Implement Recovery Mode: Use yt-dlp to recover missing metadata for unindexed files

2026-01-05 01:21:38 -05:00 · 2026-01-05 01:21:38 -05:00 · 07a7dd3c07
commit 07a7dd3c07
parent 180e0632e5
4 changed files with 291 additions and 7 deletions
--- a/3
+++ b/3
@ -3,7 +3,8 @@ WORKDIR /app

 # 1. Install System Deps (ffmpeg) FIRST
 # These rarely change, so Docker will cache this layer forever.
-RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
+RUN apt-get update && apt-get install -y ffmpeg curl && rm -rf /var/lib/apt/lists/*
+RUN curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp && chmod a+rx /usr/local/bin/yt-dlp

 # 2. Install Python Deps SECOND
 # Only re-runs if requirements.txt changes
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -7,6 +7,7 @@ services:
      - /path/to/your/source:/app/source
      - /path/to/your/target:/app/target
      - /path/to/your/data:/app/data
+      - /path/to/your/import:/app/import
    ports:
      - "8002:5000"
    environment:
--- a/ta_symlink.py
+++ b/ta_symlink.py
@ -6,6 +6,7 @@ import sys
 import threading
 import time
 import ipaddress
+import shutil
 from functools import wraps
 from flask import Flask, jsonify, render_template, request, abort, Response

@ -19,6 +20,7 @@ UI_USERNAME = os.getenv("UI_USERNAME", "admin")
 UI_PASSWORD = os.getenv("UI_PASSWORD", "password")
 SOURCE_DIR = Path("/app/source")
 TARGET_DIR = Path("/app/target")
+IMPORT_DIR = Path("/app/import")
 HEADERS = {"Authorization": f"Token {API_TOKEN}"}

 app = Flask(__name__)
@ -439,6 +441,148 @@ def check_orphaned_links():
    log(f"✅ Check complete. Scanned {total_checked} files, found {len(orphaned)} orphaned symlinks.")
    return orphaned

+def extract_id_from_filename(filename):
+    """
+    Extracts YouTube ID from filename.
+    Expects format: 'Title [VIDEO_ID].ext' or just '[VIDEO_ID].ext'
+    """
+    # Regex for [VIDEO_ID] at end of stem
+    match = re.search(r'\[([a-zA-Z0-9_-]{11})\]$', Path(filename).stem)
+    if match:
+        return match.group(1)
+    
+    # Fallback: maybe the whole filename is the ID?
+    if re.match(r'^[a-zA-Z0-9_-]{11}$', Path(filename).stem):
+        return Path(filename).stem
+        
+    return None
+
+def scan_for_unindexed_videos():
+    """
+    Scans SOURCE_DIR for files that are NOT in the TubeArchivist database/metadata.
+    Returns a list of candidate files for recovery.
+    """
+    log("🔍 Scanning for unindexed files...")
+    
+    # 1. Fetch current known IDs
+    video_map = fetch_all_metadata()
+    known_ids = set(video_map.keys())
+    
+    unindexed = []
+    
+    if not SOURCE_DIR.exists():
+        return []
+        
+    for channel_path in SOURCE_DIR.iterdir():
+        if not channel_path.is_dir():
+            continue
+            
+        for video_file in channel_path.glob("*.*"):
+            # Skip non-video files broadly (adjust extensions if needed)
+            if video_file.suffix.lower() not in ['.mp4', '.mkv', '.webm', '.mov']:
+                continue
+                
+            # Try to identify
+            vid_id = extract_id_from_filename(video_file.name)
+            
+            # If we found an ID and it's NOT in known_ids
+            if vid_id and vid_id not in known_ids:
+                unindexed.append({
+                    "path": str(video_file),
+                    "filename": video_file.name,
+                    "video_id": vid_id,
+                    "channel_folder": channel_path.name,
+                    "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
+                })
+            elif not vid_id:
+                # File without ID? Maybe worth listing too
+                pass
+                
+    log(f"✅ Found {len(unindexed)} unindexed video files.")
+    return unindexed
+
+def recover_video_metadata(filepath):
+    """
+    Uses yt-dlp to fetch metadata for a video file and prepares it for import.
+    """
+    import subprocess
+    import shutil
+    import json
+    
+    src_path = Path(filepath)
+    if not src_path.exists():
+        return False, "File not found"
+        
+    vid_id = extract_id_from_filename(src_path.name)
+    if not vid_id:
+        return False, "Could not extract Video ID from filename"
+        
+    # Ensure import dir exists
+    IMPORT_DIR.mkdir(parents=True, exist_ok=True)
+    
+    # Target paths
+    dest_video = IMPORT_DIR / src_path.name
+    dest_json = IMPORT_DIR / f"{src_path.stem}.info.json"
+    
+    log(f"🚑 Recovering: {vid_id} ...")
+    
+    # 1. Fetch Metadata using yt-dlp
+    cmd = [
+        "yt-dlp",
+        "--write-info-json",
+        "--skip-download",
+        "--id",
+        f"https://www.youtube.com/watch?v={vid_id}",
+        "-o", f"{IMPORT_DIR}/{src_path.stem}"
+    ]
+    
+    try:
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        
+        if result.returncode != 0:
+            log(f"   ⚠️ yt-dlp failed (Video likely deleted). Generating offline metadata...")
+            # START OFFLINE GENERATION
+            # Create a minimal .info.json manually
+            offline_meta = {
+                "id": vid_id,
+                "title": src_path.stem.replace(f" [{vid_id}]", ""),
+                "description": "Recovered by TA-Organizerr (Offline Mode)",
+                "uploader": src_path.parent.name, # Guess channel from folder name
+                "channel_id": "UC_UNKNOWN", # We can't know this without online check
+                "upload_date": "20000101", # Unknown
+                "thumbnail": "", # No thumbnail
+                "webpage_url": f"https://www.youtube.com/watch?v={vid_id}",
+            }
+            with open(dest_json, 'w') as f:
+                json.dump(offline_meta, f, indent=4)
+            log("   ✅ Generated offline metadata.")
+        else:
+            log("   ✅ Fetched online metadata.")
+            
+        # 2. Copy/Symlink Video File
+        try:
+            # We hardlink if possible to save space/time, otherwise copy
+            if dest_video.exists():
+                dest_video.unlink()
+            
+            # Try symlink first? No, TA import consumes files. Copying is safer or hardlink.
+            # Let's try hardlink (link)
+            try:
+                os.link(src_path, dest_video)
+                log("   🔗 Hardlinked video file.")
+            except OSError:
+                shutil.copy2(src_path, dest_video)
+                log("   📂 Copied video file.")
+                
+        except Exception as e:
+            return False, f"Failed to move video: {e}"
+            
+        return True, "Ready for import"
+        
+    except Exception as e:
+        log(f"   ❌ Recovery failed: {e}")
+        return False, str(e)
+
 # Main logic

 def process_videos():
@ -711,6 +855,28 @@ def api_transcode_logs():
            "next_index": len(transcode_log_buffer)
        })

+@app.route("/api/recovery/scan", methods=["POST"])
+@requires_auth
+def api_recovery_scan():
+    files = scan_for_unindexed_videos()
+    return jsonify({"files": files, "count": len(files)})
+
+@app.route("/api/recovery/start", methods=["POST"])
+@requires_auth
+def api_recovery_start():
+    data = request.get_json()
+    filepath = data.get('filepath')
+    
+    if not filepath:
+        return jsonify({"error": "No filepath provided"}), 400
+        
+    def run_recovery():
+        success, msg = recover_video_metadata(filepath)
+        log(f"Recovery Result for {filepath}: {msg}")
+        
+    threading.Thread(target=run_recovery).start()
+    return jsonify({"message": "Recovery started", "status": "started"})
+    
 if __name__ == "__main__":
    # Start scheduler in background thread
    thread = threading.Thread(target=scheduler, daemon=True)
--- a/templates/dashboard.html
+++ b/templates/dashboard.html
@ -56,9 +56,20 @@
            margin-right: 5px;
        }

-        .status-green { background-color: var(--accent-success); box-shadow: 0 0 8px var(--accent-success); }
-        .status-yellow { background-color: var(--accent-warning); box-shadow: 0 0 8px var(--accent-warning); }
-        .status-red { background-color: var(--accent-danger); box-shadow: 0 0 8px var(--accent-danger); }
+        .status-green {
+            background-color: var(--accent-success);
+            box-shadow: 0 0 8px var(--accent-success);
+        }
+
+        .status-yellow {
+            background-color: var(--accent-warning);
+            box-shadow: 0 0 8px var(--accent-warning);
+        }
+
+        .status-red {
+            background-color: var(--accent-danger);
+            box-shadow: 0 0 8px var(--accent-danger);
+        }

        .btn-xl {
            padding: 15px 20px;
@ -72,9 +83,11 @@

 <body>
    <div class="container-fluid p-4" style="max-width: 1600px;">
-        <header class="d-flex justify-content-between align-items-center mb-5 border-bottom pb-3" style="border-color: #333 !important;">
+        <header class="d-flex justify-content-between align-items-center mb-5 border-bottom pb-3"
+            style="border-color: #333 !important;">
            <div class="d-flex align-items-center">
-                <h1 class="display-6 mb-0 me-3"><i class="bi bi-collection-play-fill text-primary"></i> TA Organizer</h1>
+                <h1 class="display-6 mb-0 me-3"><i class="bi bi-collection-play-fill text-primary"></i> TA Organizer
+                </h1>
                <span class="badge bg-secondary" id="connection-status">Connecting...</span>
            </div>
            <div>
@ -134,6 +147,10 @@
                            <i class="bi bi-binoculars"></i> Check Orphaned Links
                        </button>

+                        <button class="btn btn-outline-info" onclick="showRecoveryModal()">
+                            <i class="bi bi-bandaid"></i> Recovery Mode
+                        </button>
+
                        <!-- Clean button disabled as requested -->
                        <!-- 
                        <button class="btn btn-danger" onclick="triggerCleanup()">
@ -163,6 +180,48 @@
            </div>
        </div>

+        <!-- Recovery Modal -->
+        <div class="modal fade" id="recoveryModal" tabindex="-1">
+            <div class="modal-dialog modal-lg">
+                <div class="modal-content" style="background-color: var(--bg-card); border: 1px solid #444;">
+                    <div class="modal-header border-bottom border-secondary">
+                        <h5 class="modal-title"><i class="bi bi-bandaid"></i> File Recovery</h5>
+                        <button type="button" class="btn-close btn-close-white" data-bs-dismiss="modal"></button>
+                    </div>
+                    <div class="modal-body">
+                        <div class="alert alert-info">
+                            This tool scans for video files in your source folder that are <strong>NOT</strong> in TA's
+                            database.
+                            It will attempt to fetch metadata (using <code>yt-dlp</code>) and move them to the Import
+                            folder.
+                        </div>
+                        <div class="d-grid mb-3">
+                            <button class="btn btn-primary" onclick="scanRecoveryFiles()">
+                                <i class="bi bi-search"></i> Scan for Unindexed Files
+                            </button>
+                        </div>
+                        <div class="table-responsive" style="max-height: 400px;">
+                            <table class="table table-dark table-striped table-hover mb-0">
+                                <thead>
+                                    <tr>
+                                        <th>Video ID</th>
+                                        <th>Filename</th>
+                                        <th>Size</th>
+                                        <th>Action</th>
+                                    </tr>
+                                </thead>
+                                <tbody id="recovery-table-body">
+                                    <tr>
+                                        <td colspan="4" class="text-center text-muted">Click Scan to begin...</td>
+                                    </tr>
+                                </tbody>
+                            </table>
+                        </div>
+                    </div>
+                </div>
+            </div>
+        </div>
+
        <div class="card">
            <div class="card-header d-flex justify-content-between align-items-center">
                <span>Video Matrix</span>
@ -275,6 +334,63 @@
            setTimeout(() => { resultsDiv.innerHTML = ''; }, 10000);
        }

+        // Recovery Functions
+        const recoveryModal = new bootstrap.Modal(document.getElementById('recoveryModal'));
+
+        function showRecoveryModal() {
+            recoveryModal.show();
+        }
+
+        async function scanRecoveryFiles() {
+            const tbody = document.getElementById('recovery-table-body');
+            tbody.innerHTML = '<tr><td colspan="4" class="text-center"><div class="spinner-border text-primary" role="status"></div> Scanning...</td></tr>';
+
+            try {
+                const res = await fetch('/api/recovery/scan', { method: 'POST' });
+                const data = await res.json();
+
+                tbody.innerHTML = '';
+                if (data.count === 0) {
+                    tbody.innerHTML = '<tr><td colspan="4" class="text-center text-success">No unindexed files found!</td></tr>';
+                    return;
+                }
+
+                data.files.forEach(f => {
+                    const tr = document.createElement('tr');
+                    tr.innerHTML = `
+                        <td><code>${f.video_id}</code></td>
+                        <td title="${f.path}" style="max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">${f.filename}</td>
+                        <td>${f.size_mb} MB</td>
+                        <td>
+                            <button class="btn btn-sm btn-success" onclick="startRecovery('${f.path.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}')">
+                                <i class="bi bi-cloud-arrow-up"></i> Recover
+                            </button>
+                        </td>
+                    `;
+                    tbody.appendChild(tr);
+                });
+            } catch (e) {
+                tbody.innerHTML = `<tr><td colspan="4" class="text-center text-danger">Error: ${e}</td></tr>`;
+            }
+        }
+
+        async function startRecovery(filepath) {
+            if (!confirm("Start recovery for this file? This will try to fetch metadata and move it to the Import folder.")) return;
+
+            try {
+                const res = await fetch('/api/recovery/start', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({ filepath })
+                });
+                const data = await res.json();
+                alert(data.message || "Recovery started! Check logs.");
+                // Optionally remove the row
+            } catch (e) {
+                alert("Error starting recovery: " + e);
+            }
+        }
+
        function clearLogs() {
            document.getElementById('log-container').innerHTML = '';
        }