From 07a7dd3c07e8f2dd71f5d3c9ccf8abc14fe78c68 Mon Sep 17 00:00:00 2001 From: wander Date: Mon, 5 Jan 2026 01:21:38 -0500 Subject: [PATCH] Implement Recovery Mode: Use yt-dlp to recover missing metadata for unindexed files --- Dockerfile | 3 +- docker-compose.yml | 1 + ta_symlink.py | 166 +++++++++++++++++++++++++++++++++++++++ templates/dashboard.html | 128 ++++++++++++++++++++++++++++-- 4 files changed, 291 insertions(+), 7 deletions(-) diff --git a/Dockerfile b/Dockerfile index af3bfb0..d45f0da 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,8 @@ WORKDIR /app # 1. Install System Deps (ffmpeg) FIRST # These rarely change, so Docker will cache this layer forever. -RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -y ffmpeg curl && rm -rf /var/lib/apt/lists/* +RUN curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp && chmod a+rx /usr/local/bin/yt-dlp # 2. Install Python Deps SECOND # Only re-runs if requirements.txt changes diff --git a/docker-compose.yml b/docker-compose.yml index c8188ca..603e388 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -7,6 +7,7 @@ services: - /path/to/your/source:/app/source - /path/to/your/target:/app/target - /path/to/your/data:/app/data + - /path/to/your/import:/app/import ports: - "8002:5000" environment: diff --git a/ta_symlink.py b/ta_symlink.py index c41c352..131469d 100644 --- a/ta_symlink.py +++ b/ta_symlink.py @@ -6,6 +6,7 @@ import sys import threading import time import ipaddress +import shutil from functools import wraps from flask import Flask, jsonify, render_template, request, abort, Response @@ -19,6 +20,7 @@ UI_USERNAME = os.getenv("UI_USERNAME", "admin") UI_PASSWORD = os.getenv("UI_PASSWORD", "password") SOURCE_DIR = Path("/app/source") TARGET_DIR = Path("/app/target") +IMPORT_DIR = Path("/app/import") HEADERS = {"Authorization": f"Token {API_TOKEN}"} app = Flask(__name__) @@ -439,6 +441,148 @@ def check_orphaned_links(): log(f"✅ Check complete. Scanned {total_checked} files, found {len(orphaned)} orphaned symlinks.") return orphaned +def extract_id_from_filename(filename): + """ + Extracts YouTube ID from filename. + Expects format: 'Title [VIDEO_ID].ext' or just '[VIDEO_ID].ext' + """ + # Regex for [VIDEO_ID] at end of stem + match = re.search(r'\[([a-zA-Z0-9_-]{11})\]$', Path(filename).stem) + if match: + return match.group(1) + + # Fallback: maybe the whole filename is the ID? + if re.match(r'^[a-zA-Z0-9_-]{11}$', Path(filename).stem): + return Path(filename).stem + + return None + +def scan_for_unindexed_videos(): + """ + Scans SOURCE_DIR for files that are NOT in the TubeArchivist database/metadata. + Returns a list of candidate files for recovery. + """ + log("🔍 Scanning for unindexed files...") + + # 1. Fetch current known IDs + video_map = fetch_all_metadata() + known_ids = set(video_map.keys()) + + unindexed = [] + + if not SOURCE_DIR.exists(): + return [] + + for channel_path in SOURCE_DIR.iterdir(): + if not channel_path.is_dir(): + continue + + for video_file in channel_path.glob("*.*"): + # Skip non-video files broadly (adjust extensions if needed) + if video_file.suffix.lower() not in ['.mp4', '.mkv', '.webm', '.mov']: + continue + + # Try to identify + vid_id = extract_id_from_filename(video_file.name) + + # If we found an ID and it's NOT in known_ids + if vid_id and vid_id not in known_ids: + unindexed.append({ + "path": str(video_file), + "filename": video_file.name, + "video_id": vid_id, + "channel_folder": channel_path.name, + "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2) + }) + elif not vid_id: + # File without ID? Maybe worth listing too + pass + + log(f"✅ Found {len(unindexed)} unindexed video files.") + return unindexed + +def recover_video_metadata(filepath): + """ + Uses yt-dlp to fetch metadata for a video file and prepares it for import. + """ + import subprocess + import shutil + import json + + src_path = Path(filepath) + if not src_path.exists(): + return False, "File not found" + + vid_id = extract_id_from_filename(src_path.name) + if not vid_id: + return False, "Could not extract Video ID from filename" + + # Ensure import dir exists + IMPORT_DIR.mkdir(parents=True, exist_ok=True) + + # Target paths + dest_video = IMPORT_DIR / src_path.name + dest_json = IMPORT_DIR / f"{src_path.stem}.info.json" + + log(f"🚑 Recovering: {vid_id} ...") + + # 1. Fetch Metadata using yt-dlp + cmd = [ + "yt-dlp", + "--write-info-json", + "--skip-download", + "--id", + f"https://www.youtube.com/watch?v={vid_id}", + "-o", f"{IMPORT_DIR}/{src_path.stem}" + ] + + try: + result = subprocess.run(cmd, capture_output=True, text=True) + + if result.returncode != 0: + log(f" ⚠️ yt-dlp failed (Video likely deleted). Generating offline metadata...") + # START OFFLINE GENERATION + # Create a minimal .info.json manually + offline_meta = { + "id": vid_id, + "title": src_path.stem.replace(f" [{vid_id}]", ""), + "description": "Recovered by TA-Organizerr (Offline Mode)", + "uploader": src_path.parent.name, # Guess channel from folder name + "channel_id": "UC_UNKNOWN", # We can't know this without online check + "upload_date": "20000101", # Unknown + "thumbnail": "", # No thumbnail + "webpage_url": f"https://www.youtube.com/watch?v={vid_id}", + } + with open(dest_json, 'w') as f: + json.dump(offline_meta, f, indent=4) + log(" ✅ Generated offline metadata.") + else: + log(" ✅ Fetched online metadata.") + + # 2. Copy/Symlink Video File + try: + # We hardlink if possible to save space/time, otherwise copy + if dest_video.exists(): + dest_video.unlink() + + # Try symlink first? No, TA import consumes files. Copying is safer or hardlink. + # Let's try hardlink (link) + try: + os.link(src_path, dest_video) + log(" 🔗 Hardlinked video file.") + except OSError: + shutil.copy2(src_path, dest_video) + log(" 📂 Copied video file.") + + except Exception as e: + return False, f"Failed to move video: {e}" + + return True, "Ready for import" + + except Exception as e: + log(f" ❌ Recovery failed: {e}") + return False, str(e) + # Main logic def process_videos(): @@ -711,6 +855,28 @@ def api_transcode_logs(): "next_index": len(transcode_log_buffer) }) +@app.route("/api/recovery/scan", methods=["POST"]) +@requires_auth +def api_recovery_scan(): + files = scan_for_unindexed_videos() + return jsonify({"files": files, "count": len(files)}) + +@app.route("/api/recovery/start", methods=["POST"]) +@requires_auth +def api_recovery_start(): + data = request.get_json() + filepath = data.get('filepath') + + if not filepath: + return jsonify({"error": "No filepath provided"}), 400 + + def run_recovery(): + success, msg = recover_video_metadata(filepath) + log(f"Recovery Result for {filepath}: {msg}") + + threading.Thread(target=run_recovery).start() + return jsonify({"message": "Recovery started", "status": "started"}) + if __name__ == "__main__": # Start scheduler in background thread thread = threading.Thread(target=scheduler, daemon=True) diff --git a/templates/dashboard.html b/templates/dashboard.html index 216d6e3..c37fe1b 100644 --- a/templates/dashboard.html +++ b/templates/dashboard.html @@ -56,9 +56,20 @@ margin-right: 5px; } - .status-green { background-color: var(--accent-success); box-shadow: 0 0 8px var(--accent-success); } - .status-yellow { background-color: var(--accent-warning); box-shadow: 0 0 8px var(--accent-warning); } - .status-red { background-color: var(--accent-danger); box-shadow: 0 0 8px var(--accent-danger); } + .status-green { + background-color: var(--accent-success); + box-shadow: 0 0 8px var(--accent-success); + } + + .status-yellow { + background-color: var(--accent-warning); + box-shadow: 0 0 8px var(--accent-warning); + } + + .status-red { + background-color: var(--accent-danger); + box-shadow: 0 0 8px var(--accent-danger); + } .btn-xl { padding: 15px 20px; @@ -72,9 +83,11 @@
-
+
-

TA Organizer

+

TA Organizer +

Connecting...
@@ -129,11 +142,15 @@ - + + + + +
Video Matrix @@ -275,6 +334,63 @@ setTimeout(() => { resultsDiv.innerHTML = ''; }, 10000); } + // Recovery Functions + const recoveryModal = new bootstrap.Modal(document.getElementById('recoveryModal')); + + function showRecoveryModal() { + recoveryModal.show(); + } + + async function scanRecoveryFiles() { + const tbody = document.getElementById('recovery-table-body'); + tbody.innerHTML = '
Scanning...'; + + try { + const res = await fetch('/api/recovery/scan', { method: 'POST' }); + const data = await res.json(); + + tbody.innerHTML = ''; + if (data.count === 0) { + tbody.innerHTML = 'No unindexed files found!'; + return; + } + + data.files.forEach(f => { + const tr = document.createElement('tr'); + tr.innerHTML = ` + ${f.video_id} + ${f.filename} + ${f.size_mb} MB + + + + `; + tbody.appendChild(tr); + }); + } catch (e) { + tbody.innerHTML = `Error: ${e}`; + } + } + + async function startRecovery(filepath) { + if (!confirm("Start recovery for this file? This will try to fetch metadata and move it to the Import folder.")) return; + + try { + const res = await fetch('/api/recovery/start', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ filepath }) + }); + const data = await res.json(); + alert(data.message || "Recovery started! Check logs."); + // Optionally remove the row + } catch (e) { + alert("Error starting recovery: " + e); + } + } + function clearLogs() { document.getElementById('log-container').innerHTML = ''; }