Implement Recovery Mode: Use yt-dlp to recover missing metadata for unindexed files
This commit is contained in:
parent
180e0632e5
commit
07a7dd3c07
4 changed files with 291 additions and 7 deletions
|
|
@ -3,7 +3,8 @@ WORKDIR /app
|
||||||
|
|
||||||
# 1. Install System Deps (ffmpeg) FIRST
|
# 1. Install System Deps (ffmpeg) FIRST
|
||||||
# These rarely change, so Docker will cache this layer forever.
|
# These rarely change, so Docker will cache this layer forever.
|
||||||
RUN apt-get update && apt-get install -y ffmpeg && rm -rf /var/lib/apt/lists/*
|
RUN apt-get update && apt-get install -y ffmpeg curl && rm -rf /var/lib/apt/lists/*
|
||||||
|
RUN curl -L https://github.com/yt-dlp/yt-dlp/releases/latest/download/yt-dlp -o /usr/local/bin/yt-dlp && chmod a+rx /usr/local/bin/yt-dlp
|
||||||
|
|
||||||
# 2. Install Python Deps SECOND
|
# 2. Install Python Deps SECOND
|
||||||
# Only re-runs if requirements.txt changes
|
# Only re-runs if requirements.txt changes
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ services:
|
||||||
- /path/to/your/source:/app/source
|
- /path/to/your/source:/app/source
|
||||||
- /path/to/your/target:/app/target
|
- /path/to/your/target:/app/target
|
||||||
- /path/to/your/data:/app/data
|
- /path/to/your/data:/app/data
|
||||||
|
- /path/to/your/import:/app/import
|
||||||
ports:
|
ports:
|
||||||
- "8002:5000"
|
- "8002:5000"
|
||||||
environment:
|
environment:
|
||||||
|
|
|
||||||
166
ta_symlink.py
166
ta_symlink.py
|
|
@ -6,6 +6,7 @@ import sys
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
import ipaddress
|
import ipaddress
|
||||||
|
import shutil
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
from flask import Flask, jsonify, render_template, request, abort, Response
|
from flask import Flask, jsonify, render_template, request, abort, Response
|
||||||
|
|
||||||
|
|
@ -19,6 +20,7 @@ UI_USERNAME = os.getenv("UI_USERNAME", "admin")
|
||||||
UI_PASSWORD = os.getenv("UI_PASSWORD", "password")
|
UI_PASSWORD = os.getenv("UI_PASSWORD", "password")
|
||||||
SOURCE_DIR = Path("/app/source")
|
SOURCE_DIR = Path("/app/source")
|
||||||
TARGET_DIR = Path("/app/target")
|
TARGET_DIR = Path("/app/target")
|
||||||
|
IMPORT_DIR = Path("/app/import")
|
||||||
HEADERS = {"Authorization": f"Token {API_TOKEN}"}
|
HEADERS = {"Authorization": f"Token {API_TOKEN}"}
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
|
|
@ -439,6 +441,148 @@ def check_orphaned_links():
|
||||||
log(f"✅ Check complete. Scanned {total_checked} files, found {len(orphaned)} orphaned symlinks.")
|
log(f"✅ Check complete. Scanned {total_checked} files, found {len(orphaned)} orphaned symlinks.")
|
||||||
return orphaned
|
return orphaned
|
||||||
|
|
||||||
|
def extract_id_from_filename(filename):
|
||||||
|
"""
|
||||||
|
Extracts YouTube ID from filename.
|
||||||
|
Expects format: 'Title [VIDEO_ID].ext' or just '[VIDEO_ID].ext'
|
||||||
|
"""
|
||||||
|
# Regex for [VIDEO_ID] at end of stem
|
||||||
|
match = re.search(r'\[([a-zA-Z0-9_-]{11})\]$', Path(filename).stem)
|
||||||
|
if match:
|
||||||
|
return match.group(1)
|
||||||
|
|
||||||
|
# Fallback: maybe the whole filename is the ID?
|
||||||
|
if re.match(r'^[a-zA-Z0-9_-]{11}$', Path(filename).stem):
|
||||||
|
return Path(filename).stem
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scan_for_unindexed_videos():
|
||||||
|
"""
|
||||||
|
Scans SOURCE_DIR for files that are NOT in the TubeArchivist database/metadata.
|
||||||
|
Returns a list of candidate files for recovery.
|
||||||
|
"""
|
||||||
|
log("🔍 Scanning for unindexed files...")
|
||||||
|
|
||||||
|
# 1. Fetch current known IDs
|
||||||
|
video_map = fetch_all_metadata()
|
||||||
|
known_ids = set(video_map.keys())
|
||||||
|
|
||||||
|
unindexed = []
|
||||||
|
|
||||||
|
if not SOURCE_DIR.exists():
|
||||||
|
return []
|
||||||
|
|
||||||
|
for channel_path in SOURCE_DIR.iterdir():
|
||||||
|
if not channel_path.is_dir():
|
||||||
|
continue
|
||||||
|
|
||||||
|
for video_file in channel_path.glob("*.*"):
|
||||||
|
# Skip non-video files broadly (adjust extensions if needed)
|
||||||
|
if video_file.suffix.lower() not in ['.mp4', '.mkv', '.webm', '.mov']:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Try to identify
|
||||||
|
vid_id = extract_id_from_filename(video_file.name)
|
||||||
|
|
||||||
|
# If we found an ID and it's NOT in known_ids
|
||||||
|
if vid_id and vid_id not in known_ids:
|
||||||
|
unindexed.append({
|
||||||
|
"path": str(video_file),
|
||||||
|
"filename": video_file.name,
|
||||||
|
"video_id": vid_id,
|
||||||
|
"channel_folder": channel_path.name,
|
||||||
|
"size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
|
||||||
|
})
|
||||||
|
elif not vid_id:
|
||||||
|
# File without ID? Maybe worth listing too
|
||||||
|
pass
|
||||||
|
|
||||||
|
log(f"✅ Found {len(unindexed)} unindexed video files.")
|
||||||
|
return unindexed
|
||||||
|
|
||||||
|
def recover_video_metadata(filepath):
|
||||||
|
"""
|
||||||
|
Uses yt-dlp to fetch metadata for a video file and prepares it for import.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import shutil
|
||||||
|
import json
|
||||||
|
|
||||||
|
src_path = Path(filepath)
|
||||||
|
if not src_path.exists():
|
||||||
|
return False, "File not found"
|
||||||
|
|
||||||
|
vid_id = extract_id_from_filename(src_path.name)
|
||||||
|
if not vid_id:
|
||||||
|
return False, "Could not extract Video ID from filename"
|
||||||
|
|
||||||
|
# Ensure import dir exists
|
||||||
|
IMPORT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Target paths
|
||||||
|
dest_video = IMPORT_DIR / src_path.name
|
||||||
|
dest_json = IMPORT_DIR / f"{src_path.stem}.info.json"
|
||||||
|
|
||||||
|
log(f"🚑 Recovering: {vid_id} ...")
|
||||||
|
|
||||||
|
# 1. Fetch Metadata using yt-dlp
|
||||||
|
cmd = [
|
||||||
|
"yt-dlp",
|
||||||
|
"--write-info-json",
|
||||||
|
"--skip-download",
|
||||||
|
"--id",
|
||||||
|
f"https://www.youtube.com/watch?v={vid_id}",
|
||||||
|
"-o", f"{IMPORT_DIR}/{src_path.stem}"
|
||||||
|
]
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
log(f" ⚠️ yt-dlp failed (Video likely deleted). Generating offline metadata...")
|
||||||
|
# START OFFLINE GENERATION
|
||||||
|
# Create a minimal .info.json manually
|
||||||
|
offline_meta = {
|
||||||
|
"id": vid_id,
|
||||||
|
"title": src_path.stem.replace(f" [{vid_id}]", ""),
|
||||||
|
"description": "Recovered by TA-Organizerr (Offline Mode)",
|
||||||
|
"uploader": src_path.parent.name, # Guess channel from folder name
|
||||||
|
"channel_id": "UC_UNKNOWN", # We can't know this without online check
|
||||||
|
"upload_date": "20000101", # Unknown
|
||||||
|
"thumbnail": "", # No thumbnail
|
||||||
|
"webpage_url": f"https://www.youtube.com/watch?v={vid_id}",
|
||||||
|
}
|
||||||
|
with open(dest_json, 'w') as f:
|
||||||
|
json.dump(offline_meta, f, indent=4)
|
||||||
|
log(" ✅ Generated offline metadata.")
|
||||||
|
else:
|
||||||
|
log(" ✅ Fetched online metadata.")
|
||||||
|
|
||||||
|
# 2. Copy/Symlink Video File
|
||||||
|
try:
|
||||||
|
# We hardlink if possible to save space/time, otherwise copy
|
||||||
|
if dest_video.exists():
|
||||||
|
dest_video.unlink()
|
||||||
|
|
||||||
|
# Try symlink first? No, TA import consumes files. Copying is safer or hardlink.
|
||||||
|
# Let's try hardlink (link)
|
||||||
|
try:
|
||||||
|
os.link(src_path, dest_video)
|
||||||
|
log(" 🔗 Hardlinked video file.")
|
||||||
|
except OSError:
|
||||||
|
shutil.copy2(src_path, dest_video)
|
||||||
|
log(" 📂 Copied video file.")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return False, f"Failed to move video: {e}"
|
||||||
|
|
||||||
|
return True, "Ready for import"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log(f" ❌ Recovery failed: {e}")
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
# Main logic
|
# Main logic
|
||||||
|
|
||||||
def process_videos():
|
def process_videos():
|
||||||
|
|
@ -711,6 +855,28 @@ def api_transcode_logs():
|
||||||
"next_index": len(transcode_log_buffer)
|
"next_index": len(transcode_log_buffer)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@app.route("/api/recovery/scan", methods=["POST"])
|
||||||
|
@requires_auth
|
||||||
|
def api_recovery_scan():
|
||||||
|
files = scan_for_unindexed_videos()
|
||||||
|
return jsonify({"files": files, "count": len(files)})
|
||||||
|
|
||||||
|
@app.route("/api/recovery/start", methods=["POST"])
|
||||||
|
@requires_auth
|
||||||
|
def api_recovery_start():
|
||||||
|
data = request.get_json()
|
||||||
|
filepath = data.get('filepath')
|
||||||
|
|
||||||
|
if not filepath:
|
||||||
|
return jsonify({"error": "No filepath provided"}), 400
|
||||||
|
|
||||||
|
def run_recovery():
|
||||||
|
success, msg = recover_video_metadata(filepath)
|
||||||
|
log(f"Recovery Result for {filepath}: {msg}")
|
||||||
|
|
||||||
|
threading.Thread(target=run_recovery).start()
|
||||||
|
return jsonify({"message": "Recovery started", "status": "started"})
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Start scheduler in background thread
|
# Start scheduler in background thread
|
||||||
thread = threading.Thread(target=scheduler, daemon=True)
|
thread = threading.Thread(target=scheduler, daemon=True)
|
||||||
|
|
|
||||||
|
|
@ -56,9 +56,20 @@
|
||||||
margin-right: 5px;
|
margin-right: 5px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.status-green { background-color: var(--accent-success); box-shadow: 0 0 8px var(--accent-success); }
|
.status-green {
|
||||||
.status-yellow { background-color: var(--accent-warning); box-shadow: 0 0 8px var(--accent-warning); }
|
background-color: var(--accent-success);
|
||||||
.status-red { background-color: var(--accent-danger); box-shadow: 0 0 8px var(--accent-danger); }
|
box-shadow: 0 0 8px var(--accent-success);
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-yellow {
|
||||||
|
background-color: var(--accent-warning);
|
||||||
|
box-shadow: 0 0 8px var(--accent-warning);
|
||||||
|
}
|
||||||
|
|
||||||
|
.status-red {
|
||||||
|
background-color: var(--accent-danger);
|
||||||
|
box-shadow: 0 0 8px var(--accent-danger);
|
||||||
|
}
|
||||||
|
|
||||||
.btn-xl {
|
.btn-xl {
|
||||||
padding: 15px 20px;
|
padding: 15px 20px;
|
||||||
|
|
@ -72,9 +83,11 @@
|
||||||
|
|
||||||
<body>
|
<body>
|
||||||
<div class="container-fluid p-4" style="max-width: 1600px;">
|
<div class="container-fluid p-4" style="max-width: 1600px;">
|
||||||
<header class="d-flex justify-content-between align-items-center mb-5 border-bottom pb-3" style="border-color: #333 !important;">
|
<header class="d-flex justify-content-between align-items-center mb-5 border-bottom pb-3"
|
||||||
|
style="border-color: #333 !important;">
|
||||||
<div class="d-flex align-items-center">
|
<div class="d-flex align-items-center">
|
||||||
<h1 class="display-6 mb-0 me-3"><i class="bi bi-collection-play-fill text-primary"></i> TA Organizer</h1>
|
<h1 class="display-6 mb-0 me-3"><i class="bi bi-collection-play-fill text-primary"></i> TA Organizer
|
||||||
|
</h1>
|
||||||
<span class="badge bg-secondary" id="connection-status">Connecting...</span>
|
<span class="badge bg-secondary" id="connection-status">Connecting...</span>
|
||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
|
|
@ -134,6 +147,10 @@
|
||||||
<i class="bi bi-binoculars"></i> Check Orphaned Links
|
<i class="bi bi-binoculars"></i> Check Orphaned Links
|
||||||
</button>
|
</button>
|
||||||
|
|
||||||
|
<button class="btn btn-outline-info" onclick="showRecoveryModal()">
|
||||||
|
<i class="bi bi-bandaid"></i> Recovery Mode
|
||||||
|
</button>
|
||||||
|
|
||||||
<!-- Clean button disabled as requested -->
|
<!-- Clean button disabled as requested -->
|
||||||
<!--
|
<!--
|
||||||
<button class="btn btn-danger" onclick="triggerCleanup()">
|
<button class="btn btn-danger" onclick="triggerCleanup()">
|
||||||
|
|
@ -163,6 +180,48 @@
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<!-- Recovery Modal -->
|
||||||
|
<div class="modal fade" id="recoveryModal" tabindex="-1">
|
||||||
|
<div class="modal-dialog modal-lg">
|
||||||
|
<div class="modal-content" style="background-color: var(--bg-card); border: 1px solid #444;">
|
||||||
|
<div class="modal-header border-bottom border-secondary">
|
||||||
|
<h5 class="modal-title"><i class="bi bi-bandaid"></i> File Recovery</h5>
|
||||||
|
<button type="button" class="btn-close btn-close-white" data-bs-dismiss="modal"></button>
|
||||||
|
</div>
|
||||||
|
<div class="modal-body">
|
||||||
|
<div class="alert alert-info">
|
||||||
|
This tool scans for video files in your source folder that are <strong>NOT</strong> in TA's
|
||||||
|
database.
|
||||||
|
It will attempt to fetch metadata (using <code>yt-dlp</code>) and move them to the Import
|
||||||
|
folder.
|
||||||
|
</div>
|
||||||
|
<div class="d-grid mb-3">
|
||||||
|
<button class="btn btn-primary" onclick="scanRecoveryFiles()">
|
||||||
|
<i class="bi bi-search"></i> Scan for Unindexed Files
|
||||||
|
</button>
|
||||||
|
</div>
|
||||||
|
<div class="table-responsive" style="max-height: 400px;">
|
||||||
|
<table class="table table-dark table-striped table-hover mb-0">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Video ID</th>
|
||||||
|
<th>Filename</th>
|
||||||
|
<th>Size</th>
|
||||||
|
<th>Action</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody id="recovery-table-body">
|
||||||
|
<tr>
|
||||||
|
<td colspan="4" class="text-center text-muted">Click Scan to begin...</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="card">
|
<div class="card">
|
||||||
<div class="card-header d-flex justify-content-between align-items-center">
|
<div class="card-header d-flex justify-content-between align-items-center">
|
||||||
<span>Video Matrix</span>
|
<span>Video Matrix</span>
|
||||||
|
|
@ -275,6 +334,63 @@
|
||||||
setTimeout(() => { resultsDiv.innerHTML = ''; }, 10000);
|
setTimeout(() => { resultsDiv.innerHTML = ''; }, 10000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Recovery Functions
|
||||||
|
const recoveryModal = new bootstrap.Modal(document.getElementById('recoveryModal'));
|
||||||
|
|
||||||
|
function showRecoveryModal() {
|
||||||
|
recoveryModal.show();
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scanRecoveryFiles() {
|
||||||
|
const tbody = document.getElementById('recovery-table-body');
|
||||||
|
tbody.innerHTML = '<tr><td colspan="4" class="text-center"><div class="spinner-border text-primary" role="status"></div> Scanning...</td></tr>';
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch('/api/recovery/scan', { method: 'POST' });
|
||||||
|
const data = await res.json();
|
||||||
|
|
||||||
|
tbody.innerHTML = '';
|
||||||
|
if (data.count === 0) {
|
||||||
|
tbody.innerHTML = '<tr><td colspan="4" class="text-center text-success">No unindexed files found!</td></tr>';
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
data.files.forEach(f => {
|
||||||
|
const tr = document.createElement('tr');
|
||||||
|
tr.innerHTML = `
|
||||||
|
<td><code>${f.video_id}</code></td>
|
||||||
|
<td title="${f.path}" style="max-width: 300px; overflow: hidden; text-overflow: ellipsis; white-space: nowrap;">${f.filename}</td>
|
||||||
|
<td>${f.size_mb} MB</td>
|
||||||
|
<td>
|
||||||
|
<button class="btn btn-sm btn-success" onclick="startRecovery('${f.path.replace(/\\/g, "\\\\").replace(/'/g, "\\'")}')">
|
||||||
|
<i class="bi bi-cloud-arrow-up"></i> Recover
|
||||||
|
</button>
|
||||||
|
</td>
|
||||||
|
`;
|
||||||
|
tbody.appendChild(tr);
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
tbody.innerHTML = `<tr><td colspan="4" class="text-center text-danger">Error: ${e}</td></tr>`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async function startRecovery(filepath) {
|
||||||
|
if (!confirm("Start recovery for this file? This will try to fetch metadata and move it to the Import folder.")) return;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch('/api/recovery/start', {
|
||||||
|
method: 'POST',
|
||||||
|
headers: { 'Content-Type': 'application/json' },
|
||||||
|
body: JSON.stringify({ filepath })
|
||||||
|
});
|
||||||
|
const data = await res.json();
|
||||||
|
alert(data.message || "Recovery started! Check logs.");
|
||||||
|
// Optionally remove the row
|
||||||
|
} catch (e) {
|
||||||
|
alert("Error starting recovery: " + e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function clearLogs() {
|
function clearLogs() {
|
||||||
document.getElementById('log-container').innerHTML = '';
|
document.getElementById('log-container').innerHTML = '';
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue