Implement Advanced Recovery with Safety Checks (Redundant vs Rescue detection)

2026-01-05 01:58:58 -05:00 · 2026-01-05 01:58:58 -05:00 · ee7d07618d
commit ee7d07618d
parent 07a7dd3c07
1 changed files with 107 additions and 35 deletions
--- a/ta_symlink.py
+++ b/ta_symlink.py
@ -459,47 +459,94 @@ def extract_id_from_filename(filename):
 def scan_for_unindexed_videos():
    """
-    Scans SOURCE_DIR for files that are NOT in the TubeArchivist database/metadata.
+    Scans both SOURCE_DIR and TARGET_DIR for files.
-    Returns a list of candidate files for recovery.
+    Classifies them as:
    - unindexed: Not in TA DB (Needs Import)
    - redundant: In TA DB AND Source exists (Safe Duplicate)
    - rescue: In TA DB BUT Source missing (Needs Rescue/Import)
    """
-    log("🔍 Scanning for unindexed files...")
+    log("🔍 Scanning for unindexed and legacy files...")
-    # 1. Fetch current known IDs
+    # 1. Fetch current known IDs and their source paths
-    video_map = fetch_all_metadata()
+    video_map = fetch_all_metadata() # {id: {path: ..., ...}}
    known_ids = set(video_map.keys())
-    unindexed = []
+    results = {
        "unindexed": [],
        "redundant": [],
        "rescue": []
    }
-    if not SOURCE_DIR.exists():
+    # Helper to check if file is video
-        return []
+    def is_video(f):
        return f.suffix.lower() in ['.mp4', '.mkv', '.webm', '.mov']
-    for channel_path in SOURCE_DIR.iterdir():
+    # --- Scan SOURCE_DIR (Standard Orphan Check) ---
-        if not channel_path.is_dir():
+    if SOURCE_DIR.exists():
-            continue
+        for channel_path in SOURCE_DIR.iterdir():
            if not channel_path.is_dir(): continue
            for video_file in channel_path.glob("*.*"):
                if not is_video(video_file): continue
-        for video_file in channel_path.glob("*.*"):
+                vid_id = extract_id_from_filename(video_file.name)
-            # Skip non-video files broadly (adjust extensions if needed)
+                if vid_id and vid_id not in known_ids:
-            if video_file.suffix.lower() not in ['.mp4', '.mkv', '.webm', '.mov']:
+                    results["unindexed"].append({
-                continue
+                        "path": str(video_file),
                        "filename": video_file.name,
                        "video_id": vid_id,
                        "type": "source_orphan",
                        "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
                    })
-            # Try to identify
+    # --- Scan TARGET_DIR (Legacy "Pinchflat" Check) ---
-            vid_id = extract_id_from_filename(video_file.name)
+    if TARGET_DIR.exists():
        for channel_path in TARGET_DIR.iterdir():
            if not channel_path.is_dir(): continue
            for video_file in channel_path.glob("*.*"):
                if not is_video(video_file): continue
-            # If we found an ID and it's NOT in known_ids
+                # We only care about REAL files, not symlinks
-            if vid_id and vid_id not in known_ids:
+                if video_file.is_symlink():
-                unindexed.append({
+                    continue
                    "path": str(video_file),
                    "filename": video_file.name,
                    "video_id": vid_id,
                    "channel_folder": channel_path.name,
                    "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
                })
            elif not vid_id:
                # File without ID? Maybe worth listing too
                pass
-    log(f"✅ Found {len(unindexed)} unindexed video files.")
+                vid_id = extract_id_from_filename(video_file.name)
-    return unindexed
+                
                # Case 1: ID NOT in TA -> Recoverable
                if vid_id and vid_id not in known_ids:
                     results["unindexed"].append({
                        "path": str(video_file),
                        "filename": video_file.name,
                        "video_id": vid_id,
                        "type": "target_realfile",
                        "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
                    })
                # Case 2: ID IS in TA
                elif vid_id:
                    # Check if TA's source file actually exists
                    ta_source_path = Path(video_map[vid_id]['filesystem_path'])
                    if ta_source_path.exists():
                        # TA has it, Source exists. This file is REDUNDANT.
                        results["redundant"].append({
                            "path": str(video_file),
                            "filename": video_file.name,
                            "video_id": vid_id,
                            "ta_source": str(ta_source_path),
                            "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
                        })
                    else:
                        # TA has it, BUT source is MISSING. This file is a RESCUE candidate.
                        results["rescue"].append({
                            "path": str(video_file),
                            "filename": video_file.name,
                            "video_id": vid_id,
                            "ta_source": str(ta_source_path), # Missing path
                            "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
                        })
    log(f"✅ Scan complete. Unindexed: {len(results['unindexed'])}, Redundant: {len(results['redundant'])}, Rescue: {len(results['rescue'])}")
    return results
 def recover_video_metadata(filepath):
    """
@ -877,6 +924,31 @@ def api_recovery_start():
    threading.Thread(target=run_recovery).start()
    return jsonify({"message": "Recovery started", "status": "started"})
@app.route("/api/recovery/delete", methods=["POST"])
@requires_auth
 def api_recovery_delete():
    data = request.get_json()
    filepath = data.get('filepath')
    if not filepath:
        return jsonify({"error": "No filepath provided"}), 400
    p = Path(filepath)
    if not p.exists() or not p.is_file():
        return jsonify({"error": "File not found"}), 404
    # Safety Check: Never delete anything from SOURCE_DIR via this endpoint
    if str(SOURCE_DIR) in str(p.resolve()):
        return jsonify({"error": "Safety Block: Cannot delete files from Source Config."}), 403
    try:
        p.unlink()
        log(f"🗑️ Deleted redundant file: {filepath}")
        return jsonify({"success": True, "message": "File deleted"})
    except Exception as e:
        log(f"❌ Delete failed: {e}")
        return jsonify({"error": str(e)}), 500
 if __name__ == "__main__":
    # Start scheduler in background thread
    thread = threading.Thread(target=scheduler, daemon=True)