Implement Advanced Recovery with Safety Checks (Redundant vs Rescue detection)

2026-01-05 01:58:58 -05:00 · 2026-01-05 01:58:58 -05:00 · ee7d07618d
commit ee7d07618d
parent 07a7dd3c07
1 changed files with 107 additions and 35 deletions
--- a/ta_symlink.py
+++ b/ta_symlink.py
@ -459,47 +459,94 @@ def extract_id_from_filename(filename):

 def scan_for_unindexed_videos():
    """
-    Scans SOURCE_DIR for files that are NOT in the TubeArchivist database/metadata.
-    Returns a list of candidate files for recovery.
+    Scans both SOURCE_DIR and TARGET_DIR for files.
+    Classifies them as:
+    - unindexed: Not in TA DB (Needs Import)
+    - redundant: In TA DB AND Source exists (Safe Duplicate)
+    - rescue: In TA DB BUT Source missing (Needs Rescue/Import)
    """
-    log("🔍 Scanning for unindexed files...")
+    log("🔍 Scanning for unindexed and legacy files...")
    
-    # 1. Fetch current known IDs
-    video_map = fetch_all_metadata()
+    # 1. Fetch current known IDs and their source paths
+    video_map = fetch_all_metadata() # {id: {path: ..., ...}}
    known_ids = set(video_map.keys())
    
-    unindexed = []
+    results = {
+        "unindexed": [],
+        "redundant": [],
+        "rescue": []
+    }

-    if not SOURCE_DIR.exists():
-        return []
+    # Helper to check if file is video
+    def is_video(f):
+        return f.suffix.lower() in ['.mp4', '.mkv', '.webm', '.mov']

-    for channel_path in SOURCE_DIR.iterdir():
-        if not channel_path.is_dir():
-            continue
+    # --- Scan SOURCE_DIR (Standard Orphan Check) ---
+    if SOURCE_DIR.exists():
+        for channel_path in SOURCE_DIR.iterdir():
+            if not channel_path.is_dir(): continue
+            for video_file in channel_path.glob("*.*"):
+                if not is_video(video_file): continue
                
-        for video_file in channel_path.glob("*.*"):
-            # Skip non-video files broadly (adjust extensions if needed)
-            if video_file.suffix.lower() not in ['.mp4', '.mkv', '.webm', '.mov']:
-                continue
+                vid_id = extract_id_from_filename(video_file.name)
+                if vid_id and vid_id not in known_ids:
+                    results["unindexed"].append({
+                        "path": str(video_file),
+                        "filename": video_file.name,
+                        "video_id": vid_id,
+                        "type": "source_orphan",
+                        "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
+                    })

-            # Try to identify
-            vid_id = extract_id_from_filename(video_file.name)
+    # --- Scan TARGET_DIR (Legacy "Pinchflat" Check) ---
+    if TARGET_DIR.exists():
+        for channel_path in TARGET_DIR.iterdir():
+            if not channel_path.is_dir(): continue
+            for video_file in channel_path.glob("*.*"):
+                if not is_video(video_file): continue
                
-            # If we found an ID and it's NOT in known_ids
-            if vid_id and vid_id not in known_ids:
-                unindexed.append({
-                    "path": str(video_file),
-                    "filename": video_file.name,
-                    "video_id": vid_id,
-                    "channel_folder": channel_path.name,
-                    "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
-                })
-            elif not vid_id:
-                # File without ID? Maybe worth listing too
-                pass
+                # We only care about REAL files, not symlinks
+                if video_file.is_symlink():
+                    continue
                    
-    log(f"✅ Found {len(unindexed)} unindexed video files.")
-    return unindexed
+                vid_id = extract_id_from_filename(video_file.name)
+                
+                # Case 1: ID NOT in TA -> Recoverable
+                if vid_id and vid_id not in known_ids:
+                     results["unindexed"].append({
+                        "path": str(video_file),
+                        "filename": video_file.name,
+                        "video_id": vid_id,
+                        "type": "target_realfile",
+                        "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
+                    })
+                
+                # Case 2: ID IS in TA
+                elif vid_id:
+                    # Check if TA's source file actually exists
+                    ta_source_path = Path(video_map[vid_id]['filesystem_path'])
+                    
+                    if ta_source_path.exists():
+                        # TA has it, Source exists. This file is REDUNDANT.
+                        results["redundant"].append({
+                            "path": str(video_file),
+                            "filename": video_file.name,
+                            "video_id": vid_id,
+                            "ta_source": str(ta_source_path),
+                            "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
+                        })
+                    else:
+                        # TA has it, BUT source is MISSING. This file is a RESCUE candidate.
+                        results["rescue"].append({
+                            "path": str(video_file),
+                            "filename": video_file.name,
+                            "video_id": vid_id,
+                            "ta_source": str(ta_source_path), # Missing path
+                            "size_mb": round(video_file.stat().st_size / (1024 * 1024), 2)
+                        })
+
+    log(f"✅ Scan complete. Unindexed: {len(results['unindexed'])}, Redundant: {len(results['redundant'])}, Rescue: {len(results['rescue'])}")
+    return results

 def recover_video_metadata(filepath):
    """
@ -877,6 +924,31 @@ def api_recovery_start():
    threading.Thread(target=run_recovery).start()
    return jsonify({"message": "Recovery started", "status": "started"})

+@app.route("/api/recovery/delete", methods=["POST"])
+@requires_auth
+def api_recovery_delete():
+    data = request.get_json()
+    filepath = data.get('filepath')
+    
+    if not filepath:
+        return jsonify({"error": "No filepath provided"}), 400
+        
+    p = Path(filepath)
+    if not p.exists() or not p.is_file():
+        return jsonify({"error": "File not found"}), 404
+        
+    # Safety Check: Never delete anything from SOURCE_DIR via this endpoint
+    if str(SOURCE_DIR) in str(p.resolve()):
+        return jsonify({"error": "Safety Block: Cannot delete files from Source Config."}), 403
+
+    try:
+        p.unlink()
+        log(f"🗑️ Deleted redundant file: {filepath}")
+        return jsonify({"success": True, "message": "File deleted"})
+    except Exception as e:
+        log(f"❌ Delete failed: {e}")
+        return jsonify({"error": str(e)}), 500
+    
 if __name__ == "__main__":
    # Start scheduler in background thread
    thread = threading.Thread(target=scheduler, daemon=True)