feat: Introduce scheduled background scanning, old folder cleanup, and optimize metadata fetching with updated Docker configuration and API tests.

2025-11-20 01:45:38 -05:00 · 2025-11-20 01:45:38 -05:00 · 94f077944b
commit 94f077944b
parent 1ecb31ae12
4 changed files with 207 additions and 26 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,10 +1,10 @@
 services:
  ta-organizer:
-    build: .
+    build: /mnt/user/appdata/dockerbuildings
    container_name: ta-organizer
    volumes:
-      - ./source:/app/source:ro
-      - ./target:/app/target
+      - /mnt/user/appdata/dockerbuildings/source:/app/source:ro
+      - /mnt/user/appdata/dockerbuildings/target:/app/target
    environment:
-      - API_TOKEN=${API_TOKEN}
-    env_file: .env
+      - SCAN_INTERVAL=${SCAN_INTERVAL:-60}
+    env_file: /mnt/user/appdata/dockerbuildings/.env
--- a/ta-organizerr.tar.gz
+++ b/ta-organizerr.tar.gz
--- a/ta_symlink.py
+++ b/ta_symlink.py
@ -4,12 +4,15 @@ import os
 import requests
 import re
 import sys
+import threading
+import time
 from flask import Flask, jsonify, render_template_string, request

 # Load config from environment variables
 API_URL = os.getenv("API_URL", "http://localhost:8457/api")
 VIDEO_URL = os.getenv("VIDEO_URL", "http://localhost:8457/video/")
 API_TOKEN = os.getenv("API_TOKEN", "")
+SCAN_INTERVAL = int(os.getenv("SCAN_INTERVAL", 60)) # Default 60 minutes
 SOURCE_DIR = Path("/app/source")
 TARGET_DIR = Path("/app/target")
 HEADERS = {"Authorization": f"Token {API_TOKEN}"}
@ -24,41 +27,137 @@ def sanitize(text):
    text = re.sub(r'[\/:*?"<>|]', "_", text)
    return text.strip()

-def fetch_video_metadata(video_id):
-    url = f"{API_URL}/video/{video_id}/"
-    try:
-        response = requests.get(url, headers=HEADERS)
-        response.raise_for_status()
-        data = response.json()
+def fetch_all_metadata():
+    print("📥 Fetching all video metadata...", flush=True)
+    video_map = {}
+    page = 1
+    while True:
+        url = f"{API_URL}/video/?page={page}"
+        try:
+            response = requests.get(url, headers=HEADERS)
+            response.raise_for_status()
+            data = response.json()
            
-        title = data.get("title", "unknown_title")
-        channel_info = data.get("channel", {})
-        channel_id = channel_info.get("channel_id", "unknown_channel")
-        channel_name = channel_info.get("channel_name") or channel_info.get("channel_title") or "Unknown Channel"
-        published = data.get("published", "unknown_date").replace("/", "-")
+            if 'data' not in data or not data['data']:
+                break
                
-        return {
-            "title": title,
-            "channel_id": channel_id,
-            "channel_name": channel_name,
-            "published": published
-        }
-    except Exception as e:
-        print(f"❌ Error fetching metadata for {video_id}: {e}", flush=True)
-        return None
+            for video in data['data']:
+                # Try to find the ID. It might be 'youtube_id' or '_id'
+                vid_id = video.get("youtube_id") or video.get("_id")
+                if not vid_id:
+                    continue
+                    
+                title = video.get("title", "unknown_title")
+                channel_info = video.get("channel", {})
+                channel_name = channel_info.get("channel_name") or channel_info.get("channel_title") or "Unknown Channel"
+                # Fix date format: take only first 10 chars (YYYY-MM-DD)
+                raw_date = video.get("published", "unknown_date")
+                published = raw_date[:10] if len(raw_date) >= 10 else raw_date.replace("/", "-")
+                
+                video_map[vid_id] = {
+                    "title": title,
+                    "channel_name": channel_name,
+                    "published": published
+                }
+            
+            # Check pagination to see if we are done
+            if 'paginate' in data:
+                current = data['paginate'].get('current_page')
+                last = data['paginate'].get('last_page')
+                if current is not None and last is not None and current >= last:
+                    break
+            else:
+                # Fallback if no pagination info, just stop if empty data (handled above) or arbitrary limit?
+                # If we got data but no pagination, maybe it's a single page result?
+                # But we loop until no data.
+                pass
+
+            print(f"   - Page {page} fetched. Total videos so far: {len(video_map)}", flush=True)
+            page += 1
+            
+        except Exception as e:
+            print(f"❌ Error fetching page {page}: {e}", flush=True)
+            # If a page fails, maybe we should stop or retry? For now, let's stop to avoid infinite loops on auth error
+            break
+            
+    print(f"✅ Metadata fetch complete. Found {len(video_map)} videos.", flush=True)
+    return video_map
+
+def cleanup_old_folders():
+    """
+    Scans TARGET_DIR for folders containing '+00:00'.
+    Safely deletes them ONLY if they contain no real files (only symlinks or empty).
+    """
+    print("🧹 Starting cleanup. Scanning ONLY for folders containing '+00:00'...", flush=True)
+    cleaned_count = 0
+    skipped_count = 0
+    
+    if not TARGET_DIR.exists():
+        return
+
+    # Walk top-down
+    for channel_dir in TARGET_DIR.iterdir():
+        if not channel_dir.is_dir():
+            continue
+            
+        for video_dir in channel_dir.iterdir():
+            if not video_dir.is_dir():
+                continue
+                
+            if "+00:00" in video_dir.name:
+                # Check safety
+                safe_to_delete = True
+                reason = ""
+                
+                for item in video_dir.iterdir():
+                    if not item.is_symlink():
+                        # Found a real file! Unsafe!
+                        safe_to_delete = False
+                        reason = "Contains real files"
+                        break
+                
+                if safe_to_delete:
+                    try:
+                        # Remove all symlinks first
+                        for item in video_dir.iterdir():
+                            item.unlink()
+                        # Remove directory
+                        video_dir.rmdir()
+                        print(f"   [DELETED] {video_dir.name}", flush=True)
+                        cleaned_count += 1
+                    except Exception as e:
+                        print(f"   ❌ Failed to delete {video_dir.name}: {e}", flush=True)
+                else:
+                    print(f"   ⚠️ SKIPPING {video_dir.name} - {reason}", flush=True)
+                    skipped_count += 1
+                    
+    print(f"🧹 Cleanup complete. Removed: {cleaned_count}, Skipped: {skipped_count}", flush=True)

 # Main logic

 def process_videos():
    global processed_videos
    processed_videos = []
+    
+    # 1. Fetch all metadata first
+    video_map = fetch_all_metadata()
+    
+    # 2. Run cleanup
+    cleanup_old_folders()
+    
+    # Statistics
+    new_links = 0
+    verified_links = 0
+    
    try:
        for channel_path in SOURCE_DIR.iterdir():
            if not channel_path.is_dir():
                continue
            for video_file in channel_path.glob("*.*"):
                video_id = video_file.stem
-                meta = fetch_video_metadata(video_id)
+                
+                # 2. Lookup in local map
+                meta = video_map.get(video_id)
                if not meta:
                    continue
                sanitized_channel_name = sanitize(meta["channel_name"])
@ -81,8 +180,14 @@ def process_videos():
                            if current_target.resolve() != host_source_path.resolve():
                                dest_file.unlink()
                                os.symlink(host_source_path, dest_file)
+                                print(f"   [FIX] Relinked: {folder_name}", flush=True)
+                                new_links += 1
+                            else:
+                                verified_links += 1
                    else:
                        os.symlink(host_source_path, dest_file)
+                        print(f"   [NEW] Linked: {folder_name}", flush=True)
+                        new_links += 1
                except Exception:
                    pass
                processed_videos.append({
@ -94,8 +199,18 @@ def process_videos():
                })
    except Exception as e:
        return str(e)
+        
+    print(f"✅ Scan complete. Processed {len(processed_videos)} videos.", flush=True)
+    print(f"   - New/Fixed Links: {new_links}", flush=True)
+    print(f"   - Verified Links:  {verified_links}", flush=True)
    return None

+def scheduler():
+    print(f"🕒 Background scheduler started. Scanning every {SCAN_INTERVAL} minutes.", flush=True)
+    while True:
+        print("🔄 Running scheduled scan...", flush=True)
+        process_videos()
+        time.sleep(SCAN_INTERVAL * 60)

 # Flask routes
@app.route("/")
@ -146,4 +261,8 @@ def api_videos():
    return jsonify(processed_videos)

 if __name__ == "__main__":
+    # Start scheduler in background thread
+    thread = threading.Thread(target=scheduler, daemon=True)
+    thread.start()
+    
    app.run(host="0.0.0.0", port=5000)
--- a/test_api.py
+++ b/test_api.py
@ -0,0 +1,62 @@
+import requests
+import os
+import json
+
+# Manually load .env
+try:
+    with open('.env', 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith('#'):
+                key, value = line.split('=', 1)
+                os.environ[key] = value
+except FileNotFoundError:
+    print("Warning: .env file not found")
+
+API_URL = os.getenv("API_URL")
+API_TOKEN = os.getenv("API_TOKEN")
+
+headers = {"Authorization": f"Token {API_TOKEN}"}
+
+print(f"Testing API at: {API_URL}")
+
+def test_endpoint(path):
+    url = f"{API_URL}{path}"
+    print(f"\n--- Testing {url} ---")
+    try:
+        response = requests.get(url, headers=headers, timeout=5)
+        print(f"Status Code: {response.status_code}")
+        try:
+            data = response.json()
+            print("Response JSON (truncated):")
+            print(json.dumps(data, indent=2)[:500] + "..." if len(str(data)) > 500 else json.dumps(data, indent=2))
+            return data
+        except json.JSONDecodeError:
+            print("Response is not JSON")
+            print(response.text[:200])
+            return None
+    except Exception as e:
+        print(f"Error: {e}")
+        return None
+
+# Test Root API
+test_endpoint("")
+
+# Test Search Parameters
+target_id = "K1Uw_YVgCBsww"
+print(f"\n--- Testing Search Params for {target_id} ---")
+
+# Test Page Size
+print(f"\n--- Testing Page Size ---")
+
+sizes = [12, 50, 100]
+
+for size in sizes:
+    url = f"/video/?page_size={size}"
+    print(f"Testing {url}...")
+    data = test_endpoint(url)
+    if data and isinstance(data, dict) and 'data' in data:
+        count = len(data['data'])
+        print(f"Requested {size}, got {count} items.")
+        if 'paginate' in data:
+            print(f"Pagination meta: {data['paginate']}")