소스 검색

fix: backfill script uses DB_PATH from config, same pattern as normalize script

The script now:
- Imports DB_PATH from news_mcp.config (which reads NEWS_MCP_DB_PATH env var)
- Accepts --db override arg
- Checks file existence before connecting
- Follows same structure as normalize_cluster_timestamps.py

Docker exec command:
  docker exec -it news-mcp python3 scripts/backfill_junction_tables.py

No NEWS_MCP_DB_PATH env var needed — it's already set in docker-compose.yml
Lukas Goldschmidt 1 주 전
부모
커밋
a3d3aa2c38
1개의 변경된 파일33개의 추가작업 그리고 17개의 파일을 삭제
  1. 33 17
      scripts/backfill_junction_tables.py

+ 33 - 17
scripts/backfill_junction_tables.py

@@ -7,6 +7,9 @@ and inserts them into the junction tables.  Idempotent — safe to re-run.
 Usage:
     python3 scripts/backfill_junction_tables.py
     python3 scripts/backfill_junction_tables.py --db /path/to/news.sqlite
+
+The script uses NEWS_MCP_DB_PATH env var or --db arg to locate the database.
+In the Docker container, NEWS_MCP_DB_PATH is set to ./data/news.sqlite.
 """
 
 from __future__ import annotations
@@ -17,20 +20,27 @@ import sqlite3
 import sys
 from pathlib import Path
 
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from news_mcp.config import DB_PATH
+
 
 def main() -> None:
     parser = argparse.ArgumentParser(
         description="Populate cluster_entities and cluster_keywords from existing payloads"
     )
-    parser.add_argument(
-        "--db",
-        default=str(Path(__file__).resolve().parent.parent / "news_mcp" / "data" / "news.sqlite"),
-        help="Path to news.sqlite (default: dev DB)",
-    )
+    parser.add_argument("--db", type=Path, default=DB_PATH)
     args = parser.parse_args()
 
-    db_path = args.db
+    db_path = str(args.db)
+    if not Path(db_path).exists():
+        print(f"ERROR: database file not found: {db_path}")
+        print("Check NEWS_MCP_DB_PATH env var or pass --db /path/to/db")
+        sys.exit(1)
+
     conn = sqlite3.connect(db_path)
+    conn.execute("PRAGMA journal_mode=WAL")
 
     # Ensure junction tables exist (same DDL as _init_db)
     conn.execute(
@@ -65,17 +75,20 @@ def main() -> None:
             "GENERATED ALWAYS AS (json_extract(payload, '$.timestamp')) VIRTUAL"
         )
     except sqlite3.OperationalError:
-        pass  # already exists
+        pass  # column already exists
 
     rows = conn.execute("SELECT cluster_id, payload FROM clusters").fetchall()
     total = len(rows)
-    entities_count = 0
-    keywords_count = 0
+    entities_inserted = 0
+    keywords_inserted = 0
 
-    for cluster_id, payload_text in rows:
-        payload = json.loads(payload_text)
+    for i, (cluster_id, payload_text) in enumerate(rows):
+        try:
+            payload = json.loads(payload_text)
+        except Exception:
+            continue
 
-        # Clear stale entries (idempotent re-run)
+        # Clear stale entries (idempotent re-run handles re-enrichment)
         conn.execute("DELETE FROM cluster_entities WHERE cluster_id = ?", (cluster_id,))
         conn.execute("DELETE FROM cluster_keywords WHERE cluster_id = ?", (cluster_id,))
 
@@ -86,7 +99,7 @@ def main() -> None:
                     "INSERT OR IGNORE INTO cluster_entities(cluster_id, entity) VALUES(?, ?)",
                     (cluster_id, ent_norm),
                 )
-                entities_count += 1
+                entities_inserted += 1
 
         for kw in payload.get("keywords", []):
             kw_norm = str(kw).strip().lower()
@@ -95,7 +108,10 @@ def main() -> None:
                     "INSERT OR IGNORE INTO cluster_keywords(cluster_id, keyword) VALUES(?, ?)",
                     (cluster_id, kw_norm),
                 )
-                keywords_count += 1
+                keywords_inserted += 1
+
+        if (i + 1) % 500 == 0 or i == total - 1:
+            print(f"  Processed {i+1}/{total}...", flush=True)
 
     conn.commit()
 
@@ -104,10 +120,10 @@ def main() -> None:
     final_keywords = conn.execute("SELECT COUNT(*) FROM cluster_keywords").fetchone()[0]
     conn.close()
 
-    print(f"Backfill complete:")
+    print(f"\nBackfill complete:")
     print(f"  Clusters processed: {total}")
-    print(f"  Entity rows inserted this run: {entities_count}")
-    print(f"  Keyword rows inserted this run: {keywords_count}")
+    print(f"  Entity rows inserted this run: {entities_inserted}")
+    print(f"  Keyword rows inserted this run: {keywords_inserted}")
     print(f"  Total entity rows in DB: {final_entities}")
     print(f"  Total keyword rows in DB: {final_keywords}")