#!/usr/bin/env python3 """Backfill seen_articles table from existing clusters. Run once after deploying the seen_articles feature: docker exec -it news-mcp python3 /app/scripts/backfill_seen_articles.py Or locally against the repo DB: python3 scripts/backfill_seen_articles.py """ import json import sqlite3 import sys from datetime import datetime, timezone # Add parent dir so we can import news_mcp when run as a standalone script from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from news_mcp.article_identity import article_key def main(db_path: str = "./data/news.sqlite"): conn = sqlite3.connect(db_path) # Ensure the table exists conn.execute(""" CREATE TABLE IF NOT EXISTS seen_articles ( article_key TEXT PRIMARY KEY, cluster_id TEXT NOT NULL, first_seen TEXT NOT NULL, url TEXT NOT NULL DEFAULT '' ) """) rows = conn.execute("SELECT cluster_id, payload FROM clusters").fetchall() now = datetime.now(timezone.utc).isoformat() added = 0 skipped = 0 for cluster_id, payload_json in rows: try: payload = json.loads(payload_json) except json.JSONDecodeError: skipped += 1 continue for art in payload.get("articles", []): akey = article_key(art) if not akey: continue art_url = str(art.get("url") or "").strip() try: conn.execute( "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)", (akey, cluster_id, now, art_url), ) if conn.execute("SELECT changes()").fetchone()[0] > 0: added += 1 except Exception as e: print(f" ERROR: {e}", file=sys.stderr) skipped += 1 conn.commit() total = conn.execute("SELECT count(*) FROM seen_articles").fetchone()[0] conn.close() print(f"Backfill complete: added={added} skipped={skipped} total_seen={total}") if __name__ == "__main__": db = sys.argv[1] if len(sys.argv) > 1 else "./data/news.sqlite" main(db)