#!/usr/bin/env python3 """Backfill seen_articles table from existing clusters. Run once after deploying the seen_articles feature: docker exec -it news-mcp python3 /app/scripts/backfill_seen_articles.py Or locally against the repo DB: python3 scripts/backfill_seen_articles.py """ import json import sqlite3 import sys from datetime import datetime, timezone from urllib.parse import urlparse def _article_key(article: dict) -> str: url = str(article.get("url") or "").strip() if not url: return str(article.get("title") or "") try: parsed = urlparse(url) parts = [p for p in parsed.path.split("/") if p] if parts: return parts[-1] except Exception: pass return url def main(db_path: str = "./data/news.sqlite"): conn = sqlite3.connect(db_path) # Ensure the table exists conn.execute(""" CREATE TABLE IF NOT EXISTS seen_articles ( article_key TEXT PRIMARY KEY, cluster_id TEXT NOT NULL, first_seen TEXT NOT NULL, url TEXT NOT NULL DEFAULT '' ) """) rows = conn.execute("SELECT cluster_id, payload FROM clusters").fetchall() now = datetime.now(timezone.utc).isoformat() added = 0 skipped = 0 for cluster_id, payload_json in rows: try: payload = json.loads(payload_json) except json.JSONDecodeError: skipped += 1 continue for art in payload.get("articles", []): akey = _article_key(art) if not akey: continue art_url = str(art.get("url") or "").strip() try: conn.execute( "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)", (akey, cluster_id, now, art_url), ) if conn.execute("SELECT changes()").fetchone()[0] > 0: added += 1 except Exception as e: print(f" ERROR: {e}", file=sys.stderr) skipped += 1 conn.commit() total = conn.execute("SELECT count(*) FROM seen_articles").fetchone()[0] conn.close() print(f"Backfill complete: added={added} skipped={skipped} total_seen={total}") if __name__ == "__main__": db = sys.argv[1] if len(sys.argv) > 1 else "./data/news.sqlite" main(db)