| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970 |
- #!/usr/bin/env python3
- """Backfill seen_articles table from existing clusters.
- Run once after deploying the seen_articles feature:
- docker exec -it news-mcp python3 /app/scripts/backfill_seen_articles.py
- Or locally against the repo DB:
- python3 scripts/backfill_seen_articles.py
- """
- import json
- import sqlite3
- import sys
- from datetime import datetime, timezone
- # Add parent dir so we can import news_mcp when run as a standalone script
- from pathlib import Path
- sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
- from news_mcp.article_identity import article_key
- def main(db_path: str = "./data/news.sqlite"):
- conn = sqlite3.connect(db_path)
- # Ensure the table exists
- conn.execute("""
- CREATE TABLE IF NOT EXISTS seen_articles (
- article_key TEXT PRIMARY KEY,
- cluster_id TEXT NOT NULL,
- first_seen TEXT NOT NULL,
- url TEXT NOT NULL DEFAULT ''
- )
- """)
- rows = conn.execute("SELECT cluster_id, payload FROM clusters").fetchall()
- now = datetime.now(timezone.utc).isoformat()
- added = 0
- skipped = 0
- for cluster_id, payload_json in rows:
- try:
- payload = json.loads(payload_json)
- except json.JSONDecodeError:
- skipped += 1
- continue
- for art in payload.get("articles", []):
- akey = article_key(art)
- if not akey:
- continue
- art_url = str(art.get("url") or "").strip()
- try:
- conn.execute(
- "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)",
- (akey, cluster_id, now, art_url),
- )
- if conn.execute("SELECT changes()").fetchone()[0] > 0:
- added += 1
- except Exception as e:
- print(f" ERROR: {e}", file=sys.stderr)
- skipped += 1
- conn.commit()
- total = conn.execute("SELECT count(*) FROM seen_articles").fetchone()[0]
- conn.close()
- print(f"Backfill complete: added={added} skipped={skipped} total_seen={total}")
- if __name__ == "__main__":
- db = sys.argv[1] if len(sys.argv) > 1 else "./data/news.sqlite"
- main(db)
|