backfill_seen_articles.py 2.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. #!/usr/bin/env python3
  2. """Backfill seen_articles table from existing clusters.
  3. Run once after deploying the seen_articles feature:
  4. docker exec -it news-mcp python3 /app/scripts/backfill_seen_articles.py
  5. Or locally against the repo DB:
  6. python3 scripts/backfill_seen_articles.py
  7. """
  8. import json
  9. import sqlite3
  10. import sys
  11. from datetime import datetime, timezone
  12. # Add parent dir so we can import news_mcp when run as a standalone script
  13. from pathlib import Path
  14. sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
  15. from news_mcp.article_identity import article_key
  16. def main(db_path: str = "./data/news.sqlite"):
  17. conn = sqlite3.connect(db_path)
  18. # Ensure the table exists
  19. conn.execute("""
  20. CREATE TABLE IF NOT EXISTS seen_articles (
  21. article_key TEXT PRIMARY KEY,
  22. cluster_id TEXT NOT NULL,
  23. first_seen TEXT NOT NULL,
  24. url TEXT NOT NULL DEFAULT ''
  25. )
  26. """)
  27. rows = conn.execute("SELECT cluster_id, payload FROM clusters").fetchall()
  28. now = datetime.now(timezone.utc).isoformat()
  29. added = 0
  30. skipped = 0
  31. for cluster_id, payload_json in rows:
  32. try:
  33. payload = json.loads(payload_json)
  34. except json.JSONDecodeError:
  35. skipped += 1
  36. continue
  37. for art in payload.get("articles", []):
  38. akey = article_key(art)
  39. if not akey:
  40. continue
  41. art_url = str(art.get("url") or "").strip()
  42. try:
  43. conn.execute(
  44. "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)",
  45. (akey, cluster_id, now, art_url),
  46. )
  47. if conn.execute("SELECT changes()").fetchone()[0] > 0:
  48. added += 1
  49. except Exception as e:
  50. print(f" ERROR: {e}", file=sys.stderr)
  51. skipped += 1
  52. conn.commit()
  53. total = conn.execute("SELECT count(*) FROM seen_articles").fetchone()[0]
  54. conn.close()
  55. print(f"Backfill complete: added={added} skipped={skipped} total_seen={total}")
  56. if __name__ == "__main__":
  57. db = sys.argv[1] if len(sys.argv) > 1 else "./data/news.sqlite"
  58. main(db)