backfill_seen_articles.py 2.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. #!/usr/bin/env python3
  2. """Backfill seen_articles table from existing clusters.
  3. Run once after deploying the seen_articles feature:
  4. docker exec -it news-mcp python3 /app/scripts/backfill_seen_articles.py
  5. Or locally against the repo DB:
  6. python3 scripts/backfill_seen_articles.py
  7. """
  8. import json
  9. import sqlite3
  10. import sys
  11. from datetime import datetime, timezone
  12. from urllib.parse import urlparse
  13. def _article_key(article: dict) -> str:
  14. url = str(article.get("url") or "").strip()
  15. if not url:
  16. return str(article.get("title") or "")
  17. try:
  18. parsed = urlparse(url)
  19. parts = [p for p in parsed.path.split("/") if p]
  20. if parts:
  21. return parts[-1]
  22. except Exception:
  23. pass
  24. return url
  25. def main(db_path: str = "./data/news.sqlite"):
  26. conn = sqlite3.connect(db_path)
  27. # Ensure the table exists
  28. conn.execute("""
  29. CREATE TABLE IF NOT EXISTS seen_articles (
  30. article_key TEXT PRIMARY KEY,
  31. cluster_id TEXT NOT NULL,
  32. first_seen TEXT NOT NULL,
  33. url TEXT NOT NULL DEFAULT ''
  34. )
  35. """)
  36. rows = conn.execute("SELECT cluster_id, payload FROM clusters").fetchall()
  37. now = datetime.now(timezone.utc).isoformat()
  38. added = 0
  39. skipped = 0
  40. for cluster_id, payload_json in rows:
  41. try:
  42. payload = json.loads(payload_json)
  43. except json.JSONDecodeError:
  44. skipped += 1
  45. continue
  46. for art in payload.get("articles", []):
  47. akey = _article_key(art)
  48. if not akey:
  49. continue
  50. art_url = str(art.get("url") or "").strip()
  51. try:
  52. conn.execute(
  53. "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)",
  54. (akey, cluster_id, now, art_url),
  55. )
  56. if conn.execute("SELECT changes()").fetchone()[0] > 0:
  57. added += 1
  58. except Exception as e:
  59. print(f" ERROR: {e}", file=sys.stderr)
  60. skipped += 1
  61. conn.commit()
  62. total = conn.execute("SELECT count(*) FROM seen_articles").fetchone()[0]
  63. conn.close()
  64. print(f"Backfill complete: added={added} skipped={skipped} total_seen={total}")
  65. if __name__ == "__main__":
  66. db = sys.argv[1] if len(sys.argv) > 1 else "./data/news.sqlite"
  67. main(db)