| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879 |
- #!/usr/bin/env python3
- """Backfill seen_articles table from existing clusters.
- Run once after deploying the seen_articles feature:
- docker exec -it news-mcp python3 /app/scripts/backfill_seen_articles.py
- Or locally against the repo DB:
- python3 scripts/backfill_seen_articles.py
- """
- import json
- import sqlite3
- import sys
- from datetime import datetime, timezone
- from urllib.parse import urlparse
- def _article_key(article: dict) -> str:
- url = str(article.get("url") or "").strip()
- if not url:
- return str(article.get("title") or "")
- try:
- parsed = urlparse(url)
- parts = [p for p in parsed.path.split("/") if p]
- if parts:
- return parts[-1]
- except Exception:
- pass
- return url
- def main(db_path: str = "./data/news.sqlite"):
- conn = sqlite3.connect(db_path)
- # Ensure the table exists
- conn.execute("""
- CREATE TABLE IF NOT EXISTS seen_articles (
- article_key TEXT PRIMARY KEY,
- cluster_id TEXT NOT NULL,
- first_seen TEXT NOT NULL,
- url TEXT NOT NULL DEFAULT ''
- )
- """)
- rows = conn.execute("SELECT cluster_id, payload FROM clusters").fetchall()
- now = datetime.now(timezone.utc).isoformat()
- added = 0
- skipped = 0
- for cluster_id, payload_json in rows:
- try:
- payload = json.loads(payload_json)
- except json.JSONDecodeError:
- skipped += 1
- continue
- for art in payload.get("articles", []):
- akey = _article_key(art)
- if not akey:
- continue
- art_url = str(art.get("url") or "").strip()
- try:
- conn.execute(
- "INSERT OR IGNORE INTO seen_articles(article_key, cluster_id, first_seen, url) VALUES(?,?,?,?)",
- (akey, cluster_id, now, art_url),
- )
- if conn.execute("SELECT changes()").fetchone()[0] > 0:
- added += 1
- except Exception as e:
- print(f" ERROR: {e}", file=sys.stderr)
- skipped += 1
- conn.commit()
- total = conn.execute("SELECT count(*) FROM seen_articles").fetchone()[0]
- conn.close()
- print(f"Backfill complete: added={added} skipped={skipped} total_seen={total}")
- if __name__ == "__main__":
- db = sys.argv[1] if len(sys.argv) > 1 else "./data/news.sqlite"
- main(db)
|