|
@@ -0,0 +1,97 @@
|
|
|
|
|
+from __future__ import annotations
|
|
|
|
|
+
|
|
|
|
|
+"""Deduplicate article entries inside stored clusters.
|
|
|
|
|
+
|
|
|
|
|
+This cleans existing SQLite payloads so a cluster only keeps one article record
|
|
|
|
|
+per canonical article key (preferably the URL path/article id).
|
|
|
|
|
+
|
|
|
|
|
+Usage:
|
|
|
|
|
+ ./.venv/bin/python scripts/dedup_articles_in_clusters.py --dry-run
|
|
|
|
|
+ ./.venv/bin/python scripts/dedup_articles_in_clusters.py
|
|
|
|
|
+"""
|
|
|
|
|
+
|
|
|
|
|
+import argparse
|
|
|
|
|
+import json
|
|
|
|
|
+import sys
|
|
|
|
|
+from pathlib import Path
|
|
|
|
|
+from typing import Any
|
|
|
|
|
+from urllib.parse import urlparse
|
|
|
|
|
+
|
|
|
|
|
+ROOT = Path(__file__).resolve().parents[1]
|
|
|
|
|
+sys.path.insert(0, str(ROOT))
|
|
|
|
|
+
|
|
|
|
|
+from news_mcp.config import DB_PATH
|
|
|
|
|
+from news_mcp.storage.sqlite_store import SQLiteClusterStore
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _article_key(article: dict[str, Any]) -> str:
|
|
|
|
|
+ url = str(article.get("url") or "").strip()
|
|
|
|
|
+ if not url:
|
|
|
|
|
+ return str(article.get("title") or "")
|
|
|
|
|
+ try:
|
|
|
|
|
+ parsed = urlparse(url)
|
|
|
|
|
+ parts = [p for p in parsed.path.split("/") if p]
|
|
|
|
|
+ if parts:
|
|
|
|
|
+ return parts[-1]
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ pass
|
|
|
|
|
+ return url
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def _dedup_articles(articles: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
|
|
|
+ seen = set()
|
|
|
|
|
+ out = []
|
|
|
|
|
+ for article in articles:
|
|
|
|
|
+ key = _article_key(article)
|
|
|
|
|
+ if key in seen:
|
|
|
|
|
+ continue
|
|
|
|
|
+ seen.add(key)
|
|
|
|
|
+ out.append(article)
|
|
|
|
|
+ return out
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def main() -> None:
|
|
|
|
|
+ parser = argparse.ArgumentParser(description="Deduplicate article entries inside stored clusters")
|
|
|
|
|
+ parser.add_argument("--db", type=Path, default=DB_PATH)
|
|
|
|
|
+ parser.add_argument("--dry-run", action="store_true")
|
|
|
|
|
+ parser.add_argument("--limit", type=int, default=None)
|
|
|
|
|
+ args = parser.parse_args()
|
|
|
|
|
+
|
|
|
|
|
+ store = SQLiteClusterStore(args.db)
|
|
|
|
|
+ with store._conn() as conn: # noqa: SLF001 - maintenance script
|
|
|
|
|
+ rows = conn.execute("SELECT cluster_id, topic, payload FROM clusters ORDER BY updated_at ASC").fetchall()
|
|
|
|
|
+
|
|
|
|
|
+ if args.limit is not None:
|
|
|
|
|
+ rows = rows[: args.limit]
|
|
|
|
|
+
|
|
|
|
|
+ total = 0
|
|
|
|
|
+ updated = 0
|
|
|
|
|
+
|
|
|
|
|
+ print(f"starting article dedup: clusters={len(rows)} dry_run={args.dry_run}")
|
|
|
|
|
+
|
|
|
|
|
+ for cluster_id, topic, payload_json in rows:
|
|
|
|
|
+ total += 1
|
|
|
|
|
+ try:
|
|
|
|
|
+ cluster = json.loads(payload_json)
|
|
|
|
|
+ except Exception:
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ articles = cluster.get("articles", []) or []
|
|
|
|
|
+ deduped = _dedup_articles([a for a in articles if isinstance(a, dict)])
|
|
|
|
|
+ if len(deduped) == len(articles):
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ cluster = dict(cluster)
|
|
|
|
|
+ cluster["articles"] = deduped
|
|
|
|
|
+ if not args.dry_run:
|
|
|
|
|
+ store.upsert_clusters([cluster], topic=topic or cluster.get("topic", "other"))
|
|
|
|
|
+ updated += 1
|
|
|
|
|
+
|
|
|
|
|
+ if updated % 25 == 0:
|
|
|
|
|
+ print(f"updated={updated} processed={total}")
|
|
|
|
|
+
|
|
|
|
|
+ print({"total_scanned": total, "updated": updated, "dry_run": args.dry_run})
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == "__main__":
|
|
|
|
|
+ main()
|