Browse Source

news-mcp: add script to enforce ENTITY_BLACKLIST in stored clusters

Lukas Goldschmidt 1 month ago
parent
commit
faf04d5b2f
1 changed files with 113 additions and 0 deletions
  1. 113 0
      scripts/enforce_news_blacklist.py

+ 113 - 0
scripts/enforce_news_blacklist.py

@@ -0,0 +1,113 @@
+from __future__ import annotations
+
+"""One-off maintenance: enforce ENTITY_BLACKLIST patterns inside stored clusters.
+
+This script rewrites clusters in news-mcp's SQLite DB so that:
+- payload.entities: remove entities matching ENTITY_BLACKLIST patterns
+- payload.keywords: remove keywords matching ENTITY_BLACKLIST patterns
+- payload.topic: if topic matches ENTITY_BLACKLIST patterns, set topic='other'
+
+It mirrors the matching approach used by news-mcp (case-insensitive fnmatch).
+
+Usage examples:
+  ././.venv/bin/python scripts/enforce_news_blacklist.py --dry-run --limit 200
+  ././.venv/bin/python scripts/enforce_news_blacklist.py --limit 1000
+"""
+
+import argparse
+import fnmatch
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from news_mcp.config import DB_PATH, NEWS_ENTITY_BLACKLIST
+from news_mcp.storage.sqlite_store import SQLiteClusterStore
+
+
+def _matches_blacklist(value: str, blacklist: list[str]) -> bool:
+    key = str(value).strip().lower()
+    if not key:
+        return True
+    return any(fnmatch.fnmatchcase(key, pattern) for pattern in blacklist)
+
+
+def enforce(cluster: dict, blacklist: list[str]) -> tuple[dict, bool]:
+    changed = False
+
+    ents = cluster.get("entities", []) or []
+    new_ents = []
+    for e in ents:
+        if _matches_blacklist(str(e), blacklist):
+            changed = True
+            continue
+        new_ents.append(e)
+    cluster = dict(cluster)
+    cluster["entities"] = new_ents
+
+    kws = cluster.get("keywords", []) or []
+    new_kws = []
+    for k in kws:
+        if _matches_blacklist(str(k), blacklist):
+            changed = True
+            continue
+        new_kws.append(k)
+    cluster["keywords"] = new_kws
+
+    topic = cluster.get("topic") or "other"
+    if topic and _matches_blacklist(topic, blacklist):
+        cluster["topic"] = "other"
+        changed = True
+
+    return cluster, changed
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Enforce news-mcp ENTITY_BLACKLIST patterns in stored clusters")
+    parser.add_argument("--db", type=Path, default=DB_PATH, help="Path to the news sqlite DB")
+    parser.add_argument("--limit", type=int, default=None, help="Optional max number of clusters to scan")
+    parser.add_argument("--dry-run", action="store_true", help="Compute changes without writing them")
+    args = parser.parse_args()
+
+    if not NEWS_ENTITY_BLACKLIST:
+        print("ENTITY_BLACKLIST is empty; nothing to do.")
+        return
+
+    store = SQLiteClusterStore(args.db)
+
+    with store._conn() as conn:  # noqa: SLF001
+        cur = conn.execute(
+            "SELECT cluster_id, topic, payload FROM clusters ORDER BY updated_at ASC"
+        )
+        rows = cur.fetchall()
+
+    rows = rows[: args.limit] if args.limit else rows
+
+    total = 0
+    updated = 0
+
+    print(f"starting blacklist enforcement: clusters={len(rows)} dry_run={args.dry_run}")
+
+    for cluster_id, topic, payload_json in rows:
+        total += 1
+        cluster = json.loads(payload_json)
+        new_cluster, changed = enforce(cluster, NEWS_ENTITY_BLACKLIST)
+        if not changed:
+            continue
+
+        if not args.dry_run:
+            store.upsert_clusters([new_cluster], topic=topic or new_cluster.get("topic", "other"))
+        updated += 1
+
+        # mild progress
+        if updated % 25 == 0:
+            print(f"updated={updated} (processed={total})")
+
+    print({"total_scanned": total, "updated": updated, "dry_run": args.dry_run})
+
+
+if __name__ == "__main__":
+    main()