enforce_news_blacklist.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. from __future__ import annotations
  2. """One-off maintenance: enforce ENTITY_BLACKLIST patterns inside stored clusters.
  3. This script rewrites clusters in news-mcp's SQLite DB so that:
  4. - payload.entities: remove entities matching ENTITY_BLACKLIST patterns
  5. - payload.keywords: remove keywords matching ENTITY_BLACKLIST patterns
  6. - payload.topic: if topic matches ENTITY_BLACKLIST patterns, set topic='other'
  7. It mirrors the matching approach used by news-mcp (case-insensitive fnmatch).
  8. Usage examples:
  9. ././.venv/bin/python scripts/enforce_news_blacklist.py --dry-run --limit 200
  10. ././.venv/bin/python scripts/enforce_news_blacklist.py --limit 1000
  11. """
  12. import argparse
  13. import fnmatch
  14. import json
  15. import sys
  16. from pathlib import Path
  17. from typing import Any
  18. ROOT = Path(__file__).resolve().parents[1]
  19. sys.path.insert(0, str(ROOT))
  20. from news_mcp.config import DB_PATH, NEWS_ENTITY_BLACKLIST
  21. from news_mcp.storage.sqlite_store import SQLiteClusterStore
  22. def _matches_blacklist(value: str, blacklist: list[str]) -> bool:
  23. key = str(value).strip().lower()
  24. if not key:
  25. return True
  26. return any(fnmatch.fnmatchcase(key, pattern) for pattern in blacklist)
  27. def enforce(cluster: dict, blacklist: list[str]) -> tuple[dict, bool]:
  28. changed = False
  29. ents = cluster.get("entities", []) or []
  30. new_ents = []
  31. for e in ents:
  32. if _matches_blacklist(str(e), blacklist):
  33. changed = True
  34. continue
  35. new_ents.append(e)
  36. cluster = dict(cluster)
  37. cluster["entities"] = new_ents
  38. kws = cluster.get("keywords", []) or []
  39. new_kws = []
  40. for k in kws:
  41. if _matches_blacklist(str(k), blacklist):
  42. changed = True
  43. continue
  44. new_kws.append(k)
  45. cluster["keywords"] = new_kws
  46. topic = cluster.get("topic") or "other"
  47. if topic and _matches_blacklist(topic, blacklist):
  48. cluster["topic"] = "other"
  49. changed = True
  50. return cluster, changed
  51. def main() -> None:
  52. parser = argparse.ArgumentParser(description="Enforce news-mcp ENTITY_BLACKLIST patterns in stored clusters")
  53. parser.add_argument("--db", type=Path, default=DB_PATH, help="Path to the news sqlite DB")
  54. parser.add_argument("--limit", type=int, default=None, help="Optional max number of clusters to scan")
  55. parser.add_argument("--dry-run", action="store_true", help="Compute changes without writing them")
  56. args = parser.parse_args()
  57. if not NEWS_ENTITY_BLACKLIST:
  58. print("ENTITY_BLACKLIST is empty; nothing to do.")
  59. return
  60. store = SQLiteClusterStore(args.db)
  61. with store._conn() as conn: # noqa: SLF001
  62. cur = conn.execute(
  63. "SELECT cluster_id, topic, payload FROM clusters ORDER BY updated_at ASC"
  64. )
  65. rows = cur.fetchall()
  66. rows = rows[: args.limit] if args.limit else rows
  67. total = 0
  68. updated = 0
  69. print(f"starting blacklist enforcement: clusters={len(rows)} dry_run={args.dry_run}")
  70. for cluster_id, topic, payload_json in rows:
  71. total += 1
  72. cluster = json.loads(payload_json)
  73. new_cluster, changed = enforce(cluster, NEWS_ENTITY_BLACKLIST)
  74. if not changed:
  75. continue
  76. if not args.dry_run:
  77. store.upsert_clusters([new_cluster], topic=topic or new_cluster.get("topic", "other"))
  78. updated += 1
  79. # mild progress
  80. if updated % 25 == 0:
  81. print(f"updated={updated} (processed={total})")
  82. print({"total_scanned": total, "updated": updated, "dry_run": args.dry_run})
  83. if __name__ == "__main__":
  84. main()