Преглед изворни кода

news-mcp: add embeddings backfill script

Lukas Goldschmidt пре 1 месец
родитељ
комит
024b2350e7
4 измењених фајлова са 104 додато и 0 уклоњено
  1. 2 0
      PROJECT.md
  2. 13 0
      README.md
  3. 3 0
      news_mcp/dedup/cluster.py
  4. 86 0
      scripts/backfill_news_embeddings.py

+ 2 - 0
PROJECT.md

@@ -9,6 +9,7 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 - RSS fetch (breakingthenews.net)
 - v1 dedup via fuzzy title similarity
 - optional Ollama embeddings path for clustering (when `NEWS_EMBEDDINGS_ENABLED=true`)
+- optional embeddings backfill script for precomputing cluster vectors in SQLite
 - Groq enrichment (topic/entities/sentiment/keywords)
 - Tools expose semantic queries over cached clusters
 
@@ -30,3 +31,4 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 - Server exposes tool surface with valid schemas
 - Caching prevents repeated Groq calls for unchanged clusters
 - Embeddings remain optional: Ollama is tried first when enabled, otherwise the heuristic path stays active
+- Embeddings backfill script exists for older cluster rows before the server restart

+ 13 - 0
README.md

@@ -162,4 +162,17 @@ For one-off cleanup, run:
 This enforces `ENTITY_BLACKLIST` inside stored clusters by removing matching
 entries from `payload.entities` and `payload.keywords` and (if needed) setting
 `payload.topic = "other"`.
+
+## Embeddings backfill (optional)
+
+If `NEWS_EMBEDDINGS_ENABLED=true`, you can precompute cluster embeddings for
+older rows before restarting the server:
+
+```bash
+./.venv/bin/python scripts/backfill_news_embeddings.py --dry-run --limit 200
+./.venv/bin/python scripts/backfill_news_embeddings.py --limit 1000
+```
+
+This stores a cluster-level `embedding` and `embedding_model` inside the SQLite
+payload so the Ollama-first clustering path has data ready to use.
 ```

+ 3 - 0
news_mcp/dedup/cluster.py

@@ -94,6 +94,7 @@ def dedup_and_cluster_articles(
 
             key = f"{topic}|{_normalize_title(title)}"
             cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
+            cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
             clusters.append(
                 {
                     "cluster_id": cid,
@@ -107,6 +108,8 @@ def dedup_and_cluster_articles(
                     "articles": [a],
                     "first_seen": a["timestamp"],
                     "last_updated": a["timestamp"],
+                    "embedding": cluster_embedding,
+                    "embedding_model": "ollama:nomic-embed-text" if cluster_embedding else None,
                 }
             )
 

+ 86 - 0
scripts/backfill_news_embeddings.py

@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+"""Backfill cluster embeddings into news-mcp's SQLite store.
+
+This precomputes a cluster-level embedding for older rows so the optional
+Ollama-first clustering path has data to work with before live traffic resumes.
+
+Usage:
+  ./.venv/bin/python scripts/backfill_news_embeddings.py --dry-run --limit 200
+  ./.venv/bin/python scripts/backfill_news_embeddings.py --limit 1000
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from news_mcp.config import DB_PATH, OLLAMA_BASE_URL, OLLAMA_EMBEDDING_MODEL, NEWS_EMBEDDINGS_ENABLED
+from news_mcp.dedup.embedding_support import ollama_embed
+from news_mcp.storage.sqlite_store import SQLiteClusterStore
+
+
+def _cluster_text(cluster: dict) -> str:
+    parts = [cluster.get("headline", ""), cluster.get("summary", "") or ""]
+    return "\n".join(p for p in parts if p).strip()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Backfill embeddings for stored news clusters")
+    parser.add_argument("--db", type=Path, default=DB_PATH)
+    parser.add_argument("--limit", type=int, default=None, help="Optional maximum number of rows to process")
+    parser.add_argument("--dry-run", action="store_true", help="Do not write back changes")
+    args = parser.parse_args()
+
+    if not NEWS_EMBEDDINGS_ENABLED:
+        print("NEWS_EMBEDDINGS_ENABLED is false; nothing to backfill.")
+        return
+
+    store = SQLiteClusterStore(args.db)
+    with store._conn() as conn:  # noqa: SLF001 - one-off maintenance script
+        rows = conn.execute("SELECT cluster_id, topic, payload FROM clusters ORDER BY updated_at ASC").fetchall()
+
+    if args.limit is not None:
+        rows = rows[: args.limit]
+
+    total = 0
+    updated = 0
+    skipped = 0
+
+    print(f"starting embeddings backfill: clusters={len(rows)} dry_run={args.dry_run} model={OLLAMA_EMBEDDING_MODEL} url={OLLAMA_BASE_URL}")
+
+    for cluster_id, topic, payload_json in rows:
+        total += 1
+        try:
+            cluster = json.loads(payload_json)
+        except Exception:
+            skipped += 1
+            continue
+
+        if cluster.get("embedding"):
+            continue
+
+        emb = ollama_embed(_cluster_text(cluster))
+        if not emb:
+            skipped += 1
+            continue
+
+        cluster = dict(cluster)
+        cluster["embedding"] = emb
+        cluster["embedding_model"] = f"ollama:{OLLAMA_EMBEDDING_MODEL}"
+
+        if not args.dry_run:
+            store.upsert_clusters([cluster], topic=topic or cluster.get("topic", "other"))
+        updated += 1
+
+        if updated % 25 == 0:
+            print(f"updated={updated} processed={total}")
+
+    print({"total_scanned": total, "updated": updated, "skipped": skipped, "dry_run": args.dry_run})
+
+
+if __name__ == "__main__":
+    main()