пре 1 месец · 024b2350e7
--- a/PROJECT.md
+++ b/PROJECT.md
@@ -9,6 +9,7 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 
				 - RSS fetch (breakingthenews.net)
			
 
				 - v1 dedup via fuzzy title similarity
			
 
				 - optional Ollama embeddings path for clustering (when `NEWS_EMBEDDINGS_ENABLED=true`)
			
 
				+- optional embeddings backfill script for precomputing cluster vectors in SQLite
			
 
				 - Groq enrichment (topic/entities/sentiment/keywords)
			
 
				 - Tools expose semantic queries over cached clusters
			
 
				 
			
@@ -30,3 +31,4 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 
				 - Server exposes tool surface with valid schemas
			
 
				 - Caching prevents repeated Groq calls for unchanged clusters
			
 
				 - Embeddings remain optional: Ollama is tried first when enabled, otherwise the heuristic path stays active
			
 
				+- Embeddings backfill script exists for older cluster rows before the server restart
			
--- a/README.md
+++ b/README.md
@@ -162,4 +162,17 @@ For one-off cleanup, run:
 
				 This enforces `ENTITY_BLACKLIST` inside stored clusters by removing matching
			
 
				 entries from `payload.entities` and `payload.keywords` and (if needed) setting
			
 
				 `payload.topic = "other"`.
			
 
				+
			
 
				+## Embeddings backfill (optional)
			
 
				+
			
 
				+If `NEWS_EMBEDDINGS_ENABLED=true`, you can precompute cluster embeddings for
			
 
				+older rows before restarting the server:
			
 
				+
			
 
				+```bash
			
 
				+./.venv/bin/python scripts/backfill_news_embeddings.py --dry-run --limit 200
			
 
				+./.venv/bin/python scripts/backfill_news_embeddings.py --limit 1000
			
 
				+```
			
 
				+
			
 
				+This stores a cluster-level `embedding` and `embedding_model` inside the SQLite
			
 
				+payload so the Ollama-first clustering path has data ready to use.
			
 
				 ```
			
--- a/news_mcp/dedup/cluster.py
+++ b/news_mcp/dedup/cluster.py
@@ -94,6 +94,7 @@ def dedup_and_cluster_articles(
 
				 
			
 
				             key = f"{topic}|{_normalize_title(title)}"
			
 
				             cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
			
 
				+            cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
			
 
				             clusters.append(
			
 
				                 {
			
 
				                     "cluster_id": cid,
			
@@ -107,6 +108,8 @@ def dedup_and_cluster_articles(
 
				                     "articles": [a],
			
 
				                     "first_seen": a["timestamp"],
			
 
				                     "last_updated": a["timestamp"],
			
 
				+                    "embedding": cluster_embedding,
			
 
				+                    "embedding_model": "ollama:nomic-embed-text" if cluster_embedding else None,
			
 
				                 }
			
 
				             )
			
 
				 
			
--- a/scripts/backfill_news_embeddings.py
+++ b/scripts/backfill_news_embeddings.py
@@ -0,0 +1,86 @@
 
				+from __future__ import annotations
			
 
				+
			
 
				+"""Backfill cluster embeddings into news-mcp's SQLite store.
			
 
				+
			
 
				+This precomputes a cluster-level embedding for older rows so the optional
			
 
				+Ollama-first clustering path has data to work with before live traffic resumes.
			
 
				+
			
 
				+Usage:
			
 
				+  ./.venv/bin/python scripts/backfill_news_embeddings.py --dry-run --limit 200
			
 
				+  ./.venv/bin/python scripts/backfill_news_embeddings.py --limit 1000
			
 
				+"""
			
 
				+
			
 
				+import argparse
			
 
				+import json
			
 
				+import sys
			
 
				+from pathlib import Path
			
 
				+
			
 
				+ROOT = Path(__file__).resolve().parents[1]
			
 
				+sys.path.insert(0, str(ROOT))
			
 
				+
			
 
				+from news_mcp.config import DB_PATH, OLLAMA_BASE_URL, OLLAMA_EMBEDDING_MODEL, NEWS_EMBEDDINGS_ENABLED
			
 
				+from news_mcp.dedup.embedding_support import ollama_embed
			
 
				+from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				+
			
 
				+
			
 
				+def _cluster_text(cluster: dict) -> str:
			
 
				+    parts = [cluster.get("headline", ""), cluster.get("summary", "") or ""]
			
 
				+    return "\n".join(p for p in parts if p).strip()
			
 
				+
			
 
				+
			
 
				+def main() -> None:
			
 
				+    parser = argparse.ArgumentParser(description="Backfill embeddings for stored news clusters")
			
 
				+    parser.add_argument("--db", type=Path, default=DB_PATH)
			
 
				+    parser.add_argument("--limit", type=int, default=None, help="Optional maximum number of rows to process")
			
 
				+    parser.add_argument("--dry-run", action="store_true", help="Do not write back changes")
			
 
				+    args = parser.parse_args()
			
 
				+
			
 
				+    if not NEWS_EMBEDDINGS_ENABLED:
			
 
				+        print("NEWS_EMBEDDINGS_ENABLED is false; nothing to backfill.")
			
 
				+        return
			
 
				+
			
 
				+    store = SQLiteClusterStore(args.db)
			
 
				+    with store._conn() as conn:  # noqa: SLF001 - one-off maintenance script
			
 
				+        rows = conn.execute("SELECT cluster_id, topic, payload FROM clusters ORDER BY updated_at ASC").fetchall()
			
 
				+
			
 
				+    if args.limit is not None:
			
 
				+        rows = rows[: args.limit]
			
 
				+
			
 
				+    total = 0
			
 
				+    updated = 0
			
 
				+    skipped = 0
			
 
				+
			
 
				+    print(f"starting embeddings backfill: clusters={len(rows)} dry_run={args.dry_run} model={OLLAMA_EMBEDDING_MODEL} url={OLLAMA_BASE_URL}")
			
 
				+
			
 
				+    for cluster_id, topic, payload_json in rows:
			
 
				+        total += 1
			
 
				+        try:
			
 
				+            cluster = json.loads(payload_json)
			
 
				+        except Exception:
			
 
				+            skipped += 1
			
 
				+            continue
			
 
				+
			
 
				+        if cluster.get("embedding"):
			
 
				+            continue
			
 
				+
			
 
				+        emb = ollama_embed(_cluster_text(cluster))
			
 
				+        if not emb:
			
 
				+            skipped += 1
			
 
				+            continue
			
 
				+
			
 
				+        cluster = dict(cluster)
			
 
				+        cluster["embedding"] = emb
			
 
				+        cluster["embedding_model"] = f"ollama:{OLLAMA_EMBEDDING_MODEL}"
			
 
				+
			
 
				+        if not args.dry_run:
			
 
				+            store.upsert_clusters([cluster], topic=topic or cluster.get("topic", "other"))
			
 
				+        updated += 1
			
 
				+
			
 
				+        if updated % 25 == 0:
			
 
				+            print(f"updated={updated} processed={total}")
			
 
				+
			
 
				+    print({"total_scanned": total, "updated": updated, "skipped": skipped, "dry_run": args.dry_run})
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()