Преглед изворни кода

news-mcp: add optional ollama embedding clustering

Lukas Goldschmidt пре 1 месец
родитељ
комит
8b935134d7
5 измењених фајлова са 77 додато и 2 уклоњено
  1. 5 0
      .env.example
  2. 3 0
      PROJECT.md
  3. 2 0
      README.md
  4. 36 2
      news_mcp/dedup/cluster.py
  5. 31 0
      news_mcp/dedup/embedding_support.py

+ 5 - 0
.env.example

@@ -16,6 +16,11 @@ GROQ_DEBUG=false
 GROQ_ENRICH_OTHER_ONLY=false
 GROQ_MAX_CLUSTERS_PER_REFRESH=20
 
+# Embeddings (optional, Ollama-first when enabled)
+NEWS_EMBEDDINGS_ENABLED=false
+OLLAMA_BASE_URL=http://127.0.0.1:11434
+OLLAMA_EMBEDDING_MODEL=nomic-embed-text
+
 # Feeds
 NEWS_FEED_URL=https://breakingthenews.net/news-feed.xml
 NEWS_FEED_URLS=

+ 3 - 0
PROJECT.md

@@ -8,6 +8,7 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 - SQLite cache for clusters + Groq summary caches
 - RSS fetch (breakingthenews.net)
 - v1 dedup via fuzzy title similarity
+- optional Ollama embeddings path for clustering (when `NEWS_EMBEDDINGS_ENABLED=true`)
 - Groq enrichment (topic/entities/sentiment/keywords)
 - Tools expose semantic queries over cached clusters
 
@@ -16,6 +17,7 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 - `get_events_for_entity(entity, limit)`
 - `get_event_summary(event_id)`
 - `detect_emerging_topics(limit)`
+- `get_related_entities(subject, timeframe, limit)`
 
 ## Refresh & caching
 - Background refresh every `NEWS_REFRESH_INTERVAL_SECONDS` (default 900s)
@@ -27,3 +29,4 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 - Tests pass offline (dedup/storage unit tests)
 - Server exposes tool surface with valid schemas
 - Caching prevents repeated Groq calls for unchanged clusters
+- Embeddings remain optional: Ollama is tried first when enabled, otherwise the heuristic path stays active

+ 2 - 0
README.md

@@ -76,6 +76,8 @@ Key variables:
 - `OLLAMA_BASE_URL` / `OLLAMA_URL` (default `http://127.0.0.1:11434`)
 - `OLLAMA_EMBEDDING_MODEL` (default `nomic-embed-text`)
 
+When embeddings are enabled, news-mcp tries Ollama first and falls back to the existing heuristic clustering path if Ollama is unavailable.
+
 ## Live extraction smoke test
 
 Run a standardized, fabricated extraction test against the currently configured provider/model:

+ 36 - 2
news_mcp/dedup/cluster.py

@@ -3,6 +3,8 @@ from __future__ import annotations
 from typing import Any, Dict, List, Tuple
 
 from news_mcp.sources.news_feeds import normalize_topic_from_title
+from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed
+from news_mcp.config import NEWS_EMBEDDINGS_ENABLED
 
 import re
 from difflib import SequenceMatcher
@@ -20,6 +22,11 @@ def _title_similarity(a: str, b: str) -> float:
     return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
 
 
+def _cluster_text(a: Dict[str, Any]) -> str:
+    parts = [a.get("title", ""), a.get("summary", "") or ""]
+    return "\n".join(p for p in parts if p).strip()
+
+
 def dedup_and_cluster_articles(
     articles: List[Dict[str, Any]],
     similarity_threshold: float = 0.87,
@@ -32,10 +39,23 @@ def dedup_and_cluster_articles(
     """
 
     by_topic: Dict[str, List[Dict[str, Any]]] = {}
+    embedding_cache: Dict[str, list[float]] = {}
+
+    def _embedding_for_text(text: str) -> list[float] | None:
+        if not NEWS_EMBEDDINGS_ENABLED:
+            return None
+        if text in embedding_cache:
+            return embedding_cache[text]
+        emb = ollama_embed(text)
+        if emb:
+            embedding_cache[text] = emb
+        return emb
 
     for a in articles:
         title = a["title"]
         topic = normalize_topic_from_title(title)
+        article_text = _cluster_text(a)
+        article_embedding = _embedding_for_text(article_text)
 
         by_topic.setdefault(topic, [])
         clusters = by_topic[topic]
@@ -43,12 +63,26 @@ def dedup_and_cluster_articles(
         best_idx: int | None = None
         best_sim = 0.0
         for idx, c in enumerate(clusters):
-            sim = _title_similarity(title, c.get("headline", ""))
+            if NEWS_EMBEDDINGS_ENABLED:
+                if not cluster_is_candidate(a, c, rules=CandidateRules(require_topic_match=False), article_topic=topic):
+                    continue
+                cluster_text = _cluster_text(c.get("articles", [{}])[0]) if c.get("articles") else c.get("headline", "")
+                cluster_embedding = _embedding_for_text(cluster_text)
+                if article_embedding and cluster_embedding:
+                    sim = cosine_similarity(article_embedding, cluster_embedding)
+                else:
+                    sim = _title_similarity(title, c.get("headline", ""))
+            else:
+                sim = _title_similarity(title, c.get("headline", ""))
             if sim > best_sim:
                 best_sim = sim
                 best_idx = idx
 
-        if best_idx is not None and best_sim >= similarity_threshold:
+        threshold = similarity_threshold
+        if NEWS_EMBEDDINGS_ENABLED:
+            threshold = max(similarity_threshold, 0.82)
+
+        if best_idx is not None and best_sim >= threshold:
             c = clusters[best_idx]
             c["articles"].append(a)
             if a["source"] not in c["sources"]:

+ 31 - 0
news_mcp/dedup/embedding_support.py

@@ -2,9 +2,13 @@ from __future__ import annotations
 
 from dataclasses import dataclass
 from datetime import datetime, timezone, timedelta
+import json
+import urllib.request
 from math import sqrt
 from typing import Any
 
+from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, OLLAMA_BASE_URL, OLLAMA_EMBEDDING_MODEL
+
 
 @dataclass(frozen=True)
 class CandidateRules:
@@ -79,3 +83,30 @@ def cluster_is_candidate(
             return False
 
     return True
+
+
+def ollama_embed(text: str, timeout: float = 20.0) -> list[float] | None:
+    """Best-effort Ollama embedding call; returns None on any failure.
+
+    Embeddings are intentionally optional. The caller should fall back to the
+    heuristic path when this returns None.
+    """
+
+    if not NEWS_EMBEDDINGS_ENABLED:
+        return None
+    payload = json.dumps({"model": OLLAMA_EMBEDDING_MODEL, "prompt": text}).encode("utf-8")
+    req = urllib.request.Request(
+        f"{OLLAMA_BASE_URL.rstrip('/')}/api/embeddings",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+        method="POST",
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=timeout) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+            emb = data.get("embedding")
+            if isinstance(emb, list) and emb:
+                return [float(x) for x in emb]
+    except Exception:
+        return None
+    return None