1 miesiąc temu · 8b935134d7
--- a/.env.example
+++ b/.env.example
@@ -16,6 +16,11 @@ GROQ_DEBUG=false
 
				 GROQ_ENRICH_OTHER_ONLY=false
			
 
				 GROQ_MAX_CLUSTERS_PER_REFRESH=20
			
 
				 
			
 
				+# Embeddings (optional, Ollama-first when enabled)
			
 
				+NEWS_EMBEDDINGS_ENABLED=false
			
 
				+OLLAMA_BASE_URL=http://127.0.0.1:11434
			
 
				+OLLAMA_EMBEDDING_MODEL=nomic-embed-text
			
 
				+
			
 
				 # Feeds
			
 
				 NEWS_FEED_URL=https://breakingthenews.net/news-feed.xml
			
 
				 NEWS_FEED_URLS=
			
--- a/PROJECT.md
+++ b/PROJECT.md
@@ -8,6 +8,7 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 
				 - SQLite cache for clusters + Groq summary caches
			
 
				 - RSS fetch (breakingthenews.net)
			
 
				 - v1 dedup via fuzzy title similarity
			
 
				+- optional Ollama embeddings path for clustering (when `NEWS_EMBEDDINGS_ENABLED=true`)
			
 
				 - Groq enrichment (topic/entities/sentiment/keywords)
			
 
				 - Tools expose semantic queries over cached clusters
			
 
				 
			
@@ -16,6 +17,7 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 
				 - `get_events_for_entity(entity, limit)`
			
 
				 - `get_event_summary(event_id)`
			
 
				 - `detect_emerging_topics(limit)`
			
 
				+- `get_related_entities(subject, timeframe, limit)`
			
 
				 
			
 
				 ## Refresh & caching
			
 
				 - Background refresh every `NEWS_REFRESH_INTERVAL_SECONDS` (default 900s)
			
@@ -27,3 +29,4 @@ Provide a signal-extraction MCP server that converts RSS into **deduplicated, en
 
				 - Tests pass offline (dedup/storage unit tests)
			
 
				 - Server exposes tool surface with valid schemas
			
 
				 - Caching prevents repeated Groq calls for unchanged clusters
			
 
				+- Embeddings remain optional: Ollama is tried first when enabled, otherwise the heuristic path stays active
			
--- a/README.md
+++ b/README.md
@@ -76,6 +76,8 @@ Key variables:
 
				 - `OLLAMA_BASE_URL` / `OLLAMA_URL` (default `http://127.0.0.1:11434`)
			
 
				 - `OLLAMA_EMBEDDING_MODEL` (default `nomic-embed-text`)
			
 
				 
			
 
				+When embeddings are enabled, news-mcp tries Ollama first and falls back to the existing heuristic clustering path if Ollama is unavailable.
			
 
				+
			
 
				 ## Live extraction smoke test
			
 
				 
			
 
				 Run a standardized, fabricated extraction test against the currently configured provider/model:
			
--- a/news_mcp/dedup/cluster.py
+++ b/news_mcp/dedup/cluster.py
@@ -3,6 +3,8 @@ from __future__ import annotations
 
				 from typing import Any, Dict, List, Tuple
			
 
				 
			
 
				 from news_mcp.sources.news_feeds import normalize_topic_from_title
			
 
				+from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed
			
 
				+from news_mcp.config import NEWS_EMBEDDINGS_ENABLED
			
 
				 
			
 
				 import re
			
 
				 from difflib import SequenceMatcher
			
@@ -20,6 +22,11 @@ def _title_similarity(a: str, b: str) -> float:
 
				     return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
			
 
				 
			
 
				 
			
 
				+def _cluster_text(a: Dict[str, Any]) -> str:
			
 
				+    parts = [a.get("title", ""), a.get("summary", "") or ""]
			
 
				+    return "\n".join(p for p in parts if p).strip()
			
 
				+
			
 
				+
			
 
				 def dedup_and_cluster_articles(
			
 
				     articles: List[Dict[str, Any]],
			
 
				     similarity_threshold: float = 0.87,
			
@@ -32,10 +39,23 @@ def dedup_and_cluster_articles(
 
				     """
			
 
				 
			
 
				     by_topic: Dict[str, List[Dict[str, Any]]] = {}
			
 
				+    embedding_cache: Dict[str, list[float]] = {}
			
 
				+
			
 
				+    def _embedding_for_text(text: str) -> list[float] | None:
			
 
				+        if not NEWS_EMBEDDINGS_ENABLED:
			
 
				+            return None
			
 
				+        if text in embedding_cache:
			
 
				+            return embedding_cache[text]
			
 
				+        emb = ollama_embed(text)
			
 
				+        if emb:
			
 
				+            embedding_cache[text] = emb
			
 
				+        return emb
			
 
				 
			
 
				     for a in articles:
			
 
				         title = a["title"]
			
 
				         topic = normalize_topic_from_title(title)
			
 
				+        article_text = _cluster_text(a)
			
 
				+        article_embedding = _embedding_for_text(article_text)
			
 
				 
			
 
				         by_topic.setdefault(topic, [])
			
 
				         clusters = by_topic[topic]
			
@@ -43,12 +63,26 @@ def dedup_and_cluster_articles(
 
				         best_idx: int | None = None
			
 
				         best_sim = 0.0
			
 
				         for idx, c in enumerate(clusters):
			
 
				-            sim = _title_similarity(title, c.get("headline", ""))
			
 
				+            if NEWS_EMBEDDINGS_ENABLED:
			
 
				+                if not cluster_is_candidate(a, c, rules=CandidateRules(require_topic_match=False), article_topic=topic):
			
 
				+                    continue
			
 
				+                cluster_text = _cluster_text(c.get("articles", [{}])[0]) if c.get("articles") else c.get("headline", "")
			
 
				+                cluster_embedding = _embedding_for_text(cluster_text)
			
 
				+                if article_embedding and cluster_embedding:
			
 
				+                    sim = cosine_similarity(article_embedding, cluster_embedding)
			
 
				+                else:
			
 
				+                    sim = _title_similarity(title, c.get("headline", ""))
			
 
				+            else:
			
 
				+                sim = _title_similarity(title, c.get("headline", ""))
			
 
				             if sim > best_sim:
			
 
				                 best_sim = sim
			
 
				                 best_idx = idx
			
 
				 
			
 
				-        if best_idx is not None and best_sim >= similarity_threshold:
			
 
				+        threshold = similarity_threshold
			
 
				+        if NEWS_EMBEDDINGS_ENABLED:
			
 
				+            threshold = max(similarity_threshold, 0.82)
			
 
				+
			
 
				+        if best_idx is not None and best_sim >= threshold:
			
 
				             c = clusters[best_idx]
			
 
				             c["articles"].append(a)
			
 
				             if a["source"] not in c["sources"]:
			
--- a/news_mcp/dedup/embedding_support.py
+++ b/news_mcp/dedup/embedding_support.py
@@ -2,9 +2,13 @@ from __future__ import annotations
 
				 
			
 
				 from dataclasses import dataclass
			
 
				 from datetime import datetime, timezone, timedelta
			
 
				+import json
			
 
				+import urllib.request
			
 
				 from math import sqrt
			
 
				 from typing import Any
			
 
				 
			
 
				+from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, OLLAMA_BASE_URL, OLLAMA_EMBEDDING_MODEL
			
 
				+
			
 
				 
			
 
				 @dataclass(frozen=True)
			
 
				 class CandidateRules:
			
@@ -79,3 +83,30 @@ def cluster_is_candidate(
 
				             return False
			
 
				 
			
 
				     return True
			
 
				+
			
 
				+
			
 
				+def ollama_embed(text: str, timeout: float = 20.0) -> list[float] | None:
			
 
				+    """Best-effort Ollama embedding call; returns None on any failure.
			
 
				+
			
 
				+    Embeddings are intentionally optional. The caller should fall back to the
			
 
				+    heuristic path when this returns None.
			
 
				+    """
			
 
				+
			
 
				+    if not NEWS_EMBEDDINGS_ENABLED:
			
 
				+        return None
			
 
				+    payload = json.dumps({"model": OLLAMA_EMBEDDING_MODEL, "prompt": text}).encode("utf-8")
			
 
				+    req = urllib.request.Request(
			
 
				+        f"{OLLAMA_BASE_URL.rstrip('/')}/api/embeddings",
			
 
				+        data=payload,
			
 
				+        headers={"Content-Type": "application/json"},
			
 
				+        method="POST",
			
 
				+    )
			
 
				+    try:
			
 
				+        with urllib.request.urlopen(req, timeout=timeout) as resp:
			
 
				+            data = json.loads(resp.read().decode("utf-8"))
			
 
				+            emb = data.get("embedding")
			
 
				+            if isinstance(emb, list) and emb:
			
 
				+                return [float(x) for x in emb]
			
 
				+    except Exception:
			
 
				+        return None
			
 
				+    return None