před 1 týdnem · 0cacab6ac8
--- a/news_mcp/dedup/cluster.py
+++ b/news_mcp/dedup/cluster.py
@@ -92,33 +92,74 @@ DEFAULT_JACCARD_THRESHOLD = 0.55
 
				 
			
 
				 
			
 
				 def _signals(article: Dict[str, Any], cluster: Dict[str, Any]) -> dict:
			
 
				-    """Per-pair similarity signals (title, jaccard, embedding cosine)."""
			
 
				+    """Per-pair similarity signals (title, jaccard, embedding cosine).
			
 
				+
			
 
				+    Compares the article against ALL articles in the cluster and returns the
			
 
				+    best (max) signal across all comparisons.  The cosine signal uses the
			
 
				+    cluster-level embedding; title and jaccard are computed per-article and
			
 
				+    the maximum is returned so that a match against any cluster member counts.
			
 
				+    """
			
 
				     a_title = str(article.get("title") or "")
			
 
				     c_title = str(cluster.get("headline") or "")
			
 
				-
			
 
				-    title_sim = _title_similarity(a_title, c_title) if a_title and c_title else 0.0
			
 
				-
			
 
				-    a_text = _cluster_text(article)
			
 
				-    c_text_seed = (cluster.get("articles") or [{}])[0]
			
 
				-    c_text = _cluster_text(c_text_seed) if c_text_seed else c_title
			
 
				-    jaccard = _jaccard(_tokens(a_text), _tokens(c_text)) if a_text and c_text else 0.0
			
 
				-
			
 
				     a_emb = article.get("_embedding")
			
 
				     c_emb = cluster.get("embedding")
			
 
				     cosine = cosine_similarity(a_emb, c_emb) if a_emb and c_emb else 0.0
			
 
				 
			
 
				-    return {"title": title_sim, "jaccard": jaccard, "cosine": cosine}
			
 
				-
			
 
				+    best_title = 0.0
			
 
				+    best_jaccard = 0.0
			
 
				+    a_text = _cluster_text(article)
			
 
				+    a_toks = _tokens(a_text) if a_text else set()
			
 
				 
			
 
				-def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, float]:
			
 
				-    """Decide whether two items should merge based on the strongest signal."""
			
 
				+    # Compare against every article in the cluster, take the best scores.
			
 
				+    cluster_articles = cluster.get("articles") or ([{"title": c_title}] if c_title else [])
			
 
				+    for ca in cluster_articles:
			
 
				+        if not isinstance(ca, dict):
			
 
				+            continue
			
 
				+        # title signal
			
 
				+        ca_title = str(ca.get("title") or "")
			
 
				+        if a_title and ca_title:
			
 
				+            t = _title_similarity(a_title, ca_title)
			
 
				+            if t > best_title:
			
 
				+                best_title = t
			
 
				+        # jaccard signal
			
 
				+        ca_text = _cluster_text(ca)
			
 
				+        if a_text and ca_text:
			
 
				+            j = _jaccard(a_toks, _tokens(ca_text))
			
 
				+            if j > best_jaccard:
			
 
				+                best_jaccard = j
			
 
				+        # early exit: if both title and jaccard are already very high
			
 
				+        if best_title >= 0.95 and best_jaccard >= 0.80:
			
 
				+            break
			
 
				+
			
 
				+    return {"title": best_title, "jaccard": best_jaccard, "cosine": cosine}
			
 
				+
			
 
				+
			
 
				+def _is_match(
			
 
				+    signals: dict,
			
 
				+    *,
			
 
				+    embeddings_enabled: bool,
			
 
				+    title_threshold: float = DEFAULT_TITLE_THRESHOLD,
			
 
				+    jaccard_threshold: float = DEFAULT_JACCARD_THRESHOLD,
			
 
				+) -> tuple[bool, str, float]:
			
 
				+    """Decide whether two items should merge based on the strongest signal.
			
 
				+
			
 
				+    Cascade: cosine (if embeddings enabled) → title → jaccard → consensus.
			
 
				+    Returns (matched, signal_name, signal_value).
			
 
				+    """
			
 
				     cosine_threshold = NEWS_EMBEDDING_SIMILARITY_THRESHOLD
			
 
				     if embeddings_enabled and signals["cosine"] >= cosine_threshold:
			
 
				         return True, "cosine", signals["cosine"]
			
 
				-    if signals["title"] >= DEFAULT_TITLE_THRESHOLD:
			
 
				+    if signals["title"] >= title_threshold:
			
 
				         return True, "title", signals["title"]
			
 
				-    if signals["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
			
 
				+    if signals["jaccard"] >= jaccard_threshold:
			
 
				         return True, "jaccard", signals["jaccard"]
			
 
				+    if (
			
 
				+        embeddings_enabled
			
 
				+        and signals["cosine"] >= 0.80
			
 
				+        and (signals["jaccard"] >= 0.30 or signals["title"] >= 0.55)
			
 
				+    ):
			
 
				+        val = (signals["cosine"] + max(signals["jaccard"], signals["title"])) / 2.0
			
 
				+        return True, "consensus", val
			
 
				     return False, "none", 0.0
			
 
				 
			
 
				 
			
@@ -388,28 +429,12 @@ def dedup_and_cluster_articles(
 
				         best_signal_value = 0.0
			
 
				         for idx, c in enumerate(clusters):
			
 
				             sigs = _signals(a_with_emb, c)
			
 
				-            local_match = False
			
 
				-            if NEWS_EMBEDDINGS_ENABLED and sigs["cosine"] >= NEWS_EMBEDDING_SIMILARITY_THRESHOLD:
			
 
				-                local_match = True
			
 
				-                signal_name, signal_value = "cosine", sigs["cosine"]
			
 
				-            elif sigs["title"] >= title_threshold:
			
 
				-                local_match = True
			
 
				-                signal_name, signal_value = "title", sigs["title"]
			
 
				-            elif sigs["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
			
 
				-                local_match = True
			
 
				-                signal_name, signal_value = "jaccard", sigs["jaccard"]
			
 
				-            elif (
			
 
				-                NEWS_EMBEDDINGS_ENABLED
			
 
				-                and sigs["cosine"] >= 0.80
			
 
				-                and (sigs["jaccard"] >= 0.30 or sigs["title"] >= 0.55)
			
 
				-            ):
			
 
				-                local_match = True
			
 
				-                signal_name = "consensus"
			
 
				-                signal_value = (sigs["cosine"] + max(sigs["jaccard"], sigs["title"])) / 2.0
			
 
				-            else:
			
 
				-                signal_name, signal_value = "none", max(sigs["title"], sigs["jaccard"], sigs["cosine"])
			
 
				-
			
 
				-            if local_match and signal_value > best_signal_value:
			
 
				+            matched, signal_name, signal_value = _is_match(
			
 
				+                sigs,
			
 
				+                embeddings_enabled=NEWS_EMBEDDINGS_ENABLED,
			
 
				+                title_threshold=title_threshold,
			
 
				+            )
			
 
				+            if matched and signal_value > best_signal_value:
			
 
				                 best_idx = idx
			
 
				                 best_signal_name = signal_name
			
 
				                 best_signal_value = signal_value
			
@@ -422,6 +447,11 @@ def dedup_and_cluster_articles(
 
				             if a.get("source") and a["source"] not in c["sources"]:
			
 
				                 c["sources"].append(a["source"])
			
 
				             c["last_updated"] = max(str(c.get("last_updated", "")), str(a.get("timestamp", "")))
			
 
				+            # Update cluster embedding to the new article's embedding so later
			
 
				+            # comparisons can match against the most recently added content.
			
 
				+            if NEWS_EMBEDDINGS_ENABLED and article_embedding is not None:
			
 
				+                c["embedding"] = article_embedding
			
 
				+                c["embedding_model"] = "ollama:nomic-embed-text"
			
 
				             c.setdefault("_merge_signals", []).append(
			
 
				                 {"signal": best_signal_name, "value": round(best_signal_value, 3)}
			
 
				             )
			
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -23,7 +23,7 @@ from news_mcp.config import (
 
				     NEWS_CLUSTER_MAX_AGE_HOURS,
			
 
				     llm_concurrency,
			
 
				 )
			
 
				-from news_mcp.dedup.cluster import dedup_and_cluster_articles
			
 
				+from news_mcp.dedup.cluster import dedup_and_cluster_articles, _cluster_is_within_age_window
			
 
				 from news_mcp.enrichment.enrich import enrich_cluster
			
 
				 from news_mcp.enrichment.llm_enrich import classify_cluster_llm
			
 
				 from news_mcp.sources.news_feeds import fetch_news_articles
			
@@ -149,29 +149,8 @@ async def _enrich_topic_clusters(
 
				 
			
 
				 
			
 
				 def _cluster_age_ok(cluster: dict, max_age_hours: float) -> bool:
			
 
				-    """Check whether a cluster's last_updated is within the merge window."""
			
 
				-    if max_age_hours <= 0:
			
 
				-        return True
			
 
				-    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
			
 
				-    if not ts_str:
			
 
				-        return True
			
 
				-    try:
			
 
				-        s = str(ts_str).replace("Z", "+00:00")
			
 
				-        dt = datetime.fromisoformat(s)
			
 
				-        if dt.tzinfo is None:
			
 
				-            dt = dt.replace(tzinfo=timezone.utc)
			
 
				-        dt = dt.astimezone(timezone.utc)
			
 
				-    except Exception:
			
 
				-        try:
			
 
				-            from email.utils import parsedate_to_datetime
			
 
				-            dt = parsedate_to_datetime(str(ts_str))
			
 
				-            if dt.tzinfo is None:
			
 
				-                dt = dt.replace(tzinfo=timezone.utc)
			
 
				-            dt = dt.astimezone(timezone.utc)
			
 
				-        except Exception:
			
 
				-            return True
			
 
				-    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
			
 
				-    return dt >= cutoff
			
 
				+    """Deprecated alias — use _cluster_is_within_age_window from cluster.py."""
			
 
				+    return _cluster_is_within_age_window(cluster, max_age_hours=max_age_hours)
			
 
				 
			
 
				 
			
 
				 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None: