1 Minggu lalu · 0cacab6ac8
--- a/news_mcp/dedup/cluster.py
+++ b/news_mcp/dedup/cluster.py
@@ -92,33 +92,74 @@ DEFAULT_JACCARD_THRESHOLD = 0.55
 
															 def _signals(article: Dict[str, Any], cluster: Dict[str, Any]) -> dict:
														
 
															-    """Per-pair similarity signals (title, jaccard, embedding cosine)."""
														
 
															+    """Per-pair similarity signals (title, jaccard, embedding cosine).
														
 
															+
														
 
															+    Compares the article against ALL articles in the cluster and returns the
														
 
															+    best (max) signal across all comparisons.  The cosine signal uses the
														
 
															+    cluster-level embedding; title and jaccard are computed per-article and
														
 
															+    the maximum is returned so that a match against any cluster member counts.
														
 
															+    """
														
 
															     a_title = str(article.get("title") or "")
														
 
															     c_title = str(cluster.get("headline") or "")
														
 
															-
														
 
															-    title_sim = _title_similarity(a_title, c_title) if a_title and c_title else 0.0
														
 
															-
														
 
															-    a_text = _cluster_text(article)
														
 
															-    c_text_seed = (cluster.get("articles") or [{}])[0]
														
 
															-    c_text = _cluster_text(c_text_seed) if c_text_seed else c_title
														
 
															-    jaccard = _jaccard(_tokens(a_text), _tokens(c_text)) if a_text and c_text else 0.0
														
 
															-
														
 
															     a_emb = article.get("_embedding")
														
 
															     c_emb = cluster.get("embedding")
														
 
															     cosine = cosine_similarity(a_emb, c_emb) if a_emb and c_emb else 0.0
														
 
															-    return {"title": title_sim, "jaccard": jaccard, "cosine": cosine}
														
 
															-
														
 
															+    best_title = 0.0
														
 
															+    best_jaccard = 0.0
														
 
															+    a_text = _cluster_text(article)
														
 
															+    a_toks = _tokens(a_text) if a_text else set()
														
 
															-def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, float]:
														
 
															-    """Decide whether two items should merge based on the strongest signal."""
														
 
															+    # Compare against every article in the cluster, take the best scores.
														
 
															+    cluster_articles = cluster.get("articles") or ([{"title": c_title}] if c_title else [])
														
 
															+    for ca in cluster_articles:
														
 
															+        if not isinstance(ca, dict):
														
 
															+            continue
														
 
															+        # title signal
														
 
															+        ca_title = str(ca.get("title") or "")
														
 
															+        if a_title and ca_title:
														
 
															+            t = _title_similarity(a_title, ca_title)
														
 
															+            if t > best_title:
														
 
															+                best_title = t
														
 
															+        # jaccard signal
														
 
															+        ca_text = _cluster_text(ca)
														
 
															+        if a_text and ca_text:
														
 
															+            j = _jaccard(a_toks, _tokens(ca_text))
														
 
															+            if j > best_jaccard:
														
 
															+                best_jaccard = j
														
 
															+        # early exit: if both title and jaccard are already very high
														
 
															+        if best_title >= 0.95 and best_jaccard >= 0.80:
														
 
															+            break
														
 
															+
														
 
															+    return {"title": best_title, "jaccard": best_jaccard, "cosine": cosine}
														
 
															+
														
 
															+
														
 
															+def _is_match(
														
 
															+    signals: dict,
														
 
															+    *,
														
 
															+    embeddings_enabled: bool,
														
 
															+    title_threshold: float = DEFAULT_TITLE_THRESHOLD,
														
 
															+    jaccard_threshold: float = DEFAULT_JACCARD_THRESHOLD,
														
 
															+) -> tuple[bool, str, float]:
														
 
															+    """Decide whether two items should merge based on the strongest signal.
														
 
															+
														
 
															+    Cascade: cosine (if embeddings enabled) → title → jaccard → consensus.
														
 
															+    Returns (matched, signal_name, signal_value).
														
 
															+    """
														
 
															     cosine_threshold = NEWS_EMBEDDING_SIMILARITY_THRESHOLD
														
 
															     if embeddings_enabled and signals["cosine"] >= cosine_threshold:
														
 
															         return True, "cosine", signals["cosine"]
														
 
															-    if signals["title"] >= DEFAULT_TITLE_THRESHOLD:
														
 
															+    if signals["title"] >= title_threshold:
														
 
															         return True, "title", signals["title"]
														
 
															-    if signals["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
														
 
															+    if signals["jaccard"] >= jaccard_threshold:
														
 
															         return True, "jaccard", signals["jaccard"]
														
 
															+    if (
														
 
															+        embeddings_enabled
														
 
															+        and signals["cosine"] >= 0.80
														
 
															+        and (signals["jaccard"] >= 0.30 or signals["title"] >= 0.55)
														
 
															+    ):
														
 
															+        val = (signals["cosine"] + max(signals["jaccard"], signals["title"])) / 2.0
														
 
															+        return True, "consensus", val
														
 
															     return False, "none", 0.0
														
@@ -388,28 +429,12 @@ def dedup_and_cluster_articles(
 
															         best_signal_value = 0.0
														
 
															         for idx, c in enumerate(clusters):
														
 
															             sigs = _signals(a_with_emb, c)
														
 
															-            local_match = False
														
 
															-            if NEWS_EMBEDDINGS_ENABLED and sigs["cosine"] >= NEWS_EMBEDDING_SIMILARITY_THRESHOLD:
														
 
															-                local_match = True
														
 
															-                signal_name, signal_value = "cosine", sigs["cosine"]
														
 
															-            elif sigs["title"] >= title_threshold:
														
 
															-                local_match = True
														
 
															-                signal_name, signal_value = "title", sigs["title"]
														
 
															-            elif sigs["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
														
 
															-                local_match = True
														
 
															-                signal_name, signal_value = "jaccard", sigs["jaccard"]
														
 
															-            elif (
														
 
															-                NEWS_EMBEDDINGS_ENABLED
														
 
															-                and sigs["cosine"] >= 0.80
														
 
															-                and (sigs["jaccard"] >= 0.30 or sigs["title"] >= 0.55)
														
 
															-            ):
														
 
															-                local_match = True
														
 
															-                signal_name = "consensus"
														
 
															-                signal_value = (sigs["cosine"] + max(sigs["jaccard"], sigs["title"])) / 2.0
														
 
															-            else:
														
 
															-                signal_name, signal_value = "none", max(sigs["title"], sigs["jaccard"], sigs["cosine"])
														
 
															-
														
 
															-            if local_match and signal_value > best_signal_value:
														
 
															+            matched, signal_name, signal_value = _is_match(
														
 
															+                sigs,
														
 
															+                embeddings_enabled=NEWS_EMBEDDINGS_ENABLED,
														
 
															+                title_threshold=title_threshold,
														
 
															+            )
														
 
															+            if matched and signal_value > best_signal_value:
														
 
															                 best_idx = idx
														
 
															                 best_signal_name = signal_name
														
 
															                 best_signal_value = signal_value
														
@@ -422,6 +447,11 @@ def dedup_and_cluster_articles(
 
															             if a.get("source") and a["source"] not in c["sources"]:
														
 
															                 c["sources"].append(a["source"])
														
 
															             c["last_updated"] = max(str(c.get("last_updated", "")), str(a.get("timestamp", "")))
														
 
															+            # Update cluster embedding to the new article's embedding so later
														
 
															+            # comparisons can match against the most recently added content.
														
 
															+            if NEWS_EMBEDDINGS_ENABLED and article_embedding is not None:
														
 
															+                c["embedding"] = article_embedding
														
 
															+                c["embedding_model"] = "ollama:nomic-embed-text"
														
 
															             c.setdefault("_merge_signals", []).append(
														
 
															                 {"signal": best_signal_name, "value": round(best_signal_value, 3)}
														
 
															             )
														
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -23,7 +23,7 @@ from news_mcp.config import (
 
															     NEWS_CLUSTER_MAX_AGE_HOURS,
														
 
															     llm_concurrency,
														
 
															 )
														
 
															-from news_mcp.dedup.cluster import dedup_and_cluster_articles
														
 
															+from news_mcp.dedup.cluster import dedup_and_cluster_articles, _cluster_is_within_age_window
														
 
															 from news_mcp.enrichment.enrich import enrich_cluster
														
 
															 from news_mcp.enrichment.llm_enrich import classify_cluster_llm
														
 
															 from news_mcp.sources.news_feeds import fetch_news_articles
														
@@ -149,29 +149,8 @@ async def _enrich_topic_clusters(
 
															 def _cluster_age_ok(cluster: dict, max_age_hours: float) -> bool:
														
 
															-    """Check whether a cluster's last_updated is within the merge window."""
														
 
															-    if max_age_hours <= 0:
														
 
															-        return True
														
 
															-    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
														
 
															-    if not ts_str:
														
 
															-        return True
														
 
															-    try:
														
 
															-        s = str(ts_str).replace("Z", "+00:00")
														
 
															-        dt = datetime.fromisoformat(s)
														
 
															-        if dt.tzinfo is None:
														
 
															-            dt = dt.replace(tzinfo=timezone.utc)
														
 
															-        dt = dt.astimezone(timezone.utc)
														
 
															-    except Exception:
														
 
															-        try:
														
 
															-            from email.utils import parsedate_to_datetime
														
 
															-            dt = parsedate_to_datetime(str(ts_str))
														
 
															-            if dt.tzinfo is None:
														
 
															-                dt = dt.replace(tzinfo=timezone.utc)
														
 
															-            dt = dt.astimezone(timezone.utc)
														
 
															-        except Exception:
														
 
															-            return True
														
 
															-    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
														
 
															-    return dt >= cutoff
														
 
															+    """Deprecated alias — use _cluster_is_within_age_window from cluster.py."""
														
 
															+    return _cluster_is_within_age_window(cluster, max_age_hours=max_age_hours)
														
 
															 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None: