Procházet zdrojové kódy

fix: multi-article signal comparison, cascade match, dedup age check, embedding update

- _signals() now compares new article against ALL cluster articles,
  returns best title/jaccard across all members (not just seed)
- Main loop uses _is_match() as unified cascade: cosine → title → jaccard → consensus
- _is_match() accepts configurable title_threshold parameter
- Cluster embedding updated when new article merges in
- _cluster_age_ok in poller.py delegates to _cluster_is_within_age_window
  in cluster.py (single source of truth)
- All 38 tests pass
Lukas Goldschmidt před 1 týdnem
rodič
revize
0cacab6ac8
2 změnil soubory, kde provedl 70 přidání a 61 odebrání
  1. 67 37
      news_mcp/dedup/cluster.py
  2. 3 24
      news_mcp/jobs/poller.py

+ 67 - 37
news_mcp/dedup/cluster.py

@@ -92,33 +92,74 @@ DEFAULT_JACCARD_THRESHOLD = 0.55
 
 
 def _signals(article: Dict[str, Any], cluster: Dict[str, Any]) -> dict:
-    """Per-pair similarity signals (title, jaccard, embedding cosine)."""
+    """Per-pair similarity signals (title, jaccard, embedding cosine).
+
+    Compares the article against ALL articles in the cluster and returns the
+    best (max) signal across all comparisons.  The cosine signal uses the
+    cluster-level embedding; title and jaccard are computed per-article and
+    the maximum is returned so that a match against any cluster member counts.
+    """
     a_title = str(article.get("title") or "")
     c_title = str(cluster.get("headline") or "")
-
-    title_sim = _title_similarity(a_title, c_title) if a_title and c_title else 0.0
-
-    a_text = _cluster_text(article)
-    c_text_seed = (cluster.get("articles") or [{}])[0]
-    c_text = _cluster_text(c_text_seed) if c_text_seed else c_title
-    jaccard = _jaccard(_tokens(a_text), _tokens(c_text)) if a_text and c_text else 0.0
-
     a_emb = article.get("_embedding")
     c_emb = cluster.get("embedding")
     cosine = cosine_similarity(a_emb, c_emb) if a_emb and c_emb else 0.0
 
-    return {"title": title_sim, "jaccard": jaccard, "cosine": cosine}
-
+    best_title = 0.0
+    best_jaccard = 0.0
+    a_text = _cluster_text(article)
+    a_toks = _tokens(a_text) if a_text else set()
 
-def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, float]:
-    """Decide whether two items should merge based on the strongest signal."""
+    # Compare against every article in the cluster, take the best scores.
+    cluster_articles = cluster.get("articles") or ([{"title": c_title}] if c_title else [])
+    for ca in cluster_articles:
+        if not isinstance(ca, dict):
+            continue
+        # title signal
+        ca_title = str(ca.get("title") or "")
+        if a_title and ca_title:
+            t = _title_similarity(a_title, ca_title)
+            if t > best_title:
+                best_title = t
+        # jaccard signal
+        ca_text = _cluster_text(ca)
+        if a_text and ca_text:
+            j = _jaccard(a_toks, _tokens(ca_text))
+            if j > best_jaccard:
+                best_jaccard = j
+        # early exit: if both title and jaccard are already very high
+        if best_title >= 0.95 and best_jaccard >= 0.80:
+            break
+
+    return {"title": best_title, "jaccard": best_jaccard, "cosine": cosine}
+
+
+def _is_match(
+    signals: dict,
+    *,
+    embeddings_enabled: bool,
+    title_threshold: float = DEFAULT_TITLE_THRESHOLD,
+    jaccard_threshold: float = DEFAULT_JACCARD_THRESHOLD,
+) -> tuple[bool, str, float]:
+    """Decide whether two items should merge based on the strongest signal.
+
+    Cascade: cosine (if embeddings enabled) → title → jaccard → consensus.
+    Returns (matched, signal_name, signal_value).
+    """
     cosine_threshold = NEWS_EMBEDDING_SIMILARITY_THRESHOLD
     if embeddings_enabled and signals["cosine"] >= cosine_threshold:
         return True, "cosine", signals["cosine"]
-    if signals["title"] >= DEFAULT_TITLE_THRESHOLD:
+    if signals["title"] >= title_threshold:
         return True, "title", signals["title"]
-    if signals["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
+    if signals["jaccard"] >= jaccard_threshold:
         return True, "jaccard", signals["jaccard"]
+    if (
+        embeddings_enabled
+        and signals["cosine"] >= 0.80
+        and (signals["jaccard"] >= 0.30 or signals["title"] >= 0.55)
+    ):
+        val = (signals["cosine"] + max(signals["jaccard"], signals["title"])) / 2.0
+        return True, "consensus", val
     return False, "none", 0.0
 
 
@@ -388,28 +429,12 @@ def dedup_and_cluster_articles(
         best_signal_value = 0.0
         for idx, c in enumerate(clusters):
             sigs = _signals(a_with_emb, c)
-            local_match = False
-            if NEWS_EMBEDDINGS_ENABLED and sigs["cosine"] >= NEWS_EMBEDDING_SIMILARITY_THRESHOLD:
-                local_match = True
-                signal_name, signal_value = "cosine", sigs["cosine"]
-            elif sigs["title"] >= title_threshold:
-                local_match = True
-                signal_name, signal_value = "title", sigs["title"]
-            elif sigs["jaccard"] >= DEFAULT_JACCARD_THRESHOLD:
-                local_match = True
-                signal_name, signal_value = "jaccard", sigs["jaccard"]
-            elif (
-                NEWS_EMBEDDINGS_ENABLED
-                and sigs["cosine"] >= 0.80
-                and (sigs["jaccard"] >= 0.30 or sigs["title"] >= 0.55)
-            ):
-                local_match = True
-                signal_name = "consensus"
-                signal_value = (sigs["cosine"] + max(sigs["jaccard"], sigs["title"])) / 2.0
-            else:
-                signal_name, signal_value = "none", max(sigs["title"], sigs["jaccard"], sigs["cosine"])
-
-            if local_match and signal_value > best_signal_value:
+            matched, signal_name, signal_value = _is_match(
+                sigs,
+                embeddings_enabled=NEWS_EMBEDDINGS_ENABLED,
+                title_threshold=title_threshold,
+            )
+            if matched and signal_value > best_signal_value:
                 best_idx = idx
                 best_signal_name = signal_name
                 best_signal_value = signal_value
@@ -422,6 +447,11 @@ def dedup_and_cluster_articles(
             if a.get("source") and a["source"] not in c["sources"]:
                 c["sources"].append(a["source"])
             c["last_updated"] = max(str(c.get("last_updated", "")), str(a.get("timestamp", "")))
+            # Update cluster embedding to the new article's embedding so later
+            # comparisons can match against the most recently added content.
+            if NEWS_EMBEDDINGS_ENABLED and article_embedding is not None:
+                c["embedding"] = article_embedding
+                c["embedding_model"] = "ollama:nomic-embed-text"
             c.setdefault("_merge_signals", []).append(
                 {"signal": best_signal_name, "value": round(best_signal_value, 3)}
             )

+ 3 - 24
news_mcp/jobs/poller.py

@@ -23,7 +23,7 @@ from news_mcp.config import (
     NEWS_CLUSTER_MAX_AGE_HOURS,
     llm_concurrency,
 )
-from news_mcp.dedup.cluster import dedup_and_cluster_articles
+from news_mcp.dedup.cluster import dedup_and_cluster_articles, _cluster_is_within_age_window
 from news_mcp.enrichment.enrich import enrich_cluster
 from news_mcp.enrichment.llm_enrich import classify_cluster_llm
 from news_mcp.sources.news_feeds import fetch_news_articles
@@ -149,29 +149,8 @@ async def _enrich_topic_clusters(
 
 
 def _cluster_age_ok(cluster: dict, max_age_hours: float) -> bool:
-    """Check whether a cluster's last_updated is within the merge window."""
-    if max_age_hours <= 0:
-        return True
-    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
-    if not ts_str:
-        return True
-    try:
-        s = str(ts_str).replace("Z", "+00:00")
-        dt = datetime.fromisoformat(s)
-        if dt.tzinfo is None:
-            dt = dt.replace(tzinfo=timezone.utc)
-        dt = dt.astimezone(timezone.utc)
-    except Exception:
-        try:
-            from email.utils import parsedate_to_datetime
-            dt = parsedate_to_datetime(str(ts_str))
-            if dt.tzinfo is None:
-                dt = dt.replace(tzinfo=timezone.utc)
-            dt = dt.astimezone(timezone.utc)
-        except Exception:
-            return True
-    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
-    return dt >= cutoff
+    """Deprecated alias — use _cluster_is_within_age_window from cluster.py."""
+    return _cluster_is_within_age_window(cluster, max_age_hours=max_age_hours)
 
 
 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None: