Ver código fonte

fix: cross-topic dedup pass to merge duplicate clusters from different feeds

Problem: same event from different feeds gets different article keys,
lands in separate clusters with different stable IDs. Orphan merge
only catches shared article keys, so these duplicates persist.

Fix: added _merge_duplicate_clusters() — a cross-topic post-clustering
pass that merges clusters with title_similarity >= 0.90 AND at least
one shared entity. Uses Union-Find for transitive merging. Merges
articles, sources, entities, keywords, and recomputes stable ID.

Runs after per-topic orphan merge, before enrichment. O(n^2) in
cluster count per cycle (typically <100), negligible cost.
Lukas Goldschmidt 1 semana atrás
pai
commit
7981f483f0
1 arquivos alterados com 135 adições e 0 exclusões
  1. 135 0
      news_mcp/dedup/cluster.py

+ 135 - 0
news_mcp/dedup/cluster.py

@@ -493,9 +493,144 @@ def dedup_and_cluster_articles(
             c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
         by_topic[topic] = clusters
 
+    # Cross-topic dedup: merge clusters with overlapping headlines and entities
+    by_topic = _merge_duplicate_clusters(by_topic)
+
     # Strip the internal merge audit trail before returning
     for clusters in by_topic.values():
         for c in clusters:
             c.pop("_merge_signals", None)
 
     return {topic: clusters for topic, clusters in by_topic.items()}
+
+def _merge_duplicate_clusters(
+    by_topic: Dict[str, List[Dict[str, Any]]],
+) -> Dict[str, List[Dict[str, Any]]]:
+    """Cross-topic dedup: merge clusters whose headlines and entities overlap.
+
+    Catches the case where the same event arrives from different feeds with
+    different article keys, lands in separate clusters with different stable
+    IDs, but has nearly identical headlines and shared entities.
+
+    Merge criteria: title_similarity >= 0.90 AND at least one shared entity.
+    This is intentionally conservative to avoid merging distinct events.
+    """
+    # Flatten all clusters with their topic
+    all_clusters: list[tuple[str, dict]] = []
+    for topic, clusters in by_topic.items():
+        for c in clusters:
+            all_clusters.append((topic, c))
+
+    n = len(all_clusters)
+    if n <= 1:
+        return by_topic
+
+    # Union-Find
+    parent = list(range(n))
+
+    def find(x: int) -> int:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]
+            x = parent[x]
+        return x
+
+    def union(a: int, b: int) -> None:
+        ra, rb = find(a), find(b)
+        if ra != rb:
+            parent[ra] = rb
+
+    # Pre-extract normalized entity sets for each cluster
+    cluster_ent_sets: list[set[str]] = []
+    cluster_heads: list[str] = []
+    for _, c in all_clusters:
+        ents = {str(e).strip().lower() for e in (c.get("entities", []) or []) if str(e).strip()}
+        cluster_ent_sets.append(ents)
+        cluster_heads.append(str(c.get("headline", "") or ""))
+
+    # Compare pairs — O(n^2) but n is small (clusters per cycle, not articles)
+    TITLE_THRESHOLD = 0.90
+    for i in range(n):
+        for j in range(i + 1, n):
+            # Quick skip: if headlines are completely different, no need for entity check
+            if _title_similarity(cluster_heads[i], cluster_heads[j]) < TITLE_THRESHOLD:
+                continue
+            # Check entity overlap (at least one shared entity)
+            if not (cluster_ent_sets[i] & cluster_ent_sets[j]):
+                continue
+            union(i, j)
+
+    # Group by component
+    components: dict[int, list[int]] = {}
+    for idx in range(n):
+        root = find(idx)
+        components.setdefault(root, []).append(idx)
+
+    # Merge each component
+    merged_by_topic: Dict[str, List[Dict[str, Any]]] = {}
+    for root, members in components.items():
+        # Pick the base cluster (the one with the most sources, then most articles)
+        best_idx = max(members, key=lambda i: (
+            len(all_clusters[i][1].get("sources", []) or []),
+            len(all_clusters[i][1].get("articles", []) or []),
+        ))
+        base_topic, base = all_clusters[best_idx]
+
+        if len(members) == 1:
+            merged_by_topic.setdefault(base_topic, []).append(base)
+            continue
+
+        # Merge all clusters in this component into base
+        all_articles: list[dict] = list(base.get("articles", []) or [])
+        all_sources: list[str] = list(base.get("sources", []) or [])
+        all_entities: list[str] = list(base.get("entities", []) or [])
+        all_keywords: list[str] = list(base.get("keywords", []) or [])
+        first_seen = base.get("first_seen", "")
+        last_updated = base.get("last_updated", "")
+        existing_article_keys = {_article_key(a) for a in all_articles}
+        existing_ent_lower = {str(e).strip().lower() for e in all_entities}
+        existing_kw_lower = {str(k).strip().lower() for k in all_keywords}
+
+        for m_idx in members:
+            if m_idx == best_idx:
+                continue
+            other = all_clusters[m_idx][1]
+            # Merge articles (dedup by key)
+            for a in other.get("articles", []) or []:
+                ak = _article_key(a)
+                if ak not in existing_article_keys:
+                    all_articles.append(a)
+                    existing_article_keys.add(ak)
+            # Merge sources
+            for s in other.get("sources", []) or []:
+                if s not in all_sources:
+                    all_sources.append(s)
+            # Merge entities (dedup case-insensitive)
+            for e in other.get("entities", []) or []:
+                el = str(e).strip().lower()
+                if el not in existing_ent_lower:
+                    all_entities.append(e)
+                    existing_ent_lower.add(el)
+            # Merge keywords (dedup case-insensitive)
+            for k in other.get("keywords", []) or []:
+                kl = str(k).strip().lower()
+                if kl not in existing_kw_lower:
+                    all_keywords.append(k)
+                    existing_kw_lower.add(kl)
+            # Timestamps
+            fs = other.get("first_seen", "")
+            if fs and (not first_seen or fs < first_seen):
+                first_seen = fs
+            lu = other.get("last_updated", "")
+            if lu and (not last_updated or lu > last_updated):
+                last_updated = lu
+
+        base["articles"] = all_articles
+        base["sources"] = all_sources
+        base["entities"] = all_entities
+        base["keywords"] = all_keywords
+        base["first_seen"] = first_seen
+        base["last_updated"] = last_updated
+        base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
+        merged_by_topic.setdefault(base_topic, []).append(base)
+
+    return merged_by_topic