пре 1 недеља · 7981f483f0
--- a/news_mcp/dedup/cluster.py
+++ b/news_mcp/dedup/cluster.py
@@ -493,9 +493,144 @@ def dedup_and_cluster_articles(
 
															             c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
														
 
															         by_topic[topic] = clusters
														
 
															+    # Cross-topic dedup: merge clusters with overlapping headlines and entities
														
 
															+    by_topic = _merge_duplicate_clusters(by_topic)
														
 
															+
														
 
															     # Strip the internal merge audit trail before returning
														
 
															     for clusters in by_topic.values():
														
 
															         for c in clusters:
														
 
															             c.pop("_merge_signals", None)
														
 
															     return {topic: clusters for topic, clusters in by_topic.items()}
														
 
															+
														
 
															+def _merge_duplicate_clusters(
														
 
															+    by_topic: Dict[str, List[Dict[str, Any]]],
														
 
															+) -> Dict[str, List[Dict[str, Any]]]:
														
 
															+    """Cross-topic dedup: merge clusters whose headlines and entities overlap.
														
 
															+
														
 
															+    Catches the case where the same event arrives from different feeds with
														
 
															+    different article keys, lands in separate clusters with different stable
														
 
															+    IDs, but has nearly identical headlines and shared entities.
														
 
															+
														
 
															+    Merge criteria: title_similarity >= 0.90 AND at least one shared entity.
														
 
															+    This is intentionally conservative to avoid merging distinct events.
														
 
															+    """
														
 
															+    # Flatten all clusters with their topic
														
 
															+    all_clusters: list[tuple[str, dict]] = []
														
 
															+    for topic, clusters in by_topic.items():
														
 
															+        for c in clusters:
														
 
															+            all_clusters.append((topic, c))
														
 
															+
														
 
															+    n = len(all_clusters)
														
 
															+    if n <= 1:
														
 
															+        return by_topic
														
 
															+
														
 
															+    # Union-Find
														
 
															+    parent = list(range(n))
														
 
															+
														
 
															+    def find(x: int) -> int:
														
 
															+        while parent[x] != x:
														
 
															+            parent[x] = parent[parent[x]]
														
 
															+            x = parent[x]
														
 
															+        return x
														
 
															+
														
 
															+    def union(a: int, b: int) -> None:
														
 
															+        ra, rb = find(a), find(b)
														
 
															+        if ra != rb:
														
 
															+            parent[ra] = rb
														
 
															+
														
 
															+    # Pre-extract normalized entity sets for each cluster
														
 
															+    cluster_ent_sets: list[set[str]] = []
														
 
															+    cluster_heads: list[str] = []
														
 
															+    for _, c in all_clusters:
														
 
															+        ents = {str(e).strip().lower() for e in (c.get("entities", []) or []) if str(e).strip()}
														
 
															+        cluster_ent_sets.append(ents)
														
 
															+        cluster_heads.append(str(c.get("headline", "") or ""))
														
 
															+
														
 
															+    # Compare pairs — O(n^2) but n is small (clusters per cycle, not articles)
														
 
															+    TITLE_THRESHOLD = 0.90
														
 
															+    for i in range(n):
														
 
															+        for j in range(i + 1, n):
														
 
															+            # Quick skip: if headlines are completely different, no need for entity check
														
 
															+            if _title_similarity(cluster_heads[i], cluster_heads[j]) < TITLE_THRESHOLD:
														
 
															+                continue
														
 
															+            # Check entity overlap (at least one shared entity)
														
 
															+            if not (cluster_ent_sets[i] & cluster_ent_sets[j]):
														
 
															+                continue
														
 
															+            union(i, j)
														
 
															+
														
 
															+    # Group by component
														
 
															+    components: dict[int, list[int]] = {}
														
 
															+    for idx in range(n):
														
 
															+        root = find(idx)
														
 
															+        components.setdefault(root, []).append(idx)
														
 
															+
														
 
															+    # Merge each component
														
 
															+    merged_by_topic: Dict[str, List[Dict[str, Any]]] = {}
														
 
															+    for root, members in components.items():
														
 
															+        # Pick the base cluster (the one with the most sources, then most articles)
														
 
															+        best_idx = max(members, key=lambda i: (
														
 
															+            len(all_clusters[i][1].get("sources", []) or []),
														
 
															+            len(all_clusters[i][1].get("articles", []) or []),
														
 
															+        ))
														
 
															+        base_topic, base = all_clusters[best_idx]
														
 
															+
														
 
															+        if len(members) == 1:
														
 
															+            merged_by_topic.setdefault(base_topic, []).append(base)
														
 
															+            continue
														
 
															+
														
 
															+        # Merge all clusters in this component into base
														
 
															+        all_articles: list[dict] = list(base.get("articles", []) or [])
														
 
															+        all_sources: list[str] = list(base.get("sources", []) or [])
														
 
															+        all_entities: list[str] = list(base.get("entities", []) or [])
														
 
															+        all_keywords: list[str] = list(base.get("keywords", []) or [])
														
 
															+        first_seen = base.get("first_seen", "")
														
 
															+        last_updated = base.get("last_updated", "")
														
 
															+        existing_article_keys = {_article_key(a) for a in all_articles}
														
 
															+        existing_ent_lower = {str(e).strip().lower() for e in all_entities}
														
 
															+        existing_kw_lower = {str(k).strip().lower() for k in all_keywords}
														
 
															+
														
 
															+        for m_idx in members:
														
 
															+            if m_idx == best_idx:
														
 
															+                continue
														
 
															+            other = all_clusters[m_idx][1]
														
 
															+            # Merge articles (dedup by key)
														
 
															+            for a in other.get("articles", []) or []:
														
 
															+                ak = _article_key(a)
														
 
															+                if ak not in existing_article_keys:
														
 
															+                    all_articles.append(a)
														
 
															+                    existing_article_keys.add(ak)
														
 
															+            # Merge sources
														
 
															+            for s in other.get("sources", []) or []:
														
 
															+                if s not in all_sources:
														
 
															+                    all_sources.append(s)
														
 
															+            # Merge entities (dedup case-insensitive)
														
 
															+            for e in other.get("entities", []) or []:
														
 
															+                el = str(e).strip().lower()
														
 
															+                if el not in existing_ent_lower:
														
 
															+                    all_entities.append(e)
														
 
															+                    existing_ent_lower.add(el)
														
 
															+            # Merge keywords (dedup case-insensitive)
														
 
															+            for k in other.get("keywords", []) or []:
														
 
															+                kl = str(k).strip().lower()
														
 
															+                if kl not in existing_kw_lower:
														
 
															+                    all_keywords.append(k)
														
 
															+                    existing_kw_lower.add(kl)
														
 
															+            # Timestamps
														
 
															+            fs = other.get("first_seen", "")
														
 
															+            if fs and (not first_seen or fs < first_seen):
														
 
															+                first_seen = fs
														
 
															+            lu = other.get("last_updated", "")
														
 
															+            if lu and (not last_updated or lu > last_updated):
														
 
															+                last_updated = lu
														
 
															+
														
 
															+        base["articles"] = all_articles
														
 
															+        base["sources"] = all_sources
														
 
															+        base["entities"] = all_entities
														
 
															+        base["keywords"] = all_keywords
														
 
															+        base["first_seen"] = first_seen
														
 
															+        base["last_updated"] = last_updated
														
 
															+        base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
														
 
															+        merged_by_topic.setdefault(base_topic, []).append(base)
														
 
															+
														
 
															+    return merged_by_topic