Forráskód Böngészése

fix: keep cluster_id stable across cycles — never recompute after creation

Lukas Goldschmidt 1 hete
szülő
commit
fab4b5ec31
2 módosított fájl, 17 hozzáadás és 12 törlés
  1. 12 4
      news_mcp/dedup/cluster.py
  2. 5 8
      test_news_mcp.py

+ 12 - 4
news_mcp/dedup/cluster.py

@@ -362,7 +362,9 @@ def _merge_orphan_clusters(
         base["sources"] = all_sources
         base["first_seen"] = first_seen
         base["last_updated"] = last_updated
-        base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
+        # Keep the base cluster's original ID so the enrichment cache
+        # (keyed by cluster_id) survives the merge.
+        base.setdefault("cluster_id", _stable_cluster_id(base.get("topic", "other"), all_articles))
         merged.append(base)
 
     return merged
@@ -488,9 +490,13 @@ def dedup_and_cluster_articles(
     for topic, clusters in by_topic.items():
         # Merge orphans (clusters that share articles)
         clusters = _merge_orphan_clusters(clusters)
-        # Recompute stable IDs from the final article sets
+        # Assign stable IDs only to clusters that don't already have one.
+        # Pre-seeded clusters from the DB carry their original cluster_id —
+        # keeping it stable across cycles so the enrichment cache (keyed by
+        # cluster_id) continues to work even after new articles are merged in.
         for c in clusters:
-            c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
+            if not c.get("cluster_id"):
+                c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
         by_topic[topic] = clusters
 
     # Cross-topic dedup: merge clusters with overlapping headlines and entities
@@ -630,7 +636,9 @@ def _merge_duplicate_clusters(
         base["keywords"] = all_keywords
         base["first_seen"] = first_seen
         base["last_updated"] = last_updated
-        base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
+        # Keep the base cluster's original ID so the enrichment cache
+        # (keyed by cluster_id) survives the merge.
+        base.setdefault("cluster_id", _stable_cluster_id(base.get("topic", "other"), all_articles))
         merged_by_topic.setdefault(base_topic, []).append(base)
 
     return merged_by_topic

+ 5 - 8
test_news_mcp.py

@@ -964,14 +964,11 @@ def test_cross_cycle_merge_topic_mismatch():
         f"{[c['headline'] for c in all_clusters]}"
     )
 
-    # The surviving cluster must carry the *same* cluster_id regardless of
-    # which topic wins, i.e. cluster_id is now purely article-key based.
-    from news_mcp.dedup.cluster import _stable_cluster_id
-    expected_cid = _stable_cluster_id(
-        "other",
-        [{"title": "Hegseth says US will keep pressure on Iran", "url": url}],
-    )
-    assert all_clusters[0]["cluster_id"] == expected_cid
+    # The surviving cluster must carry the *same* cluster_id from the
+    # pre-seeded DB cluster, even after absorbing new articles.
+    # cluster_id is set once at creation and never recomputed, so the
+    # enrichment cache (keyed by cluster_id) survives across cycles.
+    assert all_clusters[0]["cluster_id"] == "old-id"
 
     # The existing article must still be in the merged cluster.
     article_urls = [a["url"] for a in all_clusters[0]["articles"]]