|
|
@@ -362,7 +362,9 @@ def _merge_orphan_clusters(
|
|
|
base["sources"] = all_sources
|
|
|
base["first_seen"] = first_seen
|
|
|
base["last_updated"] = last_updated
|
|
|
- base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
|
|
|
+ # Keep the base cluster's original ID so the enrichment cache
|
|
|
+ # (keyed by cluster_id) survives the merge.
|
|
|
+ base.setdefault("cluster_id", _stable_cluster_id(base.get("topic", "other"), all_articles))
|
|
|
merged.append(base)
|
|
|
|
|
|
return merged
|
|
|
@@ -488,9 +490,13 @@ def dedup_and_cluster_articles(
|
|
|
for topic, clusters in by_topic.items():
|
|
|
# Merge orphans (clusters that share articles)
|
|
|
clusters = _merge_orphan_clusters(clusters)
|
|
|
- # Recompute stable IDs from the final article sets
|
|
|
+ # Assign stable IDs only to clusters that don't already have one.
|
|
|
+ # Pre-seeded clusters from the DB carry their original cluster_id —
|
|
|
+ # keeping it stable across cycles so the enrichment cache (keyed by
|
|
|
+ # cluster_id) continues to work even after new articles are merged in.
|
|
|
for c in clusters:
|
|
|
- c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
|
|
|
+ if not c.get("cluster_id"):
|
|
|
+ c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
|
|
|
by_topic[topic] = clusters
|
|
|
|
|
|
# Cross-topic dedup: merge clusters with overlapping headlines and entities
|
|
|
@@ -630,7 +636,9 @@ def _merge_duplicate_clusters(
|
|
|
base["keywords"] = all_keywords
|
|
|
base["first_seen"] = first_seen
|
|
|
base["last_updated"] = last_updated
|
|
|
- base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
|
|
|
+ # Keep the base cluster's original ID so the enrichment cache
|
|
|
+ # (keyed by cluster_id) survives the merge.
|
|
|
+ base.setdefault("cluster_id", _stable_cluster_id(base.get("topic", "other"), all_articles))
|
|
|
merged_by_topic.setdefault(base_topic, []).append(base)
|
|
|
|
|
|
return merged_by_topic
|