|
@@ -909,3 +909,83 @@ def test_preseed_merge_into_existing_cluster():
|
|
|
# Should have exactly 1 cluster (the existing one, now with 2 articles)
|
|
# Should have exactly 1 cluster (the existing one, now with 2 articles)
|
|
|
assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}"
|
|
assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}"
|
|
|
assert len(all_clusters[0]["articles"]) == 2
|
|
assert len(all_clusters[0]["articles"]) == 2
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def test_cross_cycle_merge_topic_mismatch():
|
|
|
|
|
+ """Regression: same article arriving in two cycles must merge even when
|
|
|
|
|
+ the existing cluster's enriched topic differs from the new article's
|
|
|
|
|
+ heuristic topic. Previously the cluster_id included the topic in the
|
|
|
|
|
+ hash AND existing clusters were bucketed by enriched topic, so a
|
|
|
|
|
+ topic mismatch silently produced two rows in the DB."""
|
|
|
|
|
+ from news_mcp.dedup import cluster as dc
|
|
|
|
|
+
|
|
|
|
|
+ url = (
|
|
|
|
|
+ "https://breakingthenews.net/Article/"
|
|
|
|
|
+ "Hegseth-says-US-will-keep-pressure-on-Iran/66401647"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ existing = [{
|
|
|
|
|
+ "cluster_id": "old-id",
|
|
|
|
|
+ # Enriched topic from a prior LLM pass — *different* from what
|
|
|
|
|
+ # normalize_topic_from_title would return for the headline.
|
|
|
|
|
+ "topic": "crypto",
|
|
|
|
|
+ "headline": "Hegseth says US will keep pressure on Iran",
|
|
|
|
|
+ "summary": "",
|
|
|
|
|
+ "sources": ["Breaking The News"],
|
|
|
|
|
+ "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
|
|
|
|
|
+ "last_updated": datetime.now(timezone.utc).isoformat(),
|
|
|
|
|
+ "first_seen": "Sat, 30 May 2026 13:00:00 GMT",
|
|
|
|
|
+ "articles": [{
|
|
|
|
|
+ "title": "Hegseth says US will keep pressure on Iran",
|
|
|
|
|
+ "url": url,
|
|
|
|
|
+ "source": "Breaking The News",
|
|
|
|
|
+ "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
|
|
|
|
|
+ "summary": "",
|
|
|
|
|
+ }],
|
|
|
|
|
+ "entities": ["Pete Hegseth", "Iran"],
|
|
|
|
|
+ "sentiment": "negative",
|
|
|
|
|
+ "sentimentScore": -0.5,
|
|
|
|
|
+ "importance": 0.1,
|
|
|
|
|
+ }]
|
|
|
|
|
+
|
|
|
|
|
+ # The same article arrives again in the next polling cycle.
|
|
|
|
|
+ # Its heuristic topic (normalize_topic_from_title) is "other" (no
|
|
|
|
|
+ # keyword match), which differs from the stored "crypto" topic.
|
|
|
|
|
+ new_article = {
|
|
|
|
|
+ "title": "Hegseth says US will keep pressure on Iran",
|
|
|
|
|
+ "url": url,
|
|
|
|
|
+ "source": "Breaking The News",
|
|
|
|
|
+ "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
|
|
|
|
|
+ "summary": "",
|
|
|
|
|
+ # feed_url is used for per-feed hash tracking
|
|
|
|
|
+ "feed_url": "https://breakingthenews.net/news-feed.xml",
|
|
|
|
|
+ "importance": 0.11,
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ clustered = dc.dedup_and_cluster_articles(
|
|
|
|
|
+ [new_article],
|
|
|
|
|
+ existing_clusters=existing,
|
|
|
|
|
+ max_age_hours=4,
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ all_clusters = [c for clusters in clustered.values() for c in clusters]
|
|
|
|
|
+ # Must produce exactly 1 cluster — the new article merges into the
|
|
|
|
|
+ # existing one. Before the fix this yielded 2 clusters with different
|
|
|
|
|
+ # cluster_ids because the topic mismatch prevented matching.
|
|
|
|
|
+ assert len(all_clusters) == 1, (
|
|
|
|
|
+ f"Expected 1 cluster, got {len(all_clusters)}: "
|
|
|
|
|
+ f"{[c['headline'] for c in all_clusters]}"
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ # The surviving cluster must carry the *same* cluster_id regardless of
|
|
|
|
|
+ # which topic wins, i.e. cluster_id is now purely article-key based.
|
|
|
|
|
+ from news_mcp.dedup.cluster import _stable_cluster_id
|
|
|
|
|
+ expected_cid = _stable_cluster_id(
|
|
|
|
|
+ "other",
|
|
|
|
|
+ [{"title": "Hegseth says US will keep pressure on Iran", "url": url}],
|
|
|
|
|
+ )
|
|
|
|
|
+ assert all_clusters[0]["cluster_id"] == expected_cid
|
|
|
|
|
+
|
|
|
|
|
+ # The existing article must still be in the merged cluster.
|
|
|
|
|
+ article_urls = [a["url"] for a in all_clusters[0]["articles"]]
|
|
|
|
|
+ assert url in article_urls
|