from __future__ import annotations from contextlib import contextmanager import tempfile from pathlib import Path from datetime import datetime, timezone from news_mcp.dedup.cluster import dedup_and_cluster_articles from news_mcp.storage.sqlite_store import SQLiteClusterStore from news_mcp.enrichment.importance import compute_importance from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist from news_mcp.entity_normalize import normalize_query, normalize_entities from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt from news_mcp.trends_resolution import resolve_entity_via_trends from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency def _article(title: str, url: str = None, source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"): if url is None: import hashlib url = f"https.example.com/{hashlib.md5(title.encode()).hexdigest()[:10]}" return { "title": title, "url": url, "source": source, "timestamp": ts, "summary": "summary text", } def test_dedup_merges_similar_titles(): articles = [ _article("Trump warns Iran war could spread"), _article("Trump warns Iran conflict could spread"), _article("Unrelated sports result"), ] clustered = dedup_and_cluster_articles(articles, similarity_threshold=0.75) # We expect the Trump/Iran items to be merged into one cluster in the same topic bucket. total_clusters = sum(len(v) for v in clustered.values()) assert total_clusters == 2 def test_sqlite_feed_hash_roundtrip(): with tempfile.TemporaryDirectory() as td: db = Path(td) / "news.sqlite" store = SQLiteClusterStore(db) assert store.get_feed_hash("breakingthenews") is None store.set_feed_hash("breakingthenews", "abc123") assert store.get_feed_hash("breakingthenews") == "abc123" def test_sqlite_summary_cache_roundtrip(): with tempfile.TemporaryDirectory() as td: db = Path(td) / "news.sqlite" store = SQLiteClusterStore(db) # Upsert a base cluster first. store.upsert_clusters([ { "cluster_id": "cid1", "headline": "Headline", "summary": "Summary", "entities": ["Iran"], "sentiment": "negative", "importance": 0.5, "sources": ["BreakingTheNews"], "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT", "articles": [], "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT", "last_updated": "Mon, 30 Mar 2026 12:00:00 GMT", } ], topic="other") store.upsert_cluster_summary( "cid1", { "headline": "Headline", "mergedSummary": "Merged summary", "keyFacts": ["Fact 1"], "sources": ["BreakingTheNews"], }, ) cached = store.get_cluster_summary("cid1", ttl_hours=24) assert cached is not None assert cached["mergedSummary"] == "Merged summary" assert cached["keyFacts"] == ["Fact 1"] def test_sqlite_summary_cache_does_not_create_placeholder_row(): with tempfile.TemporaryDirectory() as td: db = Path(td) / "news.sqlite" store = SQLiteClusterStore(db) store.upsert_cluster_summary( "missing", { "headline": "Missing", "mergedSummary": "Summary", "keyFacts": [], "sources": [], }, ) assert store.get_cluster_by_id("missing") is None assert store.get_cluster_summary("missing", ttl_hours=24) is None def test_prune_clusters_deletes_rows_older_than_retention(): with tempfile.TemporaryDirectory() as td: db = Path(td) / "news.sqlite" store = SQLiteClusterStore(db) store.upsert_clusters([ { "cluster_id": "fresh", "headline": "Fresh", "summary": "Fresh summary", "entities": ["Bitcoin"], "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "articles": [], }, { "cluster_id": "stale", "headline": "Stale", "summary": "Stale summary", "entities": ["Iran"], "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "articles": [], }, ], topic="other") with store._conn() as conn: conn.execute( "UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "stale"), ) deleted = store.prune_clusters(retention_days=30) assert deleted == 1 assert store.get_cluster_by_id("stale") is None assert store.get_cluster_by_id("fresh") is not None assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None def test_prune_if_due_skips_deletes_when_pruning_disabled(): with tempfile.TemporaryDirectory() as td: db = Path(td) / "news.sqlite" store = SQLiteClusterStore(db) store.upsert_clusters([ { "cluster_id": "stale", "headline": "Stale", "summary": "Stale summary", "entities": ["Iran"], "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "articles": [], } ], topic="other") with store._conn() as conn: conn.execute( "UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "stale"), ) result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24) assert result["enabled"] is False assert result["deleted"] == 0 assert store.get_cluster_by_id("stale") is not None def test_get_latest_clusters_orders_by_updated_at_before_limit(): with tempfile.TemporaryDirectory() as td: db = Path(td) / "news.sqlite" store = SQLiteClusterStore(db) store.upsert_clusters( [ { "cluster_id": "old", "headline": "Old", "summary": "Old summary", "entities": ["Iran"], "timestamp": "Wed, 01 Apr 2026 09:00:00 GMT", "articles": [], }, { "cluster_id": "new", "headline": "New", "summary": "New summary", "entities": ["Bitcoin"], "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "articles": [], }, ], topic="crypto", ) with store._conn() as conn: conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "new")) conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2026-01-01T00:00:00+00:00", "old")) latest = store.get_latest_clusters(topic="crypto", ttl_hours=24 * 365, limit=1) assert len(latest) == 1 assert latest[0]["cluster_id"] == "new" def test_get_entity_metadata_prefers_mid_scoped_row(): with tempfile.TemporaryDirectory() as td: db = Path(td) / "news.sqlite" store = SQLiteClusterStore(db) store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid=None, sources=["local"]) store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid="/m/Bitcoin", sources=["trends"]) store.record_entity_request("Bitcoin", mid="/m/Bitcoin") meta = store.get_entity_metadata("Bitcoin") assert meta is not None assert meta["mid"] == "/m/Bitcoin" def test_blacklist_filters_entities_case_insensitively(): entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"] filtered = _filter_entities(entities, blacklist=["bloomberg"]) assert filtered == ["Reuters", "CoinDesk"] def test_blacklist_supports_wildcards(): assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"]) assert _matches_blacklist("bloomberg", blacklist=["*berg"]) assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"]) def test_query_normalization_keeps_common_shorthand_working(): assert normalize_query("btc") == "Bitcoin" assert normalize_query("Trump") == "Donald Trump" assert normalize_query("nvidia") == "nvidia" def test_entity_normalization_deduplicates_aliases(): assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"] def test_load_prompt_reads_prompt_files(): text = load_prompt("extract_entities.prompt") assert "Return STRICT JSON" in text def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch): import news_mcp.trends_resolution as trends_resolution trends_resolution.resolve_entity_via_trends.cache_clear() trends_resolution._provider.cache_clear() monkeypatch.setattr(trends_resolution, "_provider", lambda: None) resolved = resolve_entity_via_trends("btc") assert resolved["normalized"] == "Bitcoin" assert resolved["canonical_label"] == "Bitcoin" assert resolved["mid"] is None assert resolved["candidates"] == [] assert resolved["source"] == "fallback" trends_resolution.resolve_entity_via_trends.cache_clear() def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance(): clusters = [ {"headline": "older", "timestamp": "Wed, 01 Apr 2026 10:00:00 GMT", "importance": 0.9}, {"headline": "newer", "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "importance": 0.1}, ] sorted_clusters = _sort_clusters_by_recency(clusters) assert [c["headline"] for c in sorted_clusters] == ["newer", "older"] def test_build_extraction_prompt_is_stable_without_blacklist(): cluster = { "headline": "Bloomberg reports Bitcoin rallies after US rate comments", "summary": "A report from Bloomberg says Bitcoin moved higher after comments from the Fed.", "articles": [], } prompt = build_extraction_prompt(cluster) assert "Bloomberg reports Bitcoin rallies" in prompt assert "Do NOT return empty entities" in prompt assert "Bloomberg" in prompt # present in the input, not filtered here def test_call_llm_dispatches_to_selected_provider(monkeypatch): async def fake_groq(model, messages, response_json=True): return '{"ok": true, "provider": "groq"}' async def fake_openai(model, messages, response_json=True): return '{"ok": true, "provider": "openai"}' monkeypatch.setattr("news_mcp.llm._call_groq", fake_groq) monkeypatch.setattr("news_mcp.llm._call_openai", fake_openai) import asyncio groq = asyncio.run(call_llm("groq", "x", "sys", "user")) openai = asyncio.run(call_llm("openai", "x", "sys", "user")) assert '"provider": "groq"' in groq assert '"provider": "openai"' in openai def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch): import news_mcp.jobs.poller as poller import hashlib from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS calls = {"fetch": 0, "cluster": 0, "enrich": 0, "classify": 0} rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()] or [NEWS_FEED_URL] material = "\n".join( [ "Bitcoin rallies|https://example.com/a|Wed, 01 Apr 2026 12:00:00 GMT", ] ) expected_hash = hashlib.sha1(material.encode("utf-8")).hexdigest() async def fake_to_thread(fn, limit): calls["fetch"] += 1 return [ { "title": "Bitcoin rallies", "url": "https://example.com/a", "source": "Src", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "summary": "summary", } ] def fake_cluster(articles): calls["cluster"] += 1 return { "crypto": [ { "cluster_id": "cid", "headline": "Bitcoin rallies", "summary": "summary", "entities": [], "sentiment": "neutral", "importance": 0.0, "sources": ["Src"], "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "articles": [], } ] } def fake_enrich(cluster): calls["enrich"] += 1 return cluster async def fake_classify(cluster): calls["classify"] += 1 return cluster class DummyStore: def __init__(self, *args, **kwargs): self.meta = {} self.feed_hash = expected_hash @contextmanager def _conn(self): class _Conn: def execute(self, *args, **kwargs): return None yield _Conn() def get_feed_hash(self, feed_key): return self.feed_hash def set_feed_hash(self, feed_key, last_hash): self.feed_hash = last_hash def set_feed_state(self, feed_key, last_hash, item_count): self.feed_hash = last_hash def get_enabled_feed_urls(self, feed_urls): return feed_urls def get_cluster_by_id(self, cluster_id): return None def upsert_clusters(self, clusters, topic): self.meta["upserted"] = (len(clusters), topic) def prune_if_due(self, **kwargs): self.meta["prune"] = kwargs return {"deleted": 0} def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500): return [] def set_meta(self, key, value): self.meta[key] = value monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore) async def _mock_fetch(limit, url_list=None): calls["fetch"] += 1 return [{"title": "Bitcoin rallies", "url": "https://example.com/a", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT"}] monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch) monkeypatch.setattr(poller.asyncio, "to_thread", fake_to_thread) monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster) monkeypatch.setattr(poller, "enrich_cluster", fake_enrich) monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify) poller.store = None async def run_once(): await poller.refresh_clusters(topic=None, limit=80) import asyncio asyncio.run(run_once()) assert calls["fetch"] == 1 assert calls["cluster"] == 0 assert calls["enrich"] == 0 assert calls["classify"] == 0 def test_importance_prefers_llm_signal(): # Two clusters with same coverage but different sentiment magnitude. base = { "sources": ["A", "B"], "articles": [{}, {}], "sentiment": "neutral", "sentimentScore": 0.0, } pos = dict(base, sentimentScore=0.9) neg = dict(base, sentimentScore=-0.8) imp_base = compute_importance(base) imp_pos = compute_importance(pos) imp_neg = compute_importance(neg) assert imp_pos >= imp_base assert imp_neg >= imp_base # --------------------------------------------------------------------------- # Regression tests for the May 2026 correctness pass # --------------------------------------------------------------------------- def test_classify_cluster_llm_uses_llm_topic_and_drops_invalid_ones(monkeypatch): """The LLM-extracted topic must propagate to the returned cluster, but free-form / hallucinated topic strings must be coerced into the allowed set so they never reach the SQL row column verbatim.""" import asyncio from news_mcp.enrichment import llm_enrich async def fake_extraction(cluster): return { "topic": "regulation", "entities": ["SEC"], "sentiment": "neutral", "sentimentScore": 0.0, "keywords": ["enforcement"], } monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction) monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None}) cluster = {"cluster_id": "x", "headline": "SEC fines firm", "summary": "...", "topic": "other"} out = asyncio.run(llm_enrich.classify_cluster_llm(cluster)) assert out["topic"] == "regulation" # Hallucinated topic is rejected; we fall back to the input cluster's # heuristic topic when it is one of the allowed ones. async def fake_extraction_garbage(cluster): return { "topic": "geopolitics-and-stuff", "entities": ["NATO"], "sentiment": "neutral", "sentimentScore": 0.0, "keywords": [], } monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction_garbage) cluster = {"cluster_id": "y", "headline": "NATO meets", "summary": "...", "topic": "macro"} out = asyncio.run(llm_enrich.classify_cluster_llm(cluster)) assert out["topic"] == "macro" # heuristic fallback # When neither the LLM nor the heuristic gives a valid label -> "other". cluster = {"cluster_id": "z", "headline": "...", "summary": "...", "topic": "geopolitics-bucket"} out = asyncio.run(llm_enrich.classify_cluster_llm(cluster)) assert out["topic"] == "other" def test_classify_cluster_llm_normalizes_aliases_before_blacklist(monkeypatch): """Regression: previously ``_filter_entities`` ran before ``normalize_entities``, so blacklisting "bitcoin" missed entries the LLM returned as the alias "btc". Order is now normalize -> blacklist.""" import asyncio from news_mcp.enrichment import llm_enrich async def fake_extraction(cluster): return { "topic": "crypto", "entities": ["btc", "Reuters"], "sentiment": "neutral", "sentimentScore": 0.0, "keywords": ["btc rally", "Reuters"], } monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction) monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None}) monkeypatch.setattr(llm_enrich, "NEWS_ENTITY_BLACKLIST", ["bitcoin"]) cluster = {"cluster_id": "x", "headline": "BTC up", "summary": "...", "topic": "crypto"} out = asyncio.run(llm_enrich.classify_cluster_llm(cluster)) # "btc" became "Bitcoin" via aliasing, then was filtered out by the # blacklist. "Reuters" survives (not blacklisted in this test). assert "Bitcoin" not in out["entities"] assert "btc" not in [e.lower() for e in out["entities"]] assert "Reuters" in out["entities"] def test_dedup_uses_jaccard_when_titles_diverge(): """Composite similarity: even with embeddings off, two articles whose titles share only some tokens should still merge if their content (token overlap) is high enough.""" from news_mcp.dedup import cluster as dc # Titles differ heavily; bodies overlap heavily -> Jaccard should catch. articles = [ { "title": "Iran tension rises", "url": "https://example.com/a", "source": "A", "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT", "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.", }, { "title": "Trump issues stark warning over Tehran", "url": "https://example.com/b", "source": "B", "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT", "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.", }, ] clustered = dc.dedup_and_cluster_articles(articles) total = sum(len(v) for v in clustered.values()) assert total == 1, f"Expected 1 merged cluster via Jaccard signal, got {total}" def test_dedup_does_not_merge_unrelated_articles(): """Negative control: cluster is robust against false-positives even with the more permissive multi-signal merging.""" from news_mcp.dedup import cluster as dc articles = [ { "title": "Bitcoin hits new high", "url": "https://example.com/a", "source": "A", "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT", "summary": "Bitcoin reached a record high amid rising demand.", }, { "title": "Local sports team wins", "url": "https://example.com/b", "source": "B", "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT", "summary": "The local team won the regional championship.", }, ] clustered = dc.dedup_and_cluster_articles(articles) total = sum(len(v) for v in clustered.values()) assert total == 2 def test_get_all_feed_states_returns_all_rows(): """Health endpoint regression: the writer keys feed state with a hashed multi-feed key, so the old hardcoded ``get_feed_state("breakingthenews")`` always returned None. Verify the bulk getter works.""" import tempfile from pathlib import Path with tempfile.TemporaryDirectory() as td: db = Path(td) / "news.sqlite" store = SQLiteClusterStore(db) store.set_feed_hash("newsfeeds:abc123", "hash1") store.set_feed_hash("newsfeeds:def456", "hash2") all_states = store.get_all_feed_states() assert len(all_states) == 2 keys = {s["feed_key"] for s in all_states} assert keys == {"newsfeeds:abc123", "newsfeeds:def456"} def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch): """Regression: the SQL row-column ``topic`` previously locked in the headline-heuristic value (which is ``other`` for most stories) and ignored the LLM's classification stored in the payload. Verify the upsert now uses the post-enrichment topic so SQL filtering and dashboard groupings see the real classification.""" import asyncio import news_mcp.jobs.poller as poller captured = {"upserts": []} class DummyStore: def __init__(self, *args, **kwargs): pass @contextmanager def _conn(self): class _Conn: def execute(self, *args, **kwargs): return None yield _Conn() def get_feed_hash(self, feed_key): return None def set_feed_hash(self, feed_key, last_hash): pass def set_feed_state(self, feed_key, last_hash, item_count): pass def get_enabled_feed_urls(self, feed_urls): return feed_urls def get_cluster_by_id(self, cluster_id): return None def upsert_clusters(self, clusters, topic): # Capture the topic the poller chose for each cluster. for c in clusters: captured["upserts"].append({"row_topic": topic, "payload_topic": c.get("topic"), "cluster_id": c.get("cluster_id")}) def prune_if_due(self, **kwargs): return {"deleted": 0} def get_failed_enrichment_clusters(self, max_retries=3): return [] def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500): return [] def set_meta(self, key, value): pass def set_feed_state(self, feed_key, last_hash, item_count): pass def fake_cluster(articles, similarity_threshold=None, existing_clusters=None, max_age_hours=0): # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords # in the title for the heuristic matcher — title above does have # "law"-adjacent words but not the specific tokens it matches). return { "other": [ { "cluster_id": "cid", "headline": "SEC fines firm", "summary": "...", "topic": "other", "entities": [], "sentiment": "neutral", "importance": 0.0, "sources": ["S"], "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "articles": [], } ] } def fake_enrich(cluster): return cluster async def fake_classify(cluster): # The LLM thinks it's regulation -> the SQL row column must reflect that. out = dict(cluster) out["topic"] = "regulation" out["entities"] = ["SEC"] out["entityResolutions"] = [] out["sentiment"] = "neutral" out["sentimentScore"] = 0.0 out["keywords"] = [] return out monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore) async def _mock_fetch2(limit, url_list=None): return [ {"title": "SEC fines firm", "url": "https://example.com/a", "source": "S", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "summary": "..."}, ] monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch2) monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster) monkeypatch.setattr(poller, "enrich_cluster", fake_enrich) monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify) asyncio.run(poller.refresh_clusters(topic=None, limit=10)) assert captured["upserts"], "Expected at least one upsert call" # The poller first stores raw clusters (topic=heuristic), then enriched # clusters (topic=post-LLM). The enriched upsert is the one whose row_topic # reflects the LLM classification. enriched_upserts = [u for u in captured["upserts"] if u["row_topic"] == "regulation"] assert enriched_upserts, ( f"Expected at least one upsert with row_topic='regulation', " f"got topics: {[u['row_topic'] for u in captured['upserts']]}" ) upsert = enriched_upserts[0] assert upsert["row_topic"] == "regulation", ( f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}" ) assert upsert["payload_topic"] == "regulation" # --------------------------------------------------------------------------- # v1.3 — Stable cluster IDs, orphan merge, temporal gating # --------------------------------------------------------------------------- def test_stable_cluster_id_is_order_independent(): """Two articles about the same event should always get the same cluster_id, regardless of which article is processed first.""" from news_mcp.dedup import cluster as dc art_a = { "title": "Bitcoin Surges Past $100K", "url": "https://example.com/btc-100k", "source": "Reuters", "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT", "summary": "Bitcoin reached $100,000 for the first time.", } art_b = { "title": "BTC Breaks $100,000 Barrier", "url": "https://example.com/btc-100k", "source": "Bloomberg", "timestamp": "Mon, 30 Mar 2026 12:05:00 GMT", "summary": "Bitcoin topped the $100,000 level.", } # Process A first clustered_ab = dc.dedup_and_cluster_articles([art_a, art_b]) # Process B first clustered_ba = dc.dedup_and_cluster_articles([art_b, art_a]) # Both orderings must produce the same cluster_id(s) ids_ab = sorted(c["cluster_id"] for clusters in clustered_ab.values() for c in clusters) ids_ba = sorted(c["cluster_id"] for clusters in clustered_ba.values() for c in clusters) assert ids_ab == ids_ba, f"Cluster IDs depend on order: {ids_ab} vs {ids_ba}" def test_orphan_merge_deduplicates_shared_articles(): """When two clusters end up with overlapping article sets (e.g. because embeddings were temporarily unavailable), the post-clustering merge pass should combine them into one.""" from news_mcp.dedup.cluster import _merge_orphan_clusters clusters = [ { "cluster_id": "aaa", "topic": "crypto", "headline": "Bitcoin surges", "articles": [ {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"}, ], "sources": ["A"], "first_seen": "T1", "last_updated": "T1", }, { "cluster_id": "bbb", "topic": "crypto", "headline": "BTC up", "articles": [ {"title": "BTC up", "url": "https://example.com/btc", "source": "B"}, ], "sources": ["B"], "first_seen": "T2", "last_updated": "T2", }, ] merged = _merge_orphan_clusters(clusters) assert len(merged) == 1, f"Expected 1 merged cluster, got {len(merged)}" assert set(merged[0]["sources"]) == {"A", "B"} def test_orphan_merge_preserves_distinct_clusters(): """Clusters with no shared articles must remain independent.""" from news_mcp.dedup.cluster import _merge_orphan_clusters clusters = [ { "cluster_id": "aaa", "topic": "crypto", "headline": "Bitcoin surges", "articles": [ {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"}, ], "sources": ["A"], "first_seen": "T1", "last_updated": "T1", }, { "cluster_id": "bbb", "topic": "crypto", "headline": "Ethereum merge", "articles": [ {"title": "Ethereum merge", "url": "https://example.com/eth", "source": "B"}, ], "sources": ["B"], "first_seen": "T2", "last_updated": "T2", }, ] merged = _merge_orphan_clusters(clusters) assert len(merged) == 2 def test_stable_id_same_for_different_titles_same_url(): """Two articles with the same URL but different titles (e.g. corrected headline) must produce the same cluster_id.""" from news_mcp.dedup.cluster import _stable_cluster_id arts_a = [ {"title": "Fed Raises Rates", "url": "https://example.com/fed-rates"}, ] arts_b = [ {"title": "Federal Reserve Increases Interest Rates", "url": "https://example.com/fed-rates"}, ] id_a = _stable_cluster_id("macro", arts_a) id_b = _stable_cluster_id("macro", arts_b) assert id_a == id_b, f"Same URL must give same cluster_id: {id_a} vs {id_b}" def test_temporal_gate_excludes_stale_clusters(): """Clusters older than max_age_hours should not be candidates for merging.""" from news_mcp.dedup.cluster import _cluster_is_within_age_window old_cluster = { "cluster_id": "old", "topic": "crypto", "last_updated": "2025-01-01T00:00:00+00:00", "articles": [], } assert not _cluster_is_within_age_window(old_cluster, max_age_hours=4) recent_cluster = { "cluster_id": "recent", "topic": "crypto", "last_updated": datetime.now(timezone.utc).isoformat(), "articles": [], } assert _cluster_is_within_age_window(recent_cluster, max_age_hours=4) # max_age_hours=0 means no limit assert _cluster_is_within_age_window(old_cluster, max_age_hours=0) def test_preseed_merge_into_existing_cluster(): """When existing_clusters is provided, a new article that matches should merge into the existing cluster instead of creating a new one.""" from news_mcp.dedup import cluster as dc existing = [{ "cluster_id": "existing-1", "topic": "other", "headline": "Trump warns Iran war could spread across Middle East", "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.", "sources": ["Reuters"], "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT", "last_updated": datetime.now(timezone.utc).isoformat(), "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT", "articles": [ { "title": "Trump warns Iran war could spread across Middle East", "url": "https://example.com/trump-iran", "source": "Reuters", "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT", "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.", } ], "entities": [], "sentiment": "neutral", "importance": 0.0, }] new_article = { "title": "Trump warns Iran conflict could spread across Middle East", "url": "https://example.com/trump-iran-2", "source": "Bloomberg", "timestamp": "Mon, 30 Mar 2026 13:00:00 GMT", "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.", } # Use a low title threshold so Jaccard can catch the merge clustered = dc.dedup_and_cluster_articles( [new_article], similarity_threshold=0.75, existing_clusters=existing, max_age_hours=4, ) all_clusters = [c for clusters in clustered.values() for c in clusters] # Should have exactly 1 cluster (the existing one, now with 2 articles) assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}" assert len(all_clusters[0]["articles"]) == 2