test_news_mcp.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979
  1. from __future__ import annotations
  2. from contextlib import contextmanager
  3. import tempfile
  4. from pathlib import Path
  5. from datetime import datetime, timezone
  6. from news_mcp.dedup.cluster import dedup_and_cluster_articles
  7. from news_mcp.storage.sqlite_store import SQLiteClusterStore
  8. from news_mcp.enrichment.importance import compute_importance
  9. from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
  10. from news_mcp.entity_normalize import normalize_query, normalize_entities
  11. from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
  12. from news_mcp.trends_resolution import resolve_entity_via_trends
  13. from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
  14. def _article(title: str, url: str = None, source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
  15. if url is None:
  16. import hashlib
  17. url = f"https.example.com/{hashlib.md5(title.encode()).hexdigest()[:10]}"
  18. return {
  19. "title": title,
  20. "url": url,
  21. "source": source,
  22. "timestamp": ts,
  23. "summary": "summary text",
  24. }
  25. def test_dedup_merges_similar_titles():
  26. articles = [
  27. _article("Trump warns Iran war could spread"),
  28. _article("Trump warns Iran conflict could spread"),
  29. _article("Unrelated sports result"),
  30. ]
  31. clustered = dedup_and_cluster_articles(articles, similarity_threshold=0.75)
  32. # We expect the Trump/Iran items to be merged into one cluster in the same topic bucket.
  33. total_clusters = sum(len(v) for v in clustered.values())
  34. assert total_clusters == 2
  35. def test_sqlite_feed_hash_roundtrip():
  36. with tempfile.TemporaryDirectory() as td:
  37. db = Path(td) / "news.sqlite"
  38. store = SQLiteClusterStore(db)
  39. assert store.get_feed_hash("breakingthenews") is None
  40. store.set_feed_hash("breakingthenews", "abc123")
  41. assert store.get_feed_hash("breakingthenews") == "abc123"
  42. def test_sqlite_summary_cache_roundtrip():
  43. with tempfile.TemporaryDirectory() as td:
  44. db = Path(td) / "news.sqlite"
  45. store = SQLiteClusterStore(db)
  46. # Upsert a base cluster first.
  47. store.upsert_clusters([
  48. {
  49. "cluster_id": "cid1",
  50. "headline": "Headline",
  51. "summary": "Summary",
  52. "entities": ["Iran"],
  53. "sentiment": "negative",
  54. "importance": 0.5,
  55. "sources": ["BreakingTheNews"],
  56. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  57. "articles": [],
  58. "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
  59. "last_updated": "Mon, 30 Mar 2026 12:00:00 GMT",
  60. }
  61. ], topic="other")
  62. store.upsert_cluster_summary(
  63. "cid1",
  64. {
  65. "headline": "Headline",
  66. "mergedSummary": "Merged summary",
  67. "keyFacts": ["Fact 1"],
  68. "sources": ["BreakingTheNews"],
  69. },
  70. )
  71. cached = store.get_cluster_summary("cid1", ttl_hours=24)
  72. assert cached is not None
  73. assert cached["mergedSummary"] == "Merged summary"
  74. assert cached["keyFacts"] == ["Fact 1"]
  75. def test_sqlite_summary_cache_does_not_create_placeholder_row():
  76. with tempfile.TemporaryDirectory() as td:
  77. db = Path(td) / "news.sqlite"
  78. store = SQLiteClusterStore(db)
  79. store.upsert_cluster_summary(
  80. "missing",
  81. {
  82. "headline": "Missing",
  83. "mergedSummary": "Summary",
  84. "keyFacts": [],
  85. "sources": [],
  86. },
  87. )
  88. assert store.get_cluster_by_id("missing") is None
  89. assert store.get_cluster_summary("missing", ttl_hours=24) is None
  90. def test_prune_clusters_deletes_rows_older_than_retention():
  91. with tempfile.TemporaryDirectory() as td:
  92. db = Path(td) / "news.sqlite"
  93. store = SQLiteClusterStore(db)
  94. store.upsert_clusters([
  95. {
  96. "cluster_id": "fresh",
  97. "headline": "Fresh",
  98. "summary": "Fresh summary",
  99. "entities": ["Bitcoin"],
  100. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  101. "articles": [],
  102. },
  103. {
  104. "cluster_id": "stale",
  105. "headline": "Stale",
  106. "summary": "Stale summary",
  107. "entities": ["Iran"],
  108. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  109. "articles": [],
  110. },
  111. ], topic="other")
  112. with store._conn() as conn:
  113. conn.execute(
  114. "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
  115. ("2025-01-01T00:00:00+00:00", "stale"),
  116. )
  117. deleted = store.prune_clusters(retention_days=30)
  118. assert deleted == 1
  119. assert store.get_cluster_by_id("stale") is None
  120. assert store.get_cluster_by_id("fresh") is not None
  121. assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None
  122. def test_prune_if_due_skips_deletes_when_pruning_disabled():
  123. with tempfile.TemporaryDirectory() as td:
  124. db = Path(td) / "news.sqlite"
  125. store = SQLiteClusterStore(db)
  126. store.upsert_clusters([
  127. {
  128. "cluster_id": "stale",
  129. "headline": "Stale",
  130. "summary": "Stale summary",
  131. "entities": ["Iran"],
  132. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  133. "articles": [],
  134. }
  135. ], topic="other")
  136. with store._conn() as conn:
  137. conn.execute(
  138. "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
  139. ("2025-01-01T00:00:00+00:00", "stale"),
  140. )
  141. result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24)
  142. assert result["enabled"] is False
  143. assert result["deleted"] == 0
  144. assert store.get_cluster_by_id("stale") is not None
  145. def test_get_latest_clusters_orders_by_updated_at_before_limit():
  146. with tempfile.TemporaryDirectory() as td:
  147. db = Path(td) / "news.sqlite"
  148. store = SQLiteClusterStore(db)
  149. store.upsert_clusters(
  150. [
  151. {
  152. "cluster_id": "old",
  153. "headline": "Old",
  154. "summary": "Old summary",
  155. "entities": ["Iran"],
  156. "timestamp": "Wed, 01 Apr 2026 09:00:00 GMT",
  157. "articles": [],
  158. },
  159. {
  160. "cluster_id": "new",
  161. "headline": "New",
  162. "summary": "New summary",
  163. "entities": ["Bitcoin"],
  164. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  165. "articles": [],
  166. },
  167. ],
  168. topic="crypto",
  169. )
  170. with store._conn() as conn:
  171. conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "new"))
  172. conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2026-01-01T00:00:00+00:00", "old"))
  173. latest = store.get_latest_clusters(topic="crypto", ttl_hours=24 * 365, limit=1)
  174. assert len(latest) == 1
  175. assert latest[0]["cluster_id"] == "new"
  176. def test_get_entity_metadata_prefers_mid_scoped_row():
  177. with tempfile.TemporaryDirectory() as td:
  178. db = Path(td) / "news.sqlite"
  179. store = SQLiteClusterStore(db)
  180. store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid=None, sources=["local"])
  181. store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid="/m/Bitcoin", sources=["trends"])
  182. store.record_entity_request("Bitcoin", mid="/m/Bitcoin")
  183. meta = store.get_entity_metadata("Bitcoin")
  184. assert meta is not None
  185. assert meta["mid"] == "/m/Bitcoin"
  186. def test_blacklist_filters_entities_case_insensitively():
  187. entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
  188. filtered = _filter_entities(entities, blacklist=["bloomberg"])
  189. assert filtered == ["Reuters", "CoinDesk"]
  190. def test_blacklist_supports_wildcards():
  191. assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"])
  192. assert _matches_blacklist("bloomberg", blacklist=["*berg"])
  193. assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"])
  194. def test_query_normalization_keeps_common_shorthand_working():
  195. assert normalize_query("btc") == "Bitcoin"
  196. assert normalize_query("Trump") == "Donald Trump"
  197. assert normalize_query("nvidia") == "nvidia"
  198. def test_entity_normalization_deduplicates_aliases():
  199. assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"]
  200. def test_load_prompt_reads_prompt_files():
  201. text = load_prompt("extract_entities.prompt")
  202. assert "Return STRICT JSON" in text
  203. def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch):
  204. import news_mcp.trends_resolution as trends_resolution
  205. trends_resolution.resolve_entity_via_trends.cache_clear()
  206. trends_resolution._provider.cache_clear()
  207. monkeypatch.setattr(trends_resolution, "_provider", lambda: None)
  208. resolved = resolve_entity_via_trends("btc")
  209. assert resolved["normalized"] == "Bitcoin"
  210. assert resolved["canonical_label"] == "Bitcoin"
  211. assert resolved["mid"] is None
  212. assert resolved["candidates"] == []
  213. assert resolved["source"] == "fallback"
  214. trends_resolution.resolve_entity_via_trends.cache_clear()
  215. def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
  216. clusters = [
  217. {"headline": "older", "timestamp": "2026-04-01T10:00:00+00:00", "importance": 0.9},
  218. {"headline": "newer", "timestamp": "2026-04-01T11:00:00+00:00", "importance": 0.1},
  219. ]
  220. sorted_clusters = _sort_clusters_by_recency(clusters)
  221. assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]
  222. def test_call_llm_dispatches_to_selected_provider(monkeypatch):
  223. async def fake_groq(model, messages, response_json=True):
  224. return '{"ok": true, "provider": "groq"}'
  225. async def fake_openai(model, messages, response_json=True):
  226. return '{"ok": true, "provider": "openai"}'
  227. monkeypatch.setattr("news_mcp.llm._call_groq", fake_groq)
  228. monkeypatch.setattr("news_mcp.llm._call_openai", fake_openai)
  229. import asyncio
  230. groq = asyncio.run(call_llm("groq", "x", "sys", "user"))
  231. openai = asyncio.run(call_llm("openai", "x", "sys", "user"))
  232. assert '"provider": "groq"' in groq
  233. assert '"provider": "openai"' in openai
  234. def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
  235. import news_mcp.jobs.poller as poller
  236. import hashlib
  237. from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS
  238. calls = {"fetch": 0, "cluster": 0, "enrich": 0, "classify": 0}
  239. rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()] or [NEWS_FEED_URL]
  240. material = "\n".join(
  241. [
  242. "Bitcoin rallies|https://example.com/a|Wed, 01 Apr 2026 12:00:00 GMT",
  243. ]
  244. )
  245. expected_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
  246. async def fake_to_thread(fn, limit):
  247. calls["fetch"] += 1
  248. return [
  249. {
  250. "title": "Bitcoin rallies",
  251. "url": "https://example.com/a",
  252. "source": "Src",
  253. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  254. "summary": "summary",
  255. }
  256. ]
  257. def fake_cluster(articles):
  258. calls["cluster"] += 1
  259. return {
  260. "crypto": [
  261. {
  262. "cluster_id": "cid",
  263. "headline": "Bitcoin rallies",
  264. "summary": "summary",
  265. "entities": [],
  266. "sentiment": "neutral",
  267. "importance": 0.0,
  268. "sources": ["Src"],
  269. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  270. "articles": [],
  271. }
  272. ]
  273. }
  274. def fake_enrich(cluster):
  275. calls["enrich"] += 1
  276. return cluster
  277. async def fake_classify(cluster):
  278. calls["classify"] += 1
  279. return cluster
  280. class DummyStore:
  281. def __init__(self, *args, **kwargs):
  282. self.meta = {}
  283. self.feed_hash = expected_hash
  284. @contextmanager
  285. def _conn(self):
  286. class _Conn:
  287. def execute(self, *args, **kwargs):
  288. return None
  289. yield _Conn()
  290. def get_feed_hash(self, feed_key):
  291. return self.feed_hash
  292. def set_feed_hash(self, feed_key, last_hash):
  293. self.feed_hash = last_hash
  294. def set_feed_state(self, feed_key, last_hash, item_count):
  295. self.feed_hash = last_hash
  296. def get_enabled_feed_urls(self, feed_urls):
  297. return feed_urls
  298. def get_cluster_by_id(self, cluster_id):
  299. return None
  300. def upsert_clusters(self, clusters, topic):
  301. self.meta["upserted"] = (len(clusters), topic)
  302. def prune_if_due(self, **kwargs):
  303. self.meta["prune"] = kwargs
  304. return {"deleted": 0}
  305. def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
  306. return []
  307. def set_meta(self, key, value):
  308. self.meta[key] = value
  309. monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
  310. async def _mock_fetch(limit, url_list=None):
  311. calls["fetch"] += 1
  312. return [{"title": "Bitcoin rallies", "url": "https://example.com/a", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT"}]
  313. monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch)
  314. monkeypatch.setattr(poller.asyncio, "to_thread", fake_to_thread)
  315. monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
  316. monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
  317. monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
  318. poller.store = None
  319. async def run_once():
  320. await poller.refresh_clusters(topic=None, limit=80)
  321. import asyncio
  322. asyncio.run(run_once())
  323. assert calls["fetch"] == 1
  324. assert calls["cluster"] == 0
  325. assert calls["enrich"] == 0
  326. assert calls["classify"] == 0
  327. def test_importance_prefers_llm_signal():
  328. # Two clusters with same coverage but different sentiment magnitude.
  329. base = {
  330. "sources": ["A", "B"],
  331. "articles": [{}, {}],
  332. "sentiment": "neutral",
  333. "sentimentScore": 0.0,
  334. }
  335. pos = dict(base, sentimentScore=0.9)
  336. neg = dict(base, sentimentScore=-0.8)
  337. imp_base = compute_importance(base)
  338. imp_pos = compute_importance(pos)
  339. imp_neg = compute_importance(neg)
  340. assert imp_pos >= imp_base
  341. assert imp_neg >= imp_base
  342. # ---------------------------------------------------------------------------
  343. # Regression tests for the May 2026 correctness pass
  344. # ---------------------------------------------------------------------------
  345. def test_classify_cluster_llm_uses_llm_topic_and_drops_invalid_ones(monkeypatch):
  346. """The LLM-extracted topic must propagate to the returned cluster, but
  347. free-form / hallucinated topic strings must be coerced into the allowed
  348. set so they never reach the SQL row column verbatim."""
  349. import asyncio
  350. from news_mcp.enrichment import llm_enrich
  351. async def fake_extraction(cluster):
  352. return {
  353. "topic": "regulation",
  354. "entities": ["SEC"],
  355. "sentiment": "neutral",
  356. "sentimentScore": 0.0,
  357. "keywords": ["enforcement"],
  358. }
  359. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
  360. monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
  361. cluster = {"cluster_id": "x", "headline": "SEC fines firm", "summary": "...", "topic": "other"}
  362. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  363. assert out["topic"] == "regulation"
  364. # Hallucinated topic is rejected; we fall back to the input cluster's
  365. # heuristic topic when it is one of the allowed ones.
  366. async def fake_extraction_garbage(cluster):
  367. return {
  368. "topic": "geopolitics-and-stuff",
  369. "entities": ["NATO"],
  370. "sentiment": "neutral",
  371. "sentimentScore": 0.0,
  372. "keywords": [],
  373. }
  374. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction_garbage)
  375. cluster = {"cluster_id": "y", "headline": "NATO meets", "summary": "...", "topic": "macro"}
  376. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  377. assert out["topic"] == "macro" # heuristic fallback
  378. # When neither the LLM nor the heuristic gives a valid label -> "other".
  379. cluster = {"cluster_id": "z", "headline": "...", "summary": "...", "topic": "geopolitics-bucket"}
  380. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  381. assert out["topic"] == "other"
  382. def test_classify_cluster_llm_normalizes_aliases_before_blacklist(monkeypatch):
  383. """Regression: previously ``_filter_entities`` ran before
  384. ``normalize_entities``, so blacklisting "bitcoin" missed entries the LLM
  385. returned as the alias "btc". Order is now normalize -> blacklist."""
  386. import asyncio
  387. from news_mcp.enrichment import llm_enrich
  388. async def fake_extraction(cluster):
  389. return {
  390. "topic": "crypto",
  391. "entities": ["btc", "Reuters"],
  392. "sentiment": "neutral",
  393. "sentimentScore": 0.0,
  394. "keywords": ["btc rally", "Reuters"],
  395. }
  396. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
  397. monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
  398. monkeypatch.setattr(llm_enrich, "NEWS_ENTITY_BLACKLIST", ["bitcoin"])
  399. cluster = {"cluster_id": "x", "headline": "BTC up", "summary": "...", "topic": "crypto"}
  400. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  401. # "btc" became "Bitcoin" via aliasing, then was filtered out by the
  402. # blacklist. "Reuters" survives (not blacklisted in this test).
  403. assert "Bitcoin" not in out["entities"]
  404. assert "btc" not in [e.lower() for e in out["entities"]]
  405. assert "Reuters" in out["entities"]
  406. def test_dedup_uses_jaccard_when_titles_diverge():
  407. """Composite similarity: even with embeddings off, two articles whose
  408. titles share only some tokens should still merge if their content (token
  409. overlap) is high enough."""
  410. from news_mcp.dedup import cluster as dc
  411. # Titles differ heavily; bodies overlap heavily -> Jaccard should catch.
  412. articles = [
  413. {
  414. "title": "Iran tension rises",
  415. "url": "https://example.com/a",
  416. "source": "A",
  417. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  418. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  419. },
  420. {
  421. "title": "Trump issues stark warning over Tehran",
  422. "url": "https://example.com/b",
  423. "source": "B",
  424. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  425. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  426. },
  427. ]
  428. clustered = dc.dedup_and_cluster_articles(articles)
  429. total = sum(len(v) for v in clustered.values())
  430. assert total == 1, f"Expected 1 merged cluster via Jaccard signal, got {total}"
  431. def test_dedup_does_not_merge_unrelated_articles():
  432. """Negative control: cluster is robust against false-positives even with
  433. the more permissive multi-signal merging."""
  434. from news_mcp.dedup import cluster as dc
  435. articles = [
  436. {
  437. "title": "Bitcoin hits new high",
  438. "url": "https://example.com/a",
  439. "source": "A",
  440. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  441. "summary": "Bitcoin reached a record high amid rising demand.",
  442. },
  443. {
  444. "title": "Local sports team wins",
  445. "url": "https://example.com/b",
  446. "source": "B",
  447. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  448. "summary": "The local team won the regional championship.",
  449. },
  450. ]
  451. clustered = dc.dedup_and_cluster_articles(articles)
  452. total = sum(len(v) for v in clustered.values())
  453. assert total == 2
  454. def test_get_all_feed_states_returns_all_rows():
  455. """Health endpoint regression: the writer keys feed state with a hashed
  456. multi-feed key, so the old hardcoded ``get_feed_state("breakingthenews")``
  457. always returned None. Verify the bulk getter works."""
  458. import tempfile
  459. from pathlib import Path
  460. with tempfile.TemporaryDirectory() as td:
  461. db = Path(td) / "news.sqlite"
  462. store = SQLiteClusterStore(db)
  463. store.set_feed_hash("newsfeeds:abc123", "hash1")
  464. store.set_feed_hash("newsfeeds:def456", "hash2")
  465. all_states = store.get_all_feed_states()
  466. assert len(all_states) == 2
  467. keys = {s["feed_key"] for s in all_states}
  468. assert keys == {"newsfeeds:abc123", "newsfeeds:def456"}
  469. def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
  470. """Regression: the SQL row-column ``topic`` previously locked in the
  471. headline-heuristic value (which is ``other`` for most stories) and ignored
  472. the LLM's classification stored in the payload. Verify the upsert now uses
  473. the post-enrichment topic so SQL filtering and dashboard groupings see the
  474. real classification."""
  475. import asyncio
  476. import news_mcp.jobs.poller as poller
  477. captured = {"upserts": []}
  478. class DummyStore:
  479. def __init__(self, *args, **kwargs):
  480. pass
  481. @contextmanager
  482. def _conn(self):
  483. class _Conn:
  484. def execute(self, *args, **kwargs):
  485. return None
  486. yield _Conn()
  487. def get_feed_hash(self, feed_key):
  488. return None
  489. def set_feed_hash(self, feed_key, last_hash):
  490. pass
  491. def set_feed_state(self, feed_key, last_hash, item_count):
  492. pass
  493. def get_enabled_feed_urls(self, feed_urls):
  494. return feed_urls
  495. def get_cluster_by_id(self, cluster_id):
  496. return None
  497. def upsert_clusters(self, clusters, topic):
  498. # Capture the topic the poller chose for each cluster.
  499. for c in clusters:
  500. captured["upserts"].append({"row_topic": topic, "payload_topic": c.get("topic"), "cluster_id": c.get("cluster_id")})
  501. def prune_if_due(self, **kwargs):
  502. return {"deleted": 0}
  503. def get_failed_enrichment_clusters(self, max_retries=3):
  504. return []
  505. def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
  506. return []
  507. def set_meta(self, key, value):
  508. pass
  509. def set_feed_state(self, feed_key, last_hash, item_count):
  510. pass
  511. def fake_cluster(articles, similarity_threshold=None, existing_clusters=None, max_age_hours=0):
  512. # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
  513. # in the title for the heuristic matcher — title above does have
  514. # "law"-adjacent words but not the specific tokens it matches).
  515. return {
  516. "other": [
  517. {
  518. "cluster_id": "cid",
  519. "headline": "SEC fines firm",
  520. "summary": "...",
  521. "topic": "other",
  522. "entities": [],
  523. "sentiment": "neutral",
  524. "importance": 0.0,
  525. "sources": ["S"],
  526. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  527. "articles": [],
  528. }
  529. ]
  530. }
  531. def fake_enrich(cluster):
  532. return cluster
  533. async def fake_classify(cluster):
  534. # The LLM thinks it's regulation -> the SQL row column must reflect that.
  535. out = dict(cluster)
  536. out["topic"] = "regulation"
  537. out["entities"] = ["SEC"]
  538. out["entityResolutions"] = []
  539. out["sentiment"] = "neutral"
  540. out["sentimentScore"] = 0.0
  541. out["keywords"] = []
  542. return out
  543. monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
  544. async def _mock_fetch2(limit, url_list=None):
  545. return [
  546. {"title": "SEC fines firm", "url": "https://example.com/a", "source": "S",
  547. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "summary": "..."},
  548. ]
  549. monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch2)
  550. monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
  551. monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
  552. monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
  553. asyncio.run(poller.refresh_clusters(topic=None, limit=10))
  554. assert captured["upserts"], "Expected at least one upsert call"
  555. # The poller first stores raw clusters (topic=heuristic), then enriched
  556. # clusters (topic=post-LLM). The enriched upsert is the one whose row_topic
  557. # reflects the LLM classification.
  558. enriched_upserts = [u for u in captured["upserts"] if u["row_topic"] == "regulation"]
  559. assert enriched_upserts, (
  560. f"Expected at least one upsert with row_topic='regulation', "
  561. f"got topics: {[u['row_topic'] for u in captured['upserts']]}"
  562. )
  563. upsert = enriched_upserts[0]
  564. assert upsert["row_topic"] == "regulation", (
  565. f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
  566. )
  567. assert upsert["payload_topic"] == "regulation"
  568. # ---------------------------------------------------------------------------
  569. # v1.3 — Stable cluster IDs, orphan merge, temporal gating
  570. # ---------------------------------------------------------------------------
  571. def test_stable_cluster_id_is_order_independent():
  572. """Two articles about the same event should always get the same cluster_id,
  573. regardless of which article is processed first."""
  574. from news_mcp.dedup import cluster as dc
  575. art_a = {
  576. "title": "Bitcoin Surges Past $100K",
  577. "url": "https://example.com/btc-100k",
  578. "source": "Reuters",
  579. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  580. "summary": "Bitcoin reached $100,000 for the first time.",
  581. }
  582. art_b = {
  583. "title": "BTC Breaks $100,000 Barrier",
  584. "url": "https://example.com/btc-100k",
  585. "source": "Bloomberg",
  586. "timestamp": "Mon, 30 Mar 2026 12:05:00 GMT",
  587. "summary": "Bitcoin topped the $100,000 level.",
  588. }
  589. # Process A first
  590. clustered_ab = dc.dedup_and_cluster_articles([art_a, art_b])
  591. # Process B first
  592. clustered_ba = dc.dedup_and_cluster_articles([art_b, art_a])
  593. # Both orderings must produce the same cluster_id(s)
  594. ids_ab = sorted(c["cluster_id"] for clusters in clustered_ab.values() for c in clusters)
  595. ids_ba = sorted(c["cluster_id"] for clusters in clustered_ba.values() for c in clusters)
  596. assert ids_ab == ids_ba, f"Cluster IDs depend on order: {ids_ab} vs {ids_ba}"
  597. def test_orphan_merge_deduplicates_shared_articles():
  598. """When two clusters end up with overlapping article sets (e.g. because
  599. embeddings were temporarily unavailable), the post-clustering merge pass
  600. should combine them into one."""
  601. from news_mcp.dedup.cluster import _merge_orphan_clusters
  602. clusters = [
  603. {
  604. "cluster_id": "aaa",
  605. "topic": "crypto",
  606. "headline": "Bitcoin surges",
  607. "articles": [
  608. {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
  609. ],
  610. "sources": ["A"],
  611. "first_seen": "T1",
  612. "last_updated": "T1",
  613. },
  614. {
  615. "cluster_id": "bbb",
  616. "topic": "crypto",
  617. "headline": "BTC up",
  618. "articles": [
  619. {"title": "BTC up", "url": "https://example.com/btc", "source": "B"},
  620. ],
  621. "sources": ["B"],
  622. "first_seen": "T2",
  623. "last_updated": "T2",
  624. },
  625. ]
  626. merged = _merge_orphan_clusters(clusters)
  627. assert len(merged) == 1, f"Expected 1 merged cluster, got {len(merged)}"
  628. assert set(merged[0]["sources"]) == {"A", "B"}
  629. def test_orphan_merge_preserves_distinct_clusters():
  630. """Clusters with no shared articles must remain independent."""
  631. from news_mcp.dedup.cluster import _merge_orphan_clusters
  632. clusters = [
  633. {
  634. "cluster_id": "aaa",
  635. "topic": "crypto",
  636. "headline": "Bitcoin surges",
  637. "articles": [
  638. {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
  639. ],
  640. "sources": ["A"],
  641. "first_seen": "T1",
  642. "last_updated": "T1",
  643. },
  644. {
  645. "cluster_id": "bbb",
  646. "topic": "crypto",
  647. "headline": "Ethereum merge",
  648. "articles": [
  649. {"title": "Ethereum merge", "url": "https://example.com/eth", "source": "B"},
  650. ],
  651. "sources": ["B"],
  652. "first_seen": "T2",
  653. "last_updated": "T2",
  654. },
  655. ]
  656. merged = _merge_orphan_clusters(clusters)
  657. assert len(merged) == 2
  658. def test_stable_id_same_for_different_titles_same_url():
  659. """Two articles with the same URL but different titles (e.g. corrected
  660. headline) must produce the same cluster_id."""
  661. from news_mcp.dedup.cluster import _stable_cluster_id
  662. arts_a = [
  663. {"title": "Fed Raises Rates", "url": "https://example.com/fed-rates"},
  664. ]
  665. arts_b = [
  666. {"title": "Federal Reserve Increases Interest Rates", "url": "https://example.com/fed-rates"},
  667. ]
  668. id_a = _stable_cluster_id("macro", arts_a)
  669. id_b = _stable_cluster_id("macro", arts_b)
  670. assert id_a == id_b, f"Same URL must give same cluster_id: {id_a} vs {id_b}"
  671. def test_temporal_gate_excludes_stale_clusters():
  672. """Clusters older than max_age_hours should not be candidates for merging."""
  673. from news_mcp.dedup.cluster import _cluster_is_within_age_window
  674. old_cluster = {
  675. "cluster_id": "old",
  676. "topic": "crypto",
  677. "last_updated": "2025-01-01T00:00:00+00:00",
  678. "articles": [],
  679. }
  680. assert not _cluster_is_within_age_window(old_cluster, max_age_hours=4)
  681. recent_cluster = {
  682. "cluster_id": "recent",
  683. "topic": "crypto",
  684. "last_updated": datetime.now(timezone.utc).isoformat(),
  685. "articles": [],
  686. }
  687. assert _cluster_is_within_age_window(recent_cluster, max_age_hours=4)
  688. # max_age_hours=0 means no limit
  689. assert _cluster_is_within_age_window(old_cluster, max_age_hours=0)
  690. def test_preseed_merge_into_existing_cluster():
  691. """When existing_clusters is provided, a new article that matches should
  692. merge into the existing cluster instead of creating a new one."""
  693. from news_mcp.dedup import cluster as dc
  694. existing = [{
  695. "cluster_id": "existing-1",
  696. "topic": "other",
  697. "headline": "Trump warns Iran war could spread across Middle East",
  698. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  699. "sources": ["Reuters"],
  700. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  701. "last_updated": datetime.now(timezone.utc).isoformat(),
  702. "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
  703. "articles": [
  704. {
  705. "title": "Trump warns Iran war could spread across Middle East",
  706. "url": "https://example.com/trump-iran",
  707. "source": "Reuters",
  708. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  709. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  710. }
  711. ],
  712. "entities": [],
  713. "sentiment": "neutral",
  714. "importance": 0.0,
  715. }]
  716. new_article = {
  717. "title": "Trump warns Iran conflict could spread across Middle East",
  718. "url": "https://example.com/trump-iran-2",
  719. "source": "Bloomberg",
  720. "timestamp": "Mon, 30 Mar 2026 13:00:00 GMT",
  721. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  722. }
  723. # Use a low title threshold so Jaccard can catch the merge
  724. clustered = dc.dedup_and_cluster_articles(
  725. [new_article],
  726. similarity_threshold=0.75,
  727. existing_clusters=existing,
  728. max_age_hours=4,
  729. )
  730. all_clusters = [c for clusters in clustered.values() for c in clusters]
  731. # Should have exactly 1 cluster (the existing one, now with 2 articles)
  732. assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}"
  733. assert len(all_clusters[0]["articles"]) == 2
  734. def test_cross_cycle_merge_topic_mismatch():
  735. """Regression: same article arriving in two cycles must merge even when
  736. the existing cluster's enriched topic differs from the new article's
  737. heuristic topic. Previously the cluster_id included the topic in the
  738. hash AND existing clusters were bucketed by enriched topic, so a
  739. topic mismatch silently produced two rows in the DB."""
  740. from news_mcp.dedup import cluster as dc
  741. url = (
  742. "https://breakingthenews.net/Article/"
  743. "Hegseth-says-US-will-keep-pressure-on-Iran/66401647"
  744. )
  745. existing = [{
  746. "cluster_id": "old-id",
  747. # Enriched topic from a prior LLM pass — *different* from what
  748. # normalize_topic_from_title would return for the headline.
  749. "topic": "crypto",
  750. "headline": "Hegseth says US will keep pressure on Iran",
  751. "summary": "",
  752. "sources": ["Breaking The News"],
  753. "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
  754. "last_updated": datetime.now(timezone.utc).isoformat(),
  755. "first_seen": "Sat, 30 May 2026 13:00:00 GMT",
  756. "articles": [{
  757. "title": "Hegseth says US will keep pressure on Iran",
  758. "url": url,
  759. "source": "Breaking The News",
  760. "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
  761. "summary": "",
  762. }],
  763. "entities": ["Pete Hegseth", "Iran"],
  764. "sentiment": "negative",
  765. "sentimentScore": -0.5,
  766. "importance": 0.1,
  767. }]
  768. # The same article arrives again in the next polling cycle.
  769. # Its heuristic topic (normalize_topic_from_title) is "other" (no
  770. # keyword match), which differs from the stored "crypto" topic.
  771. new_article = {
  772. "title": "Hegseth says US will keep pressure on Iran",
  773. "url": url,
  774. "source": "Breaking The News",
  775. "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
  776. "summary": "",
  777. # feed_url is used for per-feed hash tracking
  778. "feed_url": "https://breakingthenews.net/news-feed.xml",
  779. "importance": 0.11,
  780. }
  781. clustered = dc.dedup_and_cluster_articles(
  782. [new_article],
  783. existing_clusters=existing,
  784. max_age_hours=4,
  785. )
  786. all_clusters = [c for clusters in clustered.values() for c in clusters]
  787. # Must produce exactly 1 cluster — the new article merges into the
  788. # existing one. Before the fix this yielded 2 clusters with different
  789. # cluster_ids because the topic mismatch prevented matching.
  790. assert len(all_clusters) == 1, (
  791. f"Expected 1 cluster, got {len(all_clusters)}: "
  792. f"{[c['headline'] for c in all_clusters]}"
  793. )
  794. # The surviving cluster must carry the *same* cluster_id regardless of
  795. # which topic wins, i.e. cluster_id is now purely article-key based.
  796. from news_mcp.dedup.cluster import _stable_cluster_id
  797. expected_cid = _stable_cluster_id(
  798. "other",
  799. [{"title": "Hegseth says US will keep pressure on Iran", "url": url}],
  800. )
  801. assert all_clusters[0]["cluster_id"] == expected_cid
  802. # The existing article must still be in the merged cluster.
  803. article_urls = [a["url"] for a in all_clusters[0]["articles"]]
  804. assert url in article_urls