test_news_mcp.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702
  1. from __future__ import annotations
  2. from contextlib import contextmanager
  3. import tempfile
  4. from pathlib import Path
  5. from news_mcp.dedup.cluster import dedup_and_cluster_articles
  6. from news_mcp.storage.sqlite_store import SQLiteClusterStore
  7. from news_mcp.enrichment.importance import compute_importance
  8. from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
  9. from news_mcp.entity_normalize import normalize_query, normalize_entities
  10. from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
  11. from news_mcp.trends_resolution import resolve_entity_via_trends
  12. from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
  13. def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
  14. return {
  15. "title": title,
  16. "url": url,
  17. "source": source,
  18. "timestamp": ts,
  19. "summary": "summary text",
  20. }
  21. def test_dedup_merges_similar_titles():
  22. articles = [
  23. _article("Trump warns Iran war could spread"),
  24. _article("Trump warns Iran conflict could spread"),
  25. _article("Unrelated sports result"),
  26. ]
  27. clustered = dedup_and_cluster_articles(articles, similarity_threshold=0.75)
  28. # We expect the Trump/Iran items to be merged into one cluster in the same topic bucket.
  29. total_clusters = sum(len(v) for v in clustered.values())
  30. assert total_clusters == 2
  31. def test_sqlite_feed_hash_roundtrip():
  32. with tempfile.TemporaryDirectory() as td:
  33. db = Path(td) / "news.sqlite"
  34. store = SQLiteClusterStore(db)
  35. assert store.get_feed_hash("breakingthenews") is None
  36. store.set_feed_hash("breakingthenews", "abc123")
  37. assert store.get_feed_hash("breakingthenews") == "abc123"
  38. def test_sqlite_summary_cache_roundtrip():
  39. with tempfile.TemporaryDirectory() as td:
  40. db = Path(td) / "news.sqlite"
  41. store = SQLiteClusterStore(db)
  42. # Upsert a base cluster first.
  43. store.upsert_clusters([
  44. {
  45. "cluster_id": "cid1",
  46. "headline": "Headline",
  47. "summary": "Summary",
  48. "entities": ["Iran"],
  49. "sentiment": "negative",
  50. "importance": 0.5,
  51. "sources": ["BreakingTheNews"],
  52. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  53. "articles": [],
  54. "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
  55. "last_updated": "Mon, 30 Mar 2026 12:00:00 GMT",
  56. }
  57. ], topic="other")
  58. store.upsert_cluster_summary(
  59. "cid1",
  60. {
  61. "headline": "Headline",
  62. "mergedSummary": "Merged summary",
  63. "keyFacts": ["Fact 1"],
  64. "sources": ["BreakingTheNews"],
  65. },
  66. )
  67. cached = store.get_cluster_summary("cid1", ttl_hours=24)
  68. assert cached is not None
  69. assert cached["mergedSummary"] == "Merged summary"
  70. assert cached["keyFacts"] == ["Fact 1"]
  71. def test_sqlite_summary_cache_does_not_create_placeholder_row():
  72. with tempfile.TemporaryDirectory() as td:
  73. db = Path(td) / "news.sqlite"
  74. store = SQLiteClusterStore(db)
  75. store.upsert_cluster_summary(
  76. "missing",
  77. {
  78. "headline": "Missing",
  79. "mergedSummary": "Summary",
  80. "keyFacts": [],
  81. "sources": [],
  82. },
  83. )
  84. assert store.get_cluster_by_id("missing") is None
  85. assert store.get_cluster_summary("missing", ttl_hours=24) is None
  86. def test_prune_clusters_deletes_rows_older_than_retention():
  87. with tempfile.TemporaryDirectory() as td:
  88. db = Path(td) / "news.sqlite"
  89. store = SQLiteClusterStore(db)
  90. store.upsert_clusters([
  91. {
  92. "cluster_id": "fresh",
  93. "headline": "Fresh",
  94. "summary": "Fresh summary",
  95. "entities": ["Bitcoin"],
  96. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  97. "articles": [],
  98. },
  99. {
  100. "cluster_id": "stale",
  101. "headline": "Stale",
  102. "summary": "Stale summary",
  103. "entities": ["Iran"],
  104. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  105. "articles": [],
  106. },
  107. ], topic="other")
  108. with store._conn() as conn:
  109. conn.execute(
  110. "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
  111. ("2025-01-01T00:00:00+00:00", "stale"),
  112. )
  113. deleted = store.prune_clusters(retention_days=30)
  114. assert deleted == 1
  115. assert store.get_cluster_by_id("stale") is None
  116. assert store.get_cluster_by_id("fresh") is not None
  117. assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None
  118. def test_prune_if_due_skips_deletes_when_pruning_disabled():
  119. with tempfile.TemporaryDirectory() as td:
  120. db = Path(td) / "news.sqlite"
  121. store = SQLiteClusterStore(db)
  122. store.upsert_clusters([
  123. {
  124. "cluster_id": "stale",
  125. "headline": "Stale",
  126. "summary": "Stale summary",
  127. "entities": ["Iran"],
  128. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  129. "articles": [],
  130. }
  131. ], topic="other")
  132. with store._conn() as conn:
  133. conn.execute(
  134. "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
  135. ("2025-01-01T00:00:00+00:00", "stale"),
  136. )
  137. result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24)
  138. assert result["enabled"] is False
  139. assert result["deleted"] == 0
  140. assert store.get_cluster_by_id("stale") is not None
  141. def test_get_latest_clusters_orders_by_updated_at_before_limit():
  142. with tempfile.TemporaryDirectory() as td:
  143. db = Path(td) / "news.sqlite"
  144. store = SQLiteClusterStore(db)
  145. store.upsert_clusters(
  146. [
  147. {
  148. "cluster_id": "old",
  149. "headline": "Old",
  150. "summary": "Old summary",
  151. "entities": ["Iran"],
  152. "timestamp": "Wed, 01 Apr 2026 09:00:00 GMT",
  153. "articles": [],
  154. },
  155. {
  156. "cluster_id": "new",
  157. "headline": "New",
  158. "summary": "New summary",
  159. "entities": ["Bitcoin"],
  160. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  161. "articles": [],
  162. },
  163. ],
  164. topic="crypto",
  165. )
  166. with store._conn() as conn:
  167. conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "new"))
  168. conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2026-01-01T00:00:00+00:00", "old"))
  169. latest = store.get_latest_clusters(topic="crypto", ttl_hours=24 * 365, limit=1)
  170. assert len(latest) == 1
  171. assert latest[0]["cluster_id"] == "new"
  172. def test_get_entity_metadata_prefers_mid_scoped_row():
  173. with tempfile.TemporaryDirectory() as td:
  174. db = Path(td) / "news.sqlite"
  175. store = SQLiteClusterStore(db)
  176. store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid=None, sources=["local"])
  177. store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid="/m/Bitcoin", sources=["trends"])
  178. store.record_entity_request("Bitcoin", mid="/m/Bitcoin")
  179. meta = store.get_entity_metadata("Bitcoin")
  180. assert meta is not None
  181. assert meta["mid"] == "/m/Bitcoin"
  182. def test_blacklist_filters_entities_case_insensitively():
  183. entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
  184. filtered = _filter_entities(entities, blacklist=["bloomberg"])
  185. assert filtered == ["Reuters", "CoinDesk"]
  186. def test_blacklist_supports_wildcards():
  187. assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"])
  188. assert _matches_blacklist("bloomberg", blacklist=["*berg"])
  189. assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"])
  190. def test_query_normalization_keeps_common_shorthand_working():
  191. assert normalize_query("btc") == "Bitcoin"
  192. assert normalize_query("Trump") == "Donald Trump"
  193. assert normalize_query("nvidia") == "nvidia"
  194. def test_entity_normalization_deduplicates_aliases():
  195. assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"]
  196. def test_load_prompt_reads_prompt_files():
  197. text = load_prompt("extract_entities.prompt")
  198. assert "Return STRICT JSON" in text
  199. def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch):
  200. import news_mcp.trends_resolution as trends_resolution
  201. trends_resolution.resolve_entity_via_trends.cache_clear()
  202. trends_resolution._provider.cache_clear()
  203. monkeypatch.setattr(trends_resolution, "_provider", lambda: None)
  204. resolved = resolve_entity_via_trends("btc")
  205. assert resolved["normalized"] == "Bitcoin"
  206. assert resolved["canonical_label"] == "Bitcoin"
  207. assert resolved["mid"] is None
  208. assert resolved["candidates"] == []
  209. assert resolved["source"] == "fallback"
  210. trends_resolution.resolve_entity_via_trends.cache_clear()
  211. def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
  212. clusters = [
  213. {"headline": "older", "timestamp": "Wed, 01 Apr 2026 10:00:00 GMT", "importance": 0.9},
  214. {"headline": "newer", "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "importance": 0.1},
  215. ]
  216. sorted_clusters = _sort_clusters_by_recency(clusters)
  217. assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]
  218. def test_build_extraction_prompt_is_stable_without_blacklist():
  219. cluster = {
  220. "headline": "Bloomberg reports Bitcoin rallies after US rate comments",
  221. "summary": "A report from Bloomberg says Bitcoin moved higher after comments from the Fed.",
  222. "articles": [],
  223. }
  224. prompt = build_extraction_prompt(cluster)
  225. assert "Bloomberg reports Bitcoin rallies" in prompt
  226. assert "Do NOT return empty entities" in prompt
  227. assert "Bloomberg" in prompt # present in the input, not filtered here
  228. def test_call_llm_dispatches_to_selected_provider(monkeypatch):
  229. async def fake_groq(model, messages, response_json=True):
  230. return '{"ok": true, "provider": "groq"}'
  231. async def fake_openai(model, messages, response_json=True):
  232. return '{"ok": true, "provider": "openai"}'
  233. monkeypatch.setattr("news_mcp.llm._call_groq", fake_groq)
  234. monkeypatch.setattr("news_mcp.llm._call_openai", fake_openai)
  235. import asyncio
  236. groq = asyncio.run(call_llm("groq", "x", "sys", "user"))
  237. openai = asyncio.run(call_llm("openai", "x", "sys", "user"))
  238. assert '"provider": "groq"' in groq
  239. assert '"provider": "openai"' in openai
  240. def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
  241. import news_mcp.jobs.poller as poller
  242. import hashlib
  243. from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS
  244. calls = {"fetch": 0, "cluster": 0, "enrich": 0, "classify": 0}
  245. rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()] or [NEWS_FEED_URL]
  246. material = "\n".join(
  247. [
  248. "Bitcoin rallies|https://example.com/a|Wed, 01 Apr 2026 12:00:00 GMT",
  249. ]
  250. )
  251. expected_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
  252. async def fake_to_thread(fn, limit):
  253. calls["fetch"] += 1
  254. return [
  255. {
  256. "title": "Bitcoin rallies",
  257. "url": "https://example.com/a",
  258. "source": "Src",
  259. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  260. "summary": "summary",
  261. }
  262. ]
  263. def fake_cluster(articles):
  264. calls["cluster"] += 1
  265. return {
  266. "crypto": [
  267. {
  268. "cluster_id": "cid",
  269. "headline": "Bitcoin rallies",
  270. "summary": "summary",
  271. "entities": [],
  272. "sentiment": "neutral",
  273. "importance": 0.0,
  274. "sources": ["Src"],
  275. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  276. "articles": [],
  277. }
  278. ]
  279. }
  280. def fake_enrich(cluster):
  281. calls["enrich"] += 1
  282. return cluster
  283. async def fake_classify(cluster):
  284. calls["classify"] += 1
  285. return cluster
  286. class DummyStore:
  287. def __init__(self, *args, **kwargs):
  288. self.meta = {}
  289. self.feed_hash = expected_hash
  290. @contextmanager
  291. def _conn(self):
  292. class _Conn:
  293. def execute(self, *args, **kwargs):
  294. return None
  295. yield _Conn()
  296. def get_feed_hash(self, feed_key):
  297. return self.feed_hash
  298. def set_feed_hash(self, feed_key, last_hash):
  299. self.feed_hash = last_hash
  300. def set_feed_state(self, feed_key, last_hash, item_count):
  301. self.feed_hash = last_hash
  302. def get_cluster_by_id(self, cluster_id):
  303. return None
  304. def upsert_clusters(self, clusters, topic):
  305. self.meta["upserted"] = (len(clusters), topic)
  306. def prune_if_due(self, **kwargs):
  307. self.meta["prune"] = kwargs
  308. return {"deleted": 0}
  309. def set_meta(self, key, value):
  310. self.meta[key] = value
  311. monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
  312. async def _mock_fetch(limit):
  313. calls["fetch"] += 1
  314. return [{"title": "Bitcoin rallies", "url": "https://example.com/a", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT"}]
  315. monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch)
  316. monkeypatch.setattr(poller.asyncio, "to_thread", fake_to_thread)
  317. monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
  318. monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
  319. monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
  320. poller.store = None
  321. async def run_once():
  322. await poller.refresh_clusters(topic=None, limit=80)
  323. import asyncio
  324. asyncio.run(run_once())
  325. assert calls["fetch"] == 1
  326. assert calls["cluster"] == 0
  327. assert calls["enrich"] == 0
  328. assert calls["classify"] == 0
  329. def test_importance_prefers_llm_signal():
  330. # Two clusters with same coverage but different sentiment magnitude.
  331. base = {
  332. "sources": ["A", "B"],
  333. "articles": [{}, {}],
  334. "sentiment": "neutral",
  335. "sentimentScore": 0.0,
  336. }
  337. pos = dict(base, sentimentScore=0.9)
  338. neg = dict(base, sentimentScore=-0.8)
  339. imp_base = compute_importance(base)
  340. imp_pos = compute_importance(pos)
  341. imp_neg = compute_importance(neg)
  342. assert imp_pos >= imp_base
  343. assert imp_neg >= imp_base
  344. # ---------------------------------------------------------------------------
  345. # Regression tests for the May 2026 correctness pass
  346. # ---------------------------------------------------------------------------
  347. def test_classify_cluster_llm_uses_llm_topic_and_drops_invalid_ones(monkeypatch):
  348. """The LLM-extracted topic must propagate to the returned cluster, but
  349. free-form / hallucinated topic strings must be coerced into the allowed
  350. set so they never reach the SQL row column verbatim."""
  351. import asyncio
  352. from news_mcp.enrichment import llm_enrich
  353. async def fake_extraction(cluster):
  354. return {
  355. "topic": "regulation",
  356. "entities": ["SEC"],
  357. "sentiment": "neutral",
  358. "sentimentScore": 0.0,
  359. "keywords": ["enforcement"],
  360. }
  361. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
  362. monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
  363. cluster = {"cluster_id": "x", "headline": "SEC fines firm", "summary": "...", "topic": "other"}
  364. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  365. assert out["topic"] == "regulation"
  366. # Hallucinated topic is rejected; we fall back to the input cluster's
  367. # heuristic topic when it is one of the allowed ones.
  368. async def fake_extraction_garbage(cluster):
  369. return {
  370. "topic": "geopolitics-and-stuff",
  371. "entities": ["NATO"],
  372. "sentiment": "neutral",
  373. "sentimentScore": 0.0,
  374. "keywords": [],
  375. }
  376. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction_garbage)
  377. cluster = {"cluster_id": "y", "headline": "NATO meets", "summary": "...", "topic": "macro"}
  378. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  379. assert out["topic"] == "macro" # heuristic fallback
  380. # When neither the LLM nor the heuristic gives a valid label -> "other".
  381. cluster = {"cluster_id": "z", "headline": "...", "summary": "...", "topic": "geopolitics-bucket"}
  382. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  383. assert out["topic"] == "other"
  384. def test_classify_cluster_llm_normalizes_aliases_before_blacklist(monkeypatch):
  385. """Regression: previously ``_filter_entities`` ran before
  386. ``normalize_entities``, so blacklisting "bitcoin" missed entries the LLM
  387. returned as the alias "btc". Order is now normalize -> blacklist."""
  388. import asyncio
  389. from news_mcp.enrichment import llm_enrich
  390. async def fake_extraction(cluster):
  391. return {
  392. "topic": "crypto",
  393. "entities": ["btc", "Reuters"],
  394. "sentiment": "neutral",
  395. "sentimentScore": 0.0,
  396. "keywords": ["btc rally", "Reuters"],
  397. }
  398. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
  399. monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
  400. monkeypatch.setattr(llm_enrich, "NEWS_ENTITY_BLACKLIST", ["bitcoin"])
  401. cluster = {"cluster_id": "x", "headline": "BTC up", "summary": "...", "topic": "crypto"}
  402. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  403. # "btc" became "Bitcoin" via aliasing, then was filtered out by the
  404. # blacklist. "Reuters" survives (not blacklisted in this test).
  405. assert "Bitcoin" not in out["entities"]
  406. assert "btc" not in [e.lower() for e in out["entities"]]
  407. assert "Reuters" in out["entities"]
  408. def test_dedup_uses_jaccard_when_titles_diverge():
  409. """Composite similarity: even with embeddings off, two articles whose
  410. titles share only some tokens should still merge if their content (token
  411. overlap) is high enough."""
  412. from news_mcp.dedup import cluster as dc
  413. # Titles differ heavily; bodies overlap heavily -> Jaccard should catch.
  414. articles = [
  415. {
  416. "title": "Iran tension rises",
  417. "url": "https://example.com/a",
  418. "source": "A",
  419. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  420. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  421. },
  422. {
  423. "title": "Trump issues stark warning over Tehran",
  424. "url": "https://example.com/b",
  425. "source": "B",
  426. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  427. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  428. },
  429. ]
  430. clustered = dc.dedup_and_cluster_articles(articles)
  431. total = sum(len(v) for v in clustered.values())
  432. assert total == 1, f"Expected 1 merged cluster via Jaccard signal, got {total}"
  433. def test_dedup_does_not_merge_unrelated_articles():
  434. """Negative control: cluster is robust against false-positives even with
  435. the more permissive multi-signal merging."""
  436. from news_mcp.dedup import cluster as dc
  437. articles = [
  438. {
  439. "title": "Bitcoin hits new high",
  440. "url": "https://example.com/a",
  441. "source": "A",
  442. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  443. "summary": "Bitcoin reached a record high amid rising demand.",
  444. },
  445. {
  446. "title": "Local sports team wins",
  447. "url": "https://example.com/b",
  448. "source": "B",
  449. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  450. "summary": "The local team won the regional championship.",
  451. },
  452. ]
  453. clustered = dc.dedup_and_cluster_articles(articles)
  454. total = sum(len(v) for v in clustered.values())
  455. assert total == 2
  456. def test_get_all_feed_states_returns_all_rows():
  457. """Health endpoint regression: the writer keys feed state with a hashed
  458. multi-feed key, so the old hardcoded ``get_feed_state("breakingthenews")``
  459. always returned None. Verify the bulk getter works."""
  460. import tempfile
  461. from pathlib import Path
  462. with tempfile.TemporaryDirectory() as td:
  463. db = Path(td) / "news.sqlite"
  464. store = SQLiteClusterStore(db)
  465. store.set_feed_hash("newsfeeds:abc123", "hash1")
  466. store.set_feed_hash("newsfeeds:def456", "hash2")
  467. all_states = store.get_all_feed_states()
  468. assert len(all_states) == 2
  469. keys = {s["feed_key"] for s in all_states}
  470. assert keys == {"newsfeeds:abc123", "newsfeeds:def456"}
  471. def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
  472. """Regression: the SQL row-column ``topic`` previously locked in the
  473. headline-heuristic value (which is ``other`` for most stories) and ignored
  474. the LLM's classification stored in the payload. Verify the upsert now uses
  475. the post-enrichment topic so SQL filtering and dashboard groupings see the
  476. real classification."""
  477. import asyncio
  478. import news_mcp.jobs.poller as poller
  479. captured = {"upserts": []}
  480. class DummyStore:
  481. def __init__(self, *args, **kwargs):
  482. pass
  483. @contextmanager
  484. def _conn(self):
  485. class _Conn:
  486. def execute(self, *args, **kwargs):
  487. return None
  488. yield _Conn()
  489. def get_feed_hash(self, feed_key):
  490. return None
  491. def set_feed_hash(self, feed_key, last_hash):
  492. pass
  493. def set_feed_state(self, feed_key, last_hash, item_count):
  494. pass
  495. def get_cluster_by_id(self, cluster_id):
  496. return None
  497. def upsert_clusters(self, clusters, topic):
  498. # Capture the topic the poller chose for each cluster.
  499. for c in clusters:
  500. captured["upserts"].append({"row_topic": topic, "payload_topic": c.get("topic"), "cluster_id": c.get("cluster_id")})
  501. def prune_if_due(self, **kwargs):
  502. return {"deleted": 0}
  503. def get_failed_enrichment_clusters(self, max_retries=3):
  504. return []
  505. def set_meta(self, key, value):
  506. pass
  507. def set_feed_state(self, feed_key, last_hash, item_count):
  508. pass
  509. def fake_cluster(articles):
  510. # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
  511. # in the title for the heuristic matcher — title above does have
  512. # "law"-adjacent words but not the specific tokens it matches).
  513. return {
  514. "other": [
  515. {
  516. "cluster_id": "cid",
  517. "headline": "SEC fines firm",
  518. "summary": "...",
  519. "topic": "other",
  520. "entities": [],
  521. "sentiment": "neutral",
  522. "importance": 0.0,
  523. "sources": ["S"],
  524. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  525. "articles": [],
  526. }
  527. ]
  528. }
  529. def fake_enrich(cluster):
  530. return cluster
  531. async def fake_classify(cluster):
  532. # The LLM thinks it's regulation -> the SQL row column must reflect that.
  533. out = dict(cluster)
  534. out["topic"] = "regulation"
  535. out["entities"] = ["SEC"]
  536. out["entityResolutions"] = []
  537. out["sentiment"] = "neutral"
  538. out["sentimentScore"] = 0.0
  539. out["keywords"] = []
  540. return out
  541. monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
  542. async def _mock_fetch2(limit):
  543. return [
  544. {"title": "SEC fines firm", "url": "https://example.com/a", "source": "S",
  545. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "summary": "..."},
  546. ]
  547. monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch2)
  548. monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
  549. monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
  550. monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
  551. asyncio.run(poller.refresh_clusters(topic=None, limit=10))
  552. assert captured["upserts"], "Expected at least one upsert call"
  553. # The poller first stores raw clusters (topic=heuristic), then enriched
  554. # clusters (topic=post-LLM). The enriched upsert is the one whose row_topic
  555. # reflects the LLM classification.
  556. enriched_upserts = [u for u in captured["upserts"] if u["row_topic"] == "regulation"]
  557. assert enriched_upserts, (
  558. f"Expected at least one upsert with row_topic='regulation', "
  559. f"got topics: {[u['row_topic'] for u in captured['upserts']]}"
  560. )
  561. upsert = enriched_upserts[0]
  562. assert upsert["row_topic"] == "regulation", (
  563. f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
  564. )
  565. assert upsert["payload_topic"] == "regulation"