test_news_mcp.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708
  1. from __future__ import annotations
  2. from contextlib import contextmanager
  3. import tempfile
  4. from pathlib import Path
  5. from news_mcp.dedup.cluster import dedup_and_cluster_articles
  6. from news_mcp.storage.sqlite_store import SQLiteClusterStore
  7. from news_mcp.enrichment.importance import compute_importance
  8. from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
  9. from news_mcp.entity_normalize import normalize_query, normalize_entities
  10. from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
  11. from news_mcp.trends_resolution import resolve_entity_via_trends
  12. from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
  13. def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
  14. return {
  15. "title": title,
  16. "url": url,
  17. "source": source,
  18. "timestamp": ts,
  19. "summary": "summary text",
  20. }
  21. def test_dedup_merges_similar_titles():
  22. articles = [
  23. _article("Trump warns Iran war could spread"),
  24. _article("Trump warns Iran conflict could spread"),
  25. _article("Unrelated sports result"),
  26. ]
  27. clustered = dedup_and_cluster_articles(articles, similarity_threshold=0.75)
  28. # We expect the Trump/Iran items to be merged into one cluster in the same topic bucket.
  29. total_clusters = sum(len(v) for v in clustered.values())
  30. assert total_clusters == 2
  31. def test_sqlite_feed_hash_roundtrip():
  32. with tempfile.TemporaryDirectory() as td:
  33. db = Path(td) / "news.sqlite"
  34. store = SQLiteClusterStore(db)
  35. assert store.get_feed_hash("breakingthenews") is None
  36. store.set_feed_hash("breakingthenews", "abc123")
  37. assert store.get_feed_hash("breakingthenews") == "abc123"
  38. def test_sqlite_summary_cache_roundtrip():
  39. with tempfile.TemporaryDirectory() as td:
  40. db = Path(td) / "news.sqlite"
  41. store = SQLiteClusterStore(db)
  42. # Upsert a base cluster first.
  43. store.upsert_clusters([
  44. {
  45. "cluster_id": "cid1",
  46. "headline": "Headline",
  47. "summary": "Summary",
  48. "entities": ["Iran"],
  49. "sentiment": "negative",
  50. "importance": 0.5,
  51. "sources": ["BreakingTheNews"],
  52. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  53. "articles": [],
  54. "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
  55. "last_updated": "Mon, 30 Mar 2026 12:00:00 GMT",
  56. }
  57. ], topic="other")
  58. store.upsert_cluster_summary(
  59. "cid1",
  60. {
  61. "headline": "Headline",
  62. "mergedSummary": "Merged summary",
  63. "keyFacts": ["Fact 1"],
  64. "sources": ["BreakingTheNews"],
  65. },
  66. )
  67. cached = store.get_cluster_summary("cid1", ttl_hours=24)
  68. assert cached is not None
  69. assert cached["mergedSummary"] == "Merged summary"
  70. assert cached["keyFacts"] == ["Fact 1"]
  71. def test_sqlite_summary_cache_does_not_create_placeholder_row():
  72. with tempfile.TemporaryDirectory() as td:
  73. db = Path(td) / "news.sqlite"
  74. store = SQLiteClusterStore(db)
  75. store.upsert_cluster_summary(
  76. "missing",
  77. {
  78. "headline": "Missing",
  79. "mergedSummary": "Summary",
  80. "keyFacts": [],
  81. "sources": [],
  82. },
  83. )
  84. assert store.get_cluster_by_id("missing") is None
  85. assert store.get_cluster_summary("missing", ttl_hours=24) is None
  86. def test_prune_clusters_deletes_rows_older_than_retention():
  87. with tempfile.TemporaryDirectory() as td:
  88. db = Path(td) / "news.sqlite"
  89. store = SQLiteClusterStore(db)
  90. store.upsert_clusters([
  91. {
  92. "cluster_id": "fresh",
  93. "headline": "Fresh",
  94. "summary": "Fresh summary",
  95. "entities": ["Bitcoin"],
  96. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  97. "articles": [],
  98. },
  99. {
  100. "cluster_id": "stale",
  101. "headline": "Stale",
  102. "summary": "Stale summary",
  103. "entities": ["Iran"],
  104. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  105. "articles": [],
  106. },
  107. ], topic="other")
  108. with store._conn() as conn:
  109. conn.execute(
  110. "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
  111. ("2025-01-01T00:00:00+00:00", "stale"),
  112. )
  113. deleted = store.prune_clusters(retention_days=30)
  114. assert deleted == 1
  115. assert store.get_cluster_by_id("stale") is None
  116. assert store.get_cluster_by_id("fresh") is not None
  117. assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None
  118. def test_prune_if_due_skips_deletes_when_pruning_disabled():
  119. with tempfile.TemporaryDirectory() as td:
  120. db = Path(td) / "news.sqlite"
  121. store = SQLiteClusterStore(db)
  122. store.upsert_clusters([
  123. {
  124. "cluster_id": "stale",
  125. "headline": "Stale",
  126. "summary": "Stale summary",
  127. "entities": ["Iran"],
  128. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  129. "articles": [],
  130. }
  131. ], topic="other")
  132. with store._conn() as conn:
  133. conn.execute(
  134. "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
  135. ("2025-01-01T00:00:00+00:00", "stale"),
  136. )
  137. result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24)
  138. assert result["enabled"] is False
  139. assert result["deleted"] == 0
  140. assert store.get_cluster_by_id("stale") is not None
  141. def test_get_latest_clusters_orders_by_updated_at_before_limit():
  142. with tempfile.TemporaryDirectory() as td:
  143. db = Path(td) / "news.sqlite"
  144. store = SQLiteClusterStore(db)
  145. store.upsert_clusters(
  146. [
  147. {
  148. "cluster_id": "old",
  149. "headline": "Old",
  150. "summary": "Old summary",
  151. "entities": ["Iran"],
  152. "timestamp": "Wed, 01 Apr 2026 09:00:00 GMT",
  153. "articles": [],
  154. },
  155. {
  156. "cluster_id": "new",
  157. "headline": "New",
  158. "summary": "New summary",
  159. "entities": ["Bitcoin"],
  160. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  161. "articles": [],
  162. },
  163. ],
  164. topic="crypto",
  165. )
  166. with store._conn() as conn:
  167. conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "new"))
  168. conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2026-01-01T00:00:00+00:00", "old"))
  169. latest = store.get_latest_clusters(topic="crypto", ttl_hours=24 * 365, limit=1)
  170. assert len(latest) == 1
  171. assert latest[0]["cluster_id"] == "new"
  172. def test_get_entity_metadata_prefers_mid_scoped_row():
  173. with tempfile.TemporaryDirectory() as td:
  174. db = Path(td) / "news.sqlite"
  175. store = SQLiteClusterStore(db)
  176. store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid=None, sources=["local"])
  177. store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid="/m/Bitcoin", sources=["trends"])
  178. store.record_entity_request("Bitcoin", mid="/m/Bitcoin")
  179. meta = store.get_entity_metadata("Bitcoin")
  180. assert meta is not None
  181. assert meta["mid"] == "/m/Bitcoin"
  182. def test_blacklist_filters_entities_case_insensitively():
  183. entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
  184. filtered = _filter_entities(entities, blacklist=["bloomberg"])
  185. assert filtered == ["Reuters", "CoinDesk"]
  186. def test_blacklist_supports_wildcards():
  187. assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"])
  188. assert _matches_blacklist("bloomberg", blacklist=["*berg"])
  189. assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"])
  190. def test_query_normalization_keeps_common_shorthand_working():
  191. assert normalize_query("btc") == "Bitcoin"
  192. assert normalize_query("Trump") == "Donald Trump"
  193. assert normalize_query("nvidia") == "nvidia"
  194. def test_entity_normalization_deduplicates_aliases():
  195. assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"]
  196. def test_load_prompt_reads_prompt_files():
  197. text = load_prompt("extract_entities.prompt")
  198. assert "Return STRICT JSON" in text
  199. def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch):
  200. import news_mcp.trends_resolution as trends_resolution
  201. trends_resolution.resolve_entity_via_trends.cache_clear()
  202. trends_resolution._provider.cache_clear()
  203. monkeypatch.setattr(trends_resolution, "_provider", lambda: None)
  204. resolved = resolve_entity_via_trends("btc")
  205. assert resolved["normalized"] == "Bitcoin"
  206. assert resolved["canonical_label"] == "Bitcoin"
  207. assert resolved["mid"] is None
  208. assert resolved["candidates"] == []
  209. assert resolved["source"] == "fallback"
  210. trends_resolution.resolve_entity_via_trends.cache_clear()
  211. def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
  212. clusters = [
  213. {"headline": "older", "timestamp": "Wed, 01 Apr 2026 10:00:00 GMT", "importance": 0.9},
  214. {"headline": "newer", "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "importance": 0.1},
  215. ]
  216. sorted_clusters = _sort_clusters_by_recency(clusters)
  217. assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]
  218. def test_build_extraction_prompt_is_stable_without_blacklist():
  219. cluster = {
  220. "headline": "Bloomberg reports Bitcoin rallies after US rate comments",
  221. "summary": "A report from Bloomberg says Bitcoin moved higher after comments from the Fed.",
  222. "articles": [],
  223. }
  224. prompt = build_extraction_prompt(cluster)
  225. assert "Bloomberg reports Bitcoin rallies" in prompt
  226. assert "Do NOT return empty entities" in prompt
  227. assert "Bloomberg" in prompt # present in the input, not filtered here
  228. def test_call_llm_dispatches_to_selected_provider(monkeypatch):
  229. async def fake_groq(model, messages, response_json=True):
  230. return '{"ok": true, "provider": "groq"}'
  231. async def fake_openai(model, messages, response_json=True):
  232. return '{"ok": true, "provider": "openai"}'
  233. monkeypatch.setattr("news_mcp.llm._call_groq", fake_groq)
  234. monkeypatch.setattr("news_mcp.llm._call_openai", fake_openai)
  235. import asyncio
  236. groq = asyncio.run(call_llm("groq", "x", "sys", "user"))
  237. openai = asyncio.run(call_llm("openai", "x", "sys", "user"))
  238. assert '"provider": "groq"' in groq
  239. assert '"provider": "openai"' in openai
  240. def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
  241. import news_mcp.jobs.poller as poller
  242. import hashlib
  243. from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS
  244. calls = {"fetch": 0, "cluster": 0, "enrich": 0, "classify": 0}
  245. rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()] or [NEWS_FEED_URL]
  246. material = "\n".join(
  247. [
  248. "Bitcoin rallies|https://example.com/a|Wed, 01 Apr 2026 12:00:00 GMT",
  249. ]
  250. )
  251. expected_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
  252. async def fake_to_thread(fn, limit):
  253. calls["fetch"] += 1
  254. return [
  255. {
  256. "title": "Bitcoin rallies",
  257. "url": "https://example.com/a",
  258. "source": "Src",
  259. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  260. "summary": "summary",
  261. }
  262. ]
  263. def fake_cluster(articles):
  264. calls["cluster"] += 1
  265. return {
  266. "crypto": [
  267. {
  268. "cluster_id": "cid",
  269. "headline": "Bitcoin rallies",
  270. "summary": "summary",
  271. "entities": [],
  272. "sentiment": "neutral",
  273. "importance": 0.0,
  274. "sources": ["Src"],
  275. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  276. "articles": [],
  277. }
  278. ]
  279. }
  280. def fake_enrich(cluster):
  281. calls["enrich"] += 1
  282. return cluster
  283. async def fake_classify(cluster):
  284. calls["classify"] += 1
  285. return cluster
  286. class DummyStore:
  287. def __init__(self, *args, **kwargs):
  288. self.meta = {}
  289. self.feed_hash = expected_hash
  290. @contextmanager
  291. def _conn(self):
  292. class _Conn:
  293. def execute(self, *args, **kwargs):
  294. return None
  295. yield _Conn()
  296. def get_feed_hash(self, feed_key):
  297. return self.feed_hash
  298. def set_feed_hash(self, feed_key, last_hash):
  299. self.feed_hash = last_hash
  300. def set_feed_state(self, feed_key, last_hash, item_count):
  301. self.feed_hash = last_hash
  302. def get_enabled_feed_urls(self, feed_urls):
  303. return feed_urls
  304. def get_cluster_by_id(self, cluster_id):
  305. return None
  306. def upsert_clusters(self, clusters, topic):
  307. self.meta["upserted"] = (len(clusters), topic)
  308. def prune_if_due(self, **kwargs):
  309. self.meta["prune"] = kwargs
  310. return {"deleted": 0}
  311. def set_meta(self, key, value):
  312. self.meta[key] = value
  313. monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
  314. async def _mock_fetch(limit, url_list=None):
  315. calls["fetch"] += 1
  316. return [{"title": "Bitcoin rallies", "url": "https://example.com/a", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT"}]
  317. monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch)
  318. monkeypatch.setattr(poller.asyncio, "to_thread", fake_to_thread)
  319. monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
  320. monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
  321. monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
  322. poller.store = None
  323. async def run_once():
  324. await poller.refresh_clusters(topic=None, limit=80)
  325. import asyncio
  326. asyncio.run(run_once())
  327. assert calls["fetch"] == 1
  328. assert calls["cluster"] == 0
  329. assert calls["enrich"] == 0
  330. assert calls["classify"] == 0
  331. def test_importance_prefers_llm_signal():
  332. # Two clusters with same coverage but different sentiment magnitude.
  333. base = {
  334. "sources": ["A", "B"],
  335. "articles": [{}, {}],
  336. "sentiment": "neutral",
  337. "sentimentScore": 0.0,
  338. }
  339. pos = dict(base, sentimentScore=0.9)
  340. neg = dict(base, sentimentScore=-0.8)
  341. imp_base = compute_importance(base)
  342. imp_pos = compute_importance(pos)
  343. imp_neg = compute_importance(neg)
  344. assert imp_pos >= imp_base
  345. assert imp_neg >= imp_base
  346. # ---------------------------------------------------------------------------
  347. # Regression tests for the May 2026 correctness pass
  348. # ---------------------------------------------------------------------------
  349. def test_classify_cluster_llm_uses_llm_topic_and_drops_invalid_ones(monkeypatch):
  350. """The LLM-extracted topic must propagate to the returned cluster, but
  351. free-form / hallucinated topic strings must be coerced into the allowed
  352. set so they never reach the SQL row column verbatim."""
  353. import asyncio
  354. from news_mcp.enrichment import llm_enrich
  355. async def fake_extraction(cluster):
  356. return {
  357. "topic": "regulation",
  358. "entities": ["SEC"],
  359. "sentiment": "neutral",
  360. "sentimentScore": 0.0,
  361. "keywords": ["enforcement"],
  362. }
  363. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
  364. monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
  365. cluster = {"cluster_id": "x", "headline": "SEC fines firm", "summary": "...", "topic": "other"}
  366. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  367. assert out["topic"] == "regulation"
  368. # Hallucinated topic is rejected; we fall back to the input cluster's
  369. # heuristic topic when it is one of the allowed ones.
  370. async def fake_extraction_garbage(cluster):
  371. return {
  372. "topic": "geopolitics-and-stuff",
  373. "entities": ["NATO"],
  374. "sentiment": "neutral",
  375. "sentimentScore": 0.0,
  376. "keywords": [],
  377. }
  378. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction_garbage)
  379. cluster = {"cluster_id": "y", "headline": "NATO meets", "summary": "...", "topic": "macro"}
  380. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  381. assert out["topic"] == "macro" # heuristic fallback
  382. # When neither the LLM nor the heuristic gives a valid label -> "other".
  383. cluster = {"cluster_id": "z", "headline": "...", "summary": "...", "topic": "geopolitics-bucket"}
  384. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  385. assert out["topic"] == "other"
  386. def test_classify_cluster_llm_normalizes_aliases_before_blacklist(monkeypatch):
  387. """Regression: previously ``_filter_entities`` ran before
  388. ``normalize_entities``, so blacklisting "bitcoin" missed entries the LLM
  389. returned as the alias "btc". Order is now normalize -> blacklist."""
  390. import asyncio
  391. from news_mcp.enrichment import llm_enrich
  392. async def fake_extraction(cluster):
  393. return {
  394. "topic": "crypto",
  395. "entities": ["btc", "Reuters"],
  396. "sentiment": "neutral",
  397. "sentimentScore": 0.0,
  398. "keywords": ["btc rally", "Reuters"],
  399. }
  400. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
  401. monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
  402. monkeypatch.setattr(llm_enrich, "NEWS_ENTITY_BLACKLIST", ["bitcoin"])
  403. cluster = {"cluster_id": "x", "headline": "BTC up", "summary": "...", "topic": "crypto"}
  404. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  405. # "btc" became "Bitcoin" via aliasing, then was filtered out by the
  406. # blacklist. "Reuters" survives (not blacklisted in this test).
  407. assert "Bitcoin" not in out["entities"]
  408. assert "btc" not in [e.lower() for e in out["entities"]]
  409. assert "Reuters" in out["entities"]
  410. def test_dedup_uses_jaccard_when_titles_diverge():
  411. """Composite similarity: even with embeddings off, two articles whose
  412. titles share only some tokens should still merge if their content (token
  413. overlap) is high enough."""
  414. from news_mcp.dedup import cluster as dc
  415. # Titles differ heavily; bodies overlap heavily -> Jaccard should catch.
  416. articles = [
  417. {
  418. "title": "Iran tension rises",
  419. "url": "https://example.com/a",
  420. "source": "A",
  421. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  422. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  423. },
  424. {
  425. "title": "Trump issues stark warning over Tehran",
  426. "url": "https://example.com/b",
  427. "source": "B",
  428. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  429. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  430. },
  431. ]
  432. clustered = dc.dedup_and_cluster_articles(articles)
  433. total = sum(len(v) for v in clustered.values())
  434. assert total == 1, f"Expected 1 merged cluster via Jaccard signal, got {total}"
  435. def test_dedup_does_not_merge_unrelated_articles():
  436. """Negative control: cluster is robust against false-positives even with
  437. the more permissive multi-signal merging."""
  438. from news_mcp.dedup import cluster as dc
  439. articles = [
  440. {
  441. "title": "Bitcoin hits new high",
  442. "url": "https://example.com/a",
  443. "source": "A",
  444. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  445. "summary": "Bitcoin reached a record high amid rising demand.",
  446. },
  447. {
  448. "title": "Local sports team wins",
  449. "url": "https://example.com/b",
  450. "source": "B",
  451. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  452. "summary": "The local team won the regional championship.",
  453. },
  454. ]
  455. clustered = dc.dedup_and_cluster_articles(articles)
  456. total = sum(len(v) for v in clustered.values())
  457. assert total == 2
  458. def test_get_all_feed_states_returns_all_rows():
  459. """Health endpoint regression: the writer keys feed state with a hashed
  460. multi-feed key, so the old hardcoded ``get_feed_state("breakingthenews")``
  461. always returned None. Verify the bulk getter works."""
  462. import tempfile
  463. from pathlib import Path
  464. with tempfile.TemporaryDirectory() as td:
  465. db = Path(td) / "news.sqlite"
  466. store = SQLiteClusterStore(db)
  467. store.set_feed_hash("newsfeeds:abc123", "hash1")
  468. store.set_feed_hash("newsfeeds:def456", "hash2")
  469. all_states = store.get_all_feed_states()
  470. assert len(all_states) == 2
  471. keys = {s["feed_key"] for s in all_states}
  472. assert keys == {"newsfeeds:abc123", "newsfeeds:def456"}
  473. def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
  474. """Regression: the SQL row-column ``topic`` previously locked in the
  475. headline-heuristic value (which is ``other`` for most stories) and ignored
  476. the LLM's classification stored in the payload. Verify the upsert now uses
  477. the post-enrichment topic so SQL filtering and dashboard groupings see the
  478. real classification."""
  479. import asyncio
  480. import news_mcp.jobs.poller as poller
  481. captured = {"upserts": []}
  482. class DummyStore:
  483. def __init__(self, *args, **kwargs):
  484. pass
  485. @contextmanager
  486. def _conn(self):
  487. class _Conn:
  488. def execute(self, *args, **kwargs):
  489. return None
  490. yield _Conn()
  491. def get_feed_hash(self, feed_key):
  492. return None
  493. def set_feed_hash(self, feed_key, last_hash):
  494. pass
  495. def set_feed_state(self, feed_key, last_hash, item_count):
  496. pass
  497. def get_enabled_feed_urls(self, feed_urls):
  498. return feed_urls
  499. def get_cluster_by_id(self, cluster_id):
  500. return None
  501. def upsert_clusters(self, clusters, topic):
  502. # Capture the topic the poller chose for each cluster.
  503. for c in clusters:
  504. captured["upserts"].append({"row_topic": topic, "payload_topic": c.get("topic"), "cluster_id": c.get("cluster_id")})
  505. def prune_if_due(self, **kwargs):
  506. return {"deleted": 0}
  507. def get_failed_enrichment_clusters(self, max_retries=3):
  508. return []
  509. def set_meta(self, key, value):
  510. pass
  511. def set_feed_state(self, feed_key, last_hash, item_count):
  512. pass
  513. def fake_cluster(articles):
  514. # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
  515. # in the title for the heuristic matcher — title above does have
  516. # "law"-adjacent words but not the specific tokens it matches).
  517. return {
  518. "other": [
  519. {
  520. "cluster_id": "cid",
  521. "headline": "SEC fines firm",
  522. "summary": "...",
  523. "topic": "other",
  524. "entities": [],
  525. "sentiment": "neutral",
  526. "importance": 0.0,
  527. "sources": ["S"],
  528. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  529. "articles": [],
  530. }
  531. ]
  532. }
  533. def fake_enrich(cluster):
  534. return cluster
  535. async def fake_classify(cluster):
  536. # The LLM thinks it's regulation -> the SQL row column must reflect that.
  537. out = dict(cluster)
  538. out["topic"] = "regulation"
  539. out["entities"] = ["SEC"]
  540. out["entityResolutions"] = []
  541. out["sentiment"] = "neutral"
  542. out["sentimentScore"] = 0.0
  543. out["keywords"] = []
  544. return out
  545. monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
  546. async def _mock_fetch2(limit, url_list=None):
  547. return [
  548. {"title": "SEC fines firm", "url": "https://example.com/a", "source": "S",
  549. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "summary": "..."},
  550. ]
  551. monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch2)
  552. monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
  553. monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
  554. monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
  555. asyncio.run(poller.refresh_clusters(topic=None, limit=10))
  556. assert captured["upserts"], "Expected at least one upsert call"
  557. # The poller first stores raw clusters (topic=heuristic), then enriched
  558. # clusters (topic=post-LLM). The enriched upsert is the one whose row_topic
  559. # reflects the LLM classification.
  560. enriched_upserts = [u for u in captured["upserts"] if u["row_topic"] == "regulation"]
  561. assert enriched_upserts, (
  562. f"Expected at least one upsert with row_topic='regulation', "
  563. f"got topics: {[u['row_topic'] for u in captured['upserts']]}"
  564. )
  565. upsert = enriched_upserts[0]
  566. assert upsert["row_topic"] == "regulation", (
  567. f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
  568. )
  569. assert upsert["payload_topic"] == "regulation"