test_news_mcp.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661
  1. from __future__ import annotations
  2. import tempfile
  3. from pathlib import Path
  4. from news_mcp.dedup.cluster import dedup_and_cluster_articles
  5. from news_mcp.storage.sqlite_store import SQLiteClusterStore
  6. from news_mcp.enrichment.importance import compute_importance
  7. from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
  8. from news_mcp.entity_normalize import normalize_query, normalize_entities
  9. from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
  10. from news_mcp.trends_resolution import resolve_entity_via_trends
  11. from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
  12. def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
  13. return {
  14. "title": title,
  15. "url": url,
  16. "source": source,
  17. "timestamp": ts,
  18. "summary": "summary text",
  19. }
  20. def test_dedup_merges_similar_titles():
  21. articles = [
  22. _article("Trump warns Iran war could spread"),
  23. _article("Trump warns Iran conflict could spread"),
  24. _article("Unrelated sports result"),
  25. ]
  26. clustered = dedup_and_cluster_articles(articles, similarity_threshold=0.75)
  27. # We expect the Trump/Iran items to be merged into one cluster in the same topic bucket.
  28. total_clusters = sum(len(v) for v in clustered.values())
  29. assert total_clusters == 2
  30. def test_sqlite_feed_hash_roundtrip():
  31. with tempfile.TemporaryDirectory() as td:
  32. db = Path(td) / "news.sqlite"
  33. store = SQLiteClusterStore(db)
  34. assert store.get_feed_hash("breakingthenews") is None
  35. store.set_feed_hash("breakingthenews", "abc123")
  36. assert store.get_feed_hash("breakingthenews") == "abc123"
  37. def test_sqlite_summary_cache_roundtrip():
  38. with tempfile.TemporaryDirectory() as td:
  39. db = Path(td) / "news.sqlite"
  40. store = SQLiteClusterStore(db)
  41. # Upsert a base cluster first.
  42. store.upsert_clusters([
  43. {
  44. "cluster_id": "cid1",
  45. "headline": "Headline",
  46. "summary": "Summary",
  47. "entities": ["Iran"],
  48. "sentiment": "negative",
  49. "importance": 0.5,
  50. "sources": ["BreakingTheNews"],
  51. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  52. "articles": [],
  53. "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
  54. "last_updated": "Mon, 30 Mar 2026 12:00:00 GMT",
  55. }
  56. ], topic="other")
  57. store.upsert_cluster_summary(
  58. "cid1",
  59. {
  60. "headline": "Headline",
  61. "mergedSummary": "Merged summary",
  62. "keyFacts": ["Fact 1"],
  63. "sources": ["BreakingTheNews"],
  64. },
  65. )
  66. cached = store.get_cluster_summary("cid1", ttl_hours=24)
  67. assert cached is not None
  68. assert cached["mergedSummary"] == "Merged summary"
  69. assert cached["keyFacts"] == ["Fact 1"]
  70. def test_sqlite_summary_cache_does_not_create_placeholder_row():
  71. with tempfile.TemporaryDirectory() as td:
  72. db = Path(td) / "news.sqlite"
  73. store = SQLiteClusterStore(db)
  74. store.upsert_cluster_summary(
  75. "missing",
  76. {
  77. "headline": "Missing",
  78. "mergedSummary": "Summary",
  79. "keyFacts": [],
  80. "sources": [],
  81. },
  82. )
  83. assert store.get_cluster_by_id("missing") is None
  84. assert store.get_cluster_summary("missing", ttl_hours=24) is None
  85. def test_prune_clusters_deletes_rows_older_than_retention():
  86. with tempfile.TemporaryDirectory() as td:
  87. db = Path(td) / "news.sqlite"
  88. store = SQLiteClusterStore(db)
  89. store.upsert_clusters([
  90. {
  91. "cluster_id": "fresh",
  92. "headline": "Fresh",
  93. "summary": "Fresh summary",
  94. "entities": ["Bitcoin"],
  95. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  96. "articles": [],
  97. },
  98. {
  99. "cluster_id": "stale",
  100. "headline": "Stale",
  101. "summary": "Stale summary",
  102. "entities": ["Iran"],
  103. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  104. "articles": [],
  105. },
  106. ], topic="other")
  107. with store._conn() as conn:
  108. conn.execute(
  109. "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
  110. ("2025-01-01T00:00:00+00:00", "stale"),
  111. )
  112. deleted = store.prune_clusters(retention_days=30)
  113. assert deleted == 1
  114. assert store.get_cluster_by_id("stale") is None
  115. assert store.get_cluster_by_id("fresh") is not None
  116. assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None
  117. def test_prune_if_due_skips_deletes_when_pruning_disabled():
  118. with tempfile.TemporaryDirectory() as td:
  119. db = Path(td) / "news.sqlite"
  120. store = SQLiteClusterStore(db)
  121. store.upsert_clusters([
  122. {
  123. "cluster_id": "stale",
  124. "headline": "Stale",
  125. "summary": "Stale summary",
  126. "entities": ["Iran"],
  127. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  128. "articles": [],
  129. }
  130. ], topic="other")
  131. with store._conn() as conn:
  132. conn.execute(
  133. "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
  134. ("2025-01-01T00:00:00+00:00", "stale"),
  135. )
  136. result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24)
  137. assert result["enabled"] is False
  138. assert result["deleted"] == 0
  139. assert store.get_cluster_by_id("stale") is not None
  140. def test_get_latest_clusters_orders_by_updated_at_before_limit():
  141. with tempfile.TemporaryDirectory() as td:
  142. db = Path(td) / "news.sqlite"
  143. store = SQLiteClusterStore(db)
  144. store.upsert_clusters(
  145. [
  146. {
  147. "cluster_id": "old",
  148. "headline": "Old",
  149. "summary": "Old summary",
  150. "entities": ["Iran"],
  151. "timestamp": "Wed, 01 Apr 2026 09:00:00 GMT",
  152. "articles": [],
  153. },
  154. {
  155. "cluster_id": "new",
  156. "headline": "New",
  157. "summary": "New summary",
  158. "entities": ["Bitcoin"],
  159. "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
  160. "articles": [],
  161. },
  162. ],
  163. topic="crypto",
  164. )
  165. with store._conn() as conn:
  166. conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "new"))
  167. conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2026-01-01T00:00:00+00:00", "old"))
  168. latest = store.get_latest_clusters(topic="crypto", ttl_hours=24 * 365, limit=1)
  169. assert len(latest) == 1
  170. assert latest[0]["cluster_id"] == "new"
  171. def test_get_entity_metadata_prefers_mid_scoped_row():
  172. with tempfile.TemporaryDirectory() as td:
  173. db = Path(td) / "news.sqlite"
  174. store = SQLiteClusterStore(db)
  175. store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid=None, sources=["local"])
  176. store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid="/m/Bitcoin", sources=["trends"])
  177. store.record_entity_request("Bitcoin", mid="/m/Bitcoin")
  178. meta = store.get_entity_metadata("Bitcoin")
  179. assert meta is not None
  180. assert meta["mid"] == "/m/Bitcoin"
  181. def test_blacklist_filters_entities_case_insensitively():
  182. entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
  183. filtered = _filter_entities(entities, blacklist=["bloomberg"])
  184. assert filtered == ["Reuters", "CoinDesk"]
  185. def test_blacklist_supports_wildcards():
  186. assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"])
  187. assert _matches_blacklist("bloomberg", blacklist=["*berg"])
  188. assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"])
  189. def test_query_normalization_keeps_common_shorthand_working():
  190. assert normalize_query("btc") == "Bitcoin"
  191. assert normalize_query("Trump") == "Donald Trump"
  192. assert normalize_query("nvidia") == "nvidia"
  193. def test_entity_normalization_deduplicates_aliases():
  194. assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"]
  195. def test_load_prompt_reads_prompt_files():
  196. text = load_prompt("extract_entities.prompt")
  197. assert "Return STRICT JSON" in text
  198. def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch):
  199. import news_mcp.trends_resolution as trends_resolution
  200. trends_resolution.resolve_entity_via_trends.cache_clear()
  201. trends_resolution._provider.cache_clear()
  202. monkeypatch.setattr(trends_resolution, "_provider", lambda: None)
  203. resolved = resolve_entity_via_trends("btc")
  204. assert resolved["normalized"] == "Bitcoin"
  205. assert resolved["canonical_label"] == "Bitcoin"
  206. assert resolved["mid"] is None
  207. assert resolved["candidates"] == []
  208. assert resolved["source"] == "fallback"
  209. trends_resolution.resolve_entity_via_trends.cache_clear()
  210. def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
  211. clusters = [
  212. {"headline": "older", "timestamp": "Wed, 01 Apr 2026 10:00:00 GMT", "importance": 0.9},
  213. {"headline": "newer", "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT", "importance": 0.1},
  214. ]
  215. sorted_clusters = _sort_clusters_by_recency(clusters)
  216. assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]
  217. def test_build_extraction_prompt_is_stable_without_blacklist():
  218. cluster = {
  219. "headline": "Bloomberg reports Bitcoin rallies after US rate comments",
  220. "summary": "A report from Bloomberg says Bitcoin moved higher after comments from the Fed.",
  221. "articles": [],
  222. }
  223. prompt = build_extraction_prompt(cluster)
  224. assert "Bloomberg reports Bitcoin rallies" in prompt
  225. assert "Do NOT return empty entities" in prompt
  226. assert "Bloomberg" in prompt # present in the input, not filtered here
  227. def test_call_llm_dispatches_to_selected_provider(monkeypatch):
  228. async def fake_groq(model, messages, response_json=True):
  229. return '{"ok": true, "provider": "groq"}'
  230. async def fake_openai(model, messages, response_json=True):
  231. return '{"ok": true, "provider": "openai"}'
  232. monkeypatch.setattr("news_mcp.llm._call_groq", fake_groq)
  233. monkeypatch.setattr("news_mcp.llm._call_openai", fake_openai)
  234. import asyncio
  235. groq = asyncio.run(call_llm("groq", "x", "sys", "user"))
  236. openai = asyncio.run(call_llm("openai", "x", "sys", "user"))
  237. assert '"provider": "groq"' in groq
  238. assert '"provider": "openai"' in openai
  239. def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
  240. import news_mcp.jobs.poller as poller
  241. import hashlib
  242. from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS
  243. calls = {"fetch": 0, "cluster": 0, "enrich": 0, "classify": 0}
  244. rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()] or [NEWS_FEED_URL]
  245. material = "\n".join(
  246. [
  247. "Bitcoin rallies|https://example.com/a|Wed, 01 Apr 2026 12:00:00 GMT",
  248. ]
  249. )
  250. expected_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
  251. async def fake_to_thread(fn, limit):
  252. calls["fetch"] += 1
  253. return [
  254. {
  255. "title": "Bitcoin rallies",
  256. "url": "https://example.com/a",
  257. "source": "Src",
  258. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  259. "summary": "summary",
  260. }
  261. ]
  262. def fake_cluster(articles):
  263. calls["cluster"] += 1
  264. return {
  265. "crypto": [
  266. {
  267. "cluster_id": "cid",
  268. "headline": "Bitcoin rallies",
  269. "summary": "summary",
  270. "entities": [],
  271. "sentiment": "neutral",
  272. "importance": 0.0,
  273. "sources": ["Src"],
  274. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  275. "articles": [],
  276. }
  277. ]
  278. }
  279. def fake_enrich(cluster):
  280. calls["enrich"] += 1
  281. return cluster
  282. async def fake_classify(cluster):
  283. calls["classify"] += 1
  284. return cluster
  285. class DummyStore:
  286. def __init__(self, *args, **kwargs):
  287. self.meta = {}
  288. self.feed_hash = expected_hash
  289. def get_feed_hash(self, feed_key):
  290. return self.feed_hash
  291. def set_feed_hash(self, feed_key, last_hash):
  292. self.feed_hash = last_hash
  293. def get_cluster_by_id(self, cluster_id):
  294. return None
  295. def upsert_clusters(self, clusters, topic):
  296. self.meta["upserted"] = (len(clusters), topic)
  297. def prune_if_due(self, **kwargs):
  298. self.meta["prune"] = kwargs
  299. return {"deleted": 0}
  300. def set_meta(self, key, value):
  301. self.meta[key] = value
  302. monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
  303. monkeypatch.setattr(poller, "fetch_news_articles", lambda limit: [{"title": "Bitcoin rallies", "url": "https://example.com/a", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT"}])
  304. monkeypatch.setattr(poller.asyncio, "to_thread", fake_to_thread)
  305. monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
  306. monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
  307. monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
  308. poller.store = None
  309. async def run_once():
  310. await poller.refresh_clusters(topic=None, limit=80)
  311. import asyncio
  312. asyncio.run(run_once())
  313. assert calls["fetch"] == 1
  314. assert calls["cluster"] == 0
  315. assert calls["enrich"] == 0
  316. assert calls["classify"] == 0
  317. def test_importance_prefers_llm_signal():
  318. # Two clusters with same coverage but different sentiment magnitude.
  319. base = {
  320. "sources": ["A", "B"],
  321. "articles": [{}, {}],
  322. "sentiment": "neutral",
  323. "sentimentScore": 0.0,
  324. }
  325. pos = dict(base, sentimentScore=0.9)
  326. neg = dict(base, sentimentScore=-0.8)
  327. imp_base = compute_importance(base)
  328. imp_pos = compute_importance(pos)
  329. imp_neg = compute_importance(neg)
  330. assert imp_pos >= imp_base
  331. assert imp_neg >= imp_base
  332. # ---------------------------------------------------------------------------
  333. # Regression tests for the May 2026 correctness pass
  334. # ---------------------------------------------------------------------------
  335. def test_classify_cluster_llm_uses_llm_topic_and_drops_invalid_ones(monkeypatch):
  336. """The LLM-extracted topic must propagate to the returned cluster, but
  337. free-form / hallucinated topic strings must be coerced into the allowed
  338. set so they never reach the SQL row column verbatim."""
  339. import asyncio
  340. from news_mcp.enrichment import llm_enrich
  341. async def fake_extraction(cluster):
  342. return {
  343. "topic": "regulation",
  344. "entities": ["SEC"],
  345. "sentiment": "neutral",
  346. "sentimentScore": 0.0,
  347. "keywords": ["enforcement"],
  348. }
  349. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
  350. monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
  351. cluster = {"cluster_id": "x", "headline": "SEC fines firm", "summary": "...", "topic": "other"}
  352. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  353. assert out["topic"] == "regulation"
  354. # Hallucinated topic is rejected; we fall back to the input cluster's
  355. # heuristic topic when it is one of the allowed ones.
  356. async def fake_extraction_garbage(cluster):
  357. return {
  358. "topic": "geopolitics-and-stuff",
  359. "entities": ["NATO"],
  360. "sentiment": "neutral",
  361. "sentimentScore": 0.0,
  362. "keywords": [],
  363. }
  364. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction_garbage)
  365. cluster = {"cluster_id": "y", "headline": "NATO meets", "summary": "...", "topic": "macro"}
  366. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  367. assert out["topic"] == "macro" # heuristic fallback
  368. # When neither the LLM nor the heuristic gives a valid label -> "other".
  369. cluster = {"cluster_id": "z", "headline": "...", "summary": "...", "topic": "geopolitics-bucket"}
  370. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  371. assert out["topic"] == "other"
  372. def test_classify_cluster_llm_normalizes_aliases_before_blacklist(monkeypatch):
  373. """Regression: previously ``_filter_entities`` ran before
  374. ``normalize_entities``, so blacklisting "bitcoin" missed entries the LLM
  375. returned as the alias "btc". Order is now normalize -> blacklist."""
  376. import asyncio
  377. from news_mcp.enrichment import llm_enrich
  378. async def fake_extraction(cluster):
  379. return {
  380. "topic": "crypto",
  381. "entities": ["btc", "Reuters"],
  382. "sentiment": "neutral",
  383. "sentimentScore": 0.0,
  384. "keywords": ["btc rally", "Reuters"],
  385. }
  386. monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
  387. monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
  388. monkeypatch.setattr(llm_enrich, "NEWS_ENTITY_BLACKLIST", ["bitcoin"])
  389. cluster = {"cluster_id": "x", "headline": "BTC up", "summary": "...", "topic": "crypto"}
  390. out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
  391. # "btc" became "Bitcoin" via aliasing, then was filtered out by the
  392. # blacklist. "Reuters" survives (not blacklisted in this test).
  393. assert "Bitcoin" not in out["entities"]
  394. assert "btc" not in [e.lower() for e in out["entities"]]
  395. assert "Reuters" in out["entities"]
  396. def test_dedup_uses_jaccard_when_titles_diverge():
  397. """Composite similarity: even with embeddings off, two articles whose
  398. titles share only some tokens should still merge if their content (token
  399. overlap) is high enough."""
  400. from news_mcp.dedup import cluster as dc
  401. # Titles differ heavily; bodies overlap heavily -> Jaccard should catch.
  402. articles = [
  403. {
  404. "title": "Iran tension rises",
  405. "url": "https://example.com/a",
  406. "source": "A",
  407. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  408. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  409. },
  410. {
  411. "title": "Trump issues stark warning over Tehran",
  412. "url": "https://example.com/b",
  413. "source": "B",
  414. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  415. "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
  416. },
  417. ]
  418. clustered = dc.dedup_and_cluster_articles(articles)
  419. total = sum(len(v) for v in clustered.values())
  420. assert total == 1, f"Expected 1 merged cluster via Jaccard signal, got {total}"
  421. def test_dedup_does_not_merge_unrelated_articles():
  422. """Negative control: cluster is robust against false-positives even with
  423. the more permissive multi-signal merging."""
  424. from news_mcp.dedup import cluster as dc
  425. articles = [
  426. {
  427. "title": "Bitcoin hits new high",
  428. "url": "https://example.com/a",
  429. "source": "A",
  430. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  431. "summary": "Bitcoin reached a record high amid rising demand.",
  432. },
  433. {
  434. "title": "Local sports team wins",
  435. "url": "https://example.com/b",
  436. "source": "B",
  437. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  438. "summary": "The local team won the regional championship.",
  439. },
  440. ]
  441. clustered = dc.dedup_and_cluster_articles(articles)
  442. total = sum(len(v) for v in clustered.values())
  443. assert total == 2
  444. def test_get_all_feed_states_returns_all_rows():
  445. """Health endpoint regression: the writer keys feed state with a hashed
  446. multi-feed key, so the old hardcoded ``get_feed_state("breakingthenews")``
  447. always returned None. Verify the bulk getter works."""
  448. import tempfile
  449. from pathlib import Path
  450. with tempfile.TemporaryDirectory() as td:
  451. db = Path(td) / "news.sqlite"
  452. store = SQLiteClusterStore(db)
  453. store.set_feed_hash("newsfeeds:abc123", "hash1")
  454. store.set_feed_hash("newsfeeds:def456", "hash2")
  455. all_states = store.get_all_feed_states()
  456. assert len(all_states) == 2
  457. keys = {s["feed_key"] for s in all_states}
  458. assert keys == {"newsfeeds:abc123", "newsfeeds:def456"}
  459. def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
  460. """Regression: the SQL row-column ``topic`` previously locked in the
  461. headline-heuristic value (which is ``other`` for most stories) and ignored
  462. the LLM's classification stored in the payload. Verify the upsert now uses
  463. the post-enrichment topic so SQL filtering and dashboard groupings see the
  464. real classification."""
  465. import asyncio
  466. import news_mcp.jobs.poller as poller
  467. captured = {"upserts": []}
  468. class DummyStore:
  469. def __init__(self, *args, **kwargs):
  470. pass
  471. def get_feed_hash(self, feed_key):
  472. return None
  473. def set_feed_hash(self, feed_key, last_hash):
  474. pass
  475. def get_cluster_by_id(self, cluster_id):
  476. return None
  477. def upsert_clusters(self, clusters, topic):
  478. # Capture the topic the poller chose for each cluster.
  479. for c in clusters:
  480. captured["upserts"].append({"row_topic": topic, "payload_topic": c.get("topic"), "cluster_id": c.get("cluster_id")})
  481. def prune_if_due(self, **kwargs):
  482. return {"deleted": 0}
  483. def set_meta(self, key, value):
  484. pass
  485. async def fake_to_thread(fn, limit):
  486. return [
  487. {"title": "SEC fines firm", "url": "https://example.com/a", "source": "S", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "summary": "..."},
  488. ]
  489. def fake_cluster(articles):
  490. # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
  491. # in the title for the heuristic matcher — title above does have
  492. # "law"-adjacent words but not the specific tokens it matches).
  493. return {
  494. "other": [
  495. {
  496. "cluster_id": "cid",
  497. "headline": "SEC fines firm",
  498. "summary": "...",
  499. "topic": "other",
  500. "entities": [],
  501. "sentiment": "neutral",
  502. "importance": 0.0,
  503. "sources": ["S"],
  504. "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
  505. "articles": [],
  506. }
  507. ]
  508. }
  509. def fake_enrich(cluster):
  510. return cluster
  511. async def fake_classify(cluster):
  512. # The LLM thinks it's regulation -> the SQL row column must reflect that.
  513. out = dict(cluster)
  514. out["topic"] = "regulation"
  515. out["entities"] = ["SEC"]
  516. out["entityResolutions"] = []
  517. out["sentiment"] = "neutral"
  518. out["sentimentScore"] = 0.0
  519. out["keywords"] = []
  520. return out
  521. monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
  522. monkeypatch.setattr(poller, "fetch_news_articles", lambda limit: [])
  523. monkeypatch.setattr(poller.asyncio, "to_thread", fake_to_thread)
  524. monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
  525. monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
  526. monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
  527. asyncio.run(poller.refresh_clusters(topic=None, limit=10))
  528. assert captured["upserts"], "Expected at least one upsert call"
  529. upsert = captured["upserts"][0]
  530. assert upsert["row_topic"] == "regulation", (
  531. f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
  532. )
  533. assert upsert["payload_topic"] == "regulation"