test_news_mcp.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. from __future__ import annotations
  2. import tempfile
  3. from pathlib import Path
  4. from news_mcp.dedup.cluster import dedup_and_cluster_articles
  5. from news_mcp.storage.sqlite_store import SQLiteClusterStore
  6. from news_mcp.enrichment.importance import compute_importance
  7. from news_mcp.enrichment.llm_enrich import _filter_entities
  8. from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
  9. def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
  10. return {
  11. "title": title,
  12. "url": url,
  13. "source": source,
  14. "timestamp": ts,
  15. "summary": "summary text",
  16. }
  17. def test_dedup_merges_similar_titles():
  18. articles = [
  19. _article("Trump warns Iran war could spread"),
  20. _article("Trump warns Iran conflict could spread"),
  21. _article("Unrelated sports result"),
  22. ]
  23. clustered = dedup_and_cluster_articles(articles, similarity_threshold=0.75)
  24. # We expect the Trump/Iran items to be merged into one cluster in the same topic bucket.
  25. total_clusters = sum(len(v) for v in clustered.values())
  26. assert total_clusters == 2
  27. def test_sqlite_feed_hash_roundtrip():
  28. with tempfile.TemporaryDirectory() as td:
  29. db = Path(td) / "news.sqlite"
  30. store = SQLiteClusterStore(db)
  31. assert store.get_feed_hash("breakingthenews") is None
  32. store.set_feed_hash("breakingthenews", "abc123")
  33. assert store.get_feed_hash("breakingthenews") == "abc123"
  34. def test_sqlite_summary_cache_roundtrip():
  35. with tempfile.TemporaryDirectory() as td:
  36. db = Path(td) / "news.sqlite"
  37. store = SQLiteClusterStore(db)
  38. # Upsert a base cluster first.
  39. store.upsert_clusters([
  40. {
  41. "cluster_id": "cid1",
  42. "headline": "Headline",
  43. "summary": "Summary",
  44. "entities": ["Iran"],
  45. "sentiment": "negative",
  46. "importance": 0.5,
  47. "sources": ["BreakingTheNews"],
  48. "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
  49. "articles": [],
  50. "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
  51. "last_updated": "Mon, 30 Mar 2026 12:00:00 GMT",
  52. }
  53. ], topic="other")
  54. store.upsert_cluster_summary(
  55. "cid1",
  56. {
  57. "headline": "Headline",
  58. "mergedSummary": "Merged summary",
  59. "keyFacts": ["Fact 1"],
  60. "sources": ["BreakingTheNews"],
  61. },
  62. )
  63. cached = store.get_cluster_summary("cid1", ttl_hours=24)
  64. assert cached is not None
  65. assert cached["mergedSummary"] == "Merged summary"
  66. assert cached["keyFacts"] == ["Fact 1"]
  67. def test_blacklist_filters_entities_case_insensitively():
  68. entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
  69. filtered = _filter_entities(entities, blacklist=["bloomberg"])
  70. assert filtered == ["Reuters", "CoinDesk"]
  71. def test_load_prompt_reads_prompt_files():
  72. text = load_prompt("extract_entities.prompt")
  73. assert "Return STRICT JSON" in text
  74. def test_build_extraction_prompt_is_stable_without_blacklist():
  75. cluster = {
  76. "headline": "Bloomberg reports Bitcoin rallies after US rate comments",
  77. "summary": "A report from Bloomberg says Bitcoin moved higher after comments from the Fed.",
  78. "articles": [],
  79. }
  80. prompt = build_extraction_prompt(cluster)
  81. assert "Bloomberg reports Bitcoin rallies" in prompt
  82. assert "Do NOT return empty entities" in prompt
  83. assert "Bloomberg" in prompt # present in the input, not filtered here
  84. def test_call_llm_dispatches_to_selected_provider(monkeypatch):
  85. async def fake_groq(model, messages, response_json=True):
  86. return '{"ok": true, "provider": "groq"}'
  87. async def fake_openai(model, messages, response_json=True):
  88. return '{"ok": true, "provider": "openai"}'
  89. monkeypatch.setattr("news_mcp.llm._call_groq", fake_groq)
  90. monkeypatch.setattr("news_mcp.llm._call_openai", fake_openai)
  91. import asyncio
  92. groq = asyncio.run(call_llm("groq", "x", "sys", "user"))
  93. openai = asyncio.run(call_llm("openai", "x", "sys", "user"))
  94. assert '"provider": "groq"' in groq
  95. assert '"provider": "openai"' in openai
  96. def test_importance_prefers_llm_signal():
  97. # Two clusters with same coverage but different sentiment magnitude.
  98. base = {
  99. "sources": ["A", "B"],
  100. "articles": [{}, {}],
  101. "sentiment": "neutral",
  102. "sentimentScore": 0.0,
  103. }
  104. pos = dict(base, sentimentScore=0.9)
  105. neg = dict(base, sentimentScore=-0.8)
  106. imp_base = compute_importance(base)
  107. imp_pos = compute_importance(pos)
  108. imp_neg = compute_importance(neg)
  109. assert imp_pos >= imp_base
  110. assert imp_neg >= imp_base