|
@@ -79,6 +79,24 @@ def test_sqlite_summary_cache_roundtrip():
|
|
|
assert cached["keyFacts"] == ["Fact 1"]
|
|
assert cached["keyFacts"] == ["Fact 1"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def test_sqlite_summary_cache_does_not_create_placeholder_row():
|
|
|
|
|
+ with tempfile.TemporaryDirectory() as td:
|
|
|
|
|
+ db = Path(td) / "news.sqlite"
|
|
|
|
|
+ store = SQLiteClusterStore(db)
|
|
|
|
|
+ store.upsert_cluster_summary(
|
|
|
|
|
+ "missing",
|
|
|
|
|
+ {
|
|
|
|
|
+ "headline": "Missing",
|
|
|
|
|
+ "mergedSummary": "Summary",
|
|
|
|
|
+ "keyFacts": [],
|
|
|
|
|
+ "sources": [],
|
|
|
|
|
+ },
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ assert store.get_cluster_by_id("missing") is None
|
|
|
|
|
+ assert store.get_cluster_summary("missing", ttl_hours=24) is None
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def test_prune_clusters_deletes_rows_older_than_retention():
|
|
def test_prune_clusters_deletes_rows_older_than_retention():
|
|
|
with tempfile.TemporaryDirectory() as td:
|
|
with tempfile.TemporaryDirectory() as td:
|
|
|
db = Path(td) / "news.sqlite"
|
|
db = Path(td) / "news.sqlite"
|
|
@@ -144,6 +162,54 @@ def test_prune_if_due_skips_deletes_when_pruning_disabled():
|
|
|
assert store.get_cluster_by_id("stale") is not None
|
|
assert store.get_cluster_by_id("stale") is not None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def test_get_latest_clusters_orders_by_updated_at_before_limit():
|
|
|
|
|
+ with tempfile.TemporaryDirectory() as td:
|
|
|
|
|
+ db = Path(td) / "news.sqlite"
|
|
|
|
|
+ store = SQLiteClusterStore(db)
|
|
|
|
|
+ store.upsert_clusters(
|
|
|
|
|
+ [
|
|
|
|
|
+ {
|
|
|
|
|
+ "cluster_id": "old",
|
|
|
|
|
+ "headline": "Old",
|
|
|
|
|
+ "summary": "Old summary",
|
|
|
|
|
+ "entities": ["Iran"],
|
|
|
|
|
+ "timestamp": "Wed, 01 Apr 2026 09:00:00 GMT",
|
|
|
|
|
+ "articles": [],
|
|
|
|
|
+ },
|
|
|
|
|
+ {
|
|
|
|
|
+ "cluster_id": "new",
|
|
|
|
|
+ "headline": "New",
|
|
|
|
|
+ "summary": "New summary",
|
|
|
|
|
+ "entities": ["Bitcoin"],
|
|
|
|
|
+ "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
|
|
|
|
|
+ "articles": [],
|
|
|
|
|
+ },
|
|
|
|
|
+ ],
|
|
|
|
|
+ topic="crypto",
|
|
|
|
|
+ )
|
|
|
|
|
+
|
|
|
|
|
+ with store._conn() as conn:
|
|
|
|
|
+ conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "new"))
|
|
|
|
|
+ conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2026-01-01T00:00:00+00:00", "old"))
|
|
|
|
|
+
|
|
|
|
|
+ latest = store.get_latest_clusters(topic="crypto", ttl_hours=24 * 365, limit=1)
|
|
|
|
|
+ assert len(latest) == 1
|
|
|
|
|
+ assert latest[0]["cluster_id"] == "new"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+def test_get_entity_metadata_prefers_mid_scoped_row():
|
|
|
|
|
+ with tempfile.TemporaryDirectory() as td:
|
|
|
|
|
+ db = Path(td) / "news.sqlite"
|
|
|
|
|
+ store = SQLiteClusterStore(db)
|
|
|
|
|
+ store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid=None, sources=["local"])
|
|
|
|
|
+ store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid="/m/Bitcoin", sources=["trends"])
|
|
|
|
|
+ store.record_entity_request("Bitcoin", mid="/m/Bitcoin")
|
|
|
|
|
+
|
|
|
|
|
+ meta = store.get_entity_metadata("Bitcoin")
|
|
|
|
|
+ assert meta is not None
|
|
|
|
|
+ assert meta["mid"] == "/m/Bitcoin"
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def test_blacklist_filters_entities_case_insensitively():
|
|
def test_blacklist_filters_entities_case_insensitively():
|
|
|
entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
|
|
entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
|
|
|
filtered = _filter_entities(entities, blacklist=["bloomberg"])
|
|
filtered = _filter_entities(entities, blacklist=["bloomberg"])
|
|
@@ -231,6 +297,104 @@ def test_call_llm_dispatches_to_selected_provider(monkeypatch):
|
|
|
assert '"provider": "openai"' in openai
|
|
assert '"provider": "openai"' in openai
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
+def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
|
|
|
|
|
+ import news_mcp.jobs.poller as poller
|
|
|
|
|
+ import hashlib
|
|
|
|
|
+ from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS
|
|
|
|
|
+
|
|
|
|
|
+ calls = {"fetch": 0, "cluster": 0, "enrich": 0, "classify": 0}
|
|
|
|
|
+ rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()] or [NEWS_FEED_URL]
|
|
|
|
|
+ material = "\n".join(
|
|
|
|
|
+ [
|
|
|
|
|
+ "Bitcoin rallies|https://example.com/a|Wed, 01 Apr 2026 12:00:00 GMT",
|
|
|
|
|
+ ]
|
|
|
|
|
+ )
|
|
|
|
|
+ expected_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
|
|
|
|
|
+
|
|
|
|
|
+ async def fake_to_thread(fn, limit):
|
|
|
|
|
+ calls["fetch"] += 1
|
|
|
|
|
+ return [
|
|
|
|
|
+ {
|
|
|
|
|
+ "title": "Bitcoin rallies",
|
|
|
|
|
+ "url": "https://example.com/a",
|
|
|
|
|
+ "source": "Src",
|
|
|
|
|
+ "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
|
|
|
|
|
+ "summary": "summary",
|
|
|
|
|
+ }
|
|
|
|
|
+ ]
|
|
|
|
|
+
|
|
|
|
|
+ def fake_cluster(articles):
|
|
|
|
|
+ calls["cluster"] += 1
|
|
|
|
|
+ return {
|
|
|
|
|
+ "crypto": [
|
|
|
|
|
+ {
|
|
|
|
|
+ "cluster_id": "cid",
|
|
|
|
|
+ "headline": "Bitcoin rallies",
|
|
|
|
|
+ "summary": "summary",
|
|
|
|
|
+ "entities": [],
|
|
|
|
|
+ "sentiment": "neutral",
|
|
|
|
|
+ "importance": 0.0,
|
|
|
|
|
+ "sources": ["Src"],
|
|
|
|
|
+ "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
|
|
|
|
|
+ "articles": [],
|
|
|
|
|
+ }
|
|
|
|
|
+ ]
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ def fake_enrich(cluster):
|
|
|
|
|
+ calls["enrich"] += 1
|
|
|
|
|
+ return cluster
|
|
|
|
|
+
|
|
|
|
|
+ async def fake_classify(cluster):
|
|
|
|
|
+ calls["classify"] += 1
|
|
|
|
|
+ return cluster
|
|
|
|
|
+
|
|
|
|
|
+ class DummyStore:
|
|
|
|
|
+ def __init__(self, *args, **kwargs):
|
|
|
|
|
+ self.meta = {}
|
|
|
|
|
+ self.feed_hash = expected_hash
|
|
|
|
|
+
|
|
|
|
|
+ def get_feed_hash(self, feed_key):
|
|
|
|
|
+ return self.feed_hash
|
|
|
|
|
+
|
|
|
|
|
+ def set_feed_hash(self, feed_key, last_hash):
|
|
|
|
|
+ self.feed_hash = last_hash
|
|
|
|
|
+
|
|
|
|
|
+ def get_cluster_by_id(self, cluster_id):
|
|
|
|
|
+ return None
|
|
|
|
|
+
|
|
|
|
|
+ def upsert_clusters(self, clusters, topic):
|
|
|
|
|
+ self.meta["upserted"] = (len(clusters), topic)
|
|
|
|
|
+
|
|
|
|
|
+ def prune_if_due(self, **kwargs):
|
|
|
|
|
+ self.meta["prune"] = kwargs
|
|
|
|
|
+ return {"deleted": 0}
|
|
|
|
|
+
|
|
|
|
|
+ def set_meta(self, key, value):
|
|
|
|
|
+ self.meta[key] = value
|
|
|
|
|
+
|
|
|
|
|
+ monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)
|
|
|
|
|
+ monkeypatch.setattr(poller, "fetch_news_articles", lambda limit: [{"title": "Bitcoin rallies", "url": "https://example.com/a", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT"}])
|
|
|
|
|
+ monkeypatch.setattr(poller.asyncio, "to_thread", fake_to_thread)
|
|
|
|
|
+ monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
|
|
|
|
|
+ monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
|
|
|
|
|
+ monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)
|
|
|
|
|
+
|
|
|
|
|
+ poller.store = None
|
|
|
|
|
+
|
|
|
|
|
+ async def run_once():
|
|
|
|
|
+ await poller.refresh_clusters(topic=None, limit=80)
|
|
|
|
|
+
|
|
|
|
|
+ import asyncio
|
|
|
|
|
+
|
|
|
|
|
+ asyncio.run(run_once())
|
|
|
|
|
+
|
|
|
|
|
+ assert calls["fetch"] == 1
|
|
|
|
|
+ assert calls["cluster"] == 0
|
|
|
|
|
+ assert calls["enrich"] == 0
|
|
|
|
|
+ assert calls["classify"] == 0
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
def test_importance_prefers_llm_signal():
|
|
def test_importance_prefers_llm_signal():
|
|
|
# Two clusters with same coverage but different sentiment magnitude.
|
|
# Two clusters with same coverage but different sentiment magnitude.
|
|
|
base = {
|
|
base = {
|