| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273 |
- from __future__ import annotations
- import tempfile
- from pathlib import Path
- from news_mcp.dedup.cluster import dedup_and_cluster_articles
- from news_mcp.storage.sqlite_store import SQLiteClusterStore
- def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
- return {
- "title": title,
- "url": url,
- "source": source,
- "timestamp": ts,
- "summary": "summary text",
- }
- def test_dedup_merges_similar_titles():
- articles = [
- _article("Trump warns Iran war could spread"),
- _article("Trump warns Iran conflict could spread"),
- _article("Unrelated sports result"),
- ]
- clustered = dedup_and_cluster_articles(articles, similarity_threshold=0.75)
- # We expect the Trump/Iran items to be merged into one cluster in the same topic bucket.
- total_clusters = sum(len(v) for v in clustered.values())
- assert total_clusters == 2
- def test_sqlite_feed_hash_roundtrip():
- with tempfile.TemporaryDirectory() as td:
- db = Path(td) / "news.sqlite"
- store = SQLiteClusterStore(db)
- assert store.get_feed_hash("breakingthenews") is None
- store.set_feed_hash("breakingthenews", "abc123")
- assert store.get_feed_hash("breakingthenews") == "abc123"
- def test_sqlite_summary_cache_roundtrip():
- with tempfile.TemporaryDirectory() as td:
- db = Path(td) / "news.sqlite"
- store = SQLiteClusterStore(db)
- # Upsert a base cluster first.
- store.upsert_clusters([
- {
- "cluster_id": "cid1",
- "headline": "Headline",
- "summary": "Summary",
- "entities": ["Iran"],
- "sentiment": "negative",
- "importance": 0.5,
- "sources": ["BreakingTheNews"],
- "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
- "articles": [],
- "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
- "last_updated": "Mon, 30 Mar 2026 12:00:00 GMT",
- }
- ], topic="other")
- store.upsert_cluster_summary(
- "cid1",
- {
- "headline": "Headline",
- "mergedSummary": "Merged summary",
- "keyFacts": ["Fact 1"],
- "sources": ["BreakingTheNews"],
- },
- )
- cached = store.get_cluster_summary("cid1", ttl_hours=24)
- assert cached is not None
- assert cached["mergedSummary"] == "Merged summary"
- assert cached["keyFacts"] == ["Fact 1"]
|