lucky
/
news-mcp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979
							from __future__ import annotations

from contextlib import contextmanager
import tempfile
from pathlib import Path
from datetime import datetime, timezone

from news_mcp.dedup.cluster import dedup_and_cluster_articles
from news_mcp.storage.sqlite_store import SQLiteClusterStore
from news_mcp.enrichment.importance import compute_importance
from news_mcp.enrichment.llm_enrich import _filter_entities, _matches_blacklist
from news_mcp.entity_normalize import normalize_query, normalize_entities
from news_mcp.llm import build_extraction_prompt, call_llm, load_prompt
from news_mcp.trends_resolution import resolve_entity_via_trends
from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency


def _article(title: str, url: str = None, source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
    if url is None:
        import hashlib
        url = f"https.example.com/{hashlib.md5(title.encode()).hexdigest()[:10]}"
    return {
        "title": title,
        "url": url,
        "source": source,
        "timestamp": ts,
        "summary": "summary text",
    }


def test_dedup_merges_similar_titles():
    articles = [
        _article("Trump warns Iran war could spread"),
        _article("Trump warns Iran conflict could spread"),
        _article("Unrelated sports result"),
    ]
    clustered = dedup_and_cluster_articles(articles, similarity_threshold=0.75)
    # We expect the Trump/Iran items to be merged into one cluster in the same topic bucket.
    total_clusters = sum(len(v) for v in clustered.values())
    assert total_clusters == 2


def test_sqlite_feed_hash_roundtrip():
    with tempfile.TemporaryDirectory() as td:
        db = Path(td) / "news.sqlite"
        store = SQLiteClusterStore(db)
        assert store.get_feed_hash("breakingthenews") is None
        store.set_feed_hash("breakingthenews", "abc123")
        assert store.get_feed_hash("breakingthenews") == "abc123"


def test_sqlite_summary_cache_roundtrip():
    with tempfile.TemporaryDirectory() as td:
        db = Path(td) / "news.sqlite"
        store = SQLiteClusterStore(db)
        # Upsert a base cluster first.
        store.upsert_clusters([
            {
                "cluster_id": "cid1",
                "headline": "Headline",
                "summary": "Summary",
                "entities": ["Iran"],
                "sentiment": "negative",
                "importance": 0.5,
                "sources": ["BreakingTheNews"],
                "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
                "articles": [],
                "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
                "last_updated": "Mon, 30 Mar 2026 12:00:00 GMT",
            }
        ], topic="other")
        store.upsert_cluster_summary(
            "cid1",
            {
                "headline": "Headline",
                "mergedSummary": "Merged summary",
                "keyFacts": ["Fact 1"],
                "sources": ["BreakingTheNews"],
            },
        )
        cached = store.get_cluster_summary("cid1", ttl_hours=24)
        assert cached is not None
        assert cached["mergedSummary"] == "Merged summary"
        assert cached["keyFacts"] == ["Fact 1"]


def test_sqlite_summary_cache_does_not_create_placeholder_row():
    with tempfile.TemporaryDirectory() as td:
        db = Path(td) / "news.sqlite"
        store = SQLiteClusterStore(db)
        store.upsert_cluster_summary(
            "missing",
            {
                "headline": "Missing",
                "mergedSummary": "Summary",
                "keyFacts": [],
                "sources": [],
            },
        )

        assert store.get_cluster_by_id("missing") is None
        assert store.get_cluster_summary("missing", ttl_hours=24) is None


def test_prune_clusters_deletes_rows_older_than_retention():
    with tempfile.TemporaryDirectory() as td:
        db = Path(td) / "news.sqlite"
        store = SQLiteClusterStore(db)
        store.upsert_clusters([
            {
                "cluster_id": "fresh",
                "headline": "Fresh",
                "summary": "Fresh summary",
                "entities": ["Bitcoin"],
                "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
                "articles": [],
            },
            {
                "cluster_id": "stale",
                "headline": "Stale",
                "summary": "Stale summary",
                "entities": ["Iran"],
                "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
                "articles": [],
            },
        ], topic="other")

        with store._conn() as conn:
            conn.execute(
                "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
                ("2025-01-01T00:00:00+00:00", "stale"),
            )

        deleted = store.prune_clusters(retention_days=30)

        assert deleted == 1
        assert store.get_cluster_by_id("stale") is None
        assert store.get_cluster_by_id("fresh") is not None
        assert store.get_prune_state(pruning_enabled=True, retention_days=30, interval_hours=24)["last_prune_at"] is not None


def test_prune_if_due_skips_deletes_when_pruning_disabled():
    with tempfile.TemporaryDirectory() as td:
        db = Path(td) / "news.sqlite"
        store = SQLiteClusterStore(db)
        store.upsert_clusters([
            {
                "cluster_id": "stale",
                "headline": "Stale",
                "summary": "Stale summary",
                "entities": ["Iran"],
                "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
                "articles": [],
            }
        ], topic="other")

        with store._conn() as conn:
            conn.execute(
                "UPDATE clusters SET updated_at=? WHERE cluster_id=?",
                ("2025-01-01T00:00:00+00:00", "stale"),
            )

        result = store.prune_if_due(pruning_enabled=False, retention_days=30, interval_hours=24)

        assert result["enabled"] is False
        assert result["deleted"] == 0
        assert store.get_cluster_by_id("stale") is not None


def test_get_latest_clusters_orders_by_updated_at_before_limit():
    with tempfile.TemporaryDirectory() as td:
        db = Path(td) / "news.sqlite"
        store = SQLiteClusterStore(db)
        store.upsert_clusters(
            [
                {
                    "cluster_id": "old",
                    "headline": "Old",
                    "summary": "Old summary",
                    "entities": ["Iran"],
                    "timestamp": "Wed, 01 Apr 2026 09:00:00 GMT",
                    "articles": [],
                },
                {
                    "cluster_id": "new",
                    "headline": "New",
                    "summary": "New summary",
                    "entities": ["Bitcoin"],
                    "timestamp": "Wed, 01 Apr 2026 11:00:00 GMT",
                    "articles": [],
                },
            ],
            topic="crypto",
        )

        with store._conn() as conn:
            conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2025-01-01T00:00:00+00:00", "new"))
            conn.execute("UPDATE clusters SET updated_at=? WHERE cluster_id=?", ("2026-01-01T00:00:00+00:00", "old"))

        latest = store.get_latest_clusters(topic="crypto", ttl_hours=24 * 365, limit=1)
        assert len(latest) == 1
        assert latest[0]["cluster_id"] == "new"


def test_get_entity_metadata_prefers_mid_scoped_row():
    with tempfile.TemporaryDirectory() as td:
        db = Path(td) / "news.sqlite"
        store = SQLiteClusterStore(db)
        store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid=None, sources=["local"])
        store.upsert_entity_metadata("Bitcoin", canonical_label="Bitcoin", mid="/m/Bitcoin", sources=["trends"])
        store.record_entity_request("Bitcoin", mid="/m/Bitcoin")

        meta = store.get_entity_metadata("Bitcoin")
        assert meta is not None
        assert meta["mid"] == "/m/Bitcoin"


def test_blacklist_filters_entities_case_insensitively():
    entities = ["Bloomberg", "Reuters", "bloomberg", "CoinDesk"]
    filtered = _filter_entities(entities, blacklist=["bloomberg"])
    assert filtered == ["Reuters", "CoinDesk"]


def test_blacklist_supports_wildcards():
    assert _matches_blacklist("Bloomberg Economics", blacklist=["bloomberg*"])
    assert _matches_blacklist("bloomberg", blacklist=["*berg"])
    assert not _matches_blacklist("Reuters", blacklist=["bloomberg*"])


def test_query_normalization_keeps_common_shorthand_working():
    assert normalize_query("btc") == "Bitcoin"
    assert normalize_query("Trump") == "Donald Trump"
    assert normalize_query("nvidia") == "nvidia"


def test_entity_normalization_deduplicates_aliases():
    assert normalize_entities(["btc", "Bitcoin", "BTC", "Ethereum"]) == ["Bitcoin", "Ethereum"]


def test_load_prompt_reads_prompt_files():
    text = load_prompt("extract_entities.prompt")
    assert "Return STRICT JSON" in text


def test_resolve_entity_falls_back_cleanly_when_provider_unavailable(monkeypatch):
    import news_mcp.trends_resolution as trends_resolution

    trends_resolution.resolve_entity_via_trends.cache_clear()
    trends_resolution._provider.cache_clear()
    monkeypatch.setattr(trends_resolution, "_provider", lambda: None)

    resolved = resolve_entity_via_trends("btc")

    assert resolved["normalized"] == "Bitcoin"
    assert resolved["canonical_label"] == "Bitcoin"
    assert resolved["mid"] is None
    assert resolved["candidates"] == []
    assert resolved["source"] == "fallback"

    trends_resolution.resolve_entity_via_trends.cache_clear()


def test_sort_clusters_by_recency_prefers_newer_timestamp_over_importance():
    clusters = [
        {"headline": "older", "timestamp": "2026-04-01T10:00:00+00:00", "importance": 0.9},
        {"headline": "newer", "timestamp": "2026-04-01T11:00:00+00:00", "importance": 0.1},
    ]

    sorted_clusters = _sort_clusters_by_recency(clusters)

    assert [c["headline"] for c in sorted_clusters] == ["newer", "older"]


def test_call_llm_dispatches_to_selected_provider(monkeypatch):
    async def fake_groq(model, messages, response_json=True):
        return '{"ok": true, "provider": "groq"}'

    async def fake_openai(model, messages, response_json=True):
        return '{"ok": true, "provider": "openai"}'

    monkeypatch.setattr("news_mcp.llm._call_groq", fake_groq)
    monkeypatch.setattr("news_mcp.llm._call_openai", fake_openai)

    import asyncio

    groq = asyncio.run(call_llm("groq", "x", "sys", "user"))
    openai = asyncio.run(call_llm("openai", "x", "sys", "user"))

    assert '"provider": "groq"' in groq
    assert '"provider": "openai"' in openai


def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
    import news_mcp.jobs.poller as poller
    import hashlib
    from news_mcp.config import NEWS_FEED_URL, NEWS_FEED_URLS

    calls = {"fetch": 0, "cluster": 0, "enrich": 0, "classify": 0}
    rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()] or [NEWS_FEED_URL]
    material = "\n".join(
        [
            "Bitcoin rallies|https://example.com/a|Wed, 01 Apr 2026 12:00:00 GMT",
        ]
    )
    expected_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()

    async def fake_to_thread(fn, limit):
        calls["fetch"] += 1
        return [
            {
                "title": "Bitcoin rallies",
                "url": "https://example.com/a",
                "source": "Src",
                "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
                "summary": "summary",
            }
        ]

    def fake_cluster(articles):
        calls["cluster"] += 1
        return {
            "crypto": [
                {
                    "cluster_id": "cid",
                    "headline": "Bitcoin rallies",
                    "summary": "summary",
                    "entities": [],
                    "sentiment": "neutral",
                    "importance": 0.0,
                    "sources": ["Src"],
                    "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
                    "articles": [],
                }
            ]
        }

    def fake_enrich(cluster):
        calls["enrich"] += 1
        return cluster

    async def fake_classify(cluster):
        calls["classify"] += 1
        return cluster

    class DummyStore:
        def __init__(self, *args, **kwargs):
            self.meta = {}
            self.feed_hash = expected_hash

        @contextmanager
        def _conn(self):
            class _Conn:
                def execute(self, *args, **kwargs):
                    return None

            yield _Conn()

        def get_feed_hash(self, feed_key):
            return self.feed_hash

        def set_feed_hash(self, feed_key, last_hash):
            self.feed_hash = last_hash

        def set_feed_state(self, feed_key, last_hash, item_count):
            self.feed_hash = last_hash

        def get_enabled_feed_urls(self, feed_urls):
            return feed_urls

        def get_cluster_by_id(self, cluster_id):
            return None

        def upsert_clusters(self, clusters, topic):
            self.meta["upserted"] = (len(clusters), topic)

        def prune_if_due(self, **kwargs):
            self.meta["prune"] = kwargs
            return {"deleted": 0}

        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
            return []

        def set_meta(self, key, value):
            self.meta[key] = value

    monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)

    async def _mock_fetch(limit, url_list=None):
            calls["fetch"] += 1
            return [{"title": "Bitcoin rallies", "url": "https://example.com/a", "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT"}]
    monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch)
    monkeypatch.setattr(poller.asyncio, "to_thread", fake_to_thread)
    monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
    monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
    monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)

    poller.store = None

    async def run_once():
        await poller.refresh_clusters(topic=None, limit=80)

    import asyncio

    asyncio.run(run_once())

    assert calls["fetch"] == 1
    assert calls["cluster"] == 0
    assert calls["enrich"] == 0
    assert calls["classify"] == 0


def test_importance_prefers_llm_signal():
    # Two clusters with same coverage but different sentiment magnitude.
    base = {
        "sources": ["A", "B"],
        "articles": [{}, {}],
        "sentiment": "neutral",
        "sentimentScore": 0.0,
    }
    pos = dict(base, sentimentScore=0.9)
    neg = dict(base, sentimentScore=-0.8)

    imp_base = compute_importance(base)
    imp_pos = compute_importance(pos)
    imp_neg = compute_importance(neg)

    assert imp_pos >= imp_base
    assert imp_neg >= imp_base


# ---------------------------------------------------------------------------
# Regression tests for the May 2026 correctness pass
# ---------------------------------------------------------------------------


def test_classify_cluster_llm_uses_llm_topic_and_drops_invalid_ones(monkeypatch):
    """The LLM-extracted topic must propagate to the returned cluster, but
    free-form / hallucinated topic strings must be coerced into the allowed
    set so they never reach the SQL row column verbatim."""
    import asyncio

    from news_mcp.enrichment import llm_enrich

    async def fake_extraction(cluster):
        return {
            "topic": "regulation",
            "entities": ["SEC"],
            "sentiment": "neutral",
            "sentimentScore": 0.0,
            "keywords": ["enforcement"],
        }

    monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
    monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})

    cluster = {"cluster_id": "x", "headline": "SEC fines firm", "summary": "...", "topic": "other"}
    out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
    assert out["topic"] == "regulation"

    # Hallucinated topic is rejected; we fall back to the input cluster's
    # heuristic topic when it is one of the allowed ones.
    async def fake_extraction_garbage(cluster):
        return {
            "topic": "geopolitics-and-stuff",
            "entities": ["NATO"],
            "sentiment": "neutral",
            "sentimentScore": 0.0,
            "keywords": [],
        }

    monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction_garbage)
    cluster = {"cluster_id": "y", "headline": "NATO meets", "summary": "...", "topic": "macro"}
    out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
    assert out["topic"] == "macro"  # heuristic fallback

    # When neither the LLM nor the heuristic gives a valid label -> "other".
    cluster = {"cluster_id": "z", "headline": "...", "summary": "...", "topic": "geopolitics-bucket"}
    out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))
    assert out["topic"] == "other"


def test_classify_cluster_llm_normalizes_aliases_before_blacklist(monkeypatch):
    """Regression: previously ``_filter_entities`` ran before
    ``normalize_entities``, so blacklisting "bitcoin" missed entries the LLM
    returned as the alias "btc". Order is now normalize -> blacklist."""
    import asyncio

    from news_mcp.enrichment import llm_enrich

    async def fake_extraction(cluster):
        return {
            "topic": "crypto",
            "entities": ["btc", "Reuters"],
            "sentiment": "neutral",
            "sentimentScore": 0.0,
            "keywords": ["btc rally", "Reuters"],
        }

    monkeypatch.setattr(llm_enrich, "call_extraction", fake_extraction)
    monkeypatch.setattr(llm_enrich, "resolve_entity_via_trends", lambda e: {"normalized": e, "canonical_label": e, "mid": None})
    monkeypatch.setattr(llm_enrich, "NEWS_ENTITY_BLACKLIST", ["bitcoin"])

    cluster = {"cluster_id": "x", "headline": "BTC up", "summary": "...", "topic": "crypto"}
    out = asyncio.run(llm_enrich.classify_cluster_llm(cluster))

    # "btc" became "Bitcoin" via aliasing, then was filtered out by the
    # blacklist. "Reuters" survives (not blacklisted in this test).
    assert "Bitcoin" not in out["entities"]
    assert "btc" not in [e.lower() for e in out["entities"]]
    assert "Reuters" in out["entities"]


def test_dedup_uses_jaccard_when_titles_diverge():
    """Composite similarity: even with embeddings off, two articles whose
    titles share only some tokens should still merge if their content (token
    overlap) is high enough."""
    from news_mcp.dedup import cluster as dc

    # Titles differ heavily; bodies overlap heavily -> Jaccard should catch.
    articles = [
        {
            "title": "Iran tension rises",
            "url": "https://example.com/a",
            "source": "A",
            "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
            "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
        },
        {
            "title": "Trump issues stark warning over Tehran",
            "url": "https://example.com/b",
            "source": "B",
            "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
            "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
        },
    ]
    clustered = dc.dedup_and_cluster_articles(articles)
    total = sum(len(v) for v in clustered.values())
    assert total == 1, f"Expected 1 merged cluster via Jaccard signal, got {total}"


def test_dedup_does_not_merge_unrelated_articles():
    """Negative control: cluster is robust against false-positives even with
    the more permissive multi-signal merging."""
    from news_mcp.dedup import cluster as dc

    articles = [
        {
            "title": "Bitcoin hits new high",
            "url": "https://example.com/a",
            "source": "A",
            "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
            "summary": "Bitcoin reached a record high amid rising demand.",
        },
        {
            "title": "Local sports team wins",
            "url": "https://example.com/b",
            "source": "B",
            "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
            "summary": "The local team won the regional championship.",
        },
    ]
    clustered = dc.dedup_and_cluster_articles(articles)
    total = sum(len(v) for v in clustered.values())
    assert total == 2


def test_get_all_feed_states_returns_all_rows():
    """Health endpoint regression: the writer keys feed state with a hashed
    multi-feed key, so the old hardcoded ``get_feed_state("breakingthenews")``
    always returned None. Verify the bulk getter works."""
    import tempfile
    from pathlib import Path

    with tempfile.TemporaryDirectory() as td:
        db = Path(td) / "news.sqlite"
        store = SQLiteClusterStore(db)
        store.set_feed_hash("newsfeeds:abc123", "hash1")
        store.set_feed_hash("newsfeeds:def456", "hash2")
        all_states = store.get_all_feed_states()
        assert len(all_states) == 2
        keys = {s["feed_key"] for s in all_states}
        assert keys == {"newsfeeds:abc123", "newsfeeds:def456"}


def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
    """Regression: the SQL row-column ``topic`` previously locked in the
    headline-heuristic value (which is ``other`` for most stories) and ignored
    the LLM's classification stored in the payload. Verify the upsert now uses
    the post-enrichment topic so SQL filtering and dashboard groupings see the
    real classification."""
    import asyncio

    import news_mcp.jobs.poller as poller

    captured = {"upserts": []}

    class DummyStore:
        def __init__(self, *args, **kwargs):
            pass

        @contextmanager
        def _conn(self):
            class _Conn:
                def execute(self, *args, **kwargs):
                    return None

            yield _Conn()

        def get_feed_hash(self, feed_key):
            return None

        def set_feed_hash(self, feed_key, last_hash):
            pass

        def set_feed_state(self, feed_key, last_hash, item_count):
            pass

        def get_enabled_feed_urls(self, feed_urls):
            return feed_urls

        def get_cluster_by_id(self, cluster_id):
            return None

        def upsert_clusters(self, clusters, topic):
            # Capture the topic the poller chose for each cluster.
            for c in clusters:
                captured["upserts"].append({"row_topic": topic, "payload_topic": c.get("topic"), "cluster_id": c.get("cluster_id")})

        def prune_if_due(self, **kwargs):
            return {"deleted": 0}

        def get_failed_enrichment_clusters(self, max_retries=3):
            return []

        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
            return []

        def set_meta(self, key, value):
            pass

        def set_feed_state(self, feed_key, last_hash, item_count):
            pass

    def fake_cluster(articles, similarity_threshold=None, existing_clusters=None, max_age_hours=0):
        # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
        # in the title for the heuristic matcher — title above does have
        # "law"-adjacent words but not the specific tokens it matches).
        return {
            "other": [
                {
                    "cluster_id": "cid",
                    "headline": "SEC fines firm",
                    "summary": "...",
                    "topic": "other",
                    "entities": [],
                    "sentiment": "neutral",
                    "importance": 0.0,
                    "sources": ["S"],
                    "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT",
                    "articles": [],
                }
            ]
        }

    def fake_enrich(cluster):
        return cluster

    async def fake_classify(cluster):
        # The LLM thinks it's regulation -> the SQL row column must reflect that.
        out = dict(cluster)
        out["topic"] = "regulation"
        out["entities"] = ["SEC"]
        out["entityResolutions"] = []
        out["sentiment"] = "neutral"
        out["sentimentScore"] = 0.0
        out["keywords"] = []
        return out

    monkeypatch.setattr(poller, "SQLiteClusterStore", DummyStore)

    async def _mock_fetch2(limit, url_list=None):
            return [
                {"title": "SEC fines firm", "url": "https://example.com/a", "source": "S",
                 "timestamp": "Wed, 01 Apr 2026 12:00:00 GMT", "summary": "..."},
            ]
    monkeypatch.setattr(poller, "fetch_news_articles", _mock_fetch2)
    monkeypatch.setattr(poller, "dedup_and_cluster_articles", fake_cluster)
    monkeypatch.setattr(poller, "enrich_cluster", fake_enrich)
    monkeypatch.setattr(poller, "classify_cluster_llm", fake_classify)

    asyncio.run(poller.refresh_clusters(topic=None, limit=10))

    assert captured["upserts"], "Expected at least one upsert call"
    # The poller first stores raw clusters (topic=heuristic), then enriched
    # clusters (topic=post-LLM).  The enriched upsert is the one whose row_topic
    # reflects the LLM classification.
    enriched_upserts = [u for u in captured["upserts"] if u["row_topic"] == "regulation"]
    assert enriched_upserts, (
        f"Expected at least one upsert with row_topic='regulation', "
        f"got topics: {[u['row_topic'] for u in captured['upserts']]}"
    )
    upsert = enriched_upserts[0]
    assert upsert["row_topic"] == "regulation", (
        f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
    )
    assert upsert["payload_topic"] == "regulation"


# ---------------------------------------------------------------------------
# v1.3 — Stable cluster IDs, orphan merge, temporal gating
# ---------------------------------------------------------------------------


def test_stable_cluster_id_is_order_independent():
    """Two articles about the same event should always get the same cluster_id,
    regardless of which article is processed first."""
    from news_mcp.dedup import cluster as dc

    art_a = {
        "title": "Bitcoin Surges Past $100K",
        "url": "https://example.com/btc-100k",
        "source": "Reuters",
        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
        "summary": "Bitcoin reached $100,000 for the first time.",
    }
    art_b = {
        "title": "BTC Breaks $100,000 Barrier",
        "url": "https://example.com/btc-100k",
        "source": "Bloomberg",
        "timestamp": "Mon, 30 Mar 2026 12:05:00 GMT",
        "summary": "Bitcoin topped the $100,000 level.",
    }

    # Process A first
    clustered_ab = dc.dedup_and_cluster_articles([art_a, art_b])
    # Process B first
    clustered_ba = dc.dedup_and_cluster_articles([art_b, art_a])

    # Both orderings must produce the same cluster_id(s)
    ids_ab = sorted(c["cluster_id"] for clusters in clustered_ab.values() for c in clusters)
    ids_ba = sorted(c["cluster_id"] for clusters in clustered_ba.values() for c in clusters)
    assert ids_ab == ids_ba, f"Cluster IDs depend on order: {ids_ab} vs {ids_ba}"


def test_orphan_merge_deduplicates_shared_articles():
    """When two clusters end up with overlapping article sets (e.g. because
    embeddings were temporarily unavailable), the post-clustering merge pass
    should combine them into one."""
    from news_mcp.dedup.cluster import _merge_orphan_clusters

    clusters = [
        {
            "cluster_id": "aaa",
            "topic": "crypto",
            "headline": "Bitcoin surges",
            "articles": [
                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
            ],
            "sources": ["A"],
            "first_seen": "T1",
            "last_updated": "T1",
        },
        {
            "cluster_id": "bbb",
            "topic": "crypto",
            "headline": "BTC up",
            "articles": [
                {"title": "BTC up", "url": "https://example.com/btc", "source": "B"},
            ],
            "sources": ["B"],
            "first_seen": "T2",
            "last_updated": "T2",
        },
    ]
    merged = _merge_orphan_clusters(clusters)
    assert len(merged) == 1, f"Expected 1 merged cluster, got {len(merged)}"
    assert set(merged[0]["sources"]) == {"A", "B"}


def test_orphan_merge_preserves_distinct_clusters():
    """Clusters with no shared articles must remain independent."""
    from news_mcp.dedup.cluster import _merge_orphan_clusters

    clusters = [
        {
            "cluster_id": "aaa",
            "topic": "crypto",
            "headline": "Bitcoin surges",
            "articles": [
                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
            ],
            "sources": ["A"],
            "first_seen": "T1",
            "last_updated": "T1",
        },
        {
            "cluster_id": "bbb",
            "topic": "crypto",
            "headline": "Ethereum merge",
            "articles": [
                {"title": "Ethereum merge", "url": "https://example.com/eth", "source": "B"},
            ],
            "sources": ["B"],
            "first_seen": "T2",
            "last_updated": "T2",
        },
    ]
    merged = _merge_orphan_clusters(clusters)
    assert len(merged) == 2


def test_stable_id_same_for_different_titles_same_url():
    """Two articles with the same URL but different titles (e.g. corrected
    headline) must produce the same cluster_id."""
    from news_mcp.dedup.cluster import _stable_cluster_id

    arts_a = [
        {"title": "Fed Raises Rates", "url": "https://example.com/fed-rates"},
    ]
    arts_b = [
        {"title": "Federal Reserve Increases Interest Rates", "url": "https://example.com/fed-rates"},
    ]
    id_a = _stable_cluster_id("macro", arts_a)
    id_b = _stable_cluster_id("macro", arts_b)
    assert id_a == id_b, f"Same URL must give same cluster_id: {id_a} vs {id_b}"


def test_temporal_gate_excludes_stale_clusters():
    """Clusters older than max_age_hours should not be candidates for merging."""
    from news_mcp.dedup.cluster import _cluster_is_within_age_window

    old_cluster = {
        "cluster_id": "old",
        "topic": "crypto",
        "last_updated": "2025-01-01T00:00:00+00:00",
        "articles": [],
    }
    assert not _cluster_is_within_age_window(old_cluster, max_age_hours=4)

    recent_cluster = {
        "cluster_id": "recent",
        "topic": "crypto",
        "last_updated": datetime.now(timezone.utc).isoformat(),
        "articles": [],
    }
    assert _cluster_is_within_age_window(recent_cluster, max_age_hours=4)

    # max_age_hours=0 means no limit
    assert _cluster_is_within_age_window(old_cluster, max_age_hours=0)


def test_preseed_merge_into_existing_cluster():
    """When existing_clusters is provided, a new article that matches should
    merge into the existing cluster instead of creating a new one."""
    from news_mcp.dedup import cluster as dc

    existing = [{
        "cluster_id": "existing-1",
        "topic": "other",
        "headline": "Trump warns Iran war could spread across Middle East",
        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
        "sources": ["Reuters"],
        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
        "last_updated": datetime.now(timezone.utc).isoformat(),
        "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
        "articles": [
            {
                "title": "Trump warns Iran war could spread across Middle East",
                "url": "https://example.com/trump-iran",
                "source": "Reuters",
                "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
                "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
            }
        ],
        "entities": [],
        "sentiment": "neutral",
        "importance": 0.0,
    }]

    new_article = {
        "title": "Trump warns Iran conflict could spread across Middle East",
        "url": "https://example.com/trump-iran-2",
        "source": "Bloomberg",
        "timestamp": "Mon, 30 Mar 2026 13:00:00 GMT",
        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
    }

    # Use a low title threshold so Jaccard can catch the merge
    clustered = dc.dedup_and_cluster_articles(
        [new_article],
        similarity_threshold=0.75,
        existing_clusters=existing,
        max_age_hours=4,
    )

    all_clusters = [c for clusters in clustered.values() for c in clusters]
    # Should have exactly 1 cluster (the existing one, now with 2 articles)
    assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}"
    assert len(all_clusters[0]["articles"]) == 2


def test_cross_cycle_merge_topic_mismatch():
    """Regression: same article arriving in two cycles must merge even when
    the existing cluster's enriched topic differs from the new article's
    heuristic topic.  Previously the cluster_id included the topic in the
    hash AND existing clusters were bucketed by enriched topic, so a
    topic mismatch silently produced two rows in the DB."""
    from news_mcp.dedup import cluster as dc

    url = (
        "https://breakingthenews.net/Article/"
        "Hegseth-says-US-will-keep-pressure-on-Iran/66401647"
    )

    existing = [{
        "cluster_id": "old-id",
        # Enriched topic from a prior LLM pass — *different* from what
        # normalize_topic_from_title would return for the headline.
        "topic": "crypto",
        "headline": "Hegseth says US will keep pressure on Iran",
        "summary": "",
        "sources": ["Breaking The News"],
        "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
        "last_updated": datetime.now(timezone.utc).isoformat(),
        "first_seen": "Sat, 30 May 2026 13:00:00 GMT",
        "articles": [{
            "title": "Hegseth says US will keep pressure on Iran",
            "url": url,
            "source": "Breaking The News",
            "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
            "summary": "",
        }],
        "entities": ["Pete Hegseth", "Iran"],
        "sentiment": "negative",
        "sentimentScore": -0.5,
        "importance": 0.1,
    }]

    # The same article arrives again in the next polling cycle.
    # Its heuristic topic (normalize_topic_from_title) is "other" (no
    # keyword match), which differs from the stored "crypto" topic.
    new_article = {
        "title": "Hegseth says US will keep pressure on Iran",
        "url": url,
        "source": "Breaking The News",
        "timestamp": "Sat, 30 May 2026 13:00:00 GMT",
        "summary": "",
        # feed_url is used for per-feed hash tracking
        "feed_url": "https://breakingthenews.net/news-feed.xml",
        "importance": 0.11,
    }

    clustered = dc.dedup_and_cluster_articles(
        [new_article],
        existing_clusters=existing,
        max_age_hours=4,
    )

    all_clusters = [c for clusters in clustered.values() for c in clusters]
    # Must produce exactly 1 cluster — the new article merges into the
    # existing one.  Before the fix this yielded 2 clusters with different
    # cluster_ids because the topic mismatch prevented matching.
    assert len(all_clusters) == 1, (
        f"Expected 1 cluster, got {len(all_clusters)}: "
        f"{[c['headline'] for c in all_clusters]}"
    )

    # The surviving cluster must carry the *same* cluster_id regardless of
    # which topic wins, i.e. cluster_id is now purely article-key based.
    from news_mcp.dedup.cluster import _stable_cluster_id
    expected_cid = _stable_cluster_id(
        "other",
        [{"title": "Hegseth says US will keep pressure on Iran", "url": url}],
    )
    assert all_clusters[0]["cluster_id"] == expected_cid

    # The existing article must still be in the merged cluster.
    article_urls = [a["url"] for a in all_clusters[0]["articles"]]
    assert url in article_urls