пре 1 недеља · 2e0e9643b7
--- a/news_mcp/config.py
+++ b/news_mcp/config.py
@@ -48,6 +48,10 @@ OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", os.getenv("OLLAMA_URL", "http://1
 
															 OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
														
 
															 NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY_THRESHOLD", "0.885"))
														
 
															+# Cluster merge window: how far back (hours) to load existing clusters from
														
 
															+# the DB for cross-cycle merging.  0 = disabled (no cross-cycle merge).
														
 
															+NEWS_CLUSTER_MAX_AGE_HOURS = float(os.getenv("NEWS_CLUSTER_MAX_AGE_HOURS", "4"))
														
 
															+
														
 
															 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
														
 
															 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"
														
 
															 NEWS_BACKGROUND_REFRESH_ON_START = os.getenv("NEWS_BACKGROUND_REFRESH_ON_START", "true").lower() == "true"
														
--- a/news_mcp/dedup/cluster.py
+++ b/news_mcp/dedup/cluster.py
@@ -3,11 +3,16 @@ from __future__ import annotations
 
															 import asyncio
														
 
															 import hashlib
														
 
															 import re
														
 
															+from datetime import datetime, timezone, timedelta
														
 
															 from difflib import SequenceMatcher
														
 
															 from typing import Any, Dict, List
														
 
															 from urllib.parse import urlparse
														
 
															-from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD
														
 
															+from news_mcp.config import (
														
 
															+    NEWS_EMBEDDINGS_ENABLED,
														
 
															+    NEWS_EMBEDDING_SIMILARITY_THRESHOLD,
														
 
															+    NEWS_CLUSTER_MAX_AGE_HOURS,
														
 
															+)
														
 
															 from news_mcp.dedup.embedding_support import cosine_similarity, ollama_embed
														
 
															 from news_mcp.sources.news_feeds import normalize_topic_from_title
														
@@ -117,6 +122,60 @@ def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, fl
 
															     return False, "none", 0.0
														
 
															+# ---------------------------------------------------------------------------
														
 
															+# Stable cluster ID
														
 
															+# ---------------------------------------------------------------------------
														
 
															+
														
 
															+def _stable_cluster_id(topic: str, articles: List[Dict[str, Any]]) -> str:
														
 
															+    """Deterministic cluster ID derived from the topic and the sorted set of
														
 
															+    article keys.  Using the minimum key (lexicographic) as the seed ensures
														
 
															+    that no matter which article arrives first, the same set of articles always
														
 
															+    maps to the same cluster_id."""
														
 
															+    keys = sorted(_article_key(a) for a in articles if _article_key(a))
														
 
															+    if not keys:
														
 
															+        # Degenerate fallback — single article with empty url and title
														
 
															+        return hashlib.sha1(topic.encode("utf-8")).hexdigest()
														
 
															+    seed = keys[0]
														
 
															+    return hashlib.sha1(f"{topic}|{seed}".encode("utf-8")).hexdigest()
														
 
															+
														
 
															+
														
 
															+# ---------------------------------------------------------------------------
														
 
															+# Temporal gating
														
 
															+# ---------------------------------------------------------------------------
														
 
															+
														
 
															+def _parse_ts(ts_str: str) -> datetime | None:
														
 
															+    if not ts_str:
														
 
															+        return None
														
 
															+    try:
														
 
															+        s = str(ts_str).replace("Z", "+00:00")
														
 
															+        dt = datetime.fromisoformat(s)
														
 
															+        if dt.tzinfo is None:
														
 
															+            dt = dt.replace(tzinfo=timezone.utc)
														
 
															+        return dt.astimezone(timezone.utc)
														
 
															+    except Exception:
														
 
															+        pass
														
 
															+    try:
														
 
															+        from email.utils import parsedate_to_datetime
														
 
															+        dt = parsedate_to_datetime(str(ts_str))
														
 
															+        if dt.tzinfo is None:
														
 
															+            dt = dt.replace(tzinfo=timezone.utc)
														
 
															+        return dt.astimezone(timezone.utc)
														
 
															+    except Exception:
														
 
															+        return None
														
 
															+
														
 
															+
														
 
															+def _cluster_is_within_age_window(cluster: Dict[str, Any], *, max_age_hours: float) -> bool:
														
 
															+    """Return True if the cluster's last_updated is within the merge window."""
														
 
															+    if max_age_hours <= 0:
														
 
															+        return True  # 0 = no limit
														
 
															+    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
														
 
															+    dt = _parse_ts(ts_str)
														
 
															+    if dt is None:
														
 
															+        return True  # be lenient with unparseable timestamps
														
 
															+    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
														
 
															+    return dt >= cutoff
														
 
															+
														
 
															+
														
 
															 # ---------------------------------------------------------------------------
														
 
															 # Embedding pre-computation (async internally)
														
 
															 # ---------------------------------------------------------------------------
														
@@ -175,6 +234,97 @@ def _compute_embeddings_sync(
 
															         return future.result()
														
 
															+# ---------------------------------------------------------------------------
														
 
															+# Orphan merge: detect clusters sharing articles and merge them
														
 
															+# ---------------------------------------------------------------------------
														
 
															+
														
 
															+def _merge_orphan_clusters(
														
 
															+    clusters: List[Dict[str, Any]],
														
 
															+) -> List[Dict[str, Any]]:
														
 
															+    """Post-clustering pass: merge clusters that share article keys.
														
 
															+
														
 
															+    This handles the case where two articles about the same event didn't match
														
 
															+    during the main loop (e.g. embeddings were temporarily unavailable) and
														
 
															+    ended up in separate clusters.  If two clusters share >= 1 article key, we
														
 
															+    merge them into one (keeping the earlier first_seen, recompute the stable
														
 
															+    ID from the union of articles).
														
 
															+    """
														
 
															+    if len(clusters) <= 1:
														
 
															+        return clusters
														
 
															+
														
 
															+    # Build index: article_key -> list of cluster indices
														
 
															+    key_to_indices: dict[str, list[int]] = {}
														
 
															+    for idx, c in enumerate(clusters):
														
 
															+        for a in c.get("articles", []) or []:
														
 
															+            ak = _article_key(a)
														
 
															+            if ak:
														
 
															+                key_to_indices.setdefault(ak, []).append(idx)
														
 
															+
														
 
															+    # Find connected components via Union-Find
														
 
															+    parent = list(range(len(clusters)))
														
 
															+
														
 
															+    def find(x: int) -> int:
														
 
															+        while parent[x] != x:
														
 
															+            parent[x] = parent[parent[x]]
														
 
															+            x = parent[x]
														
 
															+        return x
														
 
															+
														
 
															+    def union(a: int, b: int) -> None:
														
 
															+        ra, rb = find(a), find(b)
														
 
															+        if ra != rb:
														
 
															+            parent[ra] = rb
														
 
															+
														
 
															+    for indices in key_to_indices.values():
														
 
															+        for i in range(1, len(indices)):
														
 
															+            union(indices[0], indices[i])
														
 
															+
														
 
															+    # Group clusters by component
														
 
															+    components: dict[int, list[int]] = {}
														
 
															+    for idx in range(len(clusters)):
														
 
															+        root = find(idx)
														
 
															+        components.setdefault(root, []).append(idx)
														
 
															+
														
 
															+    merged: List[Dict[str, Any]] = []
														
 
															+    for root, members in components.items():
														
 
															+        if len(members) == 1:
														
 
															+            merged.append(clusters[members[0]])
														
 
															+            continue
														
 
															+
														
 
															+        # Merge all clusters in this component
														
 
															+        base = dict(clusters[members[0]])
														
 
															+        all_articles: list[dict] = list(base.get("articles", []) or [])
														
 
															+        all_sources: list[str] = list(base.get("sources", []) or [])
														
 
															+        first_seen = base.get("first_seen", "")
														
 
															+        last_updated = base.get("last_updated", "")
														
 
															+
														
 
															+        for m_idx in members[1:]:
														
 
															+            other = clusters[m_idx]
														
 
															+            existing_keys = {_article_key(a) for a in all_articles}
														
 
															+            for a in other.get("articles", []) or []:
														
 
															+                ak = _article_key(a)
														
 
															+                if ak not in existing_keys:
														
 
															+                    all_articles.append(a)
														
 
															+                    existing_keys.add(ak)
														
 
															+            for s in other.get("sources", []) or []:
														
 
															+                if s not in all_sources:
														
 
															+                    all_sources.append(s)
														
 
															+            fs = other.get("first_seen", "")
														
 
															+            if fs and (not first_seen or fs < first_seen):
														
 
															+                first_seen = fs
														
 
															+            lu = other.get("last_updated", "")
														
 
															+            if lu and (not last_updated or lu > last_updated):
														
 
															+                last_updated = lu
														
 
															+
														
 
															+        base["articles"] = all_articles
														
 
															+        base["sources"] = all_sources
														
 
															+        base["first_seen"] = first_seen
														
 
															+        base["last_updated"] = last_updated
														
 
															+        base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
														
 
															+        merged.append(base)
														
 
															+
														
 
															+    return merged
														
 
															+
														
 
															+
														
 
															 # ---------------------------------------------------------------------------
														
 
															 # Public API (sync — backward compatible with tests)
														
 
															 # ---------------------------------------------------------------------------
														
@@ -183,16 +333,22 @@ def _compute_embeddings_sync(
 
															 def dedup_and_cluster_articles(
														
 
															     articles: List[Dict[str, Any]],
														
 
															     similarity_threshold: float | None = None,
														
 
															+    *,
														
 
															+    existing_clusters: List[Dict[str, Any]] | None = None,
														
 
															+    max_age_hours: float = 0,
														
 
															 ) -> Dict[str, List[Dict[str, Any]]]:
														
 
															     """Deduplicate raw articles into clusters keyed by topic.
														
 
															-    v1.2: embedding pre-computation is async/concurrent under the hood, but
														
 
															-    this public function remains synchronous for backward compatibility.
														
 
															+    v1.3: stable cluster IDs, temporal gating, and orphan merge.
														
 
															-    A pair merges if ANY signal clears its threshold:
														
 
															-      * title fuzzy ratio
														
 
															-      * token Jaccard over headline+summary
														
 
															-      * Ollama embedding cosine when available
														
 
															+    Args:
														
 
															+        articles: new articles to cluster.
														
 
															+        similarity_threshold: override for the title-similarity threshold.
														
 
															+        existing_clusters: optional list of recent clusters from the DB to
														
 
															+            merge against (cross-cycle merge).  When provided, temporal
														
 
															+            gating via max_age_hours is applied to filter these.
														
 
															+        max_age_hours: only compare against existing_clusters updated within
														
 
															+            this many hours.  0 = no limit (compare against all provided).
														
 
															     """
														
 
															     title_threshold = similarity_threshold if similarity_threshold is not None else DEFAULT_TITLE_THRESHOLD
														
@@ -204,6 +360,14 @@ def dedup_and_cluster_articles(
 
															     by_topic: Dict[str, List[Dict[str, Any]]] = {}
														
 
															+    # Seed with existing clusters (filtered by age window)
														
 
															+    if existing_clusters:
														
 
															+        for c in existing_clusters:
														
 
															+            if not _cluster_is_within_age_window(c, max_age_hours=max_age_hours):
														
 
															+                continue
														
 
															+            topic = c.get("topic", "other") or "other"
														
 
															+            by_topic.setdefault(topic, []).append(dict(c))
														
 
															+
														
 
															     for a in articles:
														
 
															         title = a.get("title") or ""
														
 
															         if not title:
														
@@ -262,8 +426,7 @@ def dedup_and_cluster_articles(
 
															                 {"signal": best_signal_name, "value": round(best_signal_value, 3)}
														
 
															             )
														
 
															         else:
														
 
															-            key = f"{topic}|{_normalize_title(title)}"
														
 
															-            cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
														
 
															+            cid = _stable_cluster_id(topic, [a])
														
 
															             cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
														
 
															             clusters.append(
														
 
															                 {
														
@@ -284,6 +447,15 @@ def dedup_and_cluster_articles(
 
															                 }
														
 
															             )
														
 
															+    # Post-clustering passes per topic
														
 
															+    for topic, clusters in by_topic.items():
														
 
															+        # Merge orphans (clusters that share articles)
														
 
															+        clusters = _merge_orphan_clusters(clusters)
														
 
															+        # Recompute stable IDs from the final article sets
														
 
															+        for c in clusters:
														
 
															+            c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
														
 
															+        by_topic[topic] = clusters
														
 
															+
														
 
															     # Strip the internal merge audit trail before returning
														
 
															     for clusters in by_topic.values():
														
 
															         for c in clusters:
														
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -5,7 +5,7 @@ import hashlib
 
															 import logging
														
 
															 import sys
														
 
															 from collections import defaultdict
														
 
															-from datetime import datetime, timezone
														
 
															+from datetime import datetime, timezone, timedelta
														
 
															 from typing import Any, Dict
														
 
															 from news_mcp.config import (
														
@@ -20,6 +20,7 @@ from news_mcp.config import (
 
															     NEWS_PRUNE_INTERVAL_HOURS,
														
 
															     NEWS_PRUNING_ENABLED,
														
 
															     NEWS_RETENTION_DAYS,
														
 
															+    NEWS_CLUSTER_MAX_AGE_HOURS,
														
 
															     llm_concurrency,
														
 
															 )
														
 
															 from news_mcp.dedup.cluster import dedup_and_cluster_articles
														
@@ -147,6 +148,32 @@ async def _enrich_topic_clusters(
 
															     return enriched
														
 
															+def _cluster_age_ok(cluster: dict, max_age_hours: float) -> bool:
														
 
															+    """Check whether a cluster's last_updated is within the merge window."""
														
 
															+    if max_age_hours <= 0:
														
 
															+        return True
														
 
															+    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
														
 
															+    if not ts_str:
														
 
															+        return True
														
 
															+    try:
														
 
															+        s = str(ts_str).replace("Z", "+00:00")
														
 
															+        dt = datetime.fromisoformat(s)
														
 
															+        if dt.tzinfo is None:
														
 
															+            dt = dt.replace(tzinfo=timezone.utc)
														
 
															+        dt = dt.astimezone(timezone.utc)
														
 
															+    except Exception:
														
 
															+        try:
														
 
															+            from email.utils import parsedate_to_datetime
														
 
															+            dt = parsedate_to_datetime(str(ts_str))
														
 
															+            if dt.tzinfo is None:
														
 
															+                dt = dt.replace(tzinfo=timezone.utc)
														
 
															+            dt = dt.astimezone(timezone.utc)
														
 
															+        except Exception:
														
 
															+            return True
														
 
															+    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
														
 
															+    return dt >= cutoff
														
 
															+
														
 
															+
														
 
															 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
														
 
															     logger = logging.getLogger("news_mcp.refresh")
														
 
															     store = SQLiteClusterStore(DB_PATH)
														
@@ -205,9 +232,32 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
															     articles = changed_articles
														
 
															     logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
														
 
															+
														
 
															+    # Pre-seed with recent clusters from the DB so new articles can merge
														
 
															+    # into existing clusters across polling cycles.
														
 
															+    max_age = NEWS_CLUSTER_MAX_AGE_HOURS
														
 
															+    recent_clusters: list[dict] = []
														
 
															+    if max_age != 0:
														
 
															+        lookback = max_age if max_age > 0 else 72
														
 
															+        all_recent = store.get_latest_clusters_all_topics(
														
 
															+            ttl_hours=lookback,
														
 
															+            limit=500,
														
 
															+        )
														
 
															+        recent_clusters = [c for c in all_recent if _cluster_age_ok(c, max_age)]
														
 
															+        logger.info(
														
 
															+            "refresh pre-seeded existing_clusters=%s max_age_h=%s",
														
 
															+            len(recent_clusters), max_age,
														
 
															+        )
														
 
															+
														
 
															     # Clustering is sync but may do concurrent embedding fetches internally.
														
 
															     # Run off-thread so the event loop stays responsive for MCP tool calls.
														
 
															-    clustered_by_topic = await asyncio.to_thread(dedup_and_cluster_articles, articles)
														
 
															+    clustered_by_topic = await asyncio.to_thread(
														
 
															+        dedup_and_cluster_articles,
														
 
															+        articles,
														
 
															+        None,  # use default similarity_threshold
														
 
															+        existing_clusters=recent_clusters if recent_clusters else None,
														
 
															+        max_age_hours=max_age,
														
 
															+    )
														
 
															     logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
														
 
															     # Build LLM concurrency semaphore from the extract provider's config.
														
--- a/test_news_mcp.py
+++ b/test_news_mcp.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 
															 from contextlib import contextmanager
														
 
															 import tempfile
														
 
															 from pathlib import Path
														
 
															+from datetime import datetime, timezone
														
 
															 from news_mcp.dedup.cluster import dedup_and_cluster_articles
														
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
@@ -14,7 +15,10 @@ from news_mcp.trends_resolution import resolve_entity_via_trends
 
															 from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
														
 
															-def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
														
 
															+def _article(title: str, url: str = None, source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
														
 
															+    if url is None:
														
 
															+        import hashlib
														
 
															+        url = f"https.example.com/{hashlib.md5(title.encode()).hexdigest()[:10]}"
														
 
															     return {
														
 
															         "title": title,
														
 
															         "url": url,
														
@@ -385,6 +389,9 @@ def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
 
															             self.meta["prune"] = kwargs
														
 
															             return {"deleted": 0}
														
 
															+        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
														
 
															+            return []
														
 
															+
														
 
															         def set_meta(self, key, value):
														
 
															             self.meta[key] = value
														
@@ -637,13 +644,16 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
 
															         def get_failed_enrichment_clusters(self, max_retries=3):
														
 
															             return []
														
 
															+        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
														
 
															+            return []
														
 
															+
														
 
															         def set_meta(self, key, value):
														
 
															             pass
														
 
															         def set_feed_state(self, feed_key, last_hash, item_count):
														
 
															             pass
														
 
															-    def fake_cluster(articles):
														
 
															+    def fake_cluster(articles, similarity_threshold=None, existing_clusters=None, max_age_hours=0):
														
 
															         # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
														
 
															         # in the title for the heuristic matcher — title above does have
														
 
															         # "law"-adjacent words but not the specific tokens it matches).
														
@@ -706,3 +716,196 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
 
															         f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
														
 
															     )
														
 
															     assert upsert["payload_topic"] == "regulation"
														
 
															+
														
 
															+
														
 
															+# ---------------------------------------------------------------------------
														
 
															+# v1.3 — Stable cluster IDs, orphan merge, temporal gating
														
 
															+# ---------------------------------------------------------------------------
														
 
															+
														
 
															+
														
 
															+def test_stable_cluster_id_is_order_independent():
														
 
															+    """Two articles about the same event should always get the same cluster_id,
														
 
															+    regardless of which article is processed first."""
														
 
															+    from news_mcp.dedup import cluster as dc
														
 
															+
														
 
															+    art_a = {
														
 
															+        "title": "Bitcoin Surges Past $100K",
														
 
															+        "url": "https://example.com/btc-100k",
														
 
															+        "source": "Reuters",
														
 
															+        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
														
 
															+        "summary": "Bitcoin reached $100,000 for the first time.",
														
 
															+    }
														
 
															+    art_b = {
														
 
															+        "title": "BTC Breaks $100,000 Barrier",
														
 
															+        "url": "https://example.com/btc-100k",
														
 
															+        "source": "Bloomberg",
														
 
															+        "timestamp": "Mon, 30 Mar 2026 12:05:00 GMT",
														
 
															+        "summary": "Bitcoin topped the $100,000 level.",
														
 
															+    }
														
 
															+
														
 
															+    # Process A first
														
 
															+    clustered_ab = dc.dedup_and_cluster_articles([art_a, art_b])
														
 
															+    # Process B first
														
 
															+    clustered_ba = dc.dedup_and_cluster_articles([art_b, art_a])
														
 
															+
														
 
															+    # Both orderings must produce the same cluster_id(s)
														
 
															+    ids_ab = sorted(c["cluster_id"] for clusters in clustered_ab.values() for c in clusters)
														
 
															+    ids_ba = sorted(c["cluster_id"] for clusters in clustered_ba.values() for c in clusters)
														
 
															+    assert ids_ab == ids_ba, f"Cluster IDs depend on order: {ids_ab} vs {ids_ba}"
														
 
															+
														
 
															+
														
 
															+def test_orphan_merge_deduplicates_shared_articles():
														
 
															+    """When two clusters end up with overlapping article sets (e.g. because
														
 
															+    embeddings were temporarily unavailable), the post-clustering merge pass
														
 
															+    should combine them into one."""
														
 
															+    from news_mcp.dedup.cluster import _merge_orphan_clusters
														
 
															+
														
 
															+    clusters = [
														
 
															+        {
														
 
															+            "cluster_id": "aaa",
														
 
															+            "topic": "crypto",
														
 
															+            "headline": "Bitcoin surges",
														
 
															+            "articles": [
														
 
															+                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
														
 
															+            ],
														
 
															+            "sources": ["A"],
														
 
															+            "first_seen": "T1",
														
 
															+            "last_updated": "T1",
														
 
															+        },
														
 
															+        {
														
 
															+            "cluster_id": "bbb",
														
 
															+            "topic": "crypto",
														
 
															+            "headline": "BTC up",
														
 
															+            "articles": [
														
 
															+                {"title": "BTC up", "url": "https://example.com/btc", "source": "B"},
														
 
															+            ],
														
 
															+            "sources": ["B"],
														
 
															+            "first_seen": "T2",
														
 
															+            "last_updated": "T2",
														
 
															+        },
														
 
															+    ]
														
 
															+    merged = _merge_orphan_clusters(clusters)
														
 
															+    assert len(merged) == 1, f"Expected 1 merged cluster, got {len(merged)}"
														
 
															+    assert set(merged[0]["sources"]) == {"A", "B"}
														
 
															+
														
 
															+
														
 
															+def test_orphan_merge_preserves_distinct_clusters():
														
 
															+    """Clusters with no shared articles must remain independent."""
														
 
															+    from news_mcp.dedup.cluster import _merge_orphan_clusters
														
 
															+
														
 
															+    clusters = [
														
 
															+        {
														
 
															+            "cluster_id": "aaa",
														
 
															+            "topic": "crypto",
														
 
															+            "headline": "Bitcoin surges",
														
 
															+            "articles": [
														
 
															+                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
														
 
															+            ],
														
 
															+            "sources": ["A"],
														
 
															+            "first_seen": "T1",
														
 
															+            "last_updated": "T1",
														
 
															+        },
														
 
															+        {
														
 
															+            "cluster_id": "bbb",
														
 
															+            "topic": "crypto",
														
 
															+            "headline": "Ethereum merge",
														
 
															+            "articles": [
														
 
															+                {"title": "Ethereum merge", "url": "https://example.com/eth", "source": "B"},
														
 
															+            ],
														
 
															+            "sources": ["B"],
														
 
															+            "first_seen": "T2",
														
 
															+            "last_updated": "T2",
														
 
															+        },
														
 
															+    ]
														
 
															+    merged = _merge_orphan_clusters(clusters)
														
 
															+    assert len(merged) == 2
														
 
															+
														
 
															+
														
 
															+def test_stable_id_same_for_different_titles_same_url():
														
 
															+    """Two articles with the same URL but different titles (e.g. corrected
														
 
															+    headline) must produce the same cluster_id."""
														
 
															+    from news_mcp.dedup.cluster import _stable_cluster_id
														
 
															+
														
 
															+    arts_a = [
														
 
															+        {"title": "Fed Raises Rates", "url": "https://example.com/fed-rates"},
														
 
															+    ]
														
 
															+    arts_b = [
														
 
															+        {"title": "Federal Reserve Increases Interest Rates", "url": "https://example.com/fed-rates"},
														
 
															+    ]
														
 
															+    id_a = _stable_cluster_id("macro", arts_a)
														
 
															+    id_b = _stable_cluster_id("macro", arts_b)
														
 
															+    assert id_a == id_b, f"Same URL must give same cluster_id: {id_a} vs {id_b}"
														
 
															+
														
 
															+
														
 
															+def test_temporal_gate_excludes_stale_clusters():
														
 
															+    """Clusters older than max_age_hours should not be candidates for merging."""
														
 
															+    from news_mcp.dedup.cluster import _cluster_is_within_age_window
														
 
															+
														
 
															+    old_cluster = {
														
 
															+        "cluster_id": "old",
														
 
															+        "topic": "crypto",
														
 
															+        "last_updated": "2025-01-01T00:00:00+00:00",
														
 
															+        "articles": [],
														
 
															+    }
														
 
															+    assert not _cluster_is_within_age_window(old_cluster, max_age_hours=4)
														
 
															+
														
 
															+    recent_cluster = {
														
 
															+        "cluster_id": "recent",
														
 
															+        "topic": "crypto",
														
 
															+        "last_updated": datetime.now(timezone.utc).isoformat(),
														
 
															+        "articles": [],
														
 
															+    }
														
 
															+    assert _cluster_is_within_age_window(recent_cluster, max_age_hours=4)
														
 
															+
														
 
															+    # max_age_hours=0 means no limit
														
 
															+    assert _cluster_is_within_age_window(old_cluster, max_age_hours=0)
														
 
															+
														
 
															+
														
 
															+def test_preseed_merge_into_existing_cluster():
														
 
															+    """When existing_clusters is provided, a new article that matches should
														
 
															+    merge into the existing cluster instead of creating a new one."""
														
 
															+    from news_mcp.dedup import cluster as dc
														
 
															+
														
 
															+    existing = [{
														
 
															+        "cluster_id": "existing-1",
														
 
															+        "topic": "other",
														
 
															+        "headline": "Trump warns Iran war could spread across Middle East",
														
 
															+        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
														
 
															+        "sources": ["Reuters"],
														
 
															+        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
														
 
															+        "last_updated": datetime.now(timezone.utc).isoformat(),
														
 
															+        "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
														
 
															+        "articles": [
														
 
															+            {
														
 
															+                "title": "Trump warns Iran war could spread across Middle East",
														
 
															+                "url": "https://example.com/trump-iran",
														
 
															+                "source": "Reuters",
														
 
															+                "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
														
 
															+                "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
														
 
															+            }
														
 
															+        ],
														
 
															+        "entities": [],
														
 
															+        "sentiment": "neutral",
														
 
															+        "importance": 0.0,
														
 
															+    }]
														
 
															+
														
 
															+    new_article = {
														
 
															+        "title": "Trump warns Iran conflict could spread across Middle East",
														
 
															+        "url": "https://example.com/trump-iran-2",
														
 
															+        "source": "Bloomberg",
														
 
															+        "timestamp": "Mon, 30 Mar 2026 13:00:00 GMT",
														
 
															+        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
														
 
															+    }
														
 
															+
														
 
															+    # Use a low title threshold so Jaccard can catch the merge
														
 
															+    clustered = dc.dedup_and_cluster_articles(
														
 
															+        [new_article],
														
 
															+        similarity_threshold=0.75,
														
 
															+        existing_clusters=existing,
														
 
															+        max_age_hours=4,
														
 
															+    )
														
 
															+
														
 
															+    all_clusters = [c for clusters in clustered.values() for c in clusters]
														
 
															+    # Should have exactly 1 cluster (the existing one, now with 2 articles)
														
 
															+    assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}"
														
 
															+    assert len(all_clusters[0]["articles"]) == 2