преди 1 седмица · 2e0e9643b7
--- a/news_mcp/config.py
+++ b/news_mcp/config.py
@@ -48,6 +48,10 @@ OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", os.getenv("OLLAMA_URL", "http://1
 
				 OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
			
 
				 NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY_THRESHOLD", "0.885"))
			
 
				 
			
 
				+# Cluster merge window: how far back (hours) to load existing clusters from
			
 
				+# the DB for cross-cycle merging.  0 = disabled (no cross-cycle merge).
			
 
				+NEWS_CLUSTER_MAX_AGE_HOURS = float(os.getenv("NEWS_CLUSTER_MAX_AGE_HOURS", "4"))
			
 
				+
			
 
				 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
			
 
				 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"
			
 
				 NEWS_BACKGROUND_REFRESH_ON_START = os.getenv("NEWS_BACKGROUND_REFRESH_ON_START", "true").lower() == "true"
			
--- a/news_mcp/dedup/cluster.py
+++ b/news_mcp/dedup/cluster.py
@@ -3,11 +3,16 @@ from __future__ import annotations
 
				 import asyncio
			
 
				 import hashlib
			
 
				 import re
			
 
				+from datetime import datetime, timezone, timedelta
			
 
				 from difflib import SequenceMatcher
			
 
				 from typing import Any, Dict, List
			
 
				 from urllib.parse import urlparse
			
 
				 
			
 
				-from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD
			
 
				+from news_mcp.config import (
			
 
				+    NEWS_EMBEDDINGS_ENABLED,
			
 
				+    NEWS_EMBEDDING_SIMILARITY_THRESHOLD,
			
 
				+    NEWS_CLUSTER_MAX_AGE_HOURS,
			
 
				+)
			
 
				 from news_mcp.dedup.embedding_support import cosine_similarity, ollama_embed
			
 
				 from news_mcp.sources.news_feeds import normalize_topic_from_title
			
 
				 
			
@@ -117,6 +122,60 @@ def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, fl
 
				     return False, "none", 0.0
			
 
				 
			
 
				 
			
 
				+# ---------------------------------------------------------------------------
			
 
				+# Stable cluster ID
			
 
				+# ---------------------------------------------------------------------------
			
 
				+
			
 
				+def _stable_cluster_id(topic: str, articles: List[Dict[str, Any]]) -> str:
			
 
				+    """Deterministic cluster ID derived from the topic and the sorted set of
			
 
				+    article keys.  Using the minimum key (lexicographic) as the seed ensures
			
 
				+    that no matter which article arrives first, the same set of articles always
			
 
				+    maps to the same cluster_id."""
			
 
				+    keys = sorted(_article_key(a) for a in articles if _article_key(a))
			
 
				+    if not keys:
			
 
				+        # Degenerate fallback — single article with empty url and title
			
 
				+        return hashlib.sha1(topic.encode("utf-8")).hexdigest()
			
 
				+    seed = keys[0]
			
 
				+    return hashlib.sha1(f"{topic}|{seed}".encode("utf-8")).hexdigest()
			
 
				+
			
 
				+
			
 
				+# ---------------------------------------------------------------------------
			
 
				+# Temporal gating
			
 
				+# ---------------------------------------------------------------------------
			
 
				+
			
 
				+def _parse_ts(ts_str: str) -> datetime | None:
			
 
				+    if not ts_str:
			
 
				+        return None
			
 
				+    try:
			
 
				+        s = str(ts_str).replace("Z", "+00:00")
			
 
				+        dt = datetime.fromisoformat(s)
			
 
				+        if dt.tzinfo is None:
			
 
				+            dt = dt.replace(tzinfo=timezone.utc)
			
 
				+        return dt.astimezone(timezone.utc)
			
 
				+    except Exception:
			
 
				+        pass
			
 
				+    try:
			
 
				+        from email.utils import parsedate_to_datetime
			
 
				+        dt = parsedate_to_datetime(str(ts_str))
			
 
				+        if dt.tzinfo is None:
			
 
				+            dt = dt.replace(tzinfo=timezone.utc)
			
 
				+        return dt.astimezone(timezone.utc)
			
 
				+    except Exception:
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def _cluster_is_within_age_window(cluster: Dict[str, Any], *, max_age_hours: float) -> bool:
			
 
				+    """Return True if the cluster's last_updated is within the merge window."""
			
 
				+    if max_age_hours <= 0:
			
 
				+        return True  # 0 = no limit
			
 
				+    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
			
 
				+    dt = _parse_ts(ts_str)
			
 
				+    if dt is None:
			
 
				+        return True  # be lenient with unparseable timestamps
			
 
				+    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
			
 
				+    return dt >= cutoff
			
 
				+
			
 
				+
			
 
				 # ---------------------------------------------------------------------------
			
 
				 # Embedding pre-computation (async internally)
			
 
				 # ---------------------------------------------------------------------------
			
@@ -175,6 +234,97 @@ def _compute_embeddings_sync(
 
				         return future.result()
			
 
				 
			
 
				 
			
 
				+# ---------------------------------------------------------------------------
			
 
				+# Orphan merge: detect clusters sharing articles and merge them
			
 
				+# ---------------------------------------------------------------------------
			
 
				+
			
 
				+def _merge_orphan_clusters(
			
 
				+    clusters: List[Dict[str, Any]],
			
 
				+) -> List[Dict[str, Any]]:
			
 
				+    """Post-clustering pass: merge clusters that share article keys.
			
 
				+
			
 
				+    This handles the case where two articles about the same event didn't match
			
 
				+    during the main loop (e.g. embeddings were temporarily unavailable) and
			
 
				+    ended up in separate clusters.  If two clusters share >= 1 article key, we
			
 
				+    merge them into one (keeping the earlier first_seen, recompute the stable
			
 
				+    ID from the union of articles).
			
 
				+    """
			
 
				+    if len(clusters) <= 1:
			
 
				+        return clusters
			
 
				+
			
 
				+    # Build index: article_key -> list of cluster indices
			
 
				+    key_to_indices: dict[str, list[int]] = {}
			
 
				+    for idx, c in enumerate(clusters):
			
 
				+        for a in c.get("articles", []) or []:
			
 
				+            ak = _article_key(a)
			
 
				+            if ak:
			
 
				+                key_to_indices.setdefault(ak, []).append(idx)
			
 
				+
			
 
				+    # Find connected components via Union-Find
			
 
				+    parent = list(range(len(clusters)))
			
 
				+
			
 
				+    def find(x: int) -> int:
			
 
				+        while parent[x] != x:
			
 
				+            parent[x] = parent[parent[x]]
			
 
				+            x = parent[x]
			
 
				+        return x
			
 
				+
			
 
				+    def union(a: int, b: int) -> None:
			
 
				+        ra, rb = find(a), find(b)
			
 
				+        if ra != rb:
			
 
				+            parent[ra] = rb
			
 
				+
			
 
				+    for indices in key_to_indices.values():
			
 
				+        for i in range(1, len(indices)):
			
 
				+            union(indices[0], indices[i])
			
 
				+
			
 
				+    # Group clusters by component
			
 
				+    components: dict[int, list[int]] = {}
			
 
				+    for idx in range(len(clusters)):
			
 
				+        root = find(idx)
			
 
				+        components.setdefault(root, []).append(idx)
			
 
				+
			
 
				+    merged: List[Dict[str, Any]] = []
			
 
				+    for root, members in components.items():
			
 
				+        if len(members) == 1:
			
 
				+            merged.append(clusters[members[0]])
			
 
				+            continue
			
 
				+
			
 
				+        # Merge all clusters in this component
			
 
				+        base = dict(clusters[members[0]])
			
 
				+        all_articles: list[dict] = list(base.get("articles", []) or [])
			
 
				+        all_sources: list[str] = list(base.get("sources", []) or [])
			
 
				+        first_seen = base.get("first_seen", "")
			
 
				+        last_updated = base.get("last_updated", "")
			
 
				+
			
 
				+        for m_idx in members[1:]:
			
 
				+            other = clusters[m_idx]
			
 
				+            existing_keys = {_article_key(a) for a in all_articles}
			
 
				+            for a in other.get("articles", []) or []:
			
 
				+                ak = _article_key(a)
			
 
				+                if ak not in existing_keys:
			
 
				+                    all_articles.append(a)
			
 
				+                    existing_keys.add(ak)
			
 
				+            for s in other.get("sources", []) or []:
			
 
				+                if s not in all_sources:
			
 
				+                    all_sources.append(s)
			
 
				+            fs = other.get("first_seen", "")
			
 
				+            if fs and (not first_seen or fs < first_seen):
			
 
				+                first_seen = fs
			
 
				+            lu = other.get("last_updated", "")
			
 
				+            if lu and (not last_updated or lu > last_updated):
			
 
				+                last_updated = lu
			
 
				+
			
 
				+        base["articles"] = all_articles
			
 
				+        base["sources"] = all_sources
			
 
				+        base["first_seen"] = first_seen
			
 
				+        base["last_updated"] = last_updated
			
 
				+        base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
			
 
				+        merged.append(base)
			
 
				+
			
 
				+    return merged
			
 
				+
			
 
				+
			
 
				 # ---------------------------------------------------------------------------
			
 
				 # Public API (sync — backward compatible with tests)
			
 
				 # ---------------------------------------------------------------------------
			
@@ -183,16 +333,22 @@ def _compute_embeddings_sync(
 
				 def dedup_and_cluster_articles(
			
 
				     articles: List[Dict[str, Any]],
			
 
				     similarity_threshold: float | None = None,
			
 
				+    *,
			
 
				+    existing_clusters: List[Dict[str, Any]] | None = None,
			
 
				+    max_age_hours: float = 0,
			
 
				 ) -> Dict[str, List[Dict[str, Any]]]:
			
 
				     """Deduplicate raw articles into clusters keyed by topic.
			
 
				 
			
 
				-    v1.2: embedding pre-computation is async/concurrent under the hood, but
			
 
				-    this public function remains synchronous for backward compatibility.
			
 
				+    v1.3: stable cluster IDs, temporal gating, and orphan merge.
			
 
				 
			
 
				-    A pair merges if ANY signal clears its threshold:
			
 
				-      * title fuzzy ratio
			
 
				-      * token Jaccard over headline+summary
			
 
				-      * Ollama embedding cosine when available
			
 
				+    Args:
			
 
				+        articles: new articles to cluster.
			
 
				+        similarity_threshold: override for the title-similarity threshold.
			
 
				+        existing_clusters: optional list of recent clusters from the DB to
			
 
				+            merge against (cross-cycle merge).  When provided, temporal
			
 
				+            gating via max_age_hours is applied to filter these.
			
 
				+        max_age_hours: only compare against existing_clusters updated within
			
 
				+            this many hours.  0 = no limit (compare against all provided).
			
 
				     """
			
 
				 
			
 
				     title_threshold = similarity_threshold if similarity_threshold is not None else DEFAULT_TITLE_THRESHOLD
			
@@ -204,6 +360,14 @@ def dedup_and_cluster_articles(
 
				 
			
 
				     by_topic: Dict[str, List[Dict[str, Any]]] = {}
			
 
				 
			
 
				+    # Seed with existing clusters (filtered by age window)
			
 
				+    if existing_clusters:
			
 
				+        for c in existing_clusters:
			
 
				+            if not _cluster_is_within_age_window(c, max_age_hours=max_age_hours):
			
 
				+                continue
			
 
				+            topic = c.get("topic", "other") or "other"
			
 
				+            by_topic.setdefault(topic, []).append(dict(c))
			
 
				+
			
 
				     for a in articles:
			
 
				         title = a.get("title") or ""
			
 
				         if not title:
			
@@ -262,8 +426,7 @@ def dedup_and_cluster_articles(
 
				                 {"signal": best_signal_name, "value": round(best_signal_value, 3)}
			
 
				             )
			
 
				         else:
			
 
				-            key = f"{topic}|{_normalize_title(title)}"
			
 
				-            cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
			
 
				+            cid = _stable_cluster_id(topic, [a])
			
 
				             cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
			
 
				             clusters.append(
			
 
				                 {
			
@@ -284,6 +447,15 @@ def dedup_and_cluster_articles(
 
				                 }
			
 
				             )
			
 
				 
			
 
				+    # Post-clustering passes per topic
			
 
				+    for topic, clusters in by_topic.items():
			
 
				+        # Merge orphans (clusters that share articles)
			
 
				+        clusters = _merge_orphan_clusters(clusters)
			
 
				+        # Recompute stable IDs from the final article sets
			
 
				+        for c in clusters:
			
 
				+            c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
			
 
				+        by_topic[topic] = clusters
			
 
				+
			
 
				     # Strip the internal merge audit trail before returning
			
 
				     for clusters in by_topic.values():
			
 
				         for c in clusters:
			
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -5,7 +5,7 @@ import hashlib
 
				 import logging
			
 
				 import sys
			
 
				 from collections import defaultdict
			
 
				-from datetime import datetime, timezone
			
 
				+from datetime import datetime, timezone, timedelta
			
 
				 from typing import Any, Dict
			
 
				 
			
 
				 from news_mcp.config import (
			
@@ -20,6 +20,7 @@ from news_mcp.config import (
 
				     NEWS_PRUNE_INTERVAL_HOURS,
			
 
				     NEWS_PRUNING_ENABLED,
			
 
				     NEWS_RETENTION_DAYS,
			
 
				+    NEWS_CLUSTER_MAX_AGE_HOURS,
			
 
				     llm_concurrency,
			
 
				 )
			
 
				 from news_mcp.dedup.cluster import dedup_and_cluster_articles
			
@@ -147,6 +148,32 @@ async def _enrich_topic_clusters(
 
				     return enriched
			
 
				 
			
 
				 
			
 
				+def _cluster_age_ok(cluster: dict, max_age_hours: float) -> bool:
			
 
				+    """Check whether a cluster's last_updated is within the merge window."""
			
 
				+    if max_age_hours <= 0:
			
 
				+        return True
			
 
				+    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
			
 
				+    if not ts_str:
			
 
				+        return True
			
 
				+    try:
			
 
				+        s = str(ts_str).replace("Z", "+00:00")
			
 
				+        dt = datetime.fromisoformat(s)
			
 
				+        if dt.tzinfo is None:
			
 
				+            dt = dt.replace(tzinfo=timezone.utc)
			
 
				+        dt = dt.astimezone(timezone.utc)
			
 
				+    except Exception:
			
 
				+        try:
			
 
				+            from email.utils import parsedate_to_datetime
			
 
				+            dt = parsedate_to_datetime(str(ts_str))
			
 
				+            if dt.tzinfo is None:
			
 
				+                dt = dt.replace(tzinfo=timezone.utc)
			
 
				+            dt = dt.astimezone(timezone.utc)
			
 
				+        except Exception:
			
 
				+            return True
			
 
				+    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
			
 
				+    return dt >= cutoff
			
 
				+
			
 
				+
			
 
				 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
			
 
				     logger = logging.getLogger("news_mcp.refresh")
			
 
				     store = SQLiteClusterStore(DB_PATH)
			
@@ -205,9 +232,32 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
				 
			
 
				     articles = changed_articles
			
 
				     logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
			
 
				+
			
 
				+    # Pre-seed with recent clusters from the DB so new articles can merge
			
 
				+    # into existing clusters across polling cycles.
			
 
				+    max_age = NEWS_CLUSTER_MAX_AGE_HOURS
			
 
				+    recent_clusters: list[dict] = []
			
 
				+    if max_age != 0:
			
 
				+        lookback = max_age if max_age > 0 else 72
			
 
				+        all_recent = store.get_latest_clusters_all_topics(
			
 
				+            ttl_hours=lookback,
			
 
				+            limit=500,
			
 
				+        )
			
 
				+        recent_clusters = [c for c in all_recent if _cluster_age_ok(c, max_age)]
			
 
				+        logger.info(
			
 
				+            "refresh pre-seeded existing_clusters=%s max_age_h=%s",
			
 
				+            len(recent_clusters), max_age,
			
 
				+        )
			
 
				+
			
 
				     # Clustering is sync but may do concurrent embedding fetches internally.
			
 
				     # Run off-thread so the event loop stays responsive for MCP tool calls.
			
 
				-    clustered_by_topic = await asyncio.to_thread(dedup_and_cluster_articles, articles)
			
 
				+    clustered_by_topic = await asyncio.to_thread(
			
 
				+        dedup_and_cluster_articles,
			
 
				+        articles,
			
 
				+        None,  # use default similarity_threshold
			
 
				+        existing_clusters=recent_clusters if recent_clusters else None,
			
 
				+        max_age_hours=max_age,
			
 
				+    )
			
 
				     logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
			
 
				 
			
 
				     # Build LLM concurrency semaphore from the extract provider's config.
			
--- a/test_news_mcp.py
+++ b/test_news_mcp.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 
				 from contextlib import contextmanager
			
 
				 import tempfile
			
 
				 from pathlib import Path
			
 
				+from datetime import datetime, timezone
			
 
				 
			
 
				 from news_mcp.dedup.cluster import dedup_and_cluster_articles
			
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
@@ -14,7 +15,10 @@ from news_mcp.trends_resolution import resolve_entity_via_trends
 
				 from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
			
 
				 
			
 
				 
			
 
				-def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
			
 
				+def _article(title: str, url: str = None, source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
			
 
				+    if url is None:
			
 
				+        import hashlib
			
 
				+        url = f"https.example.com/{hashlib.md5(title.encode()).hexdigest()[:10]}"
			
 
				     return {
			
 
				         "title": title,
			
 
				         "url": url,
			
@@ -385,6 +389,9 @@ def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
 
				             self.meta["prune"] = kwargs
			
 
				             return {"deleted": 0}
			
 
				 
			
 
				+        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
			
 
				+            return []
			
 
				+
			
 
				         def set_meta(self, key, value):
			
 
				             self.meta[key] = value
			
 
				 
			
@@ -637,13 +644,16 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
 
				         def get_failed_enrichment_clusters(self, max_retries=3):
			
 
				             return []
			
 
				 
			
 
				+        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
			
 
				+            return []
			
 
				+
			
 
				         def set_meta(self, key, value):
			
 
				             pass
			
 
				 
			
 
				         def set_feed_state(self, feed_key, last_hash, item_count):
			
 
				             pass
			
 
				 
			
 
				-    def fake_cluster(articles):
			
 
				+    def fake_cluster(articles, similarity_threshold=None, existing_clusters=None, max_age_hours=0):
			
 
				         # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
			
 
				         # in the title for the heuristic matcher — title above does have
			
 
				         # "law"-adjacent words but not the specific tokens it matches).
			
@@ -706,3 +716,196 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
 
				         f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
			
 
				     )
			
 
				     assert upsert["payload_topic"] == "regulation"
			
 
				+
			
 
				+
			
 
				+# ---------------------------------------------------------------------------
			
 
				+# v1.3 — Stable cluster IDs, orphan merge, temporal gating
			
 
				+# ---------------------------------------------------------------------------
			
 
				+
			
 
				+
			
 
				+def test_stable_cluster_id_is_order_independent():
			
 
				+    """Two articles about the same event should always get the same cluster_id,
			
 
				+    regardless of which article is processed first."""
			
 
				+    from news_mcp.dedup import cluster as dc
			
 
				+
			
 
				+    art_a = {
			
 
				+        "title": "Bitcoin Surges Past $100K",
			
 
				+        "url": "https://example.com/btc-100k",
			
 
				+        "source": "Reuters",
			
 
				+        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
			
 
				+        "summary": "Bitcoin reached $100,000 for the first time.",
			
 
				+    }
			
 
				+    art_b = {
			
 
				+        "title": "BTC Breaks $100,000 Barrier",
			
 
				+        "url": "https://example.com/btc-100k",
			
 
				+        "source": "Bloomberg",
			
 
				+        "timestamp": "Mon, 30 Mar 2026 12:05:00 GMT",
			
 
				+        "summary": "Bitcoin topped the $100,000 level.",
			
 
				+    }
			
 
				+
			
 
				+    # Process A first
			
 
				+    clustered_ab = dc.dedup_and_cluster_articles([art_a, art_b])
			
 
				+    # Process B first
			
 
				+    clustered_ba = dc.dedup_and_cluster_articles([art_b, art_a])
			
 
				+
			
 
				+    # Both orderings must produce the same cluster_id(s)
			
 
				+    ids_ab = sorted(c["cluster_id"] for clusters in clustered_ab.values() for c in clusters)
			
 
				+    ids_ba = sorted(c["cluster_id"] for clusters in clustered_ba.values() for c in clusters)
			
 
				+    assert ids_ab == ids_ba, f"Cluster IDs depend on order: {ids_ab} vs {ids_ba}"
			
 
				+
			
 
				+
			
 
				+def test_orphan_merge_deduplicates_shared_articles():
			
 
				+    """When two clusters end up with overlapping article sets (e.g. because
			
 
				+    embeddings were temporarily unavailable), the post-clustering merge pass
			
 
				+    should combine them into one."""
			
 
				+    from news_mcp.dedup.cluster import _merge_orphan_clusters
			
 
				+
			
 
				+    clusters = [
			
 
				+        {
			
 
				+            "cluster_id": "aaa",
			
 
				+            "topic": "crypto",
			
 
				+            "headline": "Bitcoin surges",
			
 
				+            "articles": [
			
 
				+                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
			
 
				+            ],
			
 
				+            "sources": ["A"],
			
 
				+            "first_seen": "T1",
			
 
				+            "last_updated": "T1",
			
 
				+        },
			
 
				+        {
			
 
				+            "cluster_id": "bbb",
			
 
				+            "topic": "crypto",
			
 
				+            "headline": "BTC up",
			
 
				+            "articles": [
			
 
				+                {"title": "BTC up", "url": "https://example.com/btc", "source": "B"},
			
 
				+            ],
			
 
				+            "sources": ["B"],
			
 
				+            "first_seen": "T2",
			
 
				+            "last_updated": "T2",
			
 
				+        },
			
 
				+    ]
			
 
				+    merged = _merge_orphan_clusters(clusters)
			
 
				+    assert len(merged) == 1, f"Expected 1 merged cluster, got {len(merged)}"
			
 
				+    assert set(merged[0]["sources"]) == {"A", "B"}
			
 
				+
			
 
				+
			
 
				+def test_orphan_merge_preserves_distinct_clusters():
			
 
				+    """Clusters with no shared articles must remain independent."""
			
 
				+    from news_mcp.dedup.cluster import _merge_orphan_clusters
			
 
				+
			
 
				+    clusters = [
			
 
				+        {
			
 
				+            "cluster_id": "aaa",
			
 
				+            "topic": "crypto",
			
 
				+            "headline": "Bitcoin surges",
			
 
				+            "articles": [
			
 
				+                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
			
 
				+            ],
			
 
				+            "sources": ["A"],
			
 
				+            "first_seen": "T1",
			
 
				+            "last_updated": "T1",
			
 
				+        },
			
 
				+        {
			
 
				+            "cluster_id": "bbb",
			
 
				+            "topic": "crypto",
			
 
				+            "headline": "Ethereum merge",
			
 
				+            "articles": [
			
 
				+                {"title": "Ethereum merge", "url": "https://example.com/eth", "source": "B"},
			
 
				+            ],
			
 
				+            "sources": ["B"],
			
 
				+            "first_seen": "T2",
			
 
				+            "last_updated": "T2",
			
 
				+        },
			
 
				+    ]
			
 
				+    merged = _merge_orphan_clusters(clusters)
			
 
				+    assert len(merged) == 2
			
 
				+
			
 
				+
			
 
				+def test_stable_id_same_for_different_titles_same_url():
			
 
				+    """Two articles with the same URL but different titles (e.g. corrected
			
 
				+    headline) must produce the same cluster_id."""
			
 
				+    from news_mcp.dedup.cluster import _stable_cluster_id
			
 
				+
			
 
				+    arts_a = [
			
 
				+        {"title": "Fed Raises Rates", "url": "https://example.com/fed-rates"},
			
 
				+    ]
			
 
				+    arts_b = [
			
 
				+        {"title": "Federal Reserve Increases Interest Rates", "url": "https://example.com/fed-rates"},
			
 
				+    ]
			
 
				+    id_a = _stable_cluster_id("macro", arts_a)
			
 
				+    id_b = _stable_cluster_id("macro", arts_b)
			
 
				+    assert id_a == id_b, f"Same URL must give same cluster_id: {id_a} vs {id_b}"
			
 
				+
			
 
				+
			
 
				+def test_temporal_gate_excludes_stale_clusters():
			
 
				+    """Clusters older than max_age_hours should not be candidates for merging."""
			
 
				+    from news_mcp.dedup.cluster import _cluster_is_within_age_window
			
 
				+
			
 
				+    old_cluster = {
			
 
				+        "cluster_id": "old",
			
 
				+        "topic": "crypto",
			
 
				+        "last_updated": "2025-01-01T00:00:00+00:00",
			
 
				+        "articles": [],
			
 
				+    }
			
 
				+    assert not _cluster_is_within_age_window(old_cluster, max_age_hours=4)
			
 
				+
			
 
				+    recent_cluster = {
			
 
				+        "cluster_id": "recent",
			
 
				+        "topic": "crypto",
			
 
				+        "last_updated": datetime.now(timezone.utc).isoformat(),
			
 
				+        "articles": [],
			
 
				+    }
			
 
				+    assert _cluster_is_within_age_window(recent_cluster, max_age_hours=4)
			
 
				+
			
 
				+    # max_age_hours=0 means no limit
			
 
				+    assert _cluster_is_within_age_window(old_cluster, max_age_hours=0)
			
 
				+
			
 
				+
			
 
				+def test_preseed_merge_into_existing_cluster():
			
 
				+    """When existing_clusters is provided, a new article that matches should
			
 
				+    merge into the existing cluster instead of creating a new one."""
			
 
				+    from news_mcp.dedup import cluster as dc
			
 
				+
			
 
				+    existing = [{
			
 
				+        "cluster_id": "existing-1",
			
 
				+        "topic": "other",
			
 
				+        "headline": "Trump warns Iran war could spread across Middle East",
			
 
				+        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
			
 
				+        "sources": ["Reuters"],
			
 
				+        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
			
 
				+        "last_updated": datetime.now(timezone.utc).isoformat(),
			
 
				+        "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
			
 
				+        "articles": [
			
 
				+            {
			
 
				+                "title": "Trump warns Iran war could spread across Middle East",
			
 
				+                "url": "https://example.com/trump-iran",
			
 
				+                "source": "Reuters",
			
 
				+                "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
			
 
				+                "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
			
 
				+            }
			
 
				+        ],
			
 
				+        "entities": [],
			
 
				+        "sentiment": "neutral",
			
 
				+        "importance": 0.0,
			
 
				+    }]
			
 
				+
			
 
				+    new_article = {
			
 
				+        "title": "Trump warns Iran conflict could spread across Middle East",
			
 
				+        "url": "https://example.com/trump-iran-2",
			
 
				+        "source": "Bloomberg",
			
 
				+        "timestamp": "Mon, 30 Mar 2026 13:00:00 GMT",
			
 
				+        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
			
 
				+    }
			
 
				+
			
 
				+    # Use a low title threshold so Jaccard can catch the merge
			
 
				+    clustered = dc.dedup_and_cluster_articles(
			
 
				+        [new_article],
			
 
				+        similarity_threshold=0.75,
			
 
				+        existing_clusters=existing,
			
 
				+        max_age_hours=4,
			
 
				+    )
			
 
				+
			
 
				+    all_clusters = [c for clusters in clustered.values() for c in clusters]
			
 
				+    # Should have exactly 1 cluster (the existing one, now with 2 articles)
			
 
				+    assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}"
			
 
				+    assert len(all_clusters[0]["articles"]) == 2