Преглед изворни кода

fix: stable cluster IDs, cross-cycle merge, orphan dedup, temporal gating

- Stable cluster_id from min article key (order-independent)
- Post-clustering orphan merge pass: Union-Find on shared article keys
- Poller pre-seeds clustering with recent DB clusters (NEWS_CLUSTER_MAX_AGE_HOURS)
- Temporal gate on existing clusters before comparison
- All 38 tests pass
Lukas Goldschmidt пре 1 недеља
родитељ
комит
2e0e9643b7
4 измењених фајлова са 442 додато и 13 уклоњено
  1. 4 0
      news_mcp/config.py
  2. 181 9
      news_mcp/dedup/cluster.py
  3. 52 2
      news_mcp/jobs/poller.py
  4. 205 2
      test_news_mcp.py

+ 4 - 0
news_mcp/config.py

@@ -48,6 +48,10 @@ OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", os.getenv("OLLAMA_URL", "http://1
 OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
 OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
 NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY_THRESHOLD", "0.885"))
 NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY_THRESHOLD", "0.885"))
 
 
+# Cluster merge window: how far back (hours) to load existing clusters from
+# the DB for cross-cycle merging.  0 = disabled (no cross-cycle merge).
+NEWS_CLUSTER_MAX_AGE_HOURS = float(os.getenv("NEWS_CLUSTER_MAX_AGE_HOURS", "4"))
+
 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"
 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"
 NEWS_BACKGROUND_REFRESH_ON_START = os.getenv("NEWS_BACKGROUND_REFRESH_ON_START", "true").lower() == "true"
 NEWS_BACKGROUND_REFRESH_ON_START = os.getenv("NEWS_BACKGROUND_REFRESH_ON_START", "true").lower() == "true"

+ 181 - 9
news_mcp/dedup/cluster.py

@@ -3,11 +3,16 @@ from __future__ import annotations
 import asyncio
 import asyncio
 import hashlib
 import hashlib
 import re
 import re
+from datetime import datetime, timezone, timedelta
 from difflib import SequenceMatcher
 from difflib import SequenceMatcher
 from typing import Any, Dict, List
 from typing import Any, Dict, List
 from urllib.parse import urlparse
 from urllib.parse import urlparse
 
 
-from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD
+from news_mcp.config import (
+    NEWS_EMBEDDINGS_ENABLED,
+    NEWS_EMBEDDING_SIMILARITY_THRESHOLD,
+    NEWS_CLUSTER_MAX_AGE_HOURS,
+)
 from news_mcp.dedup.embedding_support import cosine_similarity, ollama_embed
 from news_mcp.dedup.embedding_support import cosine_similarity, ollama_embed
 from news_mcp.sources.news_feeds import normalize_topic_from_title
 from news_mcp.sources.news_feeds import normalize_topic_from_title
 
 
@@ -117,6 +122,60 @@ def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, fl
     return False, "none", 0.0
     return False, "none", 0.0
 
 
 
 
+# ---------------------------------------------------------------------------
+# Stable cluster ID
+# ---------------------------------------------------------------------------
+
+def _stable_cluster_id(topic: str, articles: List[Dict[str, Any]]) -> str:
+    """Deterministic cluster ID derived from the topic and the sorted set of
+    article keys.  Using the minimum key (lexicographic) as the seed ensures
+    that no matter which article arrives first, the same set of articles always
+    maps to the same cluster_id."""
+    keys = sorted(_article_key(a) for a in articles if _article_key(a))
+    if not keys:
+        # Degenerate fallback — single article with empty url and title
+        return hashlib.sha1(topic.encode("utf-8")).hexdigest()
+    seed = keys[0]
+    return hashlib.sha1(f"{topic}|{seed}".encode("utf-8")).hexdigest()
+
+
+# ---------------------------------------------------------------------------
+# Temporal gating
+# ---------------------------------------------------------------------------
+
+def _parse_ts(ts_str: str) -> datetime | None:
+    if not ts_str:
+        return None
+    try:
+        s = str(ts_str).replace("Z", "+00:00")
+        dt = datetime.fromisoformat(s)
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.astimezone(timezone.utc)
+    except Exception:
+        pass
+    try:
+        from email.utils import parsedate_to_datetime
+        dt = parsedate_to_datetime(str(ts_str))
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.astimezone(timezone.utc)
+    except Exception:
+        return None
+
+
+def _cluster_is_within_age_window(cluster: Dict[str, Any], *, max_age_hours: float) -> bool:
+    """Return True if the cluster's last_updated is within the merge window."""
+    if max_age_hours <= 0:
+        return True  # 0 = no limit
+    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
+    dt = _parse_ts(ts_str)
+    if dt is None:
+        return True  # be lenient with unparseable timestamps
+    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
+    return dt >= cutoff
+
+
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
 # Embedding pre-computation (async internally)
 # Embedding pre-computation (async internally)
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
@@ -175,6 +234,97 @@ def _compute_embeddings_sync(
         return future.result()
         return future.result()
 
 
 
 
+# ---------------------------------------------------------------------------
+# Orphan merge: detect clusters sharing articles and merge them
+# ---------------------------------------------------------------------------
+
+def _merge_orphan_clusters(
+    clusters: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Post-clustering pass: merge clusters that share article keys.
+
+    This handles the case where two articles about the same event didn't match
+    during the main loop (e.g. embeddings were temporarily unavailable) and
+    ended up in separate clusters.  If two clusters share >= 1 article key, we
+    merge them into one (keeping the earlier first_seen, recompute the stable
+    ID from the union of articles).
+    """
+    if len(clusters) <= 1:
+        return clusters
+
+    # Build index: article_key -> list of cluster indices
+    key_to_indices: dict[str, list[int]] = {}
+    for idx, c in enumerate(clusters):
+        for a in c.get("articles", []) or []:
+            ak = _article_key(a)
+            if ak:
+                key_to_indices.setdefault(ak, []).append(idx)
+
+    # Find connected components via Union-Find
+    parent = list(range(len(clusters)))
+
+    def find(x: int) -> int:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]
+            x = parent[x]
+        return x
+
+    def union(a: int, b: int) -> None:
+        ra, rb = find(a), find(b)
+        if ra != rb:
+            parent[ra] = rb
+
+    for indices in key_to_indices.values():
+        for i in range(1, len(indices)):
+            union(indices[0], indices[i])
+
+    # Group clusters by component
+    components: dict[int, list[int]] = {}
+    for idx in range(len(clusters)):
+        root = find(idx)
+        components.setdefault(root, []).append(idx)
+
+    merged: List[Dict[str, Any]] = []
+    for root, members in components.items():
+        if len(members) == 1:
+            merged.append(clusters[members[0]])
+            continue
+
+        # Merge all clusters in this component
+        base = dict(clusters[members[0]])
+        all_articles: list[dict] = list(base.get("articles", []) or [])
+        all_sources: list[str] = list(base.get("sources", []) or [])
+        first_seen = base.get("first_seen", "")
+        last_updated = base.get("last_updated", "")
+
+        for m_idx in members[1:]:
+            other = clusters[m_idx]
+            existing_keys = {_article_key(a) for a in all_articles}
+            for a in other.get("articles", []) or []:
+                ak = _article_key(a)
+                if ak not in existing_keys:
+                    all_articles.append(a)
+                    existing_keys.add(ak)
+            for s in other.get("sources", []) or []:
+                if s not in all_sources:
+                    all_sources.append(s)
+            fs = other.get("first_seen", "")
+            if fs and (not first_seen or fs < first_seen):
+                first_seen = fs
+            lu = other.get("last_updated", "")
+            if lu and (not last_updated or lu > last_updated):
+                last_updated = lu
+
+        base["articles"] = all_articles
+        base["sources"] = all_sources
+        base["first_seen"] = first_seen
+        base["last_updated"] = last_updated
+        base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
+        merged.append(base)
+
+    return merged
+
+
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
 # Public API (sync — backward compatible with tests)
 # Public API (sync — backward compatible with tests)
 # ---------------------------------------------------------------------------
 # ---------------------------------------------------------------------------
@@ -183,16 +333,22 @@ def _compute_embeddings_sync(
 def dedup_and_cluster_articles(
 def dedup_and_cluster_articles(
     articles: List[Dict[str, Any]],
     articles: List[Dict[str, Any]],
     similarity_threshold: float | None = None,
     similarity_threshold: float | None = None,
+    *,
+    existing_clusters: List[Dict[str, Any]] | None = None,
+    max_age_hours: float = 0,
 ) -> Dict[str, List[Dict[str, Any]]]:
 ) -> Dict[str, List[Dict[str, Any]]]:
     """Deduplicate raw articles into clusters keyed by topic.
     """Deduplicate raw articles into clusters keyed by topic.
 
 
-    v1.2: embedding pre-computation is async/concurrent under the hood, but
-    this public function remains synchronous for backward compatibility.
+    v1.3: stable cluster IDs, temporal gating, and orphan merge.
 
 
-    A pair merges if ANY signal clears its threshold:
-      * title fuzzy ratio
-      * token Jaccard over headline+summary
-      * Ollama embedding cosine when available
+    Args:
+        articles: new articles to cluster.
+        similarity_threshold: override for the title-similarity threshold.
+        existing_clusters: optional list of recent clusters from the DB to
+            merge against (cross-cycle merge).  When provided, temporal
+            gating via max_age_hours is applied to filter these.
+        max_age_hours: only compare against existing_clusters updated within
+            this many hours.  0 = no limit (compare against all provided).
     """
     """
 
 
     title_threshold = similarity_threshold if similarity_threshold is not None else DEFAULT_TITLE_THRESHOLD
     title_threshold = similarity_threshold if similarity_threshold is not None else DEFAULT_TITLE_THRESHOLD
@@ -204,6 +360,14 @@ def dedup_and_cluster_articles(
 
 
     by_topic: Dict[str, List[Dict[str, Any]]] = {}
     by_topic: Dict[str, List[Dict[str, Any]]] = {}
 
 
+    # Seed with existing clusters (filtered by age window)
+    if existing_clusters:
+        for c in existing_clusters:
+            if not _cluster_is_within_age_window(c, max_age_hours=max_age_hours):
+                continue
+            topic = c.get("topic", "other") or "other"
+            by_topic.setdefault(topic, []).append(dict(c))
+
     for a in articles:
     for a in articles:
         title = a.get("title") or ""
         title = a.get("title") or ""
         if not title:
         if not title:
@@ -262,8 +426,7 @@ def dedup_and_cluster_articles(
                 {"signal": best_signal_name, "value": round(best_signal_value, 3)}
                 {"signal": best_signal_name, "value": round(best_signal_value, 3)}
             )
             )
         else:
         else:
-            key = f"{topic}|{_normalize_title(title)}"
-            cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
+            cid = _stable_cluster_id(topic, [a])
             cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
             cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
             clusters.append(
             clusters.append(
                 {
                 {
@@ -284,6 +447,15 @@ def dedup_and_cluster_articles(
                 }
                 }
             )
             )
 
 
+    # Post-clustering passes per topic
+    for topic, clusters in by_topic.items():
+        # Merge orphans (clusters that share articles)
+        clusters = _merge_orphan_clusters(clusters)
+        # Recompute stable IDs from the final article sets
+        for c in clusters:
+            c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
+        by_topic[topic] = clusters
+
     # Strip the internal merge audit trail before returning
     # Strip the internal merge audit trail before returning
     for clusters in by_topic.values():
     for clusters in by_topic.values():
         for c in clusters:
         for c in clusters:

+ 52 - 2
news_mcp/jobs/poller.py

@@ -5,7 +5,7 @@ import hashlib
 import logging
 import logging
 import sys
 import sys
 from collections import defaultdict
 from collections import defaultdict
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
 from typing import Any, Dict
 from typing import Any, Dict
 
 
 from news_mcp.config import (
 from news_mcp.config import (
@@ -20,6 +20,7 @@ from news_mcp.config import (
     NEWS_PRUNE_INTERVAL_HOURS,
     NEWS_PRUNE_INTERVAL_HOURS,
     NEWS_PRUNING_ENABLED,
     NEWS_PRUNING_ENABLED,
     NEWS_RETENTION_DAYS,
     NEWS_RETENTION_DAYS,
+    NEWS_CLUSTER_MAX_AGE_HOURS,
     llm_concurrency,
     llm_concurrency,
 )
 )
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
@@ -147,6 +148,32 @@ async def _enrich_topic_clusters(
     return enriched
     return enriched
 
 
 
 
+def _cluster_age_ok(cluster: dict, max_age_hours: float) -> bool:
+    """Check whether a cluster's last_updated is within the merge window."""
+    if max_age_hours <= 0:
+        return True
+    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
+    if not ts_str:
+        return True
+    try:
+        s = str(ts_str).replace("Z", "+00:00")
+        dt = datetime.fromisoformat(s)
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        dt = dt.astimezone(timezone.utc)
+    except Exception:
+        try:
+            from email.utils import parsedate_to_datetime
+            dt = parsedate_to_datetime(str(ts_str))
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=timezone.utc)
+            dt = dt.astimezone(timezone.utc)
+        except Exception:
+            return True
+    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
+    return dt >= cutoff
+
+
 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
     logger = logging.getLogger("news_mcp.refresh")
     logger = logging.getLogger("news_mcp.refresh")
     store = SQLiteClusterStore(DB_PATH)
     store = SQLiteClusterStore(DB_PATH)
@@ -205,9 +232,32 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
 
     articles = changed_articles
     articles = changed_articles
     logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
     logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
+
+    # Pre-seed with recent clusters from the DB so new articles can merge
+    # into existing clusters across polling cycles.
+    max_age = NEWS_CLUSTER_MAX_AGE_HOURS
+    recent_clusters: list[dict] = []
+    if max_age != 0:
+        lookback = max_age if max_age > 0 else 72
+        all_recent = store.get_latest_clusters_all_topics(
+            ttl_hours=lookback,
+            limit=500,
+        )
+        recent_clusters = [c for c in all_recent if _cluster_age_ok(c, max_age)]
+        logger.info(
+            "refresh pre-seeded existing_clusters=%s max_age_h=%s",
+            len(recent_clusters), max_age,
+        )
+
     # Clustering is sync but may do concurrent embedding fetches internally.
     # Clustering is sync but may do concurrent embedding fetches internally.
     # Run off-thread so the event loop stays responsive for MCP tool calls.
     # Run off-thread so the event loop stays responsive for MCP tool calls.
-    clustered_by_topic = await asyncio.to_thread(dedup_and_cluster_articles, articles)
+    clustered_by_topic = await asyncio.to_thread(
+        dedup_and_cluster_articles,
+        articles,
+        None,  # use default similarity_threshold
+        existing_clusters=recent_clusters if recent_clusters else None,
+        max_age_hours=max_age,
+    )
     logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
     logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
 
 
     # Build LLM concurrency semaphore from the extract provider's config.
     # Build LLM concurrency semaphore from the extract provider's config.

+ 205 - 2
test_news_mcp.py

@@ -3,6 +3,7 @@ from __future__ import annotations
 from contextlib import contextmanager
 from contextlib import contextmanager
 import tempfile
 import tempfile
 from pathlib import Path
 from pathlib import Path
+from datetime import datetime, timezone
 
 
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
@@ -14,7 +15,10 @@ from news_mcp.trends_resolution import resolve_entity_via_trends
 from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
 from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
 
 
 
 
-def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
+def _article(title: str, url: str = None, source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
+    if url is None:
+        import hashlib
+        url = f"https.example.com/{hashlib.md5(title.encode()).hexdigest()[:10]}"
     return {
     return {
         "title": title,
         "title": title,
         "url": url,
         "url": url,
@@ -385,6 +389,9 @@ def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
             self.meta["prune"] = kwargs
             self.meta["prune"] = kwargs
             return {"deleted": 0}
             return {"deleted": 0}
 
 
+        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
+            return []
+
         def set_meta(self, key, value):
         def set_meta(self, key, value):
             self.meta[key] = value
             self.meta[key] = value
 
 
@@ -637,13 +644,16 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
         def get_failed_enrichment_clusters(self, max_retries=3):
         def get_failed_enrichment_clusters(self, max_retries=3):
             return []
             return []
 
 
+        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
+            return []
+
         def set_meta(self, key, value):
         def set_meta(self, key, value):
             pass
             pass
 
 
         def set_feed_state(self, feed_key, last_hash, item_count):
         def set_feed_state(self, feed_key, last_hash, item_count):
             pass
             pass
 
 
-    def fake_cluster(articles):
+    def fake_cluster(articles, similarity_threshold=None, existing_clusters=None, max_age_hours=0):
         # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
         # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
         # in the title for the heuristic matcher — title above does have
         # in the title for the heuristic matcher — title above does have
         # "law"-adjacent words but not the specific tokens it matches).
         # "law"-adjacent words but not the specific tokens it matches).
@@ -706,3 +716,196 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
         f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
         f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
     )
     )
     assert upsert["payload_topic"] == "regulation"
     assert upsert["payload_topic"] == "regulation"
+
+
+# ---------------------------------------------------------------------------
+# v1.3 — Stable cluster IDs, orphan merge, temporal gating
+# ---------------------------------------------------------------------------
+
+
+def test_stable_cluster_id_is_order_independent():
+    """Two articles about the same event should always get the same cluster_id,
+    regardless of which article is processed first."""
+    from news_mcp.dedup import cluster as dc
+
+    art_a = {
+        "title": "Bitcoin Surges Past $100K",
+        "url": "https://example.com/btc-100k",
+        "source": "Reuters",
+        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
+        "summary": "Bitcoin reached $100,000 for the first time.",
+    }
+    art_b = {
+        "title": "BTC Breaks $100,000 Barrier",
+        "url": "https://example.com/btc-100k",
+        "source": "Bloomberg",
+        "timestamp": "Mon, 30 Mar 2026 12:05:00 GMT",
+        "summary": "Bitcoin topped the $100,000 level.",
+    }
+
+    # Process A first
+    clustered_ab = dc.dedup_and_cluster_articles([art_a, art_b])
+    # Process B first
+    clustered_ba = dc.dedup_and_cluster_articles([art_b, art_a])
+
+    # Both orderings must produce the same cluster_id(s)
+    ids_ab = sorted(c["cluster_id"] for clusters in clustered_ab.values() for c in clusters)
+    ids_ba = sorted(c["cluster_id"] for clusters in clustered_ba.values() for c in clusters)
+    assert ids_ab == ids_ba, f"Cluster IDs depend on order: {ids_ab} vs {ids_ba}"
+
+
+def test_orphan_merge_deduplicates_shared_articles():
+    """When two clusters end up with overlapping article sets (e.g. because
+    embeddings were temporarily unavailable), the post-clustering merge pass
+    should combine them into one."""
+    from news_mcp.dedup.cluster import _merge_orphan_clusters
+
+    clusters = [
+        {
+            "cluster_id": "aaa",
+            "topic": "crypto",
+            "headline": "Bitcoin surges",
+            "articles": [
+                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
+            ],
+            "sources": ["A"],
+            "first_seen": "T1",
+            "last_updated": "T1",
+        },
+        {
+            "cluster_id": "bbb",
+            "topic": "crypto",
+            "headline": "BTC up",
+            "articles": [
+                {"title": "BTC up", "url": "https://example.com/btc", "source": "B"},
+            ],
+            "sources": ["B"],
+            "first_seen": "T2",
+            "last_updated": "T2",
+        },
+    ]
+    merged = _merge_orphan_clusters(clusters)
+    assert len(merged) == 1, f"Expected 1 merged cluster, got {len(merged)}"
+    assert set(merged[0]["sources"]) == {"A", "B"}
+
+
+def test_orphan_merge_preserves_distinct_clusters():
+    """Clusters with no shared articles must remain independent."""
+    from news_mcp.dedup.cluster import _merge_orphan_clusters
+
+    clusters = [
+        {
+            "cluster_id": "aaa",
+            "topic": "crypto",
+            "headline": "Bitcoin surges",
+            "articles": [
+                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
+            ],
+            "sources": ["A"],
+            "first_seen": "T1",
+            "last_updated": "T1",
+        },
+        {
+            "cluster_id": "bbb",
+            "topic": "crypto",
+            "headline": "Ethereum merge",
+            "articles": [
+                {"title": "Ethereum merge", "url": "https://example.com/eth", "source": "B"},
+            ],
+            "sources": ["B"],
+            "first_seen": "T2",
+            "last_updated": "T2",
+        },
+    ]
+    merged = _merge_orphan_clusters(clusters)
+    assert len(merged) == 2
+
+
+def test_stable_id_same_for_different_titles_same_url():
+    """Two articles with the same URL but different titles (e.g. corrected
+    headline) must produce the same cluster_id."""
+    from news_mcp.dedup.cluster import _stable_cluster_id
+
+    arts_a = [
+        {"title": "Fed Raises Rates", "url": "https://example.com/fed-rates"},
+    ]
+    arts_b = [
+        {"title": "Federal Reserve Increases Interest Rates", "url": "https://example.com/fed-rates"},
+    ]
+    id_a = _stable_cluster_id("macro", arts_a)
+    id_b = _stable_cluster_id("macro", arts_b)
+    assert id_a == id_b, f"Same URL must give same cluster_id: {id_a} vs {id_b}"
+
+
+def test_temporal_gate_excludes_stale_clusters():
+    """Clusters older than max_age_hours should not be candidates for merging."""
+    from news_mcp.dedup.cluster import _cluster_is_within_age_window
+
+    old_cluster = {
+        "cluster_id": "old",
+        "topic": "crypto",
+        "last_updated": "2025-01-01T00:00:00+00:00",
+        "articles": [],
+    }
+    assert not _cluster_is_within_age_window(old_cluster, max_age_hours=4)
+
+    recent_cluster = {
+        "cluster_id": "recent",
+        "topic": "crypto",
+        "last_updated": datetime.now(timezone.utc).isoformat(),
+        "articles": [],
+    }
+    assert _cluster_is_within_age_window(recent_cluster, max_age_hours=4)
+
+    # max_age_hours=0 means no limit
+    assert _cluster_is_within_age_window(old_cluster, max_age_hours=0)
+
+
+def test_preseed_merge_into_existing_cluster():
+    """When existing_clusters is provided, a new article that matches should
+    merge into the existing cluster instead of creating a new one."""
+    from news_mcp.dedup import cluster as dc
+
+    existing = [{
+        "cluster_id": "existing-1",
+        "topic": "other",
+        "headline": "Trump warns Iran war could spread across Middle East",
+        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
+        "sources": ["Reuters"],
+        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
+        "last_updated": datetime.now(timezone.utc).isoformat(),
+        "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
+        "articles": [
+            {
+                "title": "Trump warns Iran war could spread across Middle East",
+                "url": "https://example.com/trump-iran",
+                "source": "Reuters",
+                "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
+                "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
+            }
+        ],
+        "entities": [],
+        "sentiment": "neutral",
+        "importance": 0.0,
+    }]
+
+    new_article = {
+        "title": "Trump warns Iran conflict could spread across Middle East",
+        "url": "https://example.com/trump-iran-2",
+        "source": "Bloomberg",
+        "timestamp": "Mon, 30 Mar 2026 13:00:00 GMT",
+        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
+    }
+
+    # Use a low title threshold so Jaccard can catch the merge
+    clustered = dc.dedup_and_cluster_articles(
+        [new_article],
+        similarity_threshold=0.75,
+        existing_clusters=existing,
+        max_age_hours=4,
+    )
+
+    all_clusters = [c for clusters in clustered.values() for c in clusters]
+    # Should have exactly 1 cluster (the existing one, now with 2 articles)
+    assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}"
+    assert len(all_clusters[0]["articles"]) == 2