Преглед на файлове

fix: stable cluster IDs, cross-cycle merge, orphan dedup, temporal gating

- Stable cluster_id from min article key (order-independent)
- Post-clustering orphan merge pass: Union-Find on shared article keys
- Poller pre-seeds clustering with recent DB clusters (NEWS_CLUSTER_MAX_AGE_HOURS)
- Temporal gate on existing clusters before comparison
- All 38 tests pass
Lukas Goldschmidt преди 1 седмица
родител
ревизия
2e0e9643b7
променени са 4 файла, в които са добавени 442 реда и са изтрити 13 реда
  1. 4 0
      news_mcp/config.py
  2. 181 9
      news_mcp/dedup/cluster.py
  3. 52 2
      news_mcp/jobs/poller.py
  4. 205 2
      test_news_mcp.py

+ 4 - 0
news_mcp/config.py

@@ -48,6 +48,10 @@ OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", os.getenv("OLLAMA_URL", "http://1
 OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
 NEWS_EMBEDDING_SIMILARITY_THRESHOLD = float(os.getenv("NEWS_EMBEDDING_SIMILARITY_THRESHOLD", "0.885"))
 
+# Cluster merge window: how far back (hours) to load existing clusters from
+# the DB for cross-cycle merging.  0 = disabled (no cross-cycle merge).
+NEWS_CLUSTER_MAX_AGE_HOURS = float(os.getenv("NEWS_CLUSTER_MAX_AGE_HOURS", "4"))
+
 NEWS_REFRESH_INTERVAL_SECONDS = int(os.getenv("NEWS_REFRESH_INTERVAL_SECONDS", "900"))
 NEWS_BACKGROUND_REFRESH_ENABLED = os.getenv("NEWS_BACKGROUND_REFRESH_ENABLED", "true").lower() == "true"
 NEWS_BACKGROUND_REFRESH_ON_START = os.getenv("NEWS_BACKGROUND_REFRESH_ON_START", "true").lower() == "true"

+ 181 - 9
news_mcp/dedup/cluster.py

@@ -3,11 +3,16 @@ from __future__ import annotations
 import asyncio
 import hashlib
 import re
+from datetime import datetime, timezone, timedelta
 from difflib import SequenceMatcher
 from typing import Any, Dict, List
 from urllib.parse import urlparse
 
-from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD
+from news_mcp.config import (
+    NEWS_EMBEDDINGS_ENABLED,
+    NEWS_EMBEDDING_SIMILARITY_THRESHOLD,
+    NEWS_CLUSTER_MAX_AGE_HOURS,
+)
 from news_mcp.dedup.embedding_support import cosine_similarity, ollama_embed
 from news_mcp.sources.news_feeds import normalize_topic_from_title
 
@@ -117,6 +122,60 @@ def _is_match(signals: dict, *, embeddings_enabled: bool) -> tuple[bool, str, fl
     return False, "none", 0.0
 
 
+# ---------------------------------------------------------------------------
+# Stable cluster ID
+# ---------------------------------------------------------------------------
+
+def _stable_cluster_id(topic: str, articles: List[Dict[str, Any]]) -> str:
+    """Deterministic cluster ID derived from the topic and the sorted set of
+    article keys.  Using the minimum key (lexicographic) as the seed ensures
+    that no matter which article arrives first, the same set of articles always
+    maps to the same cluster_id."""
+    keys = sorted(_article_key(a) for a in articles if _article_key(a))
+    if not keys:
+        # Degenerate fallback — single article with empty url and title
+        return hashlib.sha1(topic.encode("utf-8")).hexdigest()
+    seed = keys[0]
+    return hashlib.sha1(f"{topic}|{seed}".encode("utf-8")).hexdigest()
+
+
+# ---------------------------------------------------------------------------
+# Temporal gating
+# ---------------------------------------------------------------------------
+
+def _parse_ts(ts_str: str) -> datetime | None:
+    if not ts_str:
+        return None
+    try:
+        s = str(ts_str).replace("Z", "+00:00")
+        dt = datetime.fromisoformat(s)
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.astimezone(timezone.utc)
+    except Exception:
+        pass
+    try:
+        from email.utils import parsedate_to_datetime
+        dt = parsedate_to_datetime(str(ts_str))
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.astimezone(timezone.utc)
+    except Exception:
+        return None
+
+
+def _cluster_is_within_age_window(cluster: Dict[str, Any], *, max_age_hours: float) -> bool:
+    """Return True if the cluster's last_updated is within the merge window."""
+    if max_age_hours <= 0:
+        return True  # 0 = no limit
+    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
+    dt = _parse_ts(ts_str)
+    if dt is None:
+        return True  # be lenient with unparseable timestamps
+    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
+    return dt >= cutoff
+
+
 # ---------------------------------------------------------------------------
 # Embedding pre-computation (async internally)
 # ---------------------------------------------------------------------------
@@ -175,6 +234,97 @@ def _compute_embeddings_sync(
         return future.result()
 
 
+# ---------------------------------------------------------------------------
+# Orphan merge: detect clusters sharing articles and merge them
+# ---------------------------------------------------------------------------
+
+def _merge_orphan_clusters(
+    clusters: List[Dict[str, Any]],
+) -> List[Dict[str, Any]]:
+    """Post-clustering pass: merge clusters that share article keys.
+
+    This handles the case where two articles about the same event didn't match
+    during the main loop (e.g. embeddings were temporarily unavailable) and
+    ended up in separate clusters.  If two clusters share >= 1 article key, we
+    merge them into one (keeping the earlier first_seen, recompute the stable
+    ID from the union of articles).
+    """
+    if len(clusters) <= 1:
+        return clusters
+
+    # Build index: article_key -> list of cluster indices
+    key_to_indices: dict[str, list[int]] = {}
+    for idx, c in enumerate(clusters):
+        for a in c.get("articles", []) or []:
+            ak = _article_key(a)
+            if ak:
+                key_to_indices.setdefault(ak, []).append(idx)
+
+    # Find connected components via Union-Find
+    parent = list(range(len(clusters)))
+
+    def find(x: int) -> int:
+        while parent[x] != x:
+            parent[x] = parent[parent[x]]
+            x = parent[x]
+        return x
+
+    def union(a: int, b: int) -> None:
+        ra, rb = find(a), find(b)
+        if ra != rb:
+            parent[ra] = rb
+
+    for indices in key_to_indices.values():
+        for i in range(1, len(indices)):
+            union(indices[0], indices[i])
+
+    # Group clusters by component
+    components: dict[int, list[int]] = {}
+    for idx in range(len(clusters)):
+        root = find(idx)
+        components.setdefault(root, []).append(idx)
+
+    merged: List[Dict[str, Any]] = []
+    for root, members in components.items():
+        if len(members) == 1:
+            merged.append(clusters[members[0]])
+            continue
+
+        # Merge all clusters in this component
+        base = dict(clusters[members[0]])
+        all_articles: list[dict] = list(base.get("articles", []) or [])
+        all_sources: list[str] = list(base.get("sources", []) or [])
+        first_seen = base.get("first_seen", "")
+        last_updated = base.get("last_updated", "")
+
+        for m_idx in members[1:]:
+            other = clusters[m_idx]
+            existing_keys = {_article_key(a) for a in all_articles}
+            for a in other.get("articles", []) or []:
+                ak = _article_key(a)
+                if ak not in existing_keys:
+                    all_articles.append(a)
+                    existing_keys.add(ak)
+            for s in other.get("sources", []) or []:
+                if s not in all_sources:
+                    all_sources.append(s)
+            fs = other.get("first_seen", "")
+            if fs and (not first_seen or fs < first_seen):
+                first_seen = fs
+            lu = other.get("last_updated", "")
+            if lu and (not last_updated or lu > last_updated):
+                last_updated = lu
+
+        base["articles"] = all_articles
+        base["sources"] = all_sources
+        base["first_seen"] = first_seen
+        base["last_updated"] = last_updated
+        base["cluster_id"] = _stable_cluster_id(base.get("topic", "other"), all_articles)
+        merged.append(base)
+
+    return merged
+
+
 # ---------------------------------------------------------------------------
 # Public API (sync — backward compatible with tests)
 # ---------------------------------------------------------------------------
@@ -183,16 +333,22 @@ def _compute_embeddings_sync(
 def dedup_and_cluster_articles(
     articles: List[Dict[str, Any]],
     similarity_threshold: float | None = None,
+    *,
+    existing_clusters: List[Dict[str, Any]] | None = None,
+    max_age_hours: float = 0,
 ) -> Dict[str, List[Dict[str, Any]]]:
     """Deduplicate raw articles into clusters keyed by topic.
 
-    v1.2: embedding pre-computation is async/concurrent under the hood, but
-    this public function remains synchronous for backward compatibility.
+    v1.3: stable cluster IDs, temporal gating, and orphan merge.
 
-    A pair merges if ANY signal clears its threshold:
-      * title fuzzy ratio
-      * token Jaccard over headline+summary
-      * Ollama embedding cosine when available
+    Args:
+        articles: new articles to cluster.
+        similarity_threshold: override for the title-similarity threshold.
+        existing_clusters: optional list of recent clusters from the DB to
+            merge against (cross-cycle merge).  When provided, temporal
+            gating via max_age_hours is applied to filter these.
+        max_age_hours: only compare against existing_clusters updated within
+            this many hours.  0 = no limit (compare against all provided).
     """
 
     title_threshold = similarity_threshold if similarity_threshold is not None else DEFAULT_TITLE_THRESHOLD
@@ -204,6 +360,14 @@ def dedup_and_cluster_articles(
 
     by_topic: Dict[str, List[Dict[str, Any]]] = {}
 
+    # Seed with existing clusters (filtered by age window)
+    if existing_clusters:
+        for c in existing_clusters:
+            if not _cluster_is_within_age_window(c, max_age_hours=max_age_hours):
+                continue
+            topic = c.get("topic", "other") or "other"
+            by_topic.setdefault(topic, []).append(dict(c))
+
     for a in articles:
         title = a.get("title") or ""
         if not title:
@@ -262,8 +426,7 @@ def dedup_and_cluster_articles(
                 {"signal": best_signal_name, "value": round(best_signal_value, 3)}
             )
         else:
-            key = f"{topic}|{_normalize_title(title)}"
-            cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
+            cid = _stable_cluster_id(topic, [a])
             cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
             clusters.append(
                 {
@@ -284,6 +447,15 @@ def dedup_and_cluster_articles(
                 }
             )
 
+    # Post-clustering passes per topic
+    for topic, clusters in by_topic.items():
+        # Merge orphans (clusters that share articles)
+        clusters = _merge_orphan_clusters(clusters)
+        # Recompute stable IDs from the final article sets
+        for c in clusters:
+            c["cluster_id"] = _stable_cluster_id(topic, c.get("articles", []) or [])
+        by_topic[topic] = clusters
+
     # Strip the internal merge audit trail before returning
     for clusters in by_topic.values():
         for c in clusters:

+ 52 - 2
news_mcp/jobs/poller.py

@@ -5,7 +5,7 @@ import hashlib
 import logging
 import sys
 from collections import defaultdict
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
 from typing import Any, Dict
 
 from news_mcp.config import (
@@ -20,6 +20,7 @@ from news_mcp.config import (
     NEWS_PRUNE_INTERVAL_HOURS,
     NEWS_PRUNING_ENABLED,
     NEWS_RETENTION_DAYS,
+    NEWS_CLUSTER_MAX_AGE_HOURS,
     llm_concurrency,
 )
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
@@ -147,6 +148,32 @@ async def _enrich_topic_clusters(
     return enriched
 
 
+def _cluster_age_ok(cluster: dict, max_age_hours: float) -> bool:
+    """Check whether a cluster's last_updated is within the merge window."""
+    if max_age_hours <= 0:
+        return True
+    ts_str = cluster.get("last_updated") or cluster.get("timestamp") or ""
+    if not ts_str:
+        return True
+    try:
+        s = str(ts_str).replace("Z", "+00:00")
+        dt = datetime.fromisoformat(s)
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        dt = dt.astimezone(timezone.utc)
+    except Exception:
+        try:
+            from email.utils import parsedate_to_datetime
+            dt = parsedate_to_datetime(str(ts_str))
+            if dt.tzinfo is None:
+                dt = dt.replace(tzinfo=timezone.utc)
+            dt = dt.astimezone(timezone.utc)
+        except Exception:
+            return True
+    cutoff = datetime.now(timezone.utc) - timedelta(hours=max_age_hours)
+    return dt >= cutoff
+
+
 async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
     logger = logging.getLogger("news_mcp.refresh")
     store = SQLiteClusterStore(DB_PATH)
@@ -205,9 +232,32 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
     articles = changed_articles
     logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
+
+    # Pre-seed with recent clusters from the DB so new articles can merge
+    # into existing clusters across polling cycles.
+    max_age = NEWS_CLUSTER_MAX_AGE_HOURS
+    recent_clusters: list[dict] = []
+    if max_age != 0:
+        lookback = max_age if max_age > 0 else 72
+        all_recent = store.get_latest_clusters_all_topics(
+            ttl_hours=lookback,
+            limit=500,
+        )
+        recent_clusters = [c for c in all_recent if _cluster_age_ok(c, max_age)]
+        logger.info(
+            "refresh pre-seeded existing_clusters=%s max_age_h=%s",
+            len(recent_clusters), max_age,
+        )
+
     # Clustering is sync but may do concurrent embedding fetches internally.
     # Run off-thread so the event loop stays responsive for MCP tool calls.
-    clustered_by_topic = await asyncio.to_thread(dedup_and_cluster_articles, articles)
+    clustered_by_topic = await asyncio.to_thread(
+        dedup_and_cluster_articles,
+        articles,
+        None,  # use default similarity_threshold
+        existing_clusters=recent_clusters if recent_clusters else None,
+        max_age_hours=max_age,
+    )
     logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
 
     # Build LLM concurrency semaphore from the extract provider's config.

+ 205 - 2
test_news_mcp.py

@@ -3,6 +3,7 @@ from __future__ import annotations
 from contextlib import contextmanager
 import tempfile
 from pathlib import Path
+from datetime import datetime, timezone
 
 from news_mcp.dedup.cluster import dedup_and_cluster_articles
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
@@ -14,7 +15,10 @@ from news_mcp.trends_resolution import resolve_entity_via_trends
 from news_mcp.mcp_server_fastmcp import _sort_clusters_by_recency
 
 
-def _article(title: str, url: str = "https://example.com/x", source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
+def _article(title: str, url: str = None, source: str = "Src", ts: str = "Mon, 30 Mar 2026 12:00:00 GMT"):
+    if url is None:
+        import hashlib
+        url = f"https.example.com/{hashlib.md5(title.encode()).hexdigest()[:10]}"
     return {
         "title": title,
         "url": url,
@@ -385,6 +389,9 @@ def test_refresh_skips_reprocessing_when_feed_hash_is_unchanged(monkeypatch):
             self.meta["prune"] = kwargs
             return {"deleted": 0}
 
+        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
+            return []
+
         def set_meta(self, key, value):
             self.meta[key] = value
 
@@ -637,13 +644,16 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
         def get_failed_enrichment_clusters(self, max_retries=3):
             return []
 
+        def get_latest_clusters_all_topics(self, ttl_hours=24, limit=500):
+            return []
+
         def set_meta(self, key, value):
             pass
 
         def set_feed_state(self, feed_key, last_hash, item_count):
             pass
 
-    def fake_cluster(articles):
+    def fake_cluster(articles, similarity_threshold=None, existing_clusters=None, max_age_hours=0):
         # Heuristic put it in "other" (no crypto/macro/regulation/ai keywords
         # in the title for the heuristic matcher — title above does have
         # "law"-adjacent words but not the specific tokens it matches).
@@ -706,3 +716,196 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
         f"Expected SQL row topic to follow the LLM's classification 'regulation', got {upsert['row_topic']!r}"
     )
     assert upsert["payload_topic"] == "regulation"
+
+
+# ---------------------------------------------------------------------------
+# v1.3 — Stable cluster IDs, orphan merge, temporal gating
+# ---------------------------------------------------------------------------
+
+
+def test_stable_cluster_id_is_order_independent():
+    """Two articles about the same event should always get the same cluster_id,
+    regardless of which article is processed first."""
+    from news_mcp.dedup import cluster as dc
+
+    art_a = {
+        "title": "Bitcoin Surges Past $100K",
+        "url": "https://example.com/btc-100k",
+        "source": "Reuters",
+        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
+        "summary": "Bitcoin reached $100,000 for the first time.",
+    }
+    art_b = {
+        "title": "BTC Breaks $100,000 Barrier",
+        "url": "https://example.com/btc-100k",
+        "source": "Bloomberg",
+        "timestamp": "Mon, 30 Mar 2026 12:05:00 GMT",
+        "summary": "Bitcoin topped the $100,000 level.",
+    }
+
+    # Process A first
+    clustered_ab = dc.dedup_and_cluster_articles([art_a, art_b])
+    # Process B first
+    clustered_ba = dc.dedup_and_cluster_articles([art_b, art_a])
+
+    # Both orderings must produce the same cluster_id(s)
+    ids_ab = sorted(c["cluster_id"] for clusters in clustered_ab.values() for c in clusters)
+    ids_ba = sorted(c["cluster_id"] for clusters in clustered_ba.values() for c in clusters)
+    assert ids_ab == ids_ba, f"Cluster IDs depend on order: {ids_ab} vs {ids_ba}"
+
+
+def test_orphan_merge_deduplicates_shared_articles():
+    """When two clusters end up with overlapping article sets (e.g. because
+    embeddings were temporarily unavailable), the post-clustering merge pass
+    should combine them into one."""
+    from news_mcp.dedup.cluster import _merge_orphan_clusters
+
+    clusters = [
+        {
+            "cluster_id": "aaa",
+            "topic": "crypto",
+            "headline": "Bitcoin surges",
+            "articles": [
+                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
+            ],
+            "sources": ["A"],
+            "first_seen": "T1",
+            "last_updated": "T1",
+        },
+        {
+            "cluster_id": "bbb",
+            "topic": "crypto",
+            "headline": "BTC up",
+            "articles": [
+                {"title": "BTC up", "url": "https://example.com/btc", "source": "B"},
+            ],
+            "sources": ["B"],
+            "first_seen": "T2",
+            "last_updated": "T2",
+        },
+    ]
+    merged = _merge_orphan_clusters(clusters)
+    assert len(merged) == 1, f"Expected 1 merged cluster, got {len(merged)}"
+    assert set(merged[0]["sources"]) == {"A", "B"}
+
+
+def test_orphan_merge_preserves_distinct_clusters():
+    """Clusters with no shared articles must remain independent."""
+    from news_mcp.dedup.cluster import _merge_orphan_clusters
+
+    clusters = [
+        {
+            "cluster_id": "aaa",
+            "topic": "crypto",
+            "headline": "Bitcoin surges",
+            "articles": [
+                {"title": "Bitcoin surges", "url": "https://example.com/btc", "source": "A"},
+            ],
+            "sources": ["A"],
+            "first_seen": "T1",
+            "last_updated": "T1",
+        },
+        {
+            "cluster_id": "bbb",
+            "topic": "crypto",
+            "headline": "Ethereum merge",
+            "articles": [
+                {"title": "Ethereum merge", "url": "https://example.com/eth", "source": "B"},
+            ],
+            "sources": ["B"],
+            "first_seen": "T2",
+            "last_updated": "T2",
+        },
+    ]
+    merged = _merge_orphan_clusters(clusters)
+    assert len(merged) == 2
+
+
+def test_stable_id_same_for_different_titles_same_url():
+    """Two articles with the same URL but different titles (e.g. corrected
+    headline) must produce the same cluster_id."""
+    from news_mcp.dedup.cluster import _stable_cluster_id
+
+    arts_a = [
+        {"title": "Fed Raises Rates", "url": "https://example.com/fed-rates"},
+    ]
+    arts_b = [
+        {"title": "Federal Reserve Increases Interest Rates", "url": "https://example.com/fed-rates"},
+    ]
+    id_a = _stable_cluster_id("macro", arts_a)
+    id_b = _stable_cluster_id("macro", arts_b)
+    assert id_a == id_b, f"Same URL must give same cluster_id: {id_a} vs {id_b}"
+
+
+def test_temporal_gate_excludes_stale_clusters():
+    """Clusters older than max_age_hours should not be candidates for merging."""
+    from news_mcp.dedup.cluster import _cluster_is_within_age_window
+
+    old_cluster = {
+        "cluster_id": "old",
+        "topic": "crypto",
+        "last_updated": "2025-01-01T00:00:00+00:00",
+        "articles": [],
+    }
+    assert not _cluster_is_within_age_window(old_cluster, max_age_hours=4)
+
+    recent_cluster = {
+        "cluster_id": "recent",
+        "topic": "crypto",
+        "last_updated": datetime.now(timezone.utc).isoformat(),
+        "articles": [],
+    }
+    assert _cluster_is_within_age_window(recent_cluster, max_age_hours=4)
+
+    # max_age_hours=0 means no limit
+    assert _cluster_is_within_age_window(old_cluster, max_age_hours=0)
+
+
+def test_preseed_merge_into_existing_cluster():
+    """When existing_clusters is provided, a new article that matches should
+    merge into the existing cluster instead of creating a new one."""
+    from news_mcp.dedup import cluster as dc
+
+    existing = [{
+        "cluster_id": "existing-1",
+        "topic": "other",
+        "headline": "Trump warns Iran war could spread across Middle East",
+        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
+        "sources": ["Reuters"],
+        "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
+        "last_updated": datetime.now(timezone.utc).isoformat(),
+        "first_seen": "Mon, 30 Mar 2026 12:00:00 GMT",
+        "articles": [
+            {
+                "title": "Trump warns Iran war could spread across Middle East",
+                "url": "https://example.com/trump-iran",
+                "source": "Reuters",
+                "timestamp": "Mon, 30 Mar 2026 12:00:00 GMT",
+                "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
+            }
+        ],
+        "entities": [],
+        "sentiment": "neutral",
+        "importance": 0.0,
+    }]
+
+    new_article = {
+        "title": "Trump warns Iran conflict could spread across Middle East",
+        "url": "https://example.com/trump-iran-2",
+        "source": "Bloomberg",
+        "timestamp": "Mon, 30 Mar 2026 13:00:00 GMT",
+        "summary": "Trump warns Iran war could spread across the Middle East amid rising tensions.",
+    }
+
+    # Use a low title threshold so Jaccard can catch the merge
+    clustered = dc.dedup_and_cluster_articles(
+        [new_article],
+        similarity_threshold=0.75,
+        existing_clusters=existing,
+        max_age_hours=4,
+    )
+
+    all_clusters = [c for clusters in clustered.values() for c in clusters]
+    # Should have exactly 1 cluster (the existing one, now with 2 articles)
+    assert len(all_clusters) == 1, f"Expected 1 cluster, got {len(all_clusters)}: {[c['headline'] for c in all_clusters]}"
+    assert len(all_clusters[0]["articles"]) == 2