1 週間前 · 2670ed9d44
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -5,15 +5,14 @@ import hashlib
 
															 import logging
														
 
															 import sys
														
 
															 from collections import defaultdict
														
 
															+from dataclasses import dataclass, field
														
 
															 from datetime import datetime, timezone, timedelta
														
 
															 from typing import Any, Dict
														
 
															 from news_mcp.config import (
														
 
															-    DEFAULT_LOOKBACK_HOURS,
														
 
															     DEFAULT_TOPICS,
														
 
															     DB_PATH,
														
 
															     ENRICH_OTHER_TOPICS_ONLY,
														
 
															-    ENRICHMENT_MAX_PER_REFRESH,
														
 
															     NEWS_EXTRACT_PROVIDER,
														
 
															     NEWS_FEED_URL,
														
 
															     NEWS_FEED_URLS,
														
@@ -22,6 +21,7 @@ from news_mcp.config import (
 
															     NEWS_RETENTION_DAYS,
														
 
															     NEWS_CLUSTER_MAX_AGE_HOURS,
														
 
															     llm_concurrency,
														
 
															+    llm_rate_limit,
														
 
															 )
														
 
															 from news_mcp.dedup.cluster import dedup_and_cluster_articles, _cluster_is_within_age_window, _parse_ts
														
 
															 from news_mcp.enrichment.enrich import enrich_cluster
														
@@ -30,326 +30,469 @@ from news_mcp.sources.news_feeds import fetch_news_articles
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
 
															-def _load_feed_urls() -> list[str]:
														
 
															-    """Return the configured feed URLs from environment (unsorted)."""
														
 
															-    urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()]
														
 
															-    if not urls:
														
 
															-        urls = [NEWS_FEED_URL]
														
 
															-    return urls
														
 
															-
														
 
															-
														
 
															-MAX_ENRICHMENT_RETRIES = 3  # per-cluster retries before giving up for this cycle
														
 
															-
														
 
															-async def _enrich_single_cluster(
														
 
															-    c: dict,
														
 
															-    topic: str,
														
 
															-    llm_enabled: bool,
														
 
															-    semaphore: asyncio.Semaphore,
														
 
															-    store: SQLiteClusterStore,
														
 
															-    logger: logging.Logger,
														
 
															-) -> dict:
														
 
															-    """Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
														
 
															-
														
 
															-    Rule: if the cluster already has entities AND keywords (from a previous
														
 
															-    enrichment), skip the LLM call entirely.  The data on the dict IS the
														
 
															-    cache — no need to look up enriched_at timestamps or query the DB by
														
 
															-    cluster_id.  This works regardless of whether cluster_id changed due to
														
 
															-    article merging across polling cycles.
														
 
															-
														
 
															-    On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
														
 
															-    with exponential backoff.  If all retries are exhausted the cluster is
														
 
															-    marked with enrichment_failed_at and enrichment_retry_count so the next
														
 
															-    polling cycle can re-attempt it.
														
 
															-    """
														
 
															-    c2 = enrich_cluster(c)
														
 
															-    c2.setdefault("topic", topic)
														
 
															-
														
 
															-    cluster_id = c2.get("cluster_id")
														
 
															-    if llm_enabled and cluster_id:
														
 
															-        # --- Cache check: if the cluster already has entities AND keywords,
														
 
															-        # it was enriched in a previous cycle.  Skip LLM entirely.
														
 
															-        _existing_entities = c2.get("entities") or []
														
 
															-        _existing_keywords = c2.get("keywords") or []
														
 
															-        if _existing_entities and _existing_keywords:
														
 
															-            logger.debug("enrich skip (already enriched) cluster=%s topic=%s", cluster_id, topic)
														
 
															-            return c2
														
 
															-
														
 
															-        # --- Actually call the LLM ---
														
 
															-        last_err = ""
														
 
															-        for attempt in range(1 + MAX_ENRICHMENT_RETRIES):
														
 
															-            if attempt > 0:
														
 
															-                backoff = 2 ** attempt
														
 
															-                logger.info(
														
 
															-                    "retry cluster=%s topic=%s attempt=%d/%d backoff=%.0fs",
														
 
															-                    cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, backoff,
														
 
															-                )
														
 
															-                await asyncio.sleep(backoff)
														
 
															-            try:
														
 
															-                async with semaphore:
														
 
															-                    c2 = await classify_cluster_llm(dict(c2))
														
 
															-                c2["enriched_at"] = datetime.now(timezone.utc).isoformat()
														
 
															-                break  # success
														
 
															-            except Exception:
														
 
															-                last_err = str(sys.exc_info()[1])[:200] if sys.exc_info()[1] else "unknown"
														
 
															-                logger.warning(
														
 
															-                    "LLM enrichment failed cluster=%s topic=%s attempt=%d/%d err=%s",
														
 
															-                    cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, last_err,
														
 
															-                )
														
 
															-        else:
														
 
															-            # Loop completed without break = all retries exhausted
														
 
															-            prev_count = c2.get("enrichment_retry_count", 0)
														
 
															-            c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
														
 
															-            c2["enrichment_retry_count"] = prev_count + 1
														
 
															-            logger.error(
														
 
															-                "LLM enrichment exhausted cluster=%s topic=%s after %d retries",
														
 
															-                cluster_id, topic, MAX_ENRICHMENT_RETRIES,
														
 
															-            )
														
 
															-
														
 
															-    return c2
														
 
															-
														
 
															-
														
 
															-async def _enrich_topic_clusters(
														
 
															-    clusters: list[dict],
														
 
															-    topic: str,
														
 
															-    semaphore: asyncio.Semaphore,
														
 
															-    store: SQLiteClusterStore,
														
 
															-    logger: logging.Logger,
														
 
															-    enrich_limit: int,
														
 
															-) -> list[dict]:
														
 
															-    """Enrich all clusters for a single topic concurrently."""
														
 
															-    llm_enabled = (not ENRICH_OTHER_TOPICS_ONLY) or (topic == "other")
														
 
															-
														
 
															-    # Persist the raw clusters first so a slow enrichment pass does not
														
 
															-    # leave the first bootstrap run with nothing stored.
														
 
															-    store.upsert_clusters(clusters, topic=topic)
														
 
															-    logger.info("refresh stored raw topic=%s clusters=%s", topic, len(clusters))
														
 
															-
														
 
															-    targets = clusters[:enrich_limit]
														
 
															-    tasks = [
														
 
															-        _enrich_single_cluster(c, topic, llm_enabled, semaphore, store, logger)
														
 
															-        for c in targets
														
 
															-    ]
														
 
															-    enriched = await asyncio.gather(*tasks, return_exceptions=False)
														
 
															-
														
 
															-    # Any clusters beyond enrich_limit still need importance enrichment
														
 
															-    for c in clusters[enrich_limit:]:
														
 
															-        c2 = enrich_cluster(c)
														
 
															-        c2.setdefault("topic", topic)
														
 
															-        enriched.append(c2)
														
 
															-
														
 
															-    logger.info("refresh enriched topic=%s clusters=%s", topic, len(enriched))
														
 
															-    return enriched
														
 
															-
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+#  Per-feed + per-cycle statistics
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+
														
 
															+@dataclass
														
 
															+class FeedStats:
														
 
															+    """Per-feed statistics for one polling cycle."""
														
 
															+    feed_url: str
														
 
															+    fetched: int = 0         # total items fetched from the feed
														
 
															+    duplicate: int = 0       # unchanged hash → skipped entirely
														
 
															+    stale: int = 0           # older than retention window (dropped)
														
 
															+    ingested: int = 0        # passed dedup + retention, entered clustering
														
 
															+    enriched: int = 0        # newly LLM-enriched this cycle
														
 
															+    already_enriched: int = 0  # cache hit — already had entities+keywords
														
 
															+    failed: int = 0          # LLM enrichment failed after retries
														
 
															+
														
 
															+
														
 
															+@dataclass
														
 
															+class PollStats:
														
 
															+    """Aggregated statistics for one polling cycle."""
														
 
															+    started_at: str = ""
														
 
															+    feeds: list[FeedStats] = field(default_factory=list)
														
 
															+    total_clusters: int = 0
														
 
															+    total_newly_enriched: int = 0
														
 
															+    total_already_enriched: int = 0
														
 
															+    total_failed: int = 0
														
 
															+
														
 
															+    def summary(self) -> dict:
														
 
															+        return {
														
 
															+            "started_at": self.started_at,
														
 
															+            "feeds": [
														
 
															+                {
														
 
															+                    "feed_url": f.feed_url,
														
 
															+                    "fetched": f.fetched,
														
 
															+                    "duplicate": f.duplicate,
														
 
															+                    "stale": f.stale,
														
 
															+                    "ingested": f.ingested,
														
 
															+                }
														
 
															+                for f in self.feeds
														
 
															+            ],
														
 
															+            "total_clusters": self.total_clusters,
														
 
															+            "total_newly_enriched": self.total_newly_enriched,
														
 
															+            "total_already_enriched": self.total_already_enriched,
														
 
															+            "total_failed": self.total_failed,
														
 
															+        }
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+#  Poller
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+
														
 
															+class ClusterPoller:
														
 
															+    """One polling cycle: fetch → dedup → cluster → enrich-once → store."""
														
 
															+
														
 
															+    MAX_ENRICHMENT_RETRIES = 3
														
 
															+
														
 
															+    def __init__(
														
 
															+        self,
														
 
															+        store: SQLiteClusterStore,
														
 
															+        logger: logging.Logger | None = None,
														
 
															+    ):
														
 
															+        self.store = store
														
 
															+        self.logger = logger or logging.getLogger("news_mcp.refresh")
														
 
															+        self.stats = PollStats()
														
 
															+
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Public entry point
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+
														
 
															+    async def poll(self, topic_filter: str | None = None) -> PollStats:
														
 
															+        """Run one full polling cycle. Returns statistics."""
														
 
															+        self.stats = PollStats(started_at=datetime.now(timezone.utc).isoformat())
														
 
															+
														
 
															+        # 1. Load enabled feed URLs
														
 
															+        configured_urls = self._load_feed_urls()
														
 
															+        enabled_urls = self.store.get_enabled_feed_urls(configured_urls)
														
 
															+        self.logger.info("poll start: enabled_feeds=%d configured=%d", len(enabled_urls), len(configured_urls))
														
 
															+
														
 
															+        # 2. Fetch articles from all enabled feeds, per-feed dedup
														
 
															+        feed_map, feed_stats = await self._fetch_feeds(enabled_urls)
														
 
															+
														
 
															+        # Flatten all fresh articles (stats already tracked per-feed in feed_stats)
														
 
															+        all_fresh = [a for articles in feed_map.values() for a in articles]
														
 
															+
														
 
															+        if not all_fresh:
														
 
															+            self.logger.info("poll: no fresh articles from any feed")
														
 
															+            self.stats.feeds = feed_stats
														
 
															+            self._save_feed_stats(feed_stats)
														
 
															+            self._prune_and_finalize(enabled_urls, feed_map)
														
 
															+            return self.stats
														
 
															+
														
 
															+        # 3. Retention filter
														
 
															+        articles = self._apply_retention(all_fresh, feed_map)
														
 
															+
														
 
															+        if not articles:
														
 
															+            self.logger.info("poll: all %d fresh articles dropped by retention", len(all_fresh))
														
 
															+            self.stats.feeds = feed_stats
														
 
															+            self._save_feed_stats(feed_stats)
														
 
															+            self._prune_and_finalize(enabled_urls, feed_map)
														
 
															+            return self.stats
														
 
															+
														
 
															+        # 4. Pre-seed existing clusters for cross-cycle merging
														
 
															+        existing_clusters = self._preseed_clusters()
														
 
															+
														
 
															+        # 5. Cluster (sync, may do concurrent embeddings internally)
														
 
															+        clustered_by_topic = await self._cluster(articles, existing_clusters)
														
 
															+
														
 
															+        # 6. Enrich every cluster that needs it, store immediately
														
 
															+        await self._enrich_all(clustered_by_topic)
														
 
															+
														
 
															+        # 7. Retry previously failed enrichments
														
 
															+        await self._retry_failed()
														
 
															+
														
 
															+        # 8. Persist feed stats + prune
														
 
															+        self.stats.feeds = feed_stats
														
 
															+        self._save_feed_stats(feed_stats)
														
 
															+        self._prune_and_finalize(enabled_urls, feed_map)
														
 
															+
														
 
															+        self.logger.info(
														
 
															+            "poll complete: clusters=%d newly_enriched=%d already_enriched=%d failed=%d",
														
 
															+            self.stats.total_clusters,
														
 
															+            self.stats.total_newly_enriched,
														
 
															+            self.stats.total_already_enriched,
														
 
															+            self.stats.total_failed,
														
 
															+        )
														
 
															+        return self.stats
														
 
															+
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Phase 1: Load feed URLs
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+
														
 
															+    @staticmethod
														
 
															+    def _load_feed_urls() -> list[str]:
														
 
															+        urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()]
														
 
															+        if not urls:
														
 
															+            urls = [NEWS_FEED_URL]
														
 
															+        return urls
														
 
															+
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Phase 2: Fetch + per-feed dedup
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+
														
 
															+    async def _fetch_feeds(
														
 
															+        self, feed_urls: list[str],
														
 
															+    ) -> tuple[dict[str, list[dict]], list[FeedStats]]:
														
 
															+        """Fetch all feeds concurrently. Returns {feed_url: fresh_articles}
														
 
															+        and per-feed stats. Unchanged feeds (same content hash) are dropped."""
														
 
															+        articles = await fetch_news_articles(limit=9999, url_list=feed_urls)
														
 
															+        # limit=9999 effectively means no per-feed cap — fetches everything
														
 
															+        # the feed gives us.  fetch_news_articles applies max(1, limit).
														
 
															+
														
 
															+        # Group by feed URL
														
 
															+        per_feed: dict[str, list[dict]] = defaultdict(list)
														
 
															+        for a in articles:
														
 
															+            fu = str(a.get("feed_url") or NEWS_FEED_URL).strip() or NEWS_FEED_URL
														
 
															+            per_feed[fu].append(a)
														
 
															-def _cluster_age_ok(cluster: dict, max_age_hours: float) -> bool:
														
 
															-    """Deprecated alias — use _cluster_is_within_age_window from cluster.py."""
														
 
															-    return _cluster_is_within_age_window(cluster, max_age_hours=max_age_hours)
														
 
															+        # Per-feed content hash dedup
														
 
															+        feed_map: dict[str, list[dict]] = {}
														
 
															+        feed_stats_list: list[FeedStats] = []
														
 
															+        for feed_url in feed_urls:
														
 
															+            feed_articles = per_feed.get(feed_url, [])
														
 
															+            stats = FeedStats(feed_url=feed_url, fetched=len(feed_articles))
														
 
															-async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
														
 
															-    logger = logging.getLogger("news_mcp.refresh")
														
 
															-    store = SQLiteClusterStore(DB_PATH)
														
 
															+            if not feed_articles:
														
 
															+                self.logger.info("feed empty: feed_url=%s", feed_url)
														
 
															+                feed_stats_list.append(stats)
														
 
															+                continue
														
 
															-    logger.info("refresh start topic=%s limit=%s", topic, limit)
														
 
															-
														
 
															-    # Get enabled feed URLs from store (seeds new ones as enabled by default).
														
 
															-    configured_urls = _load_feed_urls()
														
 
															-    enabled_urls = store.get_enabled_feed_urls(configured_urls)
														
 
															-    logger.info("refresh enabled feeds=%d / configured=%d", len(enabled_urls), len(configured_urls))
														
 
															-
														
 
															-    # fetch_news_articles is now fully async (concurrent RSS fetching)
														
 
															-    articles = await fetch_news_articles(limit, url_list=enabled_urls)
														
 
															-    logger.info("refresh fetched articles=%s", len(articles))
														
 
															-
														
 
															-    # Drop legacy aggregate feed-state rows so the dashboard only reflects
														
 
															-    # real per-feed poll status from this point forward.
														
 
															-    with store._conn() as conn:
														
 
															-        conn.execute("DELETE FROM feed_state WHERE feed_key LIKE 'newsfeeds:%'")
														
 
															-
														
 
															-    # Track feed freshness per RSS URL so unchanged feeds can be skipped.
														
 
															-    per_feed: dict[str, list[dict[str, Any]]] = defaultdict(list)
														
 
															-    for article in articles:
														
 
															-        feed_url = str(article.get("feed_url") or NEWS_FEED_URL).strip() or NEWS_FEED_URL
														
 
															-        per_feed[feed_url].append(article)
														
 
															-
														
 
															-    changed_articles: list[dict[str, Any]] = []
														
 
															-    changed_feed_urls: list[str] = []
														
 
															-    for feed_url, feed_articles in per_feed.items():
														
 
															-        logger.info("refresh feed batch start feed_url=%s count=%s", feed_url, len(feed_articles))
														
 
															-        material = "\n".join(
														
 
															-            f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
														
 
															-            for a in feed_articles
														
 
															-        )
														
 
															-        last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
														
 
															-        feed_key = feed_url
														
 
															-        prev_hash = store.get_feed_hash(feed_key)
														
 
															-        if prev_hash == last_hash:
														
 
															-            logger.info("refresh unchanged feed_url=%s count=%s topic=%s", feed_url, len(feed_articles), topic)
														
 
															-        else:
														
 
															-            logger.info("refresh changed feed_url=%s count=%s topic=%s", feed_url, len(feed_articles), topic)
														
 
															-            changed_feed_urls.append(feed_url)
														
 
															-            changed_articles.extend(feed_articles)
														
 
															-        logger.info("refresh feed batch complete feed_url=%s changed_total=%s", feed_url, len(changed_articles))
														
 
															-
														
 
															-    if not changed_articles:
														
 
															-        logger.info("refresh unchanged all feeds topic=%s", topic)
														
 
															-        store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
														
 
															-        prune_result = store.prune_if_due(
														
 
															-            pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															-            retention_days=NEWS_RETENTION_DAYS,
														
 
															-            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
														
 
															-        )
														
 
															-        logger.info("refresh prune_result=%s", prune_result)
														
 
															-        return
														
 
															+            material = "\n".join(
														
 
															+                f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
														
 
															+                for a in feed_articles
														
 
															+            )
														
 
															+            content_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
														
 
															+            prev_hash = self.store.get_feed_hash(feed_url)
														
 
															-    articles = changed_articles
														
 
															+            if prev_hash == content_hash:
														
 
															+                stats.duplicate = len(feed_articles)
														
 
															+                self.logger.info("feed unchanged: feed_url=%s items=%d", feed_url, len(feed_articles))
														
 
															+                feed_stats_list.append(stats)
														
 
															+                continue
														
 
															-    # Pre-filter: drop articles whose RSS timestamp is older than retention.
														
 
															-    # This prevents stale feed items from being re-ingested after pruning.
														
 
															-    if NEWS_RETENTION_DAYS > 0:
														
 
															-        retention_cutoff = datetime.now(timezone.utc) - timedelta(days=NEWS_RETENTION_DAYS)
														
 
															-        fresh_articles = []
														
 
															+            feed_map[feed_url] = feed_articles
														
 
															+            self.logger.info(
														
 
															+                "feed changed: feed_url=%s items=%d hash_prev=%s hash_now=%s",
														
 
															+                feed_url, len(feed_articles),
														
 
															+                (prev_hash or "-")[:12], content_hash[:12],
														
 
															+            )
														
 
															+            feed_stats_list.append(stats)
														
 
															+
														
 
															+        return feed_map, feed_stats_list
														
 
															+
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Phase 3: Retention filter
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+
														
 
															+    def _apply_retention(
														
 
															+        self, articles: list[dict], feed_map: dict[str, list[dict]],
														
 
															+    ) -> list[dict]:
														
 
															+        """Drop articles older than NEWS_RETENTION_DAYS. Updates FeedStats."""
														
 
															+        if NEWS_RETENTION_DAYS <= 0:
														
 
															+            return articles
														
 
															+        cutoff = datetime.now(timezone.utc) - timedelta(days=NEWS_RETENTION_DAYS)
														
 
															+
														
 
															+        # Build a lookup: article_url → feed_url for stats
														
 
															+        article_feed: dict[str, str] = {}
														
 
															+        for fu, arts in feed_map.items():
														
 
															+            for a in arts:
														
 
															+                article_feed[a.get("url", "")] = fu
														
 
															+
														
 
															+        fresh = []
														
 
															+        dropped = 0
														
 
															         for a in articles:
														
 
															             ts_str = a.get("timestamp", "")
														
 
															             if not ts_str:
														
 
															-                fresh_articles.append(a)
														
 
															+                fresh.append(a)
														
 
															                 continue
														
 
															             dt = _parse_ts(ts_str)
														
 
															-            if dt is None:
														
 
															-                fresh_articles.append(a)
														
 
															-                continue
														
 
															-            if dt >= retention_cutoff:
														
 
															-                fresh_articles.append(a)
														
 
															+            if dt is None or dt >= cutoff:
														
 
															+                fresh.append(a)
														
 
															             else:
														
 
															-                logger.debug("drop stale article title=%s ts=%s", a.get("title", "")[:60], ts_str)
														
 
															-        dropped = len(articles) - len(fresh_articles)
														
 
															+                dropped += 1
														
 
															+                fu = article_feed.get(a.get("url", ""), "")
														
 
															+                if fu:
														
 
															+                    # Find matching FeedStats and increment stale
														
 
															+                    for fs in self.stats.feeds:
														
 
															+                        if fs.feed_url == fu:
														
 
															+                            fs.stale += 1
														
 
															+                            break
														
 
															         if dropped:
														
 
															-            logger.info("refresh retention-filter dropped=%d remaining=%d retention_days=%.0f", dropped, len(fresh_articles), NEWS_RETENTION_DAYS)
														
 
															-        articles = fresh_articles
														
 
															-
														
 
															-    if not articles:
														
 
															-        logger.info("refresh no articles after retention filter topic=%s", topic)
														
 
															-        store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
														
 
															-        prune_result = store.prune_if_due(
														
 
															-            pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															-            retention_days=NEWS_RETENTION_DAYS,
														
 
															-            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
														
 
															+            self.logger.info("retention: dropped=%d remaining=%d retention_days=%.0f", dropped, len(fresh), NEWS_RETENTION_DAYS)
														
 
															+        return fresh
														
 
															+
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Phase 4: Pre-seed existing clusters
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+
														
 
															+    def _preseed_clusters(self) -> list[dict]:
														
 
															+        """Load recent clusters from DB for cross-cycle article merging."""
														
 
															+        max_age = NEWS_CLUSTER_MAX_AGE_HOURS
														
 
															+        if max_age == 0:
														
 
															+            return []
														
 
															+        lookback = max_age if max_age > 0 else 72
														
 
															+        all_recent = self.store.get_latest_clusters_all_topics(ttl_hours=lookback, limit=500)
														
 
															+        recent = [c for c in all_recent if _cluster_is_within_age_window(c, max_age)]
														
 
															+        self.logger.info("pre-seeded: existing_clusters=%d max_age_h=%.1f", len(recent), max_age)
														
 
															+        return recent
														
 
															+
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Phase 5: Clustering
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+
														
 
															+    async def _cluster(
														
 
															+        self, articles: list[dict], existing_clusters: list[dict],
														
 
															+    ) -> dict[str, list[dict]]:
														
 
															+        """Run dedup_and_cluster_articles. Returns {topic: [clusters]}."""
														
 
															+        self.logger.info("clustering: articles=%d existing_clusters=%d", len(articles), len(existing_clusters))
														
 
															+        clustered = await asyncio.to_thread(
														
 
															+            dedup_and_cluster_articles,
														
 
															+            articles,
														
 
															+            None,  # default similarity_threshold
														
 
															+            existing_clusters=existing_clusters if existing_clusters else None,
														
 
															+            max_age_hours=NEWS_CLUSTER_MAX_AGE_HOURS,
														
 
															         )
														
 
															-        logger.info("refresh prune_result=%s", prune_result)
														
 
															-        return
														
 
															+        self.logger.info("clustered: topics=%s", list(clustered.keys()))
														
 
															+        return clustered
														
 
															-    logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Phase 6: Enrich + store
														
 
															+    # ------------------------------------------------------------------ #
														
 
															-    # Pre-seed with recent clusters from the DB so new articles can merge
														
 
															-    # into existing clusters across polling cycles.
														
 
															-    max_age = NEWS_CLUSTER_MAX_AGE_HOURS
														
 
															-    recent_clusters: list[dict] = []
														
 
															-    if max_age != 0:
														
 
															-        lookback = max_age if max_age > 0 else 72
														
 
															-        all_recent = store.get_latest_clusters_all_topics(
														
 
															-            ttl_hours=lookback,
														
 
															-            limit=500,
														
 
															-        )
														
 
															-        recent_clusters = [c for c in all_recent if _cluster_age_ok(c, max_age)]
														
 
															-        logger.info(
														
 
															-            "refresh pre-seeded existing_clusters=%s max_age_h=%s",
														
 
															-            len(recent_clusters), max_age,
														
 
															-        )
														
 
															+    async def _enrich_all(self, clustered_by_topic: dict[str, list[dict]]) -> None:
														
 
															+        """Enrich every cluster that needs it and store immediately."""
														
 
															+        semaphore = asyncio.Semaphore(llm_concurrency(NEWS_EXTRACT_PROVIDER))
														
 
															+        rate = llm_rate_limit(NEWS_EXTRACT_PROVIDER)
														
 
															-    # Clustering is sync but may do concurrent embedding fetches internally.
														
 
															-    # Run off-thread so the event loop stays responsive for MCP tool calls.
														
 
															-    clustered_by_topic = await asyncio.to_thread(
														
 
															-        dedup_and_cluster_articles,
														
 
															-        articles,
														
 
															-        None,  # use default similarity_threshold
														
 
															-        existing_clusters=recent_clusters if recent_clusters else None,
														
 
															-        max_age_hours=max_age,
														
 
															-    )
														
 
															-    logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
														
 
															-
														
 
															-    # Build LLM concurrency semaphore from the extract provider's config.
														
 
															-    max_llm_concurrent = llm_concurrency(NEWS_EXTRACT_PROVIDER)
														
 
															-    llm_semaphore = asyncio.Semaphore(max_llm_concurrent)
														
 
															-    logger.info("refresh llm semaphore limit=%s provider=%s", max_llm_concurrent, NEWS_EXTRACT_PROVIDER)
														
 
															-
														
 
															-    from news_mcp.config import llm_rate_limit as _rl
														
 
															-    _rate = _rl(NEWS_EXTRACT_PROVIDER)
														
 
															-    logger.info("refresh llm rate-limit=%s/s provider=%s", _rate, NEWS_EXTRACT_PROVIDER)
														
 
															-
														
 
															-    # Enrich each topic's clusters concurrently.
														
 
															-    topic_tasks = []
														
 
															-    for t, clusters in clustered_by_topic.items():
														
 
															-        if topic and t != topic:
														
 
															-            continue
														
 
															-
														
 
															-        # Determine how many clusters to LLM-enrich.
														
 
															-        # ENRICHMENT_MAX_PER_REFRESH=0 means enrich every cluster (no cap).
														
 
															-        enrich_limit = ENRICHMENT_MAX_PER_REFRESH or len(clusters)
														
 
															-
														
 
															-        topic_tasks.append(
														
 
															-            _enrich_topic_clusters(
														
 
															-                clusters=clusters,
														
 
															-                topic=t,
														
 
															-                semaphore=llm_semaphore,
														
 
															-                store=store,
														
 
															-                logger=logger,
														
 
															-                enrich_limit=enrich_limit,
														
 
															-            )
														
 
															+        self.logger.info(
														
 
															+            "enrich: semaphore_limit=%d rate_limit=%s/s provider=%s",
														
 
															+            llm_concurrency(NEWS_EXTRACT_PROVIDER), rate, NEWS_EXTRACT_PROVIDER,
														
 
															         )
														
 
															-    # Run all topic enrichment phases concurrently
														
 
															-    topic_results = await asyncio.gather(*topic_tasks, return_exceptions=False)
														
 
															+        # Flatten all clusters into one list with their topics
														
 
															+        all_targets: list[tuple[str, dict]] = []
														
 
															+        for topic, clusters in clustered_by_topic.items():
														
 
															+            for c in clusters:
														
 
															+                all_targets.append((topic, c))
														
 
															-    # Persist enriched clusters grouped by their final topic
														
 
															-    for enriched in topic_results:
														
 
															-        by_final_topic: Dict[str, list] = {}
														
 
															-        for c2 in enriched:
														
 
															+        if not all_targets:
														
 
															+            return
														
 
															+
														
 
															+        # Enrich concurrently
														
 
															+        tasks = [
														
 
															+            self._enrich_one(c, topic, semaphore, rate)
														
 
															+            for topic, c in all_targets
														
 
															+        ]
														
 
															+        results = await asyncio.gather(*tasks, return_exceptions=False)
														
 
															+
														
 
															+        # Store each cluster individually, grouped by final topic
														
 
															+        by_final_topic: dict[str, list[dict]] = defaultdict(list)
														
 
															+        for c2, was_new in results:
														
 
															             final_topic = str(c2.get("topic") or "other").strip().lower()
														
 
															-            if final_topic not in {x.lower() for x in DEFAULT_TOPICS}:
														
 
															+            if final_topic not in {t.lower() for t in DEFAULT_TOPICS}:
														
 
															                 final_topic = "other"
														
 
															-            by_final_topic.setdefault(final_topic, []).append(c2)
														
 
															+            by_final_topic[final_topic].append(c2)
														
 
															+            self.stats.total_clusters += 1
														
 
															+            if was_new:
														
 
															+                self.stats.total_newly_enriched += 1
														
 
															+            else:
														
 
															+                self.stats.total_already_enriched += 1
														
 
															+
														
 
															         for final_topic, group in by_final_topic.items():
														
 
															-            store.upsert_clusters(group, topic=final_topic)
														
 
															-            logger.info("refresh stored topic=%s clusters=%s", final_topic, len(group))
														
 
															-
														
 
															-    # Retry previously failed enrichments
														
 
															-    failed_clusters = store.get_failed_enrichment_clusters(max_retries=3)
														
 
															-    if failed_clusters:
														
 
															-        logger.info("retry enrich failed clusters count=%d", len(failed_clusters))
														
 
															-        retry_tasks = [
														
 
															-            _enrich_single_cluster(
														
 
															-                c, str(c.get("topic") or "other"), True, llm_semaphore, store, logger,
														
 
															-            )
														
 
															-            for c in failed_clusters
														
 
															+            self.store.upsert_clusters(group, topic=final_topic)
														
 
															+            self.logger.info("stored: topic=%s clusters=%d", final_topic, len(group))
														
 
															+
														
 
															+    async def _enrich_one(
														
 
															+        self,
														
 
															+        cluster: dict,
														
 
															+        topic: str,
														
 
															+        semaphore: asyncio.Semaphore,
														
 
															+        rate: float,
														
 
															+    ) -> tuple[dict, bool]:
														
 
															+        """Enrich a single cluster. Returns (cluster, was_newly_enriched).
														
 
															+
														
 
															+        If the cluster already has entities AND keywords, skip LLM entirely.
														
 
															+        The data on the dict IS the cache — no timestamp or DB lookup needed.
														
 
															+        """
														
 
															+        c2 = enrich_cluster(cluster)
														
 
															+        c2.setdefault("topic", topic)
														
 
															+
														
 
															+        llm_enabled = (not ENRICH_OTHER_TOPICS_ONLY) or (topic == "other")
														
 
															+        cluster_id = c2.get("cluster_id")
														
 
															+
														
 
															+        if not llm_enabled or not cluster_id:
														
 
															+            return c2, False
														
 
															+
														
 
															+        # Cache check: entities + keywords already present → skip
														
 
															+        if (c2.get("entities") or []) and (c2.get("keywords") or []):
														
 
															+            self.logger.debug("enrich skip (cached): cluster=%s topic=%s", cluster_id, topic)
														
 
															+            return c2, False
														
 
															+
														
 
															+        # Actually call the LLM
														
 
															+        last_err = ""
														
 
															+        for attempt in range(1 + self.MAX_ENRICHMENT_RETRIES):
														
 
															+            if attempt > 0:
														
 
															+                backoff = 2 ** attempt
														
 
															+                self.logger.info("retry: cluster=%s attempt=%d backoff=%.0fs", cluster_id, attempt, backoff)
														
 
															+                await asyncio.sleep(backoff)
														
 
															+            try:
														
 
															+                async with semaphore:
														
 
															+                    c2 = await classify_cluster_llm(dict(c2))
														
 
															+                c2["enriched_at"] = datetime.now(timezone.utc).isoformat()
														
 
															+                return c2, True
														
 
															+            except Exception:
														
 
															+                last_err = str(sys.exc_info()[1])[:200] if sys.exc_info()[1] else "unknown"
														
 
															+                self.logger.warning(
														
 
															+                    "enrich failed: cluster=%s attempt=%d err=%s",
														
 
															+                    cluster_id, attempt, last_err,
														
 
															+                )
														
 
															+
														
 
															+        # All retries exhausted
														
 
															+        prev_count = c2.get("enrichment_retry_count", 0)
														
 
															+        c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
														
 
															+        c2["enrichment_retry_count"] = prev_count + 1
														
 
															+        self.logger.error("enrich exhausted: cluster=%s after %d retries", cluster_id, self.MAX_ENRICHMENT_RETRIES)
														
 
															+        self.stats.total_failed += 1
														
 
															+        return c2, True  # was "newly" enriched (attempted), but failed
														
 
															+
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Phase 7: Retry failed enrichments
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+
														
 
															+    async def _retry_failed(self) -> None:
														
 
															+        """Retry clusters whose previous enrichment failed."""
														
 
															+        failed = self.store.get_failed_enrichment_clusters(max_retries=3)
														
 
															+        if not failed:
														
 
															+            return
														
 
															+
														
 
															+        self.logger.info("retry: failed_clusters=%d", len(failed))
														
 
															+        semaphore = asyncio.Semaphore(llm_concurrency(NEWS_EXTRACT_PROVIDER))
														
 
															+        rate = llm_rate_limit(NEWS_EXTRACT_PROVIDER)
														
 
															+
														
 
															+        tasks = [
														
 
															+            self._enrich_one(c, str(c.get("topic") or "other"), semaphore, rate)
														
 
															+            for c in failed
														
 
															         ]
														
 
															-        retry_results = await asyncio.gather(*retry_tasks, return_exceptions=False)
														
 
															-        # Persist retried results
														
 
															-        by_topic_retry: Dict[str, list] = {}
														
 
															-        for c2 in retry_results:
														
 
															-            # Clear stale failure marker on success
														
 
															-            if not c2.get("enrichment_failed_at") or c2.get("entities"):
														
 
															+        results = await asyncio.gather(*tasks, return_exceptions=False)
														
 
															+
														
 
															+        by_topic: dict[str, list[dict]] = defaultdict(list)
														
 
															+        attempted = 0
														
 
															+        now_success = 0
														
 
															+        still_failed = 0
														
 
															+        for c2, was_new in results:
														
 
															+            if not was_new:
														
 
															+                continue
														
 
															+            attempted += 1
														
 
															+            # Clear failure marker on success
														
 
															+            if c2.get("enriched_at") and not c2.get("enrichment_failed_at"):
														
 
															                 c2.pop("enrichment_failed_at", None)
														
 
															                 c2.pop("enrichment_retry_count", None)
														
 
															+                now_success += 1
														
 
															+            else:
														
 
															+                still_failed += 1
														
 
															             t = str(c2.get("topic") or "other").strip().lower()
														
 
															             if t not in {x.lower() for x in DEFAULT_TOPICS}:
														
 
															                 t = "other"
														
 
															-            by_topic_retry.setdefault(t, []).append(c2)
														
 
															-        for t, group in by_topic_retry.items():
														
 
															-            store.upsert_clusters(group, topic=t)
														
 
															-            logger.info("retry stored topic=%s clusters=%s", t, len(group))
														
 
															-
														
 
															-    prune_result = store.prune_if_due(
														
 
															-        pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															-        retention_days=NEWS_RETENTION_DAYS,
														
 
															-        interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
														
 
															-    )
														
 
															-    for feed_url in changed_feed_urls:
														
 
															-        feed_articles = per_feed[feed_url]
														
 
															-        material = "\n".join(
														
 
															-            f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
														
 
															-            for a in feed_articles
														
 
															+            by_topic[t].append(c2)
														
 
															+
														
 
															+        for t, group in by_topic.items():
														
 
															+            self.store.upsert_clusters(group, topic=t)
														
 
															+            self.logger.info("retry stored: topic=%s clusters=%d", t, len(group))
														
 
															+
														
 
															+        if attempted:
														
 
															+            self.logger.info("retry done: attempted=%d recovered=%d still_failed=%d", attempted, now_success, still_failed)
														
 
															+
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+    #  Phase 8: Feed stats + prune
														
 
															+    # ------------------------------------------------------------------ #
														
 
															+
														
 
															+    def _save_feed_stats(self, feed_stats: list[FeedStats]) -> None:
														
 
															+        """Log per-feed statistics. ingested = fetched - duplicate - stale."""
														
 
															+        for fs in feed_stats:
														
 
															+            fs.ingested = max(0, fs.fetched - fs.duplicate - fs.stale)
														
 
															+            self.logger.info(
														
 
															+                "feed stats: feed_url=%s fetched=%d duplicate=%d stale=%d ingested=%d",
														
 
															+                fs.feed_url, fs.fetched, fs.duplicate, fs.stale, fs.ingested,
														
 
															+            )
														
 
															+
														
 
															+    def _prune_and_finalize(
														
 
															+        self,
														
 
															+        enabled_urls: list[str],
														
 
															+        feed_map: dict[str, list[dict]],
														
 
															+    ) -> None:
														
 
															+        """Run pruning and update feed_state hashes + timestamps."""
														
 
															+        prune_result = self.store.prune_if_due(
														
 
															+            pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															+            retention_days=NEWS_RETENTION_DAYS,
														
 
															+            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
														
 
															         )
														
 
															-        last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
														
 
															-        store.set_feed_state(feed_url, last_hash, len(feed_articles))
														
 
															-    store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
														
 
															-    logger.info("refresh prune_result=%s", prune_result)
														
 
															+
														
 
															+        # Update feed_state: hash + item_count for feeds that had changes
														
 
															+        for feed_url, feed_articles in feed_map.items():
														
 
															+            material = "\n".join(
														
 
															+                f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
														
 
															+                for a in feed_articles
														
 
															+            )
														
 
															+            content_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
														
 
															+            self.store.set_feed_state(feed_url, content_hash, len(feed_articles))
														
 
															+
														
 
															+        # Drop legacy aggregate feed-state rows
														
 
															+        with self.store._conn() as conn:
														
 
															+            conn.execute("DELETE FROM feed_state WHERE feed_key LIKE 'newsfeeds:%'")
														
 
															+
														
 
															+        self.store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
														
 
															+        self.logger.info("prune: %s", prune_result)
														
 
															+
														
 
															+
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+#  Compatibility wrapper (used by background loop + tests)
														
 
															+# --------------------------------------------------------------------------- #
														
 
															+
														
 
															+async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
														
 
															+    """Backward-compatible entry point. Delegates to ClusterPoller."""
														
 
															+    store = SQLiteClusterStore(DB_PATH)
														
 
															+    poller = ClusterPoller(store)
														
 
															+    await poller.poll(topic_filter=topic)