1 nedēļu atpakaļ · 2db56b7dc0
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -5,7 +5,7 @@ import hashlib
 
				 import logging
			
 
				 import sys
			
 
				 from collections import defaultdict
			
 
				-from datetime import datetime, timezone, timedelta
			
 
				+from datetime import datetime, timezone
			
 
				 from typing import Any, Dict
			
 
				 
			
 
				 from news_mcp.config import (
			
@@ -51,6 +51,13 @@ async def _enrich_single_cluster(
 
				 ) -> dict:
			
 
				     """Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
			
 
				 
			
 
				+    Cache strategy: if the cluster already has entities and an enriched_at
			
 
				+    timestamp (from a previous successful enrichment), skip the LLM call.
			
 
				+    Clusters only need to be enriched once — the enrichment output
			
 
				+    (entities, keywords, sentiment, topic) is derived from the cluster's
			
 
				+    article content, which doesn't change meaningfully between polls unless
			
 
				+    enough new articles arrive to form a different cluster.
			
 
				+
			
 
				     On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
			
 
				     with exponential backoff.  If all retries are exhausted the cluster is
			
 
				     marked with enrichment_failed_at and enrichment_retry_count so the next
			
@@ -61,11 +68,24 @@ async def _enrich_single_cluster(
 
				 
			
 
				     cluster_id = c2.get("cluster_id")
			
 
				     if llm_enabled and cluster_id:
			
 
				-        # Cache: if we already have entities/sentiment for this cluster, skip LLM call.
			
 
				+        # --- Cache check: skip LLM if already enriched ---
			
 
				+        # The cluster payload may already carry enriched_at if it was loaded
			
 
				+        # from an existing DB cluster during cross-cycle seeding.
			
 
				+        enriched_at_str = c2.get("enriched_at")
			
 
				+        if enriched_at_str and c2.get("entities"):
			
 
				+            logger.debug("enrich skip (cached) cluster=%s topic=%s", cluster_id, topic)
			
 
				+            return c2
			
 
				+
			
 
				+        # --- Check DB: cluster may have been enriched in a previous cycle ---
			
 
				+        # Note: cluster_id is derived from the article set, so this lookup
			
 
				+        # only matches when the same exact article set was clustered before.
			
 
				+        # That's fine — it means the enrichment is still valid.
			
 
				         existing = store.get_cluster_by_id(cluster_id)
			
 
				-        if existing and existing.get("entities"):
			
 
				+        if existing and existing.get("entities") and existing.get("enriched_at"):
			
 
				+            logger.info("enrich cache-hit cluster=%s topic=%s", cluster_id, topic)
			
 
				             c2 = dict(c2)
			
 
				             c2["entities"] = existing.get("entities", [])
			
 
				+            c2["enriched_at"] = existing.get("enriched_at")
			
 
				 
			
 
				             existing_resolutions = existing.get("entityResolutions", None)
			
 
				             if isinstance(existing_resolutions, list) and existing_resolutions:
			
@@ -81,36 +101,38 @@ async def _enrich_single_cluster(
 
				                 c2["keywords"] = existing.get("keywords")
			
 
				             if existing.get("topic"):
			
 
				                 c2["topic"] = existing.get("topic")
			
 
				-        else:
			
 
				-            # Retry loop with exponential backoff | semaphore held per-attempt
			
 
				-            last_err = ""
			
 
				-            for attempt in range(1 + MAX_ENRICHMENT_RETRIES):
			
 
				-                if attempt > 0:
			
 
				-                    backoff = 2 ** attempt
			
 
				-                    logger.info(
			
 
				-                        "retry cluster=%s topic=%s attempt=%d/%d backoff=%.0fs",
			
 
				-                        cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, backoff,
			
 
				-                    )
			
 
				-                    await asyncio.sleep(backoff)
			
 
				-                try:
			
 
				-                    async with semaphore:
			
 
				-                        c2 = await classify_cluster_llm(dict(c2))
			
 
				-                    break  # success
			
 
				-                except Exception:
			
 
				-                    last_err = str(sys.exc_info()[1])[:200] if sys.exc_info()[1] else "unknown"
			
 
				-                    logger.warning(
			
 
				-                        "LLM enrichment failed cluster=%s topic=%s attempt=%d/%d err=%s",
			
 
				-                        cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, last_err,
			
 
				-                    )
			
 
				-            else:
			
 
				-                # Loop completed without break = all retries exhausted
			
 
				-                prev_count = c2.get("enrichment_retry_count", 0)
			
 
				-                c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
			
 
				-                c2["enrichment_retry_count"] = prev_count + 1
			
 
				-                logger.error(
			
 
				-                    "LLM enrichment exhausted cluster=%s topic=%s after %d retries",
			
 
				-                    cluster_id, topic, MAX_ENRICHMENT_RETRIES,
			
 
				+            return c2
			
 
				+
			
 
				+        # --- Actually call the LLM ---
			
 
				+        last_err = ""
			
 
				+        for attempt in range(1 + MAX_ENRICHMENT_RETRIES):
			
 
				+            if attempt > 0:
			
 
				+                backoff = 2 ** attempt
			
 
				+                logger.info(
			
 
				+                    "retry cluster=%s topic=%s attempt=%d/%d backoff=%.0fs",
			
 
				+                    cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, backoff,
			
 
				                 )
			
 
				+                await asyncio.sleep(backoff)
			
 
				+            try:
			
 
				+                async with semaphore:
			
 
				+                    c2 = await classify_cluster_llm(dict(c2))
			
 
				+                c2["enriched_at"] = datetime.now(timezone.utc).isoformat()
			
 
				+                break  # success
			
 
				+            except Exception:
			
 
				+                last_err = str(sys.exc_info()[1])[:200] if sys.exc_info()[1] else "unknown"
			
 
				+                logger.warning(
			
 
				+                    "LLM enrichment failed cluster=%s topic=%s attempt=%d/%d err=%s",
			
 
				+                    cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, last_err,
			
 
				+                )
			
 
				+        else:
			
 
				+            # Loop completed without break = all retries exhausted
			
 
				+            prev_count = c2.get("enrichment_retry_count", 0)
			
 
				+            c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
			
 
				+            c2["enrichment_retry_count"] = prev_count + 1
			
 
				+            logger.error(
			
 
				+                "LLM enrichment exhausted cluster=%s topic=%s after %d retries",
			
 
				+                cluster_id, topic, MAX_ENRICHMENT_RETRIES,
			
 
				+            )
			
 
				 
			
 
				     return c2
			
 
				 
			
@@ -286,7 +308,7 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
				         logger.info("retry enrich failed clusters count=%d", len(failed_clusters))
			
 
				         retry_tasks = [
			
 
				             _enrich_single_cluster(
			
 
				-                c, str(c.get("topic") or "other"), True, llm_semaphore, store, logger
			
 
				+                c, str(c.get("topic") or "other"), True, llm_semaphore, store, logger,
			
 
				             )
			
 
				             for c in failed_clusters
			
 
				         ]