пре 1 недеља · 7c83c529a8
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 
															 import asyncio
														
 
															 import hashlib
														
 
															 import logging
														
 
															+import sys
														
 
															 from collections import defaultdict
														
 
															 from datetime import datetime, timezone
														
 
															 from typing import Any, Dict
														
@@ -29,6 +30,8 @@ from news_mcp.storage.sqlite_store import SQLiteClusterStore
 
															 from news_mcp.trends_resolution import resolve_entity_via_trends
														
 
															+MAX_ENRICHMENT_RETRIES = 3  # per-cluster retries before giving up for this cycle
														
 
															+
														
 
															 async def _enrich_single_cluster(
														
 
															     c: dict,
														
 
															     topic: str,
														
@@ -37,7 +40,13 @@ async def _enrich_single_cluster(
 
															     store: SQLiteClusterStore,
														
 
															     logger: logging.Logger,
														
 
															 ) -> dict:
														
 
															-    """Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited."""
														
 
															+    """Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
														
 
															+
														
 
															+    On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
														
 
															+    with exponential backoff.  If all retries are exhausted the cluster is
														
 
															+    marked with enrichment_failed_at and enrichment_retry_count so the next
														
 
															+    polling cycle can re-attempt it.
														
 
															+    """
														
 
															     c2 = enrich_cluster(c)
														
 
															     c2.setdefault("topic", topic)
														
@@ -64,16 +73,35 @@ async def _enrich_single_cluster(
 
															             if existing.get("topic"):
														
 
															                 c2["topic"] = existing.get("topic")
														
 
															         else:
														
 
															-            # Acquire semaphore before making outbound LLM call
														
 
															-            async with semaphore:
														
 
															+            # Retry loop with exponential backoff | semaphore held per-attempt
														
 
															+            last_err = ""
														
 
															+            for attempt in range(1 + MAX_ENRICHMENT_RETRIES):
														
 
															+                if attempt > 0:
														
 
															+                    backoff = 2 ** attempt
														
 
															+                    logger.info(
														
 
															+                        "retry cluster=%s topic=%s attempt=%d/%d backoff=%.0fs",
														
 
															+                        cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, backoff,
														
 
															+                    )
														
 
															+                    await asyncio.sleep(backoff)
														
 
															                 try:
														
 
															-                    c2 = await classify_cluster_llm(c2)
														
 
															+                    async with semaphore:
														
 
															+                        c2 = await classify_cluster_llm(dict(c2))
														
 
															+                    break  # success
														
 
															                 except Exception:
														
 
															-                    logger.exception(
														
 
															-                        "LLM enrichment failed for cluster %s (topic %s)",
														
 
															-                        c2.get("cluster_id"), topic,
														
 
															+                    last_err = str(sys.exc_info()[1])[:200] if sys.exc_info()[1] else "unknown"
														
 
															+                    logger.warning(
														
 
															+                        "LLM enrichment failed cluster=%s topic=%s attempt=%d/%d err=%s",
														
 
															+                        cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, last_err,
														
 
															                     )
														
 
															-                    c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
														
 
															+            else:
														
 
															+                # Loop completed without break = all retries exhausted
														
 
															+                prev_count = c2.get("enrichment_retry_count", 0)
														
 
															+                c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
														
 
															+                c2["enrichment_retry_count"] = prev_count + 1
														
 
															+                logger.error(
														
 
															+                    "LLM enrichment exhausted cluster=%s topic=%s after %d retries",
														
 
															+                    cluster_id, topic, MAX_ENRICHMENT_RETRIES,
														
 
															+                )
														
 
															     return c2
														
@@ -210,6 +238,32 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
 
															             store.upsert_clusters(group, topic=final_topic)
														
 
															             logger.info("refresh stored topic=%s clusters=%s", final_topic, len(group))
														
 
															+    # Retry previously failed enrichments
														
 
															+    failed_clusters = store.get_failed_enrichment_clusters(max_retries=3)
														
 
															+    if failed_clusters:
														
 
															+        logger.info("retry enrich failed clusters count=%d", len(failed_clusters))
														
 
															+        retry_tasks = [
														
 
															+            _enrich_single_cluster(
														
 
															+                c, str(c.get("topic") or "other"), True, llm_semaphore, store, logger
														
 
															+            )
														
 
															+            for c in failed_clusters
														
 
															+        ]
														
 
															+        retry_results = await asyncio.gather(*retry_tasks, return_exceptions=False)
														
 
															+        # Persist retried results
														
 
															+        by_topic_retry: Dict[str, list] = {}
														
 
															+        for c2 in retry_results:
														
 
															+            # Clear stale failure marker on success
														
 
															+            if not c2.get("enrichment_failed_at") or c2.get("entities"):
														
 
															+                c2.pop("enrichment_failed_at", None)
														
 
															+                c2.pop("enrichment_retry_count", None)
														
 
															+            t = str(c2.get("topic") or "other").strip().lower()
														
 
															+            if t not in {x.lower() for x in DEFAULT_TOPICS}:
														
 
															+                t = "other"
														
 
															+            by_topic_retry.setdefault(t, []).append(c2)
														
 
															+        for t, group in by_topic_retry.items():
														
 
															+            store.upsert_clusters(group, topic=t)
														
 
															+            logger.info("retry stored topic=%s clusters=%s", t, len(group))
														
 
															+
														
 
															     prune_result = store.prune_if_due(
														
 
															         pruning_enabled=NEWS_PRUNING_ENABLED,
														
 
															         retention_days=NEWS_RETENTION_DAYS,
														
--- a/news_mcp/llm.py
+++ b/news_mcp/llm.py
@@ -48,39 +48,88 @@ def active_llm_config() -> dict[str, str]:
 
															     }
														
 
															-async def _call_groq(model: str, messages: List[Dict[str, str]], response_json: bool = True) -> str:
														
 
															+async def _call_groq(model: str, messages: List[Dict[str, str]], response_json: bool = True, retries: int = 2) -> str:
														
 
															     if not GROQ_API_KEY:
														
 
															         raise LLMError("GROQ_API_KEY is not configured")
														
 
															     req = {"model": model, "messages": messages, "temperature": 0.2}
														
 
															     if response_json:
														
 
															         req["response_format"] = {"type": "json_object"}
														
 
															+    last_err = ""
														
 
															     async with httpx.AsyncClient(timeout=45.0) as client:
														
 
															-        resp = await client.post(
														
 
															-            "https://api.groq.com/openai/v1/chat/completions",
														
 
															-            headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
														
 
															-            json=req,
														
 
															-        )
														
 
															-        resp.raise_for_status()
														
 
															-        data = resp.json()
														
 
															-    return data["choices"][0]["message"]["content"]
														
 
															-
														
 
															-
														
 
															-async def _call_openai(model: str, messages: List[Dict[str, str]], response_json: bool = True) -> str:
														
 
															-    # OpenAI-compatible chat endpoint; uses NEWS_OPENAI_API_KEY.
														
 
															+        for attempt in range(1 + retries):
														
 
															+            resp = await client.post(
														
 
															+                "https://api.groq.com/openai/v1/chat/completions",
														
 
															+                headers={"Authorization": f"Bearer {GROQ_API_KEY}"},
														
 
															+                json=req,
														
 
															+            )
														
 
															+            if resp.status_code != 200:
														
 
															+                last_err = f"HTTP {resp.status_code}: {resp.text[:300]}"
														
 
															+                if resp.status_code in (429, 500, 502, 503):
														
 
															+                    await asyncio.sleep(2 ** attempt)
														
 
															+                    continue
														
 
															+                resp.raise_for_status()
														
 
															+            data = resp.json()
														
 
															+            if "error" in data:
														
 
															+                last_err = f"API error: {data['error']}"
														
 
															+                break
														
 
															+            choices = data.get("choices", [])
														
 
															+            if not choices:
														
 
															+                last_err = f"No choices in response: {str(data)[:300]}"
														
 
															+                if attempt < retries:
														
 
															+                    await asyncio.sleep(2 ** attempt)
														
 
															+                    continue
														
 
															+                break
														
 
															+            content = choices[0].get("message", {}).get("content")
														
 
															+            if content:
														
 
															+                return content
														
 
															+            last_err = f"Empty content in choice: {str(choices[0])[:200]}"
														
 
															+            if attempt < retries:
														
 
															+                await asyncio.sleep(2 ** attempt)
														
 
															+                continue
														
 
															+            break
														
 
															+    raise LLMError(f"Groq failed after {1+retries} attempts: {last_err}")
														
 
															+
														
 
															+
														
 
															+async def _call_openai(model: str, messages: List[Dict[str, str]], response_json: bool = True, retries: int = 2) -> str:
														
 
															     if not OPENAI_API_KEY:
														
 
															         raise LLMError("OPENAI_API_KEY is not configured")
														
 
															     req = {"model": model, "messages": messages}
														
 
															     if response_json:
														
 
															         req["response_format"] = {"type": "json_object"}
														
 
															+    last_err = ""
														
 
															     async with httpx.AsyncClient(timeout=45.0) as client:
														
 
															-        resp = await client.post(
														
 
															-            "https://api.openai.com/v1/chat/completions",
														
 
															-            headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
														
 
															-            json=req,
														
 
															-        )
														
 
															-        resp.raise_for_status()
														
 
															-        data = resp.json()
														
 
															-    return data["choices"][0]["message"]["content"]
														
 
															+        for attempt in range(1 + retries):
														
 
															+            resp = await client.post(
														
 
															+                "https://api.openai.com/v1/chat/completions",
														
 
															+                headers={"Authorization": f"Bearer {OPENAI_API_KEY}"},
														
 
															+                json=req,
														
 
															+            )
														
 
															+            if resp.status_code != 200:
														
 
															+                last_err = f"HTTP {resp.status_code}: {resp.text[:300]}"
														
 
															+                if resp.status_code in (429, 500, 502, 503):
														
 
															+                    await asyncio.sleep(2 ** attempt)
														
 
															+                    continue
														
 
															+                resp.raise_for_status()
														
 
															+            data = resp.json()
														
 
															+            if "error" in data:
														
 
															+                last_err = f"API error: {data['error']}"
														
 
															+                break
														
 
															+            choices = data.get("choices", [])
														
 
															+            if not choices:
														
 
															+                last_err = f"No choices in response: {str(data)[:300]}"
														
 
															+                if attempt < retries:
														
 
															+                    await asyncio.sleep(2 ** attempt)
														
 
															+                    continue
														
 
															+                break
														
 
															+            content = choices[0].get("message", {}).get("content")
														
 
															+            if content:
														
 
															+                return content
														
 
															+            last_err = f"Empty content in choice: {str(choices[0])[:200]}"
														
 
															+            if attempt < retries:
														
 
															+                await asyncio.sleep(2 ** attempt)
														
 
															+                continue
														
 
															+            break
														
 
															+    raise LLMError(f"OpenAI failed after {1+retries} attempts: {last_err}")
														
 
															 OR_OPENROUTER_URL = "https://openrouter.ai/api/v1/chat/completions"
														
--- a/news_mcp/storage/sqlite_store.py
+++ b/news_mcp/storage/sqlite_store.py
@@ -468,6 +468,23 @@ class SQLiteClusterStore:
 
															                 (entity_id, normalized_label, None, mid, json.dumps([], ensure_ascii=False), now, now),
														
 
															             )
														
 
															+    def get_failed_enrichment_clusters(self, max_retries: int = 3) -> list[dict]:
														
 
															+        """Return clusters whose last enrichment failed and haven't exceeded max_retries.
														
 
															+
														
 
															+        These are candidates for re-enrichment on the next polling cycle.
														
 
															+        """
														
 
															+        with self._conn() as conn:
														
 
															+            cur = conn.execute(
														
 
															+                "SELECT payload FROM clusters "
														
 
															+                "WHERE json_extract(payload, '$.enrichment_failed_at') IS NOT NULL "
														
 
															+                "AND (json_extract(payload, '$.enrichment_retry_count') IS NULL "
														
 
															+                "     OR json_extract(payload, '$.enrichment_retry_count') < ?) "
														
 
															+                "ORDER BY updated_at DESC LIMIT 500",
														
 
															+                (max_retries,),
														
 
															+            )
														
 
															+            rows = cur.fetchall()
														
 
															+        return [json.loads(r[0]) for r in rows]
														
 
															+
														
 
															     def prune_clusters(self, retention_days: float) -> int:
														
 
															         retention_days = float(retention_days)
														
 
															         if retention_days <= 0:
														
--- a/test_news_mcp.py
+++ b/test_news_mcp.py
@@ -628,6 +628,9 @@ def test_poller_persists_clusters_under_post_enrichment_topic(monkeypatch):
 
															         def prune_if_due(self, **kwargs):
														
 
															             return {"deleted": 0}
														
 
															+        def get_failed_enrichment_clusters(self, max_retries=3):
														
 
															+            return []
														
 
															+
														
 
															         def set_meta(self, key, value):
														
 
															             pass