Pārlūkot izejas kodu

poller llm calls reduced

Lukas Goldschmidt 1 nedēļu atpakaļ
vecāks
revīzija
2db56b7dc0
1 mainītis faili ar 55 papildinājumiem un 33 dzēšanām
  1. 55 33
      news_mcp/jobs/poller.py

+ 55 - 33
news_mcp/jobs/poller.py

@@ -5,7 +5,7 @@ import hashlib
 import logging
 import sys
 from collections import defaultdict
-from datetime import datetime, timezone, timedelta
+from datetime import datetime, timezone
 from typing import Any, Dict
 
 from news_mcp.config import (
@@ -51,6 +51,13 @@ async def _enrich_single_cluster(
 ) -> dict:
     """Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
 
+    Cache strategy: if the cluster already has entities and an enriched_at
+    timestamp (from a previous successful enrichment), skip the LLM call.
+    Clusters only need to be enriched once — the enrichment output
+    (entities, keywords, sentiment, topic) is derived from the cluster's
+    article content, which doesn't change meaningfully between polls unless
+    enough new articles arrive to form a different cluster.
+
     On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
     with exponential backoff.  If all retries are exhausted the cluster is
     marked with enrichment_failed_at and enrichment_retry_count so the next
@@ -61,11 +68,24 @@ async def _enrich_single_cluster(
 
     cluster_id = c2.get("cluster_id")
     if llm_enabled and cluster_id:
-        # Cache: if we already have entities/sentiment for this cluster, skip LLM call.
+        # --- Cache check: skip LLM if already enriched ---
+        # The cluster payload may already carry enriched_at if it was loaded
+        # from an existing DB cluster during cross-cycle seeding.
+        enriched_at_str = c2.get("enriched_at")
+        if enriched_at_str and c2.get("entities"):
+            logger.debug("enrich skip (cached) cluster=%s topic=%s", cluster_id, topic)
+            return c2
+
+        # --- Check DB: cluster may have been enriched in a previous cycle ---
+        # Note: cluster_id is derived from the article set, so this lookup
+        # only matches when the same exact article set was clustered before.
+        # That's fine — it means the enrichment is still valid.
         existing = store.get_cluster_by_id(cluster_id)
-        if existing and existing.get("entities"):
+        if existing and existing.get("entities") and existing.get("enriched_at"):
+            logger.info("enrich cache-hit cluster=%s topic=%s", cluster_id, topic)
             c2 = dict(c2)
             c2["entities"] = existing.get("entities", [])
+            c2["enriched_at"] = existing.get("enriched_at")
 
             existing_resolutions = existing.get("entityResolutions", None)
             if isinstance(existing_resolutions, list) and existing_resolutions:
@@ -81,36 +101,38 @@ async def _enrich_single_cluster(
                 c2["keywords"] = existing.get("keywords")
             if existing.get("topic"):
                 c2["topic"] = existing.get("topic")
-        else:
-            # Retry loop with exponential backoff | semaphore held per-attempt
-            last_err = ""
-            for attempt in range(1 + MAX_ENRICHMENT_RETRIES):
-                if attempt > 0:
-                    backoff = 2 ** attempt
-                    logger.info(
-                        "retry cluster=%s topic=%s attempt=%d/%d backoff=%.0fs",
-                        cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, backoff,
-                    )
-                    await asyncio.sleep(backoff)
-                try:
-                    async with semaphore:
-                        c2 = await classify_cluster_llm(dict(c2))
-                    break  # success
-                except Exception:
-                    last_err = str(sys.exc_info()[1])[:200] if sys.exc_info()[1] else "unknown"
-                    logger.warning(
-                        "LLM enrichment failed cluster=%s topic=%s attempt=%d/%d err=%s",
-                        cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, last_err,
-                    )
-            else:
-                # Loop completed without break = all retries exhausted
-                prev_count = c2.get("enrichment_retry_count", 0)
-                c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
-                c2["enrichment_retry_count"] = prev_count + 1
-                logger.error(
-                    "LLM enrichment exhausted cluster=%s topic=%s after %d retries",
-                    cluster_id, topic, MAX_ENRICHMENT_RETRIES,
+            return c2
+
+        # --- Actually call the LLM ---
+        last_err = ""
+        for attempt in range(1 + MAX_ENRICHMENT_RETRIES):
+            if attempt > 0:
+                backoff = 2 ** attempt
+                logger.info(
+                    "retry cluster=%s topic=%s attempt=%d/%d backoff=%.0fs",
+                    cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, backoff,
                 )
+                await asyncio.sleep(backoff)
+            try:
+                async with semaphore:
+                    c2 = await classify_cluster_llm(dict(c2))
+                c2["enriched_at"] = datetime.now(timezone.utc).isoformat()
+                break  # success
+            except Exception:
+                last_err = str(sys.exc_info()[1])[:200] if sys.exc_info()[1] else "unknown"
+                logger.warning(
+                    "LLM enrichment failed cluster=%s topic=%s attempt=%d/%d err=%s",
+                    cluster_id, topic, attempt, MAX_ENRICHMENT_RETRIES, last_err,
+                )
+        else:
+            # Loop completed without break = all retries exhausted
+            prev_count = c2.get("enrichment_retry_count", 0)
+            c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
+            c2["enrichment_retry_count"] = prev_count + 1
+            logger.error(
+                "LLM enrichment exhausted cluster=%s topic=%s after %d retries",
+                cluster_id, topic, MAX_ENRICHMENT_RETRIES,
+            )
 
     return c2
 
@@ -286,7 +308,7 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
         logger.info("retry enrich failed clusters count=%d", len(failed_clusters))
         retry_tasks = [
             _enrich_single_cluster(
-                c, str(c.get("topic") or "other"), True, llm_semaphore, store, logger
+                c, str(c.get("topic") or "other"), True, llm_semaphore, store, logger,
             )
             for c in failed_clusters
         ]