Explorar el Código

fix: skip LLM enrichment for clusters that already have entities and keywords

Lukas Goldschmidt hace 1 semana
padre
commit
c3cc8103fe
Se han modificado 1 ficheros con 11 adiciones y 40 borrados
  1. 11 40
      news_mcp/jobs/poller.py

+ 11 - 40
news_mcp/jobs/poller.py

@@ -28,7 +28,6 @@ from news_mcp.enrichment.enrich import enrich_cluster
 from news_mcp.enrichment.llm_enrich import classify_cluster_llm
 from news_mcp.sources.news_feeds import fetch_news_articles
 from news_mcp.storage.sqlite_store import SQLiteClusterStore
-from news_mcp.trends_resolution import resolve_entity_via_trends
 
 
 def _load_feed_urls() -> list[str]:
@@ -51,12 +50,11 @@ async def _enrich_single_cluster(
 ) -> dict:
     """Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
 
-    Cache strategy: if the cluster already has entities and an enriched_at
-    timestamp (from a previous successful enrichment), skip the LLM call.
-    Clusters only need to be enriched once — the enrichment output
-    (entities, keywords, sentiment, topic) is derived from the cluster's
-    article content, which doesn't change meaningfully between polls unless
-    enough new articles arrive to form a different cluster.
+    Rule: if the cluster already has entities AND keywords (from a previous
+    enrichment), skip the LLM call entirely.  The data on the dict IS the
+    cache — no need to look up enriched_at timestamps or query the DB by
+    cluster_id.  This works regardless of whether cluster_id changed due to
+    article merging across polling cycles.
 
     On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
     with exponential backoff.  If all retries are exhausted the cluster is
@@ -68,39 +66,12 @@ async def _enrich_single_cluster(
 
     cluster_id = c2.get("cluster_id")
     if llm_enabled and cluster_id:
-        # --- Cache check: skip LLM if already enriched ---
-        # The cluster payload may already carry enriched_at if it was loaded
-        # from an existing DB cluster during cross-cycle seeding.
-        enriched_at_str = c2.get("enriched_at")
-        if enriched_at_str and c2.get("entities"):
-            logger.debug("enrich skip (cached) cluster=%s topic=%s", cluster_id, topic)
-            return c2
-
-        # --- Check DB: cluster may have been enriched in a previous cycle ---
-        # Note: cluster_id is derived from the article set, so this lookup
-        # only matches when the same exact article set was clustered before.
-        # That's fine — it means the enrichment is still valid.
-        existing = store.get_cluster_by_id(cluster_id)
-        if existing and existing.get("entities") and existing.get("enriched_at"):
-            logger.info("enrich cache-hit cluster=%s topic=%s", cluster_id, topic)
-            c2 = dict(c2)
-            c2["entities"] = existing.get("entities", [])
-            c2["enriched_at"] = existing.get("enriched_at")
-
-            existing_resolutions = existing.get("entityResolutions", None)
-            if isinstance(existing_resolutions, list) and existing_resolutions:
-                c2["entityResolutions"] = existing_resolutions
-            else:
-                c2["entityResolutions"] = [resolve_entity_via_trends(e) for e in c2["entities"]]
-
-            if existing.get("sentiment"):
-                c2["sentiment"] = existing.get("sentiment")
-            if existing.get("sentimentScore") is not None:
-                c2["sentimentScore"] = existing.get("sentimentScore")
-            if existing.get("keywords"):
-                c2["keywords"] = existing.get("keywords")
-            if existing.get("topic"):
-                c2["topic"] = existing.get("topic")
+        # --- Cache check: if the cluster already has entities AND keywords,
+        # it was enriched in a previous cycle.  Skip LLM entirely.
+        _existing_entities = c2.get("entities") or []
+        _existing_keywords = c2.get("keywords") or []
+        if _existing_entities and _existing_keywords:
+            logger.debug("enrich skip (already enriched) cluster=%s topic=%s", cluster_id, topic)
             return c2
 
         # --- Actually call the LLM ---