hace 1 semana · c3cc8103fe
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -28,7 +28,6 @@ from news_mcp.enrichment.enrich import enrich_cluster
 
				 from news_mcp.enrichment.llm_enrich import classify_cluster_llm
			
 
				 from news_mcp.sources.news_feeds import fetch_news_articles
			
 
				 from news_mcp.storage.sqlite_store import SQLiteClusterStore
			
 
				-from news_mcp.trends_resolution import resolve_entity_via_trends
			
 
				 
			
 
				 
			
 
				 def _load_feed_urls() -> list[str]:
			
@@ -51,12 +50,11 @@ async def _enrich_single_cluster(
 
				 ) -> dict:
			
 
				     """Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
			
 
				 
			
 
				-    Cache strategy: if the cluster already has entities and an enriched_at
			
 
				-    timestamp (from a previous successful enrichment), skip the LLM call.
			
 
				-    Clusters only need to be enriched once — the enrichment output
			
 
				-    (entities, keywords, sentiment, topic) is derived from the cluster's
			
 
				-    article content, which doesn't change meaningfully between polls unless
			
 
				-    enough new articles arrive to form a different cluster.
			
 
				+    Rule: if the cluster already has entities AND keywords (from a previous
			
 
				+    enrichment), skip the LLM call entirely.  The data on the dict IS the
			
 
				+    cache — no need to look up enriched_at timestamps or query the DB by
			
 
				+    cluster_id.  This works regardless of whether cluster_id changed due to
			
 
				+    article merging across polling cycles.
			
 
				 
			
 
				     On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
			
 
				     with exponential backoff.  If all retries are exhausted the cluster is
			
@@ -68,39 +66,12 @@ async def _enrich_single_cluster(
 
				 
			
 
				     cluster_id = c2.get("cluster_id")
			
 
				     if llm_enabled and cluster_id:
			
 
				-        # --- Cache check: skip LLM if already enriched ---
			
 
				-        # The cluster payload may already carry enriched_at if it was loaded
			
 
				-        # from an existing DB cluster during cross-cycle seeding.
			
 
				-        enriched_at_str = c2.get("enriched_at")
			
 
				-        if enriched_at_str and c2.get("entities"):
			
 
				-            logger.debug("enrich skip (cached) cluster=%s topic=%s", cluster_id, topic)
			
 
				-            return c2
			
 
				-
			
 
				-        # --- Check DB: cluster may have been enriched in a previous cycle ---
			
 
				-        # Note: cluster_id is derived from the article set, so this lookup
			
 
				-        # only matches when the same exact article set was clustered before.
			
 
				-        # That's fine — it means the enrichment is still valid.
			
 
				-        existing = store.get_cluster_by_id(cluster_id)
			
 
				-        if existing and existing.get("entities") and existing.get("enriched_at"):
			
 
				-            logger.info("enrich cache-hit cluster=%s topic=%s", cluster_id, topic)
			
 
				-            c2 = dict(c2)
			
 
				-            c2["entities"] = existing.get("entities", [])
			
 
				-            c2["enriched_at"] = existing.get("enriched_at")
			
 
				-
			
 
				-            existing_resolutions = existing.get("entityResolutions", None)
			
 
				-            if isinstance(existing_resolutions, list) and existing_resolutions:
			
 
				-                c2["entityResolutions"] = existing_resolutions
			
 
				-            else:
			
 
				-                c2["entityResolutions"] = [resolve_entity_via_trends(e) for e in c2["entities"]]
			
 
				-
			
 
				-            if existing.get("sentiment"):
			
 
				-                c2["sentiment"] = existing.get("sentiment")
			
 
				-            if existing.get("sentimentScore") is not None:
			
 
				-                c2["sentimentScore"] = existing.get("sentimentScore")
			
 
				-            if existing.get("keywords"):
			
 
				-                c2["keywords"] = existing.get("keywords")
			
 
				-            if existing.get("topic"):
			
 
				-                c2["topic"] = existing.get("topic")
			
 
				+        # --- Cache check: if the cluster already has entities AND keywords,
			
 
				+        # it was enriched in a previous cycle.  Skip LLM entirely.
			
 
				+        _existing_entities = c2.get("entities") or []
			
 
				+        _existing_keywords = c2.get("keywords") or []
			
 
				+        if _existing_entities and _existing_keywords:
			
 
				+            logger.debug("enrich skip (already enriched) cluster=%s topic=%s", cluster_id, topic)
			
 
				             return c2
			
 
				 
			
 
				         # --- Actually call the LLM ---