1 week ago · c3cc8103fe
--- a/news_mcp/jobs/poller.py
+++ b/news_mcp/jobs/poller.py
@@ -28,7 +28,6 @@ from news_mcp.enrichment.enrich import enrich_cluster
 
															 from news_mcp.enrichment.llm_enrich import classify_cluster_llm
														
 
															 from news_mcp.sources.news_feeds import fetch_news_articles
														
 
															 from news_mcp.storage.sqlite_store import SQLiteClusterStore
														
 
															-from news_mcp.trends_resolution import resolve_entity_via_trends
														
 
															 def _load_feed_urls() -> list[str]:
														
@@ -51,12 +50,11 @@ async def _enrich_single_cluster(
 
															 ) -> dict:
														
 
															     """Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
														
 
															-    Cache strategy: if the cluster already has entities and an enriched_at
														
 
															-    timestamp (from a previous successful enrichment), skip the LLM call.
														
 
															-    Clusters only need to be enriched once — the enrichment output
														
 
															-    (entities, keywords, sentiment, topic) is derived from the cluster's
														
 
															-    article content, which doesn't change meaningfully between polls unless
														
 
															-    enough new articles arrive to form a different cluster.
														
 
															+    Rule: if the cluster already has entities AND keywords (from a previous
														
 
															+    enrichment), skip the LLM call entirely.  The data on the dict IS the
														
 
															+    cache — no need to look up enriched_at timestamps or query the DB by
														
 
															+    cluster_id.  This works regardless of whether cluster_id changed due to
														
 
															+    article merging across polling cycles.
														
 
															     On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
														
 
															     with exponential backoff.  If all retries are exhausted the cluster is
														
@@ -68,39 +66,12 @@ async def _enrich_single_cluster(
 
															     cluster_id = c2.get("cluster_id")
														
 
															     if llm_enabled and cluster_id:
														
 
															-        # --- Cache check: skip LLM if already enriched ---
														
 
															-        # The cluster payload may already carry enriched_at if it was loaded
														
 
															-        # from an existing DB cluster during cross-cycle seeding.
														
 
															-        enriched_at_str = c2.get("enriched_at")
														
 
															-        if enriched_at_str and c2.get("entities"):
														
 
															-            logger.debug("enrich skip (cached) cluster=%s topic=%s", cluster_id, topic)
														
 
															-            return c2
														
 
															-
														
 
															-        # --- Check DB: cluster may have been enriched in a previous cycle ---
														
 
															-        # Note: cluster_id is derived from the article set, so this lookup
														
 
															-        # only matches when the same exact article set was clustered before.
														
 
															-        # That's fine — it means the enrichment is still valid.
														
 
															-        existing = store.get_cluster_by_id(cluster_id)
														
 
															-        if existing and existing.get("entities") and existing.get("enriched_at"):
														
 
															-            logger.info("enrich cache-hit cluster=%s topic=%s", cluster_id, topic)
														
 
															-            c2 = dict(c2)
														
 
															-            c2["entities"] = existing.get("entities", [])
														
 
															-            c2["enriched_at"] = existing.get("enriched_at")
														
 
															-
														
 
															-            existing_resolutions = existing.get("entityResolutions", None)
														
 
															-            if isinstance(existing_resolutions, list) and existing_resolutions:
														
 
															-                c2["entityResolutions"] = existing_resolutions
														
 
															-            else:
														
 
															-                c2["entityResolutions"] = [resolve_entity_via_trends(e) for e in c2["entities"]]
														
 
															-
														
 
															-            if existing.get("sentiment"):
														
 
															-                c2["sentiment"] = existing.get("sentiment")
														
 
															-            if existing.get("sentimentScore") is not None:
														
 
															-                c2["sentimentScore"] = existing.get("sentimentScore")
														
 
															-            if existing.get("keywords"):
														
 
															-                c2["keywords"] = existing.get("keywords")
														
 
															-            if existing.get("topic"):
														
 
															-                c2["topic"] = existing.get("topic")
														
 
															+        # --- Cache check: if the cluster already has entities AND keywords,
														
 
															+        # it was enriched in a previous cycle.  Skip LLM entirely.
														
 
															+        _existing_entities = c2.get("entities") or []
														
 
															+        _existing_keywords = c2.get("keywords") or []
														
 
															+        if _existing_entities and _existing_keywords:
														
 
															+            logger.debug("enrich skip (already enriched) cluster=%s topic=%s", cluster_id, topic)
														
 
															             return c2
														
 
															         # --- Actually call the LLM ---