|
@@ -28,7 +28,6 @@ from news_mcp.enrichment.enrich import enrich_cluster
|
|
|
from news_mcp.enrichment.llm_enrich import classify_cluster_llm
|
|
from news_mcp.enrichment.llm_enrich import classify_cluster_llm
|
|
|
from news_mcp.sources.news_feeds import fetch_news_articles
|
|
from news_mcp.sources.news_feeds import fetch_news_articles
|
|
|
from news_mcp.storage.sqlite_store import SQLiteClusterStore
|
|
from news_mcp.storage.sqlite_store import SQLiteClusterStore
|
|
|
-from news_mcp.trends_resolution import resolve_entity_via_trends
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _load_feed_urls() -> list[str]:
|
|
def _load_feed_urls() -> list[str]:
|
|
@@ -51,12 +50,11 @@ async def _enrich_single_cluster(
|
|
|
) -> dict:
|
|
) -> dict:
|
|
|
"""Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
|
|
"""Enrich one cluster: heuristic + optional LLM extraction, concurrency-limited.
|
|
|
|
|
|
|
|
- Cache strategy: if the cluster already has entities and an enriched_at
|
|
|
|
|
- timestamp (from a previous successful enrichment), skip the LLM call.
|
|
|
|
|
- Clusters only need to be enriched once — the enrichment output
|
|
|
|
|
- (entities, keywords, sentiment, topic) is derived from the cluster's
|
|
|
|
|
- article content, which doesn't change meaningfully between polls unless
|
|
|
|
|
- enough new articles arrive to form a different cluster.
|
|
|
|
|
|
|
+ Rule: if the cluster already has entities AND keywords (from a previous
|
|
|
|
|
+ enrichment), skip the LLM call entirely. The data on the dict IS the
|
|
|
|
|
+ cache — no need to look up enriched_at timestamps or query the DB by
|
|
|
|
|
+ cluster_id. This works regardless of whether cluster_id changed due to
|
|
|
|
|
+ article merging across polling cycles.
|
|
|
|
|
|
|
|
On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
|
|
On LLM failure the cluster is retried up to MAX_ENRICHMENT_RETRIES times
|
|
|
with exponential backoff. If all retries are exhausted the cluster is
|
|
with exponential backoff. If all retries are exhausted the cluster is
|
|
@@ -68,39 +66,12 @@ async def _enrich_single_cluster(
|
|
|
|
|
|
|
|
cluster_id = c2.get("cluster_id")
|
|
cluster_id = c2.get("cluster_id")
|
|
|
if llm_enabled and cluster_id:
|
|
if llm_enabled and cluster_id:
|
|
|
- # --- Cache check: skip LLM if already enriched ---
|
|
|
|
|
- # The cluster payload may already carry enriched_at if it was loaded
|
|
|
|
|
- # from an existing DB cluster during cross-cycle seeding.
|
|
|
|
|
- enriched_at_str = c2.get("enriched_at")
|
|
|
|
|
- if enriched_at_str and c2.get("entities"):
|
|
|
|
|
- logger.debug("enrich skip (cached) cluster=%s topic=%s", cluster_id, topic)
|
|
|
|
|
- return c2
|
|
|
|
|
-
|
|
|
|
|
- # --- Check DB: cluster may have been enriched in a previous cycle ---
|
|
|
|
|
- # Note: cluster_id is derived from the article set, so this lookup
|
|
|
|
|
- # only matches when the same exact article set was clustered before.
|
|
|
|
|
- # That's fine — it means the enrichment is still valid.
|
|
|
|
|
- existing = store.get_cluster_by_id(cluster_id)
|
|
|
|
|
- if existing and existing.get("entities") and existing.get("enriched_at"):
|
|
|
|
|
- logger.info("enrich cache-hit cluster=%s topic=%s", cluster_id, topic)
|
|
|
|
|
- c2 = dict(c2)
|
|
|
|
|
- c2["entities"] = existing.get("entities", [])
|
|
|
|
|
- c2["enriched_at"] = existing.get("enriched_at")
|
|
|
|
|
-
|
|
|
|
|
- existing_resolutions = existing.get("entityResolutions", None)
|
|
|
|
|
- if isinstance(existing_resolutions, list) and existing_resolutions:
|
|
|
|
|
- c2["entityResolutions"] = existing_resolutions
|
|
|
|
|
- else:
|
|
|
|
|
- c2["entityResolutions"] = [resolve_entity_via_trends(e) for e in c2["entities"]]
|
|
|
|
|
-
|
|
|
|
|
- if existing.get("sentiment"):
|
|
|
|
|
- c2["sentiment"] = existing.get("sentiment")
|
|
|
|
|
- if existing.get("sentimentScore") is not None:
|
|
|
|
|
- c2["sentimentScore"] = existing.get("sentimentScore")
|
|
|
|
|
- if existing.get("keywords"):
|
|
|
|
|
- c2["keywords"] = existing.get("keywords")
|
|
|
|
|
- if existing.get("topic"):
|
|
|
|
|
- c2["topic"] = existing.get("topic")
|
|
|
|
|
|
|
+ # --- Cache check: if the cluster already has entities AND keywords,
|
|
|
|
|
+ # it was enriched in a previous cycle. Skip LLM entirely.
|
|
|
|
|
+ _existing_entities = c2.get("entities") or []
|
|
|
|
|
+ _existing_keywords = c2.get("keywords") or []
|
|
|
|
|
+ if _existing_entities and _existing_keywords:
|
|
|
|
|
+ logger.debug("enrich skip (already enriched) cluster=%s topic=%s", cluster_id, topic)
|
|
|
return c2
|
|
return c2
|
|
|
|
|
|
|
|
# --- Actually call the LLM ---
|
|
# --- Actually call the LLM ---
|