Lukas Goldschmidt 2 недель назад
Родитель
Сommit
8a2a0c6279
1 измененных файлов с 10 добавлено и 0 удалено
  1. 10 0
      news_mcp/jobs/poller.py

+ 10 - 0
news_mcp/jobs/poller.py

@@ -46,6 +46,7 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
     changed_articles: list[dict[str, Any]] = []
     changed_feed_urls: list[str] = []
     for feed_url, feed_articles in per_feed.items():
+        logger.info("refresh feed batch start feed_url=%s count=%s", feed_url, len(feed_articles))
         material = "\n".join(
             f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
             for a in feed_articles
@@ -59,6 +60,7 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
             logger.info("refresh changed feed_url=%s count=%s topic=%s", feed_url, len(feed_articles), topic)
             changed_feed_urls.append(feed_url)
             changed_articles.extend(feed_articles)
+        logger.info("refresh feed batch complete feed_url=%s changed_total=%s", feed_url, len(changed_articles))
 
     if not changed_articles:
         logger.info("refresh unchanged all feeds topic=%s", topic)
@@ -72,12 +74,14 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
         return
 
     articles = changed_articles
+    logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
     clustered_by_topic = dedup_and_cluster_articles(articles)
     logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
 
     for t, clusters in clustered_by_topic.items():
         if topic and t != topic:
             continue
+        logger.info("refresh topic phase start topic=%s clusters=%s", t, len(clusters))
         enriched = []
 
         # Determine how many clusters to LLM-enrich.
@@ -89,11 +93,17 @@ async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
             (not ENRICH_OTHER_TOPICS_ONLY) or (t == "other")
         )
 
+        # Persist the raw clusters first so a slow enrichment pass does not
+        # leave the first bootstrap run with nothing stored.
+        store.upsert_clusters(clusters, topic=t)
+        logger.info("refresh stored raw topic=%s clusters=%s", t, len(clusters))
+
         for idx, c in enumerate(clusters[:enrich_limit]):
             c2 = enrich_cluster(c)
             # Seed the heuristic topic on the payload so classify_cluster_llm
             # has a sane fallback if the LLM omits or hallucinates one.
             c2.setdefault("topic", t)
+            logger.info("refresh enrich cluster=%s topic=%s idx=%s/%s", c2.get("cluster_id"), t, idx + 1, enrich_limit)
 
             if _llm_enabled_for_topic:
                 # Cache: if we already have entities/sentiment for this cluster, skip LLM call.