poller.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. from __future__ import annotations
  2. import asyncio
  3. import logging
  4. from datetime import datetime, timezone
  5. from typing import Any, Dict
  6. from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DEFAULT_TOPICS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
  7. from news_mcp.dedup.cluster import dedup_and_cluster_articles
  8. from news_mcp.enrichment.enrich import enrich_cluster
  9. from news_mcp.enrichment.llm_enrich import classify_cluster_llm
  10. from news_mcp.trends_resolution import resolve_entity_via_trends
  11. from news_mcp.sources.news_feeds import fetch_news_articles
  12. from news_mcp.storage.sqlite_store import SQLiteClusterStore
  13. from news_mcp.config import (
  14. ENRICH_OTHER_TOPICS_ONLY,
  15. ENRICHMENT_MAX_PER_REFRESH,
  16. NEWS_PRUNE_INTERVAL_HOURS,
  17. NEWS_PRUNING_ENABLED,
  18. NEWS_RETENTION_DAYS,
  19. )
  20. async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
  21. logger = logging.getLogger("news_mcp.refresh")
  22. store = SQLiteClusterStore(DB_PATH)
  23. logger.info("refresh start topic=%s limit=%s", topic, limit)
  24. articles = await asyncio.to_thread(fetch_news_articles, limit)
  25. logger.info("refresh fetched articles=%s", len(articles))
  26. # Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
  27. import hashlib
  28. rss_urls = [u.strip() for u in NEWS_FEED_URLS.split(",") if u.strip()]
  29. if not rss_urls:
  30. rss_urls = [NEWS_FEED_URL]
  31. feed_key = "newsfeeds:" + hashlib.sha1(",".join(rss_urls).encode("utf-8")).hexdigest()
  32. material = "\n".join(
  33. f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
  34. for a in articles
  35. )
  36. last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
  37. prev_hash = store.get_feed_hash(feed_key)
  38. if prev_hash == last_hash:
  39. logger.info("refresh unchanged feed_key=%s topic=%s", feed_key, topic)
  40. store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
  41. prune_result = store.prune_if_due(
  42. pruning_enabled=NEWS_PRUNING_ENABLED,
  43. retention_days=NEWS_RETENTION_DAYS,
  44. interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
  45. )
  46. logger.info("refresh prune_result=%s", prune_result)
  47. return
  48. else:
  49. logger.info("refresh changed feed_key=%s topic=%s", feed_key, topic)
  50. store.set_feed_hash(feed_key, last_hash)
  51. clustered_by_topic = dedup_and_cluster_articles(articles)
  52. logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))
  53. for t, clusters in clustered_by_topic.items():
  54. if topic and t != topic:
  55. continue
  56. enriched = []
  57. # Determine how many clusters to LLM-enrich.
  58. # ENRICHMENT_MAX_PER_REFRESH=0 means enrich every cluster (no cap).
  59. enrich_limit = ENRICHMENT_MAX_PER_REFRESH or len(clusters)
  60. # Track whether the LLM pipeline is available for this topic.
  61. _llm_enabled_for_topic = (
  62. (not ENRICH_OTHER_TOPICS_ONLY) or (t == "other")
  63. )
  64. for idx, c in enumerate(clusters[:enrich_limit]):
  65. c2 = enrich_cluster(c)
  66. # Seed the heuristic topic on the payload so classify_cluster_llm
  67. # has a sane fallback if the LLM omits or hallucinates one.
  68. c2.setdefault("topic", t)
  69. if _llm_enabled_for_topic:
  70. # Cache: if we already have entities/sentiment for this cluster, skip LLM call.
  71. existing = store.get_cluster_by_id(c2.get("cluster_id"))
  72. if existing and existing.get("entities"):
  73. c2 = dict(c2)
  74. # Keep existing enriched fields.
  75. c2["entities"] = existing.get("entities", [])
  76. # IMPORTANT: entityResolutions must stay consistent with entities.
  77. # Older rows may have entities but missing/malformed resolutions.
  78. existing_resolutions = existing.get("entityResolutions", None)
  79. if isinstance(existing_resolutions, list) and existing_resolutions:
  80. c2["entityResolutions"] = existing_resolutions
  81. else:
  82. # Recompute resolutions deterministically from the stored entities.
  83. c2["entityResolutions"] = [resolve_entity_via_trends(e) for e in c2["entities"]]
  84. if existing.get("sentiment"):
  85. c2["sentiment"] = existing.get("sentiment")
  86. if existing.get("sentimentScore") is not None:
  87. c2["sentimentScore"] = existing.get("sentimentScore")
  88. if existing.get("keywords"):
  89. c2["keywords"] = existing.get("keywords")
  90. # Preserve a previously-classified topic so we don't drift back
  91. # to the heuristic on cache hits.
  92. if existing.get("topic"):
  93. c2["topic"] = existing.get("topic")
  94. else:
  95. try:
  96. c2 = await classify_cluster_llm(c2)
  97. except Exception:
  98. logger.exception("LLM enrichment failed for cluster %s (topic %s)", c2.get("cluster_id"), t)
  99. # Mark so we can retry on next refresh.
  100. c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()
  101. enriched.append(c2)
  102. # Persist clusters under their *post-enrichment* topic so the SQL row
  103. # column matches what the LLM (or the validated heuristic fallback)
  104. # actually decided. Previously, every cluster from this bucket was
  105. # forced into the heuristic topic `t`, which caused a ~97% mismatch
  106. # between row-column topic and payload topic.
  107. by_final_topic: Dict[str, list] = {}
  108. for c2 in enriched:
  109. final_topic = str(c2.get("topic") or t or "other").strip().lower()
  110. if final_topic not in {x.lower() for x in DEFAULT_TOPICS}:
  111. final_topic = "other"
  112. by_final_topic.setdefault(final_topic, []).append(c2)
  113. for final_topic, group in by_final_topic.items():
  114. store.upsert_clusters(group, topic=final_topic)
  115. logger.info("refresh stored topic=%s clusters=%s (heuristic_topic=%s)", final_topic, len(group), t)
  116. prune_result = store.prune_if_due(
  117. pruning_enabled=NEWS_PRUNING_ENABLED,
  118. retention_days=NEWS_RETENTION_DAYS,
  119. interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
  120. )
  121. store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
  122. logger.info("refresh prune_result=%s", prune_result)