poller.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263
  1. from __future__ import annotations
  2. from typing import Any, Dict
  3. from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH
  4. from news_mcp.dedup.cluster import dedup_and_cluster_articles
  5. from news_mcp.enrichment.enrich import enrich_cluster
  6. from news_mcp.enrichment.groq_enrich import classify_cluster_groq
  7. from news_mcp.sources.rss_breakingthenews import fetch_breakingthenews_articles
  8. from news_mcp.storage.sqlite_store import SQLiteClusterStore
  9. from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH
  10. async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
  11. store = SQLiteClusterStore(DB_PATH)
  12. articles = fetch_breakingthenews_articles(limit=limit)
  13. # Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
  14. import hashlib
  15. feed_key = "breakingthenews" # v1: single feed
  16. material = "\n".join(
  17. f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
  18. for a in articles
  19. )
  20. last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
  21. prev_hash = store.get_feed_hash(feed_key)
  22. if prev_hash == last_hash:
  23. return
  24. store.set_feed_hash(feed_key, last_hash)
  25. clustered_by_topic = dedup_and_cluster_articles(articles)
  26. for t, clusters in clustered_by_topic.items():
  27. if topic and t != topic:
  28. continue
  29. enriched = []
  30. # Always compute cheap enrichment first.
  31. for idx, c in enumerate(clusters[:GROQ_MAX_CLUSTERS_PER_REFRESH]):
  32. c2 = enrich_cluster(c)
  33. # Groq enrichment only when configured.
  34. if (not GROQ_ENRICH_OTHER_ONLY) or (t == "other"):
  35. # Cache Groq: if we already have entities/sentiment for this cluster, skip.
  36. existing = store.get_cluster_by_id(c2.get("cluster_id"))
  37. if existing and existing.get("entities"):
  38. c2 = dict(c2)
  39. # Keep existing enriched fields.
  40. c2["entities"] = existing.get("entities", [])
  41. if existing.get("sentiment"):
  42. c2["sentiment"] = existing.get("sentiment")
  43. if existing.get("sentimentScore") is not None:
  44. c2["sentimentScore"] = existing.get("sentimentScore")
  45. if existing.get("keywords"):
  46. c2["keywords"] = existing.get("keywords")
  47. else:
  48. c2 = await classify_cluster_groq(c2)
  49. enriched.append(c2)
  50. store.upsert_clusters(enriched, topic=t)