from __future__ import annotations from typing import Any, Dict from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, RSS_FEED_URL, RSS_FEED_URLS from news_mcp.dedup.cluster import dedup_and_cluster_articles from news_mcp.enrichment.enrich import enrich_cluster from news_mcp.enrichment.groq_enrich import classify_cluster_groq from news_mcp.sources.rss_breakingthenews import fetch_breakingthenews_articles from news_mcp.storage.sqlite_store import SQLiteClusterStore from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None: store = SQLiteClusterStore(DB_PATH) articles = fetch_breakingthenews_articles(limit=limit) # Skip expensive work if the feed content (titles/urls/timestamps) didn't change. import hashlib rss_urls = [u.strip() for u in RSS_FEED_URLS.split(",") if u.strip()] if not rss_urls: rss_urls = [RSS_FEED_URL] feed_key = "breakingthenews:" + hashlib.sha1(",".join(rss_urls).encode("utf-8")).hexdigest() material = "\n".join( f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}" for a in articles ) last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest() prev_hash = store.get_feed_hash(feed_key) if prev_hash == last_hash: return store.set_feed_hash(feed_key, last_hash) clustered_by_topic = dedup_and_cluster_articles(articles) for t, clusters in clustered_by_topic.items(): if topic and t != topic: continue enriched = [] # Always compute cheap enrichment first. for idx, c in enumerate(clusters[:GROQ_MAX_CLUSTERS_PER_REFRESH]): c2 = enrich_cluster(c) # Groq enrichment only when configured. if (not GROQ_ENRICH_OTHER_ONLY) or (t == "other"): # Cache Groq: if we already have entities/sentiment for this cluster, skip. existing = store.get_cluster_by_id(c2.get("cluster_id")) if existing and existing.get("entities"): c2 = dict(c2) # Keep existing enriched fields. c2["entities"] = existing.get("entities", []) if existing.get("sentiment"): c2["sentiment"] = existing.get("sentiment") if existing.get("sentimentScore") is not None: c2["sentimentScore"] = existing.get("sentimentScore") if existing.get("keywords"): c2["keywords"] = existing.get("keywords") else: c2 = await classify_cluster_groq(c2) enriched.append(c2) store.upsert_clusters(enriched, topic=t)