lucky
/
news-mcp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
							from __future__ import annotations

from typing import Any, Dict

from news_mcp.config import CLUSTERS_TTL_HOURS, DB_PATH, RSS_FEED_URL, RSS_FEED_URLS
from news_mcp.dedup.cluster import dedup_and_cluster_articles
from news_mcp.enrichment.enrich import enrich_cluster
from news_mcp.enrichment.groq_enrich import classify_cluster_groq
from news_mcp.sources.rss_breakingthenews import fetch_breakingthenews_articles
from news_mcp.storage.sqlite_store import SQLiteClusterStore

from news_mcp.config import GROQ_ENRICH_OTHER_ONLY, GROQ_MAX_CLUSTERS_PER_REFRESH


async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
    store = SQLiteClusterStore(DB_PATH)

    articles = fetch_breakingthenews_articles(limit=limit)

    # Skip expensive work if the feed content (titles/urls/timestamps) didn't change.
    import hashlib
    rss_urls = [u.strip() for u in RSS_FEED_URLS.split(",") if u.strip()]
    if not rss_urls:
        rss_urls = [RSS_FEED_URL]
    feed_key = "breakingthenews:" + hashlib.sha1(",".join(rss_urls).encode("utf-8")).hexdigest()
    material = "\n".join(
        f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
        for a in articles
    )
    last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
    prev_hash = store.get_feed_hash(feed_key)
    if prev_hash == last_hash:
        return
    store.set_feed_hash(feed_key, last_hash)
    clustered_by_topic = dedup_and_cluster_articles(articles)

    for t, clusters in clustered_by_topic.items():
        if topic and t != topic:
            continue
        enriched = []
        # Always compute cheap enrichment first.
        for idx, c in enumerate(clusters[:GROQ_MAX_CLUSTERS_PER_REFRESH]):
            c2 = enrich_cluster(c)

            # Groq enrichment only when configured.
            if (not GROQ_ENRICH_OTHER_ONLY) or (t == "other"):
                # Cache Groq: if we already have entities/sentiment for this cluster, skip.
                existing = store.get_cluster_by_id(c2.get("cluster_id"))
                if existing and existing.get("entities"):
                    c2 = dict(c2)
                    # Keep existing enriched fields.
                    c2["entities"] = existing.get("entities", [])
                    if existing.get("sentiment"):
                        c2["sentiment"] = existing.get("sentiment")
                    if existing.get("sentimentScore") is not None:
                        c2["sentimentScore"] = existing.get("sentimentScore")
                    if existing.get("keywords"):
                        c2["keywords"] = existing.get("keywords")
                else:
                    c2 = await classify_cluster_groq(c2)

            enriched.append(c2)

        store.upsert_clusters(enriched, topic=t)