lucky
/
news-mcp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
							from __future__ import annotations

import asyncio
import logging
from collections import defaultdict
from datetime import datetime, timezone
from typing import Any, Dict

from news_mcp.config import DEFAULT_LOOKBACK_HOURS, DEFAULT_TOPICS, DB_PATH, NEWS_FEED_URL, NEWS_FEED_URLS
from news_mcp.dedup.cluster import dedup_and_cluster_articles
from news_mcp.enrichment.enrich import enrich_cluster
from news_mcp.enrichment.llm_enrich import classify_cluster_llm
from news_mcp.trends_resolution import resolve_entity_via_trends
from news_mcp.sources.news_feeds import fetch_news_articles
from news_mcp.storage.sqlite_store import SQLiteClusterStore

from news_mcp.config import (
    ENRICH_OTHER_TOPICS_ONLY,
    ENRICHMENT_MAX_PER_REFRESH,
    NEWS_PRUNE_INTERVAL_HOURS,
    NEWS_PRUNING_ENABLED,
    NEWS_RETENTION_DAYS,
)


async def refresh_clusters(topic: str | None = None, limit: int = 80) -> None:
    logger = logging.getLogger("news_mcp.refresh")
    store = SQLiteClusterStore(DB_PATH)

    logger.info("refresh start topic=%s limit=%s", topic, limit)
    articles = await asyncio.to_thread(fetch_news_articles, limit)
    logger.info("refresh fetched articles=%s", len(articles))

    # Drop legacy aggregate feed-state rows so the dashboard only reflects
    # real per-feed poll status from this point forward.
    with store._conn() as conn:
        conn.execute("DELETE FROM feed_state WHERE feed_key LIKE 'newsfeeds:%'")

    # Track feed freshness per RSS URL so unchanged feeds can be skipped.
    import hashlib
    per_feed: dict[str, list[dict[str, Any]]] = defaultdict(list)
    for article in articles:
        feed_url = str(article.get("feed_url") or NEWS_FEED_URL).strip() or NEWS_FEED_URL
        per_feed[feed_url].append(article)

    changed_articles: list[dict[str, Any]] = []
    changed_feed_urls: list[str] = []
    for feed_url, feed_articles in per_feed.items():
        logger.info("refresh feed batch start feed_url=%s count=%s", feed_url, len(feed_articles))
        material = "\n".join(
            f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
            for a in feed_articles
        )
        last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
        feed_key = feed_url
        prev_hash = store.get_feed_hash(feed_key)
        if prev_hash == last_hash:
            logger.info("refresh unchanged feed_url=%s count=%s topic=%s", feed_url, len(feed_articles), topic)
        else:
            logger.info("refresh changed feed_url=%s count=%s topic=%s", feed_url, len(feed_articles), topic)
            changed_feed_urls.append(feed_url)
            changed_articles.extend(feed_articles)
        logger.info("refresh feed batch complete feed_url=%s changed_total=%s", feed_url, len(changed_articles))

    if not changed_articles:
        logger.info("refresh unchanged all feeds topic=%s", topic)
        store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
        prune_result = store.prune_if_due(
            pruning_enabled=NEWS_PRUNING_ENABLED,
            retention_days=NEWS_RETENTION_DAYS,
            interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
        )
        logger.info("refresh prune_result=%s", prune_result)
        return

    articles = changed_articles
    logger.info("refresh clustering start articles=%s topic=%s", len(articles), topic)
    clustered_by_topic = dedup_and_cluster_articles(articles)
    logger.info("refresh clustered topics=%s", list(clustered_by_topic.keys()))

    for t, clusters in clustered_by_topic.items():
        if topic and t != topic:
            continue
        logger.info("refresh topic phase start topic=%s clusters=%s", t, len(clusters))
        enriched = []

        # Determine how many clusters to LLM-enrich.
        # ENRICHMENT_MAX_PER_REFRESH=0 means enrich every cluster (no cap).
        enrich_limit = ENRICHMENT_MAX_PER_REFRESH or len(clusters)

        # Track whether the LLM pipeline is available for this topic.
        _llm_enabled_for_topic = (
            (not ENRICH_OTHER_TOPICS_ONLY) or (t == "other")
        )

        # Persist the raw clusters first so a slow enrichment pass does not
        # leave the first bootstrap run with nothing stored.
        store.upsert_clusters(clusters, topic=t)
        logger.info("refresh stored raw topic=%s clusters=%s", t, len(clusters))

        for idx, c in enumerate(clusters[:enrich_limit]):
            c2 = enrich_cluster(c)
            # Seed the heuristic topic on the payload so classify_cluster_llm
            # has a sane fallback if the LLM omits or hallucinates one.
            c2.setdefault("topic", t)
            logger.info("refresh enrich cluster=%s topic=%s idx=%s/%s", c2.get("cluster_id"), t, idx + 1, enrich_limit)

            if _llm_enabled_for_topic:
                # Cache: if we already have entities/sentiment for this cluster, skip LLM call.
                existing = store.get_cluster_by_id(c2.get("cluster_id"))
                if existing and existing.get("entities"):
                    c2 = dict(c2)
                    # Keep existing enriched fields.
                    c2["entities"] = existing.get("entities", [])

                    # IMPORTANT: entityResolutions must stay consistent with entities.
                    # Older rows may have entities but missing/malformed resolutions.
                    existing_resolutions = existing.get("entityResolutions", None)
                    if isinstance(existing_resolutions, list) and existing_resolutions:
                        c2["entityResolutions"] = existing_resolutions
                    else:
                        # Recompute resolutions deterministically from the stored entities.
                        c2["entityResolutions"] = [resolve_entity_via_trends(e) for e in c2["entities"]]

                    if existing.get("sentiment"):
                        c2["sentiment"] = existing.get("sentiment")
                    if existing.get("sentimentScore") is not None:
                        c2["sentimentScore"] = existing.get("sentimentScore")
                    if existing.get("keywords"):
                        c2["keywords"] = existing.get("keywords")
                    # Preserve a previously-classified topic so we don't drift back
                    # to the heuristic on cache hits.
                    if existing.get("topic"):
                        c2["topic"] = existing.get("topic")
                else:
                    try:
                        c2 = await classify_cluster_llm(c2)
                    except Exception:
                        logger.exception("LLM enrichment failed for cluster %s (topic %s)", c2.get("cluster_id"), t)
                        # Mark so we can retry on next refresh.
                        c2["enrichment_failed_at"] = datetime.now(timezone.utc).isoformat()

            enriched.append(c2)

        # Persist clusters under their *post-enrichment* topic so the SQL row
        # column matches what the LLM (or the validated heuristic fallback)
        # actually decided. Previously, every cluster from this bucket was
        # forced into the heuristic topic `t`, which caused a ~97% mismatch
        # between row-column topic and payload topic.
        by_final_topic: Dict[str, list] = {}
        for c2 in enriched:
            final_topic = str(c2.get("topic") or t or "other").strip().lower()
            if final_topic not in {x.lower() for x in DEFAULT_TOPICS}:
                final_topic = "other"
            by_final_topic.setdefault(final_topic, []).append(c2)
        for final_topic, group in by_final_topic.items():
            store.upsert_clusters(group, topic=final_topic)
            logger.info("refresh stored topic=%s clusters=%s (heuristic_topic=%s)", final_topic, len(group), t)

    prune_result = store.prune_if_due(
        pruning_enabled=NEWS_PRUNING_ENABLED,
        retention_days=NEWS_RETENTION_DAYS,
        interval_hours=NEWS_PRUNE_INTERVAL_HOURS,
    )
    for feed_url in changed_feed_urls:
        feed_articles = per_feed[feed_url]
        material = "\n".join(
            f"{a.get('title','')}|{a.get('url','')}|{a.get('timestamp','')}"
            for a in feed_articles
        )
        last_hash = hashlib.sha1(material.encode("utf-8")).hexdigest()
        store.set_feed_state(feed_url, last_hash, len(feed_articles))
    store.set_meta("last_refresh_at", datetime.now(timezone.utc).isoformat())
    logger.info("refresh prune_result=%s", prune_result)