lucky
/
news-mcp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
							from __future__ import annotations

from fnmatch import fnmatchcase
from typing import Any, Dict

from news_mcp.config import NEWS_ENTITY_BLACKLIST, DEFAULT_TOPICS
from news_mcp.entity_normalize import normalize_entities
from news_mcp.llm import call_extraction, call_summary
from news_mcp.trends_resolution import resolve_entity_via_trends


def _matches_blacklist(value: str, blacklist=None) -> bool:
    patterns = [x.strip().lower() for x in (blacklist if blacklist is not None else NEWS_ENTITY_BLACKLIST) if x and x.strip()]
    key = str(value).strip().lower()
    if not key:
        return True
    return any(fnmatchcase(key, pattern) for pattern in patterns)


def _filter_entities(entities, blacklist=None):
    out = []
    for ent in entities or []:
        if _matches_blacklist(ent, blacklist=blacklist):
            continue
        out.append(ent)
    return out


async def classify_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
    parsed = await call_extraction(cluster)
    out = dict(cluster)

    # Topic: prefer the LLM's classification, fall back to the heuristic topic
    # already on the input cluster. Validate against the allowed set so we never
    # promote a free-form string into the SQL row column.
    raw_topic = parsed.get("topic", cluster.get("topic"))
    topic = str(raw_topic).strip().lower() if raw_topic else None
    if topic and _matches_blacklist(topic):
        topic = "other"
    if topic not in {t.lower() for t in DEFAULT_TOPICS}:
        # Unknown / hallucinated label -> fall back to whatever the heuristic
        # classifier on the headline gave us, else "other".
        fallback = str(cluster.get("topic") or "").strip().lower()
        topic = fallback if fallback in {t.lower() for t in DEFAULT_TOPICS} else "other"

    # IMPORTANT: normalize aliases BEFORE applying the blacklist, otherwise
    # blacklisting "bitcoin" misses entries the LLM returned as "btc".
    entities = _filter_entities(normalize_entities(parsed.get("entities", [])))
    keywords = _filter_entities(normalize_entities(parsed.get("keywords", [])))

    # Filter out topic labels from keywords. The LLM often returns the
    # topic (e.g. "crypto", "macro", "regulation", "ai") as a keyword
    # since the prompt asks for "keywords that justify the classification".
    # These are already captured by the cluster topic field and should not
    # pollute keyword search/scoring/frequencies.
    _topic_labels = {t.lower() for t in DEFAULT_TOPICS}
    keywords = [k for k in keywords if k.lower() not in _topic_labels]

    # Enforce per-keyword length cap (max 2 words) as a hard guard.
    # The prompt requests this but the LLM occasionally ignores it.
    keywords = [k for k in keywords if len(k.split()) <= 2]

    # De-duplicate entities vs keywords — entities list is the
    # authoritative source for proper nouns; keywords should be the
    # thematic complement, not a repeat.
    _entity_keys = {e.strip().lower() for e in entities}
    keywords = [k for k in keywords if k.strip().lower() not in _entity_keys]

    out.update({
        "topic": topic,
        "entities": entities,
        "entityResolutions": [resolve_entity_via_trends(e) for e in entities],
        "sentiment": parsed.get("sentiment", "neutral"),
        "sentimentScore": parsed.get("sentimentScore"),
        "keywords": keywords,
    })
    return out


async def summarize_cluster_llm(cluster: Dict[str, Any]) -> Dict[str, Any]:
    parsed = await call_summary(cluster)
    return parsed


# Backward-compatible aliases during the transition away from provider-specific naming.
classify_cluster_groq = classify_cluster_llm
summarize_cluster_groq = summarize_cluster_llm