lucky
/
news-mcp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445
							from __future__ import annotations

from typing import Any, Dict, List

from news_mcp.sources.rss_breakingthenews import cluster_id_for_title, normalize_topic_from_title


def dedup_and_cluster_articles(articles: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    """v1 dedup: cluster by normalized title hash per topic.

    Returns topic -> clusters[]
    """
    by_topic: Dict[str, Dict[str, Dict[str, Any]]] = {}

    for a in articles:
        title = a["title"]
        topic = normalize_topic_from_title(title)
        cid = cluster_id_for_title(topic, title)

        by_topic.setdefault(topic, {})
        cluster_map = by_topic[topic]
        if cid not in cluster_map:
            cluster_map[cid] = {
                "cluster_id": cid,
                "headline": title,
                "summary": a.get("summary", ""),
                "entities": [],
                "sentiment": "neutral",
                "importance": 0.0,
                "sources": [a["source"]],
                "timestamp": a["timestamp"],
                "articles": [a],
                "first_seen": a["timestamp"],
                "last_updated": a["timestamp"],
            }
        else:
            c = cluster_map[cid]
            c["articles"].append(a)
            if a["source"] not in c["sources"]:
                c["sources"].append(a["source"])

            # Keep latest timestamp as last_updated (v1 heuristic)
            c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))

    return {topic: list(clusters.values()) for topic, clusters in by_topic.items()}