from __future__ import annotations from typing import Any, Dict, List from news_mcp.sources.rss_breakingthenews import cluster_id_for_title, normalize_topic_from_title def dedup_and_cluster_articles(articles: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]: """v1 dedup: cluster by normalized title hash per topic. Returns topic -> clusters[] """ by_topic: Dict[str, Dict[str, Dict[str, Any]]] = {} for a in articles: title = a["title"] topic = normalize_topic_from_title(title) cid = cluster_id_for_title(topic, title) by_topic.setdefault(topic, {}) cluster_map = by_topic[topic] if cid not in cluster_map: cluster_map[cid] = { "cluster_id": cid, "headline": title, "summary": a.get("summary", ""), "entities": [], "sentiment": "neutral", "importance": 0.0, "sources": [a["source"]], "timestamp": a["timestamp"], "articles": [a], "first_seen": a["timestamp"], "last_updated": a["timestamp"], } else: c = cluster_map[cid] c["articles"].append(a) if a["source"] not in c["sources"]: c["sources"].append(a["source"]) # Keep latest timestamp as last_updated (v1 heuristic) c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"])) return {topic: list(clusters.values()) for topic, clusters in by_topic.items()}