| 123456789101112131415161718192021222324252627282930313233343536373839404142434445 |
- from __future__ import annotations
- from typing import Any, Dict, List
- from news_mcp.sources.rss_breakingthenews import cluster_id_for_title, normalize_topic_from_title
- def dedup_and_cluster_articles(articles: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
- """v1 dedup: cluster by normalized title hash per topic.
- Returns topic -> clusters[]
- """
- by_topic: Dict[str, Dict[str, Dict[str, Any]]] = {}
- for a in articles:
- title = a["title"]
- topic = normalize_topic_from_title(title)
- cid = cluster_id_for_title(topic, title)
- by_topic.setdefault(topic, {})
- cluster_map = by_topic[topic]
- if cid not in cluster_map:
- cluster_map[cid] = {
- "cluster_id": cid,
- "headline": title,
- "summary": a.get("summary", ""),
- "entities": [],
- "sentiment": "neutral",
- "importance": 0.0,
- "sources": [a["source"]],
- "timestamp": a["timestamp"],
- "articles": [a],
- "first_seen": a["timestamp"],
- "last_updated": a["timestamp"],
- }
- else:
- c = cluster_map[cid]
- c["articles"].append(a)
- if a["source"] not in c["sources"]:
- c["sources"].append(a["source"])
- # Keep latest timestamp as last_updated (v1 heuristic)
- c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))
- return {topic: list(clusters.values()) for topic, clusters in by_topic.items()}
|