cluster.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. from __future__ import annotations
  2. from typing import Any, Dict, List
  3. from news_mcp.sources.rss_breakingthenews import cluster_id_for_title, normalize_topic_from_title
  4. def dedup_and_cluster_articles(articles: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
  5. """v1 dedup: cluster by normalized title hash per topic.
  6. Returns topic -> clusters[]
  7. """
  8. by_topic: Dict[str, Dict[str, Dict[str, Any]]] = {}
  9. for a in articles:
  10. title = a["title"]
  11. topic = normalize_topic_from_title(title)
  12. cid = cluster_id_for_title(topic, title)
  13. by_topic.setdefault(topic, {})
  14. cluster_map = by_topic[topic]
  15. if cid not in cluster_map:
  16. cluster_map[cid] = {
  17. "cluster_id": cid,
  18. "headline": title,
  19. "summary": a.get("summary", ""),
  20. "entities": [],
  21. "sentiment": "neutral",
  22. "importance": 0.0,
  23. "sources": [a["source"]],
  24. "timestamp": a["timestamp"],
  25. "articles": [a],
  26. "first_seen": a["timestamp"],
  27. "last_updated": a["timestamp"],
  28. }
  29. else:
  30. c = cluster_map[cid]
  31. c["articles"].append(a)
  32. if a["source"] not in c["sources"]:
  33. c["sources"].append(a["source"])
  34. # Keep latest timestamp as last_updated (v1 heuristic)
  35. c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))
  36. return {topic: list(clusters.values()) for topic, clusters in by_topic.items()}