cluster.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
  1. from __future__ import annotations
  2. from typing import Any, Dict, List, Tuple
  3. from news_mcp.sources.rss_breakingthenews import normalize_topic_from_title
  4. import re
  5. from difflib import SequenceMatcher
  6. def _normalize_title(title: str) -> str:
  7. t = title.lower().strip()
  8. # Remove punctuation-ish characters for similarity scoring.
  9. t = re.sub(r"[^a-z0-9\s]", " ", t)
  10. t = re.sub(r"\s+", " ", t).strip()
  11. return t
  12. def _title_similarity(a: str, b: str) -> float:
  13. return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
  14. def dedup_and_cluster_articles(
  15. articles: List[Dict[str, Any]],
  16. similarity_threshold: float = 0.87,
  17. ) -> Dict[str, List[Dict[str, Any]]]:
  18. """v1 dedup: fuzzy title similarity per topic.
  19. Instead of strict hashing, we merge clusters whose normalized titles are
  20. similar enough. This helps create richer clusters (multiple sources/articles)
  21. and therefore better importance.
  22. """
  23. by_topic: Dict[str, List[Dict[str, Any]]] = {}
  24. for a in articles:
  25. title = a["title"]
  26. topic = normalize_topic_from_title(title)
  27. by_topic.setdefault(topic, [])
  28. clusters = by_topic[topic]
  29. best_idx: int | None = None
  30. best_sim = 0.0
  31. for idx, c in enumerate(clusters):
  32. sim = _title_similarity(title, c.get("headline", ""))
  33. if sim > best_sim:
  34. best_sim = sim
  35. best_idx = idx
  36. if best_idx is not None and best_sim >= similarity_threshold:
  37. c = clusters[best_idx]
  38. c["articles"].append(a)
  39. if a["source"] not in c["sources"]:
  40. c["sources"].append(a["source"])
  41. c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))
  42. else:
  43. # Stable-ish cluster id: based on topic + normalized canonical title.
  44. import hashlib
  45. key = f"{topic}|{_normalize_title(title)}"
  46. cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
  47. clusters.append(
  48. {
  49. "cluster_id": cid,
  50. "headline": title,
  51. "summary": a.get("summary", ""),
  52. "entities": [],
  53. "sentiment": "neutral",
  54. "importance": 0.0,
  55. "sources": [a["source"]],
  56. "timestamp": a["timestamp"],
  57. "articles": [a],
  58. "first_seen": a["timestamp"],
  59. "last_updated": a["timestamp"],
  60. }
  61. )
  62. return {topic: clusters for topic, clusters in by_topic.items()}