cluster.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. from __future__ import annotations
  2. from typing import Any, Dict, List, Tuple
  3. from news_mcp.sources.news_feeds import normalize_topic_from_title
  4. from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed
  5. from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD
  6. import re
  7. from difflib import SequenceMatcher
  8. from urllib.parse import urlparse
  9. def _normalize_title(title: str) -> str:
  10. t = title.lower().strip()
  11. # Remove punctuation-ish characters for similarity scoring.
  12. t = re.sub(r"[^a-z0-9\s]", " ", t)
  13. t = re.sub(r"\s+", " ", t).strip()
  14. return t
  15. def _title_similarity(a: str, b: str) -> float:
  16. return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
  17. def _article_key(article: Dict[str, Any]) -> str:
  18. url = str(article.get("url") or "").strip()
  19. if not url:
  20. return str(article.get("title") or "")
  21. try:
  22. parsed = urlparse(url)
  23. parts = [p for p in parsed.path.split("/") if p]
  24. if parts:
  25. return parts[-1]
  26. except Exception:
  27. pass
  28. return url
  29. def _cluster_text(a: Dict[str, Any]) -> str:
  30. parts = [a.get("title", ""), a.get("summary", "") or ""]
  31. return "\n".join(p for p in parts if p).strip()
  32. def dedup_and_cluster_articles(
  33. articles: List[Dict[str, Any]],
  34. similarity_threshold: float = 0.87,
  35. ) -> Dict[str, List[Dict[str, Any]]]:
  36. """v1 dedup: fuzzy title similarity per topic.
  37. Instead of strict hashing, we merge clusters whose normalized titles are
  38. similar enough. This helps create richer clusters (multiple sources/articles)
  39. and therefore better importance.
  40. """
  41. by_topic: Dict[str, List[Dict[str, Any]]] = {}
  42. embedding_cache: Dict[str, list[float]] = {}
  43. def _embedding_for_text(text: str) -> list[float] | None:
  44. if not NEWS_EMBEDDINGS_ENABLED:
  45. return None
  46. if text in embedding_cache:
  47. return embedding_cache[text]
  48. emb = ollama_embed(text)
  49. if emb:
  50. embedding_cache[text] = emb
  51. return emb
  52. for a in articles:
  53. title = a["title"]
  54. topic = normalize_topic_from_title(title)
  55. article_text = _cluster_text(a)
  56. article_embedding = _embedding_for_text(article_text)
  57. by_topic.setdefault(topic, [])
  58. clusters = by_topic[topic]
  59. best_idx: int | None = None
  60. best_sim = 0.0
  61. for idx, c in enumerate(clusters):
  62. if NEWS_EMBEDDINGS_ENABLED:
  63. if not cluster_is_candidate(a, c, rules=CandidateRules(require_topic_match=False), article_topic=topic):
  64. continue
  65. cluster_text = _cluster_text(c.get("articles", [{}])[0]) if c.get("articles") else c.get("headline", "")
  66. cluster_embedding = _embedding_for_text(cluster_text)
  67. if article_embedding and cluster_embedding:
  68. sim = cosine_similarity(article_embedding, cluster_embedding)
  69. else:
  70. sim = _title_similarity(title, c.get("headline", ""))
  71. else:
  72. sim = _title_similarity(title, c.get("headline", ""))
  73. if sim > best_sim:
  74. best_sim = sim
  75. best_idx = idx
  76. threshold = similarity_threshold
  77. if NEWS_EMBEDDINGS_ENABLED:
  78. threshold = max(similarity_threshold, NEWS_EMBEDDING_SIMILARITY_THRESHOLD)
  79. if best_idx is not None and best_sim >= threshold:
  80. c = clusters[best_idx]
  81. existing_keys = {_article_key(x) for x in c.get("articles", []) or []}
  82. if _article_key(a) not in existing_keys:
  83. c["articles"].append(a)
  84. if a["source"] not in c["sources"]:
  85. c["sources"].append(a["source"])
  86. c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))
  87. else:
  88. # Stable-ish cluster id: based on topic + normalized canonical title.
  89. import hashlib
  90. key = f"{topic}|{_normalize_title(title)}"
  91. cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
  92. cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
  93. clusters.append(
  94. {
  95. "cluster_id": cid,
  96. "headline": title,
  97. "summary": a.get("summary", ""),
  98. "entities": [],
  99. "sentiment": "neutral",
  100. "importance": 0.0,
  101. "sources": [a["source"]],
  102. "timestamp": a["timestamp"],
  103. "articles": [a],
  104. "first_seen": a["timestamp"],
  105. "last_updated": a["timestamp"],
  106. "embedding": cluster_embedding,
  107. "embedding_model": "ollama:nomic-embed-text" if cluster_embedding else None,
  108. }
  109. )
  110. return {topic: clusters for topic, clusters in by_topic.items()}