cluster.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
  1. from __future__ import annotations
  2. from typing import Any, Dict, List, Tuple
  3. from news_mcp.sources.news_feeds import normalize_topic_from_title
  4. from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed
  5. from news_mcp.config import NEWS_EMBEDDINGS_ENABLED
  6. import re
  7. from difflib import SequenceMatcher
  8. def _normalize_title(title: str) -> str:
  9. t = title.lower().strip()
  10. # Remove punctuation-ish characters for similarity scoring.
  11. t = re.sub(r"[^a-z0-9\s]", " ", t)
  12. t = re.sub(r"\s+", " ", t).strip()
  13. return t
  14. def _title_similarity(a: str, b: str) -> float:
  15. return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
  16. def _cluster_text(a: Dict[str, Any]) -> str:
  17. parts = [a.get("title", ""), a.get("summary", "") or ""]
  18. return "\n".join(p for p in parts if p).strip()
  19. def dedup_and_cluster_articles(
  20. articles: List[Dict[str, Any]],
  21. similarity_threshold: float = 0.87,
  22. ) -> Dict[str, List[Dict[str, Any]]]:
  23. """v1 dedup: fuzzy title similarity per topic.
  24. Instead of strict hashing, we merge clusters whose normalized titles are
  25. similar enough. This helps create richer clusters (multiple sources/articles)
  26. and therefore better importance.
  27. """
  28. by_topic: Dict[str, List[Dict[str, Any]]] = {}
  29. embedding_cache: Dict[str, list[float]] = {}
  30. def _embedding_for_text(text: str) -> list[float] | None:
  31. if not NEWS_EMBEDDINGS_ENABLED:
  32. return None
  33. if text in embedding_cache:
  34. return embedding_cache[text]
  35. emb = ollama_embed(text)
  36. if emb:
  37. embedding_cache[text] = emb
  38. return emb
  39. for a in articles:
  40. title = a["title"]
  41. topic = normalize_topic_from_title(title)
  42. article_text = _cluster_text(a)
  43. article_embedding = _embedding_for_text(article_text)
  44. by_topic.setdefault(topic, [])
  45. clusters = by_topic[topic]
  46. best_idx: int | None = None
  47. best_sim = 0.0
  48. for idx, c in enumerate(clusters):
  49. if NEWS_EMBEDDINGS_ENABLED:
  50. if not cluster_is_candidate(a, c, rules=CandidateRules(require_topic_match=False), article_topic=topic):
  51. continue
  52. cluster_text = _cluster_text(c.get("articles", [{}])[0]) if c.get("articles") else c.get("headline", "")
  53. cluster_embedding = _embedding_for_text(cluster_text)
  54. if article_embedding and cluster_embedding:
  55. sim = cosine_similarity(article_embedding, cluster_embedding)
  56. else:
  57. sim = _title_similarity(title, c.get("headline", ""))
  58. else:
  59. sim = _title_similarity(title, c.get("headline", ""))
  60. if sim > best_sim:
  61. best_sim = sim
  62. best_idx = idx
  63. threshold = similarity_threshold
  64. if NEWS_EMBEDDINGS_ENABLED:
  65. threshold = max(similarity_threshold, 0.82)
  66. if best_idx is not None and best_sim >= threshold:
  67. c = clusters[best_idx]
  68. c["articles"].append(a)
  69. if a["source"] not in c["sources"]:
  70. c["sources"].append(a["source"])
  71. c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))
  72. else:
  73. # Stable-ish cluster id: based on topic + normalized canonical title.
  74. import hashlib
  75. key = f"{topic}|{_normalize_title(title)}"
  76. cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
  77. cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
  78. clusters.append(
  79. {
  80. "cluster_id": cid,
  81. "headline": title,
  82. "summary": a.get("summary", ""),
  83. "entities": [],
  84. "sentiment": "neutral",
  85. "importance": 0.0,
  86. "sources": [a["source"]],
  87. "timestamp": a["timestamp"],
  88. "articles": [a],
  89. "first_seen": a["timestamp"],
  90. "last_updated": a["timestamp"],
  91. "embedding": cluster_embedding,
  92. "embedding_model": "ollama:nomic-embed-text" if cluster_embedding else None,
  93. }
  94. )
  95. return {topic: clusters for topic, clusters in by_topic.items()}