from __future__ import annotations from typing import Any, Dict, List, Tuple from news_mcp.sources.news_feeds import normalize_topic_from_title import re from difflib import SequenceMatcher def _normalize_title(title: str) -> str: t = title.lower().strip() # Remove punctuation-ish characters for similarity scoring. t = re.sub(r"[^a-z0-9\s]", " ", t) t = re.sub(r"\s+", " ", t).strip() return t def _title_similarity(a: str, b: str) -> float: return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio() def dedup_and_cluster_articles( articles: List[Dict[str, Any]], similarity_threshold: float = 0.87, ) -> Dict[str, List[Dict[str, Any]]]: """v1 dedup: fuzzy title similarity per topic. Instead of strict hashing, we merge clusters whose normalized titles are similar enough. This helps create richer clusters (multiple sources/articles) and therefore better importance. """ by_topic: Dict[str, List[Dict[str, Any]]] = {} for a in articles: title = a["title"] topic = normalize_topic_from_title(title) by_topic.setdefault(topic, []) clusters = by_topic[topic] best_idx: int | None = None best_sim = 0.0 for idx, c in enumerate(clusters): sim = _title_similarity(title, c.get("headline", "")) if sim > best_sim: best_sim = sim best_idx = idx if best_idx is not None and best_sim >= similarity_threshold: c = clusters[best_idx] c["articles"].append(a) if a["source"] not in c["sources"]: c["sources"].append(a["source"]) c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"])) else: # Stable-ish cluster id: based on topic + normalized canonical title. import hashlib key = f"{topic}|{_normalize_title(title)}" cid = hashlib.sha1(key.encode("utf-8")).hexdigest() clusters.append( { "cluster_id": cid, "headline": title, "summary": a.get("summary", ""), "entities": [], "sentiment": "neutral", "importance": 0.0, "sources": [a["source"]], "timestamp": a["timestamp"], "articles": [a], "first_seen": a["timestamp"], "last_updated": a["timestamp"], } ) return {topic: clusters for topic, clusters in by_topic.items()}