from __future__ import annotations from typing import Any, Dict, List, Tuple from news_mcp.sources.news_feeds import normalize_topic_from_title from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD import re from difflib import SequenceMatcher def _normalize_title(title: str) -> str: t = title.lower().strip() # Remove punctuation-ish characters for similarity scoring. t = re.sub(r"[^a-z0-9\s]", " ", t) t = re.sub(r"\s+", " ", t).strip() return t def _title_similarity(a: str, b: str) -> float: return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio() def _cluster_text(a: Dict[str, Any]) -> str: parts = [a.get("title", ""), a.get("summary", "") or ""] return "\n".join(p for p in parts if p).strip() def dedup_and_cluster_articles( articles: List[Dict[str, Any]], similarity_threshold: float = 0.87, ) -> Dict[str, List[Dict[str, Any]]]: """v1 dedup: fuzzy title similarity per topic. Instead of strict hashing, we merge clusters whose normalized titles are similar enough. This helps create richer clusters (multiple sources/articles) and therefore better importance. """ by_topic: Dict[str, List[Dict[str, Any]]] = {} embedding_cache: Dict[str, list[float]] = {} def _embedding_for_text(text: str) -> list[float] | None: if not NEWS_EMBEDDINGS_ENABLED: return None if text in embedding_cache: return embedding_cache[text] emb = ollama_embed(text) if emb: embedding_cache[text] = emb return emb for a in articles: title = a["title"] topic = normalize_topic_from_title(title) article_text = _cluster_text(a) article_embedding = _embedding_for_text(article_text) by_topic.setdefault(topic, []) clusters = by_topic[topic] best_idx: int | None = None best_sim = 0.0 for idx, c in enumerate(clusters): if NEWS_EMBEDDINGS_ENABLED: if not cluster_is_candidate(a, c, rules=CandidateRules(require_topic_match=False), article_topic=topic): continue cluster_text = _cluster_text(c.get("articles", [{}])[0]) if c.get("articles") else c.get("headline", "") cluster_embedding = _embedding_for_text(cluster_text) if article_embedding and cluster_embedding: sim = cosine_similarity(article_embedding, cluster_embedding) else: sim = _title_similarity(title, c.get("headline", "")) else: sim = _title_similarity(title, c.get("headline", "")) if sim > best_sim: best_sim = sim best_idx = idx threshold = similarity_threshold if NEWS_EMBEDDINGS_ENABLED: threshold = max(similarity_threshold, NEWS_EMBEDDING_SIMILARITY_THRESHOLD) if best_idx is not None and best_sim >= threshold: c = clusters[best_idx] c["articles"].append(a) if a["source"] not in c["sources"]: c["sources"].append(a["source"]) c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"])) else: # Stable-ish cluster id: based on topic + normalized canonical title. import hashlib key = f"{topic}|{_normalize_title(title)}" cid = hashlib.sha1(key.encode("utf-8")).hexdigest() cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None clusters.append( { "cluster_id": cid, "headline": title, "summary": a.get("summary", ""), "entities": [], "sentiment": "neutral", "importance": 0.0, "sources": [a["source"]], "timestamp": a["timestamp"], "articles": [a], "first_seen": a["timestamp"], "last_updated": a["timestamp"], "embedding": cluster_embedding, "embedding_model": "ollama:nomic-embed-text" if cluster_embedding else None, } ) return {topic: clusters for topic, clusters in by_topic.items()}