| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- from __future__ import annotations
- from typing import Any, Dict, List, Tuple
- from news_mcp.sources.news_feeds import normalize_topic_from_title
- from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed
- from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD
- import re
- from difflib import SequenceMatcher
- def _normalize_title(title: str) -> str:
- t = title.lower().strip()
- # Remove punctuation-ish characters for similarity scoring.
- t = re.sub(r"[^a-z0-9\s]", " ", t)
- t = re.sub(r"\s+", " ", t).strip()
- return t
- def _title_similarity(a: str, b: str) -> float:
- return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
- def _cluster_text(a: Dict[str, Any]) -> str:
- parts = [a.get("title", ""), a.get("summary", "") or ""]
- return "\n".join(p for p in parts if p).strip()
- def dedup_and_cluster_articles(
- articles: List[Dict[str, Any]],
- similarity_threshold: float = 0.87,
- ) -> Dict[str, List[Dict[str, Any]]]:
- """v1 dedup: fuzzy title similarity per topic.
- Instead of strict hashing, we merge clusters whose normalized titles are
- similar enough. This helps create richer clusters (multiple sources/articles)
- and therefore better importance.
- """
- by_topic: Dict[str, List[Dict[str, Any]]] = {}
- embedding_cache: Dict[str, list[float]] = {}
- def _embedding_for_text(text: str) -> list[float] | None:
- if not NEWS_EMBEDDINGS_ENABLED:
- return None
- if text in embedding_cache:
- return embedding_cache[text]
- emb = ollama_embed(text)
- if emb:
- embedding_cache[text] = emb
- return emb
- for a in articles:
- title = a["title"]
- topic = normalize_topic_from_title(title)
- article_text = _cluster_text(a)
- article_embedding = _embedding_for_text(article_text)
- by_topic.setdefault(topic, [])
- clusters = by_topic[topic]
- best_idx: int | None = None
- best_sim = 0.0
- for idx, c in enumerate(clusters):
- if NEWS_EMBEDDINGS_ENABLED:
- if not cluster_is_candidate(a, c, rules=CandidateRules(require_topic_match=False), article_topic=topic):
- continue
- cluster_text = _cluster_text(c.get("articles", [{}])[0]) if c.get("articles") else c.get("headline", "")
- cluster_embedding = _embedding_for_text(cluster_text)
- if article_embedding and cluster_embedding:
- sim = cosine_similarity(article_embedding, cluster_embedding)
- else:
- sim = _title_similarity(title, c.get("headline", ""))
- else:
- sim = _title_similarity(title, c.get("headline", ""))
- if sim > best_sim:
- best_sim = sim
- best_idx = idx
- threshold = similarity_threshold
- if NEWS_EMBEDDINGS_ENABLED:
- threshold = max(similarity_threshold, NEWS_EMBEDDING_SIMILARITY_THRESHOLD)
- if best_idx is not None and best_sim >= threshold:
- c = clusters[best_idx]
- c["articles"].append(a)
- if a["source"] not in c["sources"]:
- c["sources"].append(a["source"])
- c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))
- else:
- # Stable-ish cluster id: based on topic + normalized canonical title.
- import hashlib
- key = f"{topic}|{_normalize_title(title)}"
- cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
- cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
- clusters.append(
- {
- "cluster_id": cid,
- "headline": title,
- "summary": a.get("summary", ""),
- "entities": [],
- "sentiment": "neutral",
- "importance": 0.0,
- "sources": [a["source"]],
- "timestamp": a["timestamp"],
- "articles": [a],
- "first_seen": a["timestamp"],
- "last_updated": a["timestamp"],
- "embedding": cluster_embedding,
- "embedding_model": "ollama:nomic-embed-text" if cluster_embedding else None,
- }
- )
- return {topic: clusters for topic, clusters in by_topic.items()}
|