|
|
@@ -3,6 +3,8 @@ from __future__ import annotations
|
|
|
from typing import Any, Dict, List, Tuple
|
|
|
|
|
|
from news_mcp.sources.news_feeds import normalize_topic_from_title
|
|
|
+from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed
|
|
|
+from news_mcp.config import NEWS_EMBEDDINGS_ENABLED
|
|
|
|
|
|
import re
|
|
|
from difflib import SequenceMatcher
|
|
|
@@ -20,6 +22,11 @@ def _title_similarity(a: str, b: str) -> float:
|
|
|
return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()
|
|
|
|
|
|
|
|
|
+def _cluster_text(a: Dict[str, Any]) -> str:
|
|
|
+ parts = [a.get("title", ""), a.get("summary", "") or ""]
|
|
|
+ return "\n".join(p for p in parts if p).strip()
|
|
|
+
|
|
|
+
|
|
|
def dedup_and_cluster_articles(
|
|
|
articles: List[Dict[str, Any]],
|
|
|
similarity_threshold: float = 0.87,
|
|
|
@@ -32,10 +39,23 @@ def dedup_and_cluster_articles(
|
|
|
"""
|
|
|
|
|
|
by_topic: Dict[str, List[Dict[str, Any]]] = {}
|
|
|
+ embedding_cache: Dict[str, list[float]] = {}
|
|
|
+
|
|
|
+ def _embedding_for_text(text: str) -> list[float] | None:
|
|
|
+ if not NEWS_EMBEDDINGS_ENABLED:
|
|
|
+ return None
|
|
|
+ if text in embedding_cache:
|
|
|
+ return embedding_cache[text]
|
|
|
+ emb = ollama_embed(text)
|
|
|
+ if emb:
|
|
|
+ embedding_cache[text] = emb
|
|
|
+ return emb
|
|
|
|
|
|
for a in articles:
|
|
|
title = a["title"]
|
|
|
topic = normalize_topic_from_title(title)
|
|
|
+ article_text = _cluster_text(a)
|
|
|
+ article_embedding = _embedding_for_text(article_text)
|
|
|
|
|
|
by_topic.setdefault(topic, [])
|
|
|
clusters = by_topic[topic]
|
|
|
@@ -43,12 +63,26 @@ def dedup_and_cluster_articles(
|
|
|
best_idx: int | None = None
|
|
|
best_sim = 0.0
|
|
|
for idx, c in enumerate(clusters):
|
|
|
- sim = _title_similarity(title, c.get("headline", ""))
|
|
|
+ if NEWS_EMBEDDINGS_ENABLED:
|
|
|
+ if not cluster_is_candidate(a, c, rules=CandidateRules(require_topic_match=False), article_topic=topic):
|
|
|
+ continue
|
|
|
+ cluster_text = _cluster_text(c.get("articles", [{}])[0]) if c.get("articles") else c.get("headline", "")
|
|
|
+ cluster_embedding = _embedding_for_text(cluster_text)
|
|
|
+ if article_embedding and cluster_embedding:
|
|
|
+ sim = cosine_similarity(article_embedding, cluster_embedding)
|
|
|
+ else:
|
|
|
+ sim = _title_similarity(title, c.get("headline", ""))
|
|
|
+ else:
|
|
|
+ sim = _title_similarity(title, c.get("headline", ""))
|
|
|
if sim > best_sim:
|
|
|
best_sim = sim
|
|
|
best_idx = idx
|
|
|
|
|
|
- if best_idx is not None and best_sim >= similarity_threshold:
|
|
|
+ threshold = similarity_threshold
|
|
|
+ if NEWS_EMBEDDINGS_ENABLED:
|
|
|
+ threshold = max(similarity_threshold, 0.82)
|
|
|
+
|
|
|
+ if best_idx is not None and best_sim >= threshold:
|
|
|
c = clusters[best_idx]
|
|
|
c["articles"].append(a)
|
|
|
if a["source"] not in c["sources"]:
|