lucky
/
news-mcp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							from __future__ import annotations

from typing import Any, Dict, List, Tuple

from news_mcp.sources.news_feeds import normalize_topic_from_title
from news_mcp.dedup.embedding_support import CandidateRules, cluster_is_candidate, cosine_similarity, ollama_embed
from news_mcp.config import NEWS_EMBEDDINGS_ENABLED, NEWS_EMBEDDING_SIMILARITY_THRESHOLD

import re
from difflib import SequenceMatcher


def _normalize_title(title: str) -> str:
    t = title.lower().strip()
    # Remove punctuation-ish characters for similarity scoring.
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t


def _title_similarity(a: str, b: str) -> float:
    return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()


def _cluster_text(a: Dict[str, Any]) -> str:
    parts = [a.get("title", ""), a.get("summary", "") or ""]
    return "\n".join(p for p in parts if p).strip()


def dedup_and_cluster_articles(
    articles: List[Dict[str, Any]],
    similarity_threshold: float = 0.87,
) -> Dict[str, List[Dict[str, Any]]]:
    """v1 dedup: fuzzy title similarity per topic.

    Instead of strict hashing, we merge clusters whose normalized titles are
    similar enough. This helps create richer clusters (multiple sources/articles)
    and therefore better importance.
    """

    by_topic: Dict[str, List[Dict[str, Any]]] = {}
    embedding_cache: Dict[str, list[float]] = {}

    def _embedding_for_text(text: str) -> list[float] | None:
        if not NEWS_EMBEDDINGS_ENABLED:
            return None
        if text in embedding_cache:
            return embedding_cache[text]
        emb = ollama_embed(text)
        if emb:
            embedding_cache[text] = emb
        return emb

    for a in articles:
        title = a["title"]
        topic = normalize_topic_from_title(title)
        article_text = _cluster_text(a)
        article_embedding = _embedding_for_text(article_text)

        by_topic.setdefault(topic, [])
        clusters = by_topic[topic]

        best_idx: int | None = None
        best_sim = 0.0
        for idx, c in enumerate(clusters):
            if NEWS_EMBEDDINGS_ENABLED:
                if not cluster_is_candidate(a, c, rules=CandidateRules(require_topic_match=False), article_topic=topic):
                    continue
                cluster_text = _cluster_text(c.get("articles", [{}])[0]) if c.get("articles") else c.get("headline", "")
                cluster_embedding = _embedding_for_text(cluster_text)
                if article_embedding and cluster_embedding:
                    sim = cosine_similarity(article_embedding, cluster_embedding)
                else:
                    sim = _title_similarity(title, c.get("headline", ""))
            else:
                sim = _title_similarity(title, c.get("headline", ""))
            if sim > best_sim:
                best_sim = sim
                best_idx = idx

        threshold = similarity_threshold
        if NEWS_EMBEDDINGS_ENABLED:
            threshold = max(similarity_threshold, NEWS_EMBEDDING_SIMILARITY_THRESHOLD)

        if best_idx is not None and best_sim >= threshold:
            c = clusters[best_idx]
            c["articles"].append(a)
            if a["source"] not in c["sources"]:
                c["sources"].append(a["source"])
            c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))
        else:
            # Stable-ish cluster id: based on topic + normalized canonical title.
            import hashlib

            key = f"{topic}|{_normalize_title(title)}"
            cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
            cluster_embedding = article_embedding if NEWS_EMBEDDINGS_ENABLED else None
            clusters.append(
                {
                    "cluster_id": cid,
                    "headline": title,
                    "summary": a.get("summary", ""),
                    "entities": [],
                    "sentiment": "neutral",
                    "importance": 0.0,
                    "sources": [a["source"]],
                    "timestamp": a["timestamp"],
                    "articles": [a],
                    "first_seen": a["timestamp"],
                    "last_updated": a["timestamp"],
                    "embedding": cluster_embedding,
                    "embedding_model": "ollama:nomic-embed-text" if cluster_embedding else None,
                }
            )

    return {topic: clusters for topic, clusters in by_topic.items()}