lucky
/
news-mcp


			
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879
							from __future__ import annotations

from typing import Any, Dict, List, Tuple

from news_mcp.sources.rss_breakingthenews import normalize_topic_from_title

import re
from difflib import SequenceMatcher


def _normalize_title(title: str) -> str:
    t = title.lower().strip()
    # Remove punctuation-ish characters for similarity scoring.
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    t = re.sub(r"\s+", " ", t).strip()
    return t


def _title_similarity(a: str, b: str) -> float:
    return SequenceMatcher(None, _normalize_title(a), _normalize_title(b)).ratio()


def dedup_and_cluster_articles(
    articles: List[Dict[str, Any]],
    similarity_threshold: float = 0.87,
) -> Dict[str, List[Dict[str, Any]]]:
    """v1 dedup: fuzzy title similarity per topic.

    Instead of strict hashing, we merge clusters whose normalized titles are
    similar enough. This helps create richer clusters (multiple sources/articles)
    and therefore better importance.
    """

    by_topic: Dict[str, List[Dict[str, Any]]] = {}

    for a in articles:
        title = a["title"]
        topic = normalize_topic_from_title(title)

        by_topic.setdefault(topic, [])
        clusters = by_topic[topic]

        best_idx: int | None = None
        best_sim = 0.0
        for idx, c in enumerate(clusters):
            sim = _title_similarity(title, c.get("headline", ""))
            if sim > best_sim:
                best_sim = sim
                best_idx = idx

        if best_idx is not None and best_sim >= similarity_threshold:
            c = clusters[best_idx]
            c["articles"].append(a)
            if a["source"] not in c["sources"]:
                c["sources"].append(a["source"])
            c["last_updated"] = max(str(c["last_updated"]), str(a["timestamp"]))
        else:
            # Stable-ish cluster id: based on topic + normalized canonical title.
            import hashlib

            key = f"{topic}|{_normalize_title(title)}"
            cid = hashlib.sha1(key.encode("utf-8")).hexdigest()
            clusters.append(
                {
                    "cluster_id": cid,
                    "headline": title,
                    "summary": a.get("summary", ""),
                    "entities": [],
                    "sentiment": "neutral",
                    "importance": 0.0,
                    "sources": [a["source"]],
                    "timestamp": a["timestamp"],
                    "articles": [a],
                    "first_seen": a["timestamp"],
                    "last_updated": a["timestamp"],
                }
            )

    return {topic: clusters for topic, clusters in by_topic.items()}