lucky
/
news-mcp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
							"""Article identity and content hashing — single source of truth.

Used by:
  - news_mcp.dedup.cluster  (clustering identity, orphan merge, stable cluster IDs)
  - news_mcp.storage.sqlite_store  (seen_articles, dedup, upsert)
  - scripts/backfill_seen_articles.py  (backfill)
"""

from __future__ import annotations

import hashlib
from typing import Any
from urllib.parse import urlparse


def article_key(article: dict[str, Any]) -> str:
    """Deterministic identity key derived from an article's URL.

    If a URL exists, returns the last path segment (e.g. '/content/uuid' → 'uuid',
    '/Article/Slug/66427393' → '66427393').  Falls back to the full URL if no
    path segments, or to the title if no URL at all.

    This is the primary dedup identity — two articles with the same key
    are considered the same article regardless of source.
    """
    url = str(article.get("url") or "").strip()
    if not url:
        return str(article.get("title") or "")
    try:
        parsed = urlparse(url)
        parts = [p for p in parsed.path.split("/") if p]
        if parts:
            return parts[-1]
    except Exception:
        pass
    return url


def article_content_hash(article: dict[str, Any]) -> str:
    """SHA-1 hash of title + summary for detecting content changes.

    Used to detect in-place article updates (e.g. a stub that gets fleshed
    out) where the URL stays the same but the content changes.
    """
    title = str(article.get("title") or "").strip()
    summary = str(article.get("summary") or "").strip()
    material = f"{title}|{summary}"
    return hashlib.sha1(material.encode("utf-8")).hexdigest()