"""Article identity and content hashing — single source of truth. Used by: - news_mcp.dedup.cluster (clustering identity, orphan merge, stable cluster IDs) - news_mcp.storage.sqlite_store (seen_articles, dedup, upsert) - scripts/backfill_seen_articles.py (backfill) """ from __future__ import annotations import hashlib from typing import Any from urllib.parse import urlparse def article_key(article: dict[str, Any]) -> str: """Deterministic identity key derived from an article's URL. If a URL exists, returns the last path segment (e.g. '/content/uuid' → 'uuid', '/Article/Slug/66427393' → '66427393'). Falls back to the full URL if no path segments, or to the title if no URL at all. This is the primary dedup identity — two articles with the same key are considered the same article regardless of source. """ url = str(article.get("url") or "").strip() if not url: return str(article.get("title") or "") try: parsed = urlparse(url) parts = [p for p in parsed.path.split("/") if p] if parts: return parts[-1] except Exception: pass return url def article_content_hash(article: dict[str, Any]) -> str: """SHA-1 hash of title + summary for detecting content changes. Used to detect in-place article updates (e.g. a stub that gets fleshed out) where the URL stays the same but the content changes. """ title = str(article.get("title") or "").strip() summary = str(article.get("summary") or "").strip() material = f"{title}|{summary}" return hashlib.sha1(material.encode("utf-8")).hexdigest()