| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748 |
- """Article identity and content hashing — single source of truth.
- Used by:
- - news_mcp.dedup.cluster (clustering identity, orphan merge, stable cluster IDs)
- - news_mcp.storage.sqlite_store (seen_articles, dedup, upsert)
- - scripts/backfill_seen_articles.py (backfill)
- """
- from __future__ import annotations
- import hashlib
- from typing import Any
- from urllib.parse import urlparse
- def article_key(article: dict[str, Any]) -> str:
- """Deterministic identity key derived from an article's URL.
- If a URL exists, returns the last path segment (e.g. '/content/uuid' → 'uuid',
- '/Article/Slug/66427393' → '66427393'). Falls back to the full URL if no
- path segments, or to the title if no URL at all.
- This is the primary dedup identity — two articles with the same key
- are considered the same article regardless of source.
- """
- url = str(article.get("url") or "").strip()
- if not url:
- return str(article.get("title") or "")
- try:
- parsed = urlparse(url)
- parts = [p for p in parsed.path.split("/") if p]
- if parts:
- return parts[-1]
- except Exception:
- pass
- return url
- def article_content_hash(article: dict[str, Any]) -> str:
- """SHA-1 hash of title + summary for detecting content changes.
- Used to detect in-place article updates (e.g. a stub that gets fleshed
- out) where the URL stays the same but the content changes.
- """
- title = str(article.get("title") or "").strip()
- summary = str(article.get("summary") or "").strip()
- material = f"{title}|{summary}"
- return hashlib.sha1(material.encode("utf-8")).hexdigest()
|