article_identity.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748
  1. """Article identity and content hashing — single source of truth.
  2. Used by:
  3. - news_mcp.dedup.cluster (clustering identity, orphan merge, stable cluster IDs)
  4. - news_mcp.storage.sqlite_store (seen_articles, dedup, upsert)
  5. - scripts/backfill_seen_articles.py (backfill)
  6. """
  7. from __future__ import annotations
  8. import hashlib
  9. from typing import Any
  10. from urllib.parse import urlparse
  11. def article_key(article: dict[str, Any]) -> str:
  12. """Deterministic identity key derived from an article's URL.
  13. If a URL exists, returns the last path segment (e.g. '/content/uuid' → 'uuid',
  14. '/Article/Slug/66427393' → '66427393'). Falls back to the full URL if no
  15. path segments, or to the title if no URL at all.
  16. This is the primary dedup identity — two articles with the same key
  17. are considered the same article regardless of source.
  18. """
  19. url = str(article.get("url") or "").strip()
  20. if not url:
  21. return str(article.get("title") or "")
  22. try:
  23. parsed = urlparse(url)
  24. parts = [p for p in parsed.path.split("/") if p]
  25. if parts:
  26. return parts[-1]
  27. except Exception:
  28. pass
  29. return url
  30. def article_content_hash(article: dict[str, Any]) -> str:
  31. """SHA-1 hash of title + summary for detecting content changes.
  32. Used to detect in-place article updates (e.g. a stub that gets fleshed
  33. out) where the URL stays the same but the content changes.
  34. """
  35. title = str(article.get("title") or "").strip()
  36. summary = str(article.get("summary") or "").strip()
  37. material = f"{title}|{summary}"
  38. return hashlib.sha1(material.encode("utf-8")).hexdigest()