entity_normalize.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. from __future__ import annotations
  2. import json
  3. from functools import lru_cache
  4. from pathlib import Path
  5. from typing import Iterable
  6. from news_mcp.config import ENTITY_ALIASES_FILE
  7. # Small, explicit canonical alias map.
  8. # Keep this conservative and grow it only when a shorthand is clearly useful.
  9. @lru_cache(maxsize=1)
  10. def _alias_map() -> dict[str, str]:
  11. path = Path(ENTITY_ALIASES_FILE)
  12. if not path.exists():
  13. return {}
  14. try:
  15. raw = json.loads(path.read_text(encoding="utf-8"))
  16. except Exception:
  17. return {}
  18. out: dict[str, str] = {}
  19. if isinstance(raw, dict):
  20. for k, v in raw.items():
  21. if k and v:
  22. out[str(k).strip().lower()] = str(v).strip()
  23. return out
  24. def _lookup_alias(key: str) -> str | None:
  25. return _alias_map().get(key)
  26. def normalize_entity(value: str) -> str:
  27. key = str(value).strip().lower()
  28. if not key:
  29. return ""
  30. return _lookup_alias(key) or str(value).strip()
  31. def normalize_query(value: str) -> str:
  32. return normalize_entity(value)
  33. def normalize_entities(values: Iterable[str]) -> list[str]:
  34. out: list[str] = []
  35. seen: set[str] = set()
  36. for value in values or []:
  37. norm = normalize_entity(value)
  38. key = norm.lower()
  39. if not norm or key in seen:
  40. continue
  41. seen.add(key)
  42. out.append(norm)
  43. return out