wikidata_lookup.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. """Direct Wikidata lookup helpers."""
  2. from __future__ import annotations
  3. import os
  4. from datetime import datetime, timezone
  5. from typing import Any, Optional
  6. import httpx
  7. WIKIDATA_TIMEOUT = float(os.getenv("ATLAS_WIKIDATA_TIMEOUT", "10"))
  8. WIKIDATA_USER_AGENT = os.getenv(
  9. "ATLAS_WIKIDATA_USER_AGENT",
  10. "Atlas/1.0 (contact: lukas.goldschmidt+atlas@googlemail.com)",
  11. )
  12. async def lookup_wikidata(subject: str) -> Optional[dict[str, Any]]:
  13. term = (subject or "").strip()
  14. if not term:
  15. return None
  16. async with httpx.AsyncClient(timeout=WIKIDATA_TIMEOUT, follow_redirects=True) as client:
  17. search = await client.get(
  18. "https://www.wikidata.org/w/api.php",
  19. params={
  20. "action": "wbsearchentities",
  21. "search": term,
  22. "language": "en",
  23. "format": "json",
  24. "limit": 1,
  25. },
  26. headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
  27. )
  28. if search.status_code >= 400:
  29. return None
  30. payload = search.json()
  31. results = payload.get("search") or []
  32. if not results:
  33. return None
  34. top = results[0]
  35. qid = top.get("id")
  36. if not qid:
  37. return None
  38. entity = await client.get(
  39. f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json",
  40. params={"flavor": "dump"},
  41. headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
  42. )
  43. if entity.status_code >= 400:
  44. return None
  45. entity_payload = entity.json()
  46. return {
  47. "qid": qid,
  48. "label": top.get("label") or term,
  49. "description": top.get("description"),
  50. "entity": entity_payload.get("entities", {}).get(qid, {}),
  51. "source": "wikidata",
  52. "wikidata_status": "hit",
  53. "retrieved_at": datetime.now(timezone.utc).isoformat(),
  54. }