wikidata_lookup.py 3.3 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. """Direct Wikidata lookup helpers."""
  2. from __future__ import annotations
  3. import os
  4. from datetime import datetime, timezone
  5. from typing import Any, Optional
  6. import httpx
  7. WIKIDATA_TIMEOUT = float(os.getenv("ATLAS_WIKIDATA_TIMEOUT", "10"))
  8. WIKIDATA_USER_AGENT = os.getenv(
  9. "ATLAS_WIKIDATA_USER_AGENT",
  10. "Atlas/1.0 (contact: lukas.goldschmidt+atlas@googlemail.com)",
  11. )
  12. async def lookup_wikidata(subject: str) -> Optional[dict[str, Any]]:
  13. term = (subject or "").strip()
  14. if not term:
  15. return None
  16. async with httpx.AsyncClient(timeout=WIKIDATA_TIMEOUT, follow_redirects=True) as client:
  17. search = await client.get(
  18. "https://www.wikidata.org/w/api.php",
  19. params={
  20. "action": "wbsearchentities",
  21. "search": term,
  22. "language": "en",
  23. "format": "json",
  24. "limit": 1,
  25. },
  26. headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
  27. )
  28. if search.status_code >= 400:
  29. return None
  30. payload = search.json()
  31. results = payload.get("search") or []
  32. if not results:
  33. return None
  34. top = results[0]
  35. qid = top.get("id")
  36. if not qid:
  37. return None
  38. entity = await client.get(
  39. f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json",
  40. params={"flavor": "dump"},
  41. headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
  42. )
  43. if entity.status_code >= 400:
  44. return None
  45. entity_payload = entity.json()
  46. return {
  47. "qid": qid,
  48. "label": top.get("label") or term,
  49. "description": top.get("description"),
  50. "entity": entity_payload.get("entities", {}).get(qid, {}),
  51. "source": "wikidata",
  52. "wikidata_status": "hit",
  53. "retrieved_at": datetime.now(timezone.utc).isoformat(),
  54. }
  55. async def fetch_wikidata_entity(qid: str) -> Optional[dict[str, Any]]:
  56. qid = (qid or "").strip()
  57. if not qid:
  58. return None
  59. async with httpx.AsyncClient(timeout=WIKIDATA_TIMEOUT, follow_redirects=True) as client:
  60. entity = await client.get(
  61. f"https://www.wikidata.org/wiki/Special:EntityData/{qid}.json",
  62. params={"flavor": "dump"},
  63. headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
  64. )
  65. if entity.status_code >= 400:
  66. return None
  67. entity_payload = entity.json()
  68. entity_block = entity_payload.get("entities", {}).get(qid, {})
  69. label = None
  70. description = None
  71. if isinstance(entity_block, dict):
  72. labels = entity_block.get("labels", {})
  73. descriptions = entity_block.get("descriptions", {})
  74. label = (labels.get("en") or {}).get("value") if isinstance(labels, dict) else None
  75. description = (descriptions.get("en") or {}).get("value") if isinstance(descriptions, dict) else None
  76. return {
  77. "qid": qid,
  78. "entity": entity_block,
  79. "label": label,
  80. "description": description,
  81. "source": "wikidata",
  82. "wikidata_status": "enriched",
  83. "retrieved_at": datetime.now(timezone.utc).isoformat(),
  84. }