claims.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
  1. """Claim extraction helpers for Atlas layered outputs."""
  2. from __future__ import annotations
  3. from typing import Any
  4. from app.models import AtlasEntity, AtlasProvenance
  5. def _prov_to_dict(p: AtlasProvenance | None) -> dict[str, Any] | None:
  6. if p is None:
  7. return None
  8. return {
  9. "source": p.source,
  10. "method": p.retrieval_method,
  11. "confidence": p.confidence,
  12. "retrieved_at": p.retrieved_at,
  13. }
  14. def _pick_provenance(entity: AtlasEntity, *, source_hint: str | None = None, method_hint: str | None = None) -> AtlasProvenance | None:
  15. if not entity.provenance:
  16. return None
  17. if method_hint:
  18. for p in entity.provenance:
  19. if p.retrieval_method == method_hint:
  20. return p
  21. if source_hint:
  22. for p in entity.provenance:
  23. if p.source == source_hint:
  24. return p
  25. return entity.provenance[0]
  26. def _id_type_resource(identifier_type: str) -> str:
  27. if identifier_type == "mid":
  28. return "atlas:Mid"
  29. if identifier_type == "qid":
  30. return "atlas:WikidataQID"
  31. return f"atlas:{identifier_type}"
  32. def build_claim_sets(entity: AtlasEntity) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
  33. raw_claims: list[dict[str, Any]] = []
  34. derived_claims: list[dict[str, Any]] = []
  35. for ident in entity.identifiers:
  36. prov = _pick_provenance(entity, source_hint=ident.source)
  37. raw_claims.append(
  38. {
  39. "claim_id": f"clm_raw_ident_{ident.identifier_type}_{ident.value}",
  40. "layer": "raw",
  41. "subject": entity.atlas_id,
  42. "predicate": "atlas:hasIdentifier",
  43. "object": {
  44. "kind": "identifier",
  45. "id_type": _id_type_resource(ident.identifier_type),
  46. "value": ident.value,
  47. },
  48. "provenance": _prov_to_dict(prov),
  49. }
  50. )
  51. for alias in entity.aliases:
  52. raw_claims.append(
  53. {
  54. "claim_id": f"clm_raw_alias_{alias.label}",
  55. "layer": "raw",
  56. "subject": entity.atlas_id,
  57. "predicate": "atlas:hasAlias",
  58. "object": {"kind": "alias", "value": alias.label},
  59. "provenance": _prov_to_dict(_pick_provenance(entity, method_hint="trends-resolution")),
  60. }
  61. )
  62. wd = entity.raw_payload.get("wikidata") or {}
  63. if wd.get("status") == "ok":
  64. derived_claims.append(
  65. {
  66. "claim_id": "clm_drv_wikidata_type",
  67. "layer": "derived",
  68. "subject": entity.atlas_id,
  69. "predicate": "atlas:hasExternalType",
  70. "object": {"kind": "external_type", "value": "atlas:WikidataType_Q5", "qid": wd.get("qid")},
  71. "provenance": {
  72. "source": "wikidata",
  73. "method": "wbsearchentities + entitydata",
  74. "confidence": 0.99,
  75. "retrieved_at": wd.get("retrieved_at"),
  76. },
  77. }
  78. )
  79. else:
  80. raw_claims.append(
  81. {
  82. "claim_id": "clm_raw_wikidata_missing",
  83. "layer": "raw",
  84. "subject": entity.atlas_id,
  85. "predicate": "atlas:wikidataLookupStatus",
  86. "object": {"kind": "literal", "value": wd.get("status", "missing")},
  87. "provenance": _prov_to_dict(_pick_provenance(entity, method_hint="trends-resolution")),
  88. }
  89. )
  90. type_prov = _pick_provenance(entity, method_hint="type-classification")
  91. derived_claims.append(
  92. {
  93. "claim_id": "clm_drv_canonical_type",
  94. "layer": "derived",
  95. "subject": entity.atlas_id,
  96. "predicate": "atlas:hasCanonicalType",
  97. "object": {"kind": "type", "value": f"atlas:{entity.entity_type}"},
  98. "provenance": _prov_to_dict(type_prov),
  99. }
  100. )
  101. return raw_claims, derived_claims