atlas_model.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280
  1. from __future__ import annotations
  2. import hashlib
  3. import json
  4. from dataclasses import dataclass, field
  5. from typing import Any, Dict, List, Optional
  6. # ---------------------------------------------------------------------------
  7. # Mirrors atlas:Identifier
  8. # atlas:scheme — controlled token: "wikidata-qid" | "google-mid" | "atlas-internal"
  9. # atlas:value — raw identifier string
  10. # ---------------------------------------------------------------------------
  11. @dataclass
  12. class Identifier:
  13. scheme: str # "wikidata-qid" | "google-mid" | "atlas-internal"
  14. value: str
  15. # ---------------------------------------------------------------------------
  16. # Mirrors atlas:Provenance
  17. # Attached to a Claim to record where a fact came from.
  18. # ---------------------------------------------------------------------------
  19. @dataclass
  20. class Provenance:
  21. source: str # e.g. "wikidata", "google-trends"
  22. method: str # e.g. "wbsearchentities", "trends-resolution"
  23. confidence: float # 0.0 – 1.0
  24. retrieved_at: str # ISO-8601 UTC, e.g. "2026-04-04T23:27:09Z"
  25. # ---------------------------------------------------------------------------
  26. # Mirrors atlas:Claim
  27. # Records provenance for one direct triple on the entity.
  28. # The triple itself must also exist directly on the Entity — Claims are the
  29. # audit layer, not the only place the fact lives.
  30. # ---------------------------------------------------------------------------
  31. @dataclass
  32. class Claim:
  33. predicate: str # ontology property IRI, e.g. "atlas:hasIdentifier"
  34. object_iri: Optional[str] = None # IRI object, e.g. "atlas_data:ident_qid_q6279"
  35. object_literal: Optional[str] = None # literal object, e.g. "true"
  36. layer: str = "raw" # "raw" | "derived" | "curated"
  37. status: str = "active" # "active" | "superseded" | "rejected"
  38. provenance: Optional[Provenance] = None
  39. # ---------------------------------------------------------------------------
  40. # Mirrors atlas:CurateFlag
  41. # ---------------------------------------------------------------------------
  42. @dataclass
  43. class CurateFlag:
  44. reason: str
  45. # ---------------------------------------------------------------------------
  46. # Mirrors atlas:Entity — the central node.
  47. #
  48. # aliases : flat list of surface forms (atlas:aliasLabel "Biden"@en)
  49. # identifiers: flat Identifier nodes (atlas:scheme / atlas:value)
  50. # attributes : arbitrary key-value facts (atlas:isAlive, atlas:latitude, …)
  51. # These map to direct datatype triples on the entity.
  52. # raw_json : opaque source blobs (atlas:rawJson "…"^^xsd:string)
  53. # One entry per source; source name lives inside the JSON blob.
  54. # claims : provenance audit trail — one Claim per attributed triple
  55. # curate_flag: set when the entity needs human review
  56. # ---------------------------------------------------------------------------
  57. @dataclass
  58. class Entity:
  59. id: str # "atlas:1b0e7222c7730540"
  60. label: str # canonical label
  61. type: Optional[str] = None # "atlas:Person" | "atlas:Location" | …
  62. description: Optional[str] = None
  63. aliases: List[str] = field(default_factory=list)
  64. identifiers: List[Identifier] = field(default_factory=list)
  65. attributes: Dict[str, Any] = field(default_factory=dict) # extensible facts
  66. raw_json: List[str] = field(default_factory=list) # opaque blobs
  67. claims: List[Claim] = field(default_factory=list)
  68. needs_curation: bool = False
  69. curate_flag: Optional[CurateFlag] = None
  70. # ------------------------------------------------------------------
  71. # Helpers
  72. # ------------------------------------------------------------------
  73. def get_identifier(self, scheme: str) -> Optional[str]:
  74. """Return the value for the first identifier matching scheme, or None."""
  75. for ident in self.identifiers:
  76. if ident.scheme == scheme:
  77. return ident.value
  78. return None
  79. def add_raw_json(self, source: str, data: Dict[str, Any]) -> None:
  80. """Serialise a source payload and append it to raw_json blobs."""
  81. self.raw_json.append(json.dumps({"source": source, **data}))
  82. def _entity_iri(self) -> str:
  83. return f"atlas_data:entity_{self.id.replace('atlas:', '')}"
  84. def _identifier_iri(self, ident: Identifier) -> str:
  85. slug = ident.value.replace("/", "_").replace(":", "_").lower()
  86. return f"atlas_data:ident_{ident.scheme}_{slug}"
  87. def _claim_id(self, predicate: str, obj: str) -> str:
  88. h = hashlib.sha1(f"{self.id}{predicate}{obj}".encode()).hexdigest()[:8]
  89. return f"atlas_data:claim_{h}"
  90. def _prov_id(self, claim_id: str) -> str:
  91. return claim_id.replace("claim_", "prov_")
  92. # ------------------------------------------------------------------
  93. # Serialisation to Turtle
  94. # ------------------------------------------------------------------
  95. def to_turtle(self) -> str:
  96. lines: List[str] = [
  97. "@prefix atlas: <http://world.eu.org/atlas_ontology#> .",
  98. "@prefix atlas_data: <http://world.eu.org/atlas_data#> .",
  99. "@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .",
  100. "",
  101. ]
  102. entity_iri = self._entity_iri()
  103. # --- Entity node ---
  104. lines.append("### Entity")
  105. lines.append(f"{entity_iri}")
  106. lines.append(f' a atlas:Entity ;')
  107. lines.append(f' atlas:atlasId "{self.id}" ;')
  108. lines.append(f' atlas:canonicalLabel "{self.label}"@en ;')
  109. if self.description:
  110. lines.append(f' atlas:canonicalDescription "{self.description}"@en ;')
  111. if self.type:
  112. lines.append(f' atlas:hasCanonicalType {self.type} ;')
  113. for alias in self.aliases:
  114. lines.append(f' atlas:aliasLabel "{alias}"@en ;')
  115. for ident in self.identifiers:
  116. lines.append(f' atlas:hasIdentifier {self._identifier_iri(ident)} ;')
  117. for key, val in self.attributes.items():
  118. if isinstance(val, bool):
  119. lit = str(val).lower()
  120. lines.append(f' atlas:{key:<26} "{lit}"^^xsd:boolean ;')
  121. elif isinstance(val, float):
  122. lines.append(f' atlas:{key:<26} "{val}"^^xsd:decimal ;')
  123. elif isinstance(val, int):
  124. lines.append(f' atlas:{key:<26} "{val}"^^xsd:integer ;')
  125. else:
  126. lines.append(f' atlas:{key:<26} "{val}" ;')
  127. for blob in self.raw_json:
  128. escaped = blob.replace("\\", "\\\\").replace('"', '\\"')
  129. lines.append(f' atlas:rawJson "{escaped}"^^xsd:string ;')
  130. for claim in self.claims:
  131. obj = claim.object_iri or claim.object_literal or ""
  132. cid = self._claim_id(claim.predicate, obj)
  133. lines.append(f' atlas:hasClaim {cid} ;')
  134. lines.append(f' atlas:needsCuration {str(self.needs_curation).lower()}')
  135. if self.curate_flag:
  136. # reopen with semicolon on previous line
  137. lines[-1] += " ;"
  138. lines.append(f' atlas:hasCurateFlag {entity_iri}_curate')
  139. lines[-1] += " ."
  140. lines.append("")
  141. # --- Identifier nodes ---
  142. if self.identifiers:
  143. lines.append("### Identifiers")
  144. for ident in self.identifiers:
  145. iiri = self._identifier_iri(ident)
  146. lines.append(f"{iiri}")
  147. lines.append(f' a atlas:Identifier ;')
  148. lines.append(f' atlas:scheme "{ident.scheme}" ;')
  149. lines.append(f' atlas:value "{ident.value}" .')
  150. lines.append("")
  151. # --- Claim + Provenance nodes ---
  152. if self.claims:
  153. lines.append("### Claims")
  154. for claim in self.claims:
  155. obj = claim.object_iri or claim.object_literal or ""
  156. cid = self._claim_id(claim.predicate, obj)
  157. pid = self._prov_id(cid)
  158. lines.append(f"{cid}")
  159. lines.append(f' a atlas:Claim ;')
  160. lines.append(f' atlas:claimSubjectIri {entity_iri} ;')
  161. lines.append(f' atlas:claimPredicate {claim.predicate} ;')
  162. if claim.object_iri:
  163. lines.append(f' atlas:claimObjectIri {claim.object_iri} ;')
  164. else:
  165. lines.append(f' atlas:claimObjectLiteral "{claim.object_literal}" ;')
  166. lines.append(f' atlas:claimLayer "{claim.layer}" ;')
  167. lines.append(f' atlas:claimStatus "{claim.status}"')
  168. if claim.provenance:
  169. lines[-1] += " ;"
  170. lines.append(f' atlas:hasProvenance {pid}')
  171. lines[-1] += " ."
  172. lines.append("")
  173. if claim.provenance:
  174. p = claim.provenance
  175. lines.append(f"{pid}")
  176. lines.append(f' a atlas:Provenance ;')
  177. lines.append(f' atlas:provenanceSource "{p.source}" ;')
  178. lines.append(f' atlas:retrievalMethod "{p.method}" ;')
  179. lines.append(f' atlas:confidence "{p.confidence}"^^xsd:decimal ;')
  180. lines.append(f' atlas:retrievedAt "{p.retrieved_at}"^^xsd:dateTime .')
  181. lines.append("")
  182. # --- CurateFlag node ---
  183. if self.curate_flag:
  184. lines.append("### Curation flag")
  185. lines.append(f"{entity_iri}_curate")
  186. lines.append(f' a atlas:CurateFlag ;')
  187. lines.append(f' atlas:curationReason "{self.curate_flag.reason}"@en .')
  188. lines.append("")
  189. return "\n".join(lines)
  190. # ---------------------------------------------------------------------------
  191. # Usage example — reproduces joe.ttl
  192. # ---------------------------------------------------------------------------
  193. if __name__ == "__main__":
  194. biden = Entity(
  195. id="atlas:1b0e7222c7730540",
  196. label="Joe Biden",
  197. description="46th President of the United States (2021\u20132025)",
  198. type="atlas:Person",
  199. aliases=["Joe Biden", "Biden", "Joseph Biden"],
  200. identifiers=[
  201. Identifier(scheme="google-mid", value="/m/012gx2"),
  202. Identifier(scheme="wikidata-qid", value="Q6279"),
  203. ],
  204. attributes={
  205. "isAlive": True,
  206. },
  207. needs_curation=True,
  208. curate_flag=CurateFlag(
  209. reason="Fine-grained Trends type '46th U.S. President' not yet adjudicated."
  210. ),
  211. claims=[
  212. Claim(
  213. predicate="atlas:hasIdentifier",
  214. object_iri="atlas_data:ident_google-mid__m_012gx2",
  215. layer="raw",
  216. provenance=Provenance(
  217. source="google-trends",
  218. method="trends-resolution",
  219. confidence=0.9,
  220. retrieved_at="2026-04-04T23:27:06Z",
  221. ),
  222. ),
  223. Claim(
  224. predicate="atlas:hasIdentifier",
  225. object_iri="atlas_data:ident_wikidata-qid_q6279",
  226. layer="raw",
  227. provenance=Provenance(
  228. source="wikidata",
  229. method="wbsearchentities + entitydata",
  230. confidence=0.99,
  231. retrieved_at="2026-04-04T23:27:09Z",
  232. ),
  233. ),
  234. Claim(
  235. predicate="atlas:hasCanonicalType",
  236. object_iri="atlas:Person",
  237. layer="derived",
  238. ),
  239. ],
  240. )
  241. biden.add_raw_json("wikidata", {"qid": "Q6279", "label": "Joe Biden", "retrieved_at": "2026-04-04T23:27:09Z"})
  242. biden.add_raw_json("google-trends", {"mid": "/m/012gx2", "type": "46th U.S. President", "retrieved_at": "2026-04-04T23:27:06Z"})
  243. print(biden.to_turtle())