from __future__ import annotations import hashlib import json from dataclasses import dataclass, field from typing import Any, Dict, List, Optional # --------------------------------------------------------------------------- # Mirrors atlas:Identifier # atlas:scheme — controlled token: "wikidata-qid" | "google-mid" | "atlas-internal" # atlas:value — raw identifier string # --------------------------------------------------------------------------- @dataclass class Identifier: scheme: str # "wikidata-qid" | "google-mid" | "atlas-internal" value: str # --------------------------------------------------------------------------- # Mirrors atlas:Provenance # Attached to a Claim to record where a fact came from. # --------------------------------------------------------------------------- @dataclass class Provenance: source: str # e.g. "wikidata", "google-trends" method: str # e.g. "wbsearchentities", "trends-resolution" confidence: float # 0.0 – 1.0 retrieved_at: str # ISO-8601 UTC, e.g. "2026-04-04T23:27:09Z" # --------------------------------------------------------------------------- # Mirrors atlas:Claim # Records provenance for one direct triple on the entity. # The triple itself must also exist directly on the Entity — Claims are the # audit layer, not the only place the fact lives. # --------------------------------------------------------------------------- @dataclass class Claim: predicate: str # ontology property IRI, e.g. "atlas:hasIdentifier" object_iri: Optional[str] = None # IRI object, e.g. "atlas_data:ident_qid_q6279" object_literal: Optional[str] = None # literal object, e.g. "true" layer: str = "raw" # "raw" | "derived" | "curated" status: str = "active" # "active" | "superseded" | "rejected" provenance: Optional[Provenance] = None # --------------------------------------------------------------------------- # Mirrors atlas:CurateFlag # --------------------------------------------------------------------------- @dataclass class CurateFlag: reason: str # --------------------------------------------------------------------------- # Mirrors atlas:Entity — the central node. # # aliases : flat list of surface forms (atlas:aliasLabel "Biden"@en) # identifiers: flat Identifier nodes (atlas:scheme / atlas:value) # attributes : arbitrary key-value facts (atlas:isAlive, atlas:latitude, …) # These map to direct datatype triples on the entity. # raw_json : opaque source blobs (atlas:rawJson "…"^^xsd:string) # One entry per source; source name lives inside the JSON blob. # claims : provenance audit trail — one Claim per attributed triple # curate_flag: set when the entity needs human review # --------------------------------------------------------------------------- @dataclass class Entity: id: str # "atlas:1b0e7222c7730540" label: str # canonical label type: Optional[str] = None # "atlas:Person" | "atlas:Location" | … description: Optional[str] = None aliases: List[str] = field(default_factory=list) identifiers: List[Identifier] = field(default_factory=list) attributes: Dict[str, Any] = field(default_factory=dict) # extensible facts raw_json: List[str] = field(default_factory=list) # opaque blobs claims: List[Claim] = field(default_factory=list) needs_curation: bool = False curate_flag: Optional[CurateFlag] = None # ------------------------------------------------------------------ # Helpers # ------------------------------------------------------------------ def get_identifier(self, scheme: str) -> Optional[str]: """Return the value for the first identifier matching scheme, or None.""" for ident in self.identifiers: if ident.scheme == scheme: return ident.value return None def add_raw_json(self, source: str, data: Dict[str, Any]) -> None: """Serialise a source payload and append it to raw_json blobs.""" self.raw_json.append(json.dumps({"source": source, **data})) def _entity_iri(self) -> str: return f"atlas_data:entity_{self.id.replace('atlas:', '')}" def _identifier_iri(self, ident: Identifier) -> str: slug = ident.value.replace("/", "_").replace(":", "_").lower() return f"atlas_data:ident_{ident.scheme}_{slug}" def _claim_id(self, predicate: str, obj: str) -> str: h = hashlib.sha1(f"{self.id}{predicate}{obj}".encode()).hexdigest()[:8] return f"atlas_data:claim_{h}" def _prov_id(self, claim_id: str) -> str: return claim_id.replace("claim_", "prov_") # ------------------------------------------------------------------ # Serialisation to Turtle # ------------------------------------------------------------------ def to_turtle(self) -> str: lines: List[str] = [ "@prefix atlas: .", "@prefix atlas_data: .", "@prefix xsd: .", "", ] entity_iri = self._entity_iri() # --- Entity node --- lines.append("### Entity") lines.append(f"{entity_iri}") lines.append(f' a atlas:Entity ;') lines.append(f' atlas:atlasId "{self.id}" ;') lines.append(f' atlas:canonicalLabel "{self.label}"@en ;') if self.description: lines.append(f' atlas:canonicalDescription "{self.description}"@en ;') if self.type: lines.append(f' atlas:hasCanonicalType {self.type} ;') for alias in self.aliases: lines.append(f' atlas:aliasLabel "{alias}"@en ;') for ident in self.identifiers: lines.append(f' atlas:hasIdentifier {self._identifier_iri(ident)} ;') for key, val in self.attributes.items(): if isinstance(val, bool): lit = str(val).lower() lines.append(f' atlas:{key:<26} "{lit}"^^xsd:boolean ;') elif isinstance(val, float): lines.append(f' atlas:{key:<26} "{val}"^^xsd:decimal ;') elif isinstance(val, int): lines.append(f' atlas:{key:<26} "{val}"^^xsd:integer ;') else: lines.append(f' atlas:{key:<26} "{val}" ;') for blob in self.raw_json: escaped = blob.replace("\\", "\\\\").replace('"', '\\"') lines.append(f' atlas:rawJson "{escaped}"^^xsd:string ;') for claim in self.claims: obj = claim.object_iri or claim.object_literal or "" cid = self._claim_id(claim.predicate, obj) lines.append(f' atlas:hasClaim {cid} ;') lines.append(f' atlas:needsCuration {str(self.needs_curation).lower()}') if self.curate_flag: # reopen with semicolon on previous line lines[-1] += " ;" lines.append(f' atlas:hasCurateFlag {entity_iri}_curate') lines[-1] += " ." lines.append("") # --- Identifier nodes --- if self.identifiers: lines.append("### Identifiers") for ident in self.identifiers: iiri = self._identifier_iri(ident) lines.append(f"{iiri}") lines.append(f' a atlas:Identifier ;') lines.append(f' atlas:scheme "{ident.scheme}" ;') lines.append(f' atlas:value "{ident.value}" .') lines.append("") # --- Claim + Provenance nodes --- if self.claims: lines.append("### Claims") for claim in self.claims: obj = claim.object_iri or claim.object_literal or "" cid = self._claim_id(claim.predicate, obj) pid = self._prov_id(cid) lines.append(f"{cid}") lines.append(f' a atlas:Claim ;') lines.append(f' atlas:claimSubjectIri {entity_iri} ;') lines.append(f' atlas:claimPredicate {claim.predicate} ;') if claim.object_iri: lines.append(f' atlas:claimObjectIri {claim.object_iri} ;') else: lines.append(f' atlas:claimObjectLiteral "{claim.object_literal}" ;') lines.append(f' atlas:claimLayer "{claim.layer}" ;') lines.append(f' atlas:claimStatus "{claim.status}"') if claim.provenance: lines[-1] += " ;" lines.append(f' atlas:hasProvenance {pid}') lines[-1] += " ." lines.append("") if claim.provenance: p = claim.provenance lines.append(f"{pid}") lines.append(f' a atlas:Provenance ;') lines.append(f' atlas:provenanceSource "{p.source}" ;') lines.append(f' atlas:retrievalMethod "{p.method}" ;') lines.append(f' atlas:confidence "{p.confidence}"^^xsd:decimal ;') lines.append(f' atlas:retrievedAt "{p.retrieved_at}"^^xsd:dateTime .') lines.append("") # --- CurateFlag node --- if self.curate_flag: lines.append("### Curation flag") lines.append(f"{entity_iri}_curate") lines.append(f' a atlas:CurateFlag ;') lines.append(f' atlas:curationReason "{self.curate_flag.reason}"@en .') lines.append("") return "\n".join(lines) # --------------------------------------------------------------------------- # Usage example — reproduces joe.ttl # --------------------------------------------------------------------------- if __name__ == "__main__": biden = Entity( id="atlas:1b0e7222c7730540", label="Joe Biden", description="46th President of the United States (2021\u20132025)", type="atlas:Person", aliases=["Joe Biden", "Biden", "Joseph Biden"], identifiers=[ Identifier(scheme="google-mid", value="/m/012gx2"), Identifier(scheme="wikidata-qid", value="Q6279"), ], attributes={ "isAlive": True, }, needs_curation=True, curate_flag=CurateFlag( reason="Fine-grained Trends type '46th U.S. President' not yet adjudicated." ), claims=[ Claim( predicate="atlas:hasIdentifier", object_iri="atlas_data:ident_google-mid__m_012gx2", layer="raw", provenance=Provenance( source="google-trends", method="trends-resolution", confidence=0.9, retrieved_at="2026-04-04T23:27:06Z", ), ), Claim( predicate="atlas:hasIdentifier", object_iri="atlas_data:ident_wikidata-qid_q6279", layer="raw", provenance=Provenance( source="wikidata", method="wbsearchentities + entitydata", confidence=0.99, retrieved_at="2026-04-04T23:27:09Z", ), ), Claim( predicate="atlas:hasCanonicalType", object_iri="atlas:Person", layer="derived", ), ], ) biden.add_raw_json("wikidata", {"qid": "Q6279", "label": "Joe Biden", "retrieved_at": "2026-04-04T23:27:09Z"}) biden.add_raw_json("google-trends", {"mid": "/m/012gx2", "type": "46th U.S. President", "retrieved_at": "2026-04-04T23:27:06Z"}) print(biden.to_turtle())