| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280 |
- from __future__ import annotations
- import hashlib
- import json
- from dataclasses import dataclass, field
- from typing import Any, Dict, List, Optional
- # ---------------------------------------------------------------------------
- # Mirrors atlas:Identifier
- # atlas:scheme — controlled token: "wikidata-qid" | "google-mid" | "atlas-internal"
- # atlas:value — raw identifier string
- # ---------------------------------------------------------------------------
- @dataclass
- class Identifier:
- scheme: str # "wikidata-qid" | "google-mid" | "atlas-internal"
- value: str
- # ---------------------------------------------------------------------------
- # Mirrors atlas:Provenance
- # Attached to a Claim to record where a fact came from.
- # ---------------------------------------------------------------------------
- @dataclass
- class Provenance:
- source: str # e.g. "wikidata", "google-trends"
- method: str # e.g. "wbsearchentities", "trends-resolution"
- confidence: float # 0.0 – 1.0
- retrieved_at: str # ISO-8601 UTC, e.g. "2026-04-04T23:27:09Z"
- # ---------------------------------------------------------------------------
- # Mirrors atlas:Claim
- # Records provenance for one direct triple on the entity.
- # The triple itself must also exist directly on the Entity — Claims are the
- # audit layer, not the only place the fact lives.
- # ---------------------------------------------------------------------------
- @dataclass
- class Claim:
- predicate: str # ontology property IRI, e.g. "atlas:hasIdentifier"
- object_iri: Optional[str] = None # IRI object, e.g. "atlas_data:ident_qid_q6279"
- object_literal: Optional[str] = None # literal object, e.g. "true"
- layer: str = "raw" # "raw" | "derived" | "curated"
- status: str = "active" # "active" | "superseded" | "rejected"
- provenance: Optional[Provenance] = None
- # ---------------------------------------------------------------------------
- # Mirrors atlas:CurateFlag
- # ---------------------------------------------------------------------------
- @dataclass
- class CurateFlag:
- reason: str
- # ---------------------------------------------------------------------------
- # Mirrors atlas:Entity — the central node.
- #
- # aliases : flat list of surface forms (atlas:aliasLabel "Biden"@en)
- # identifiers: flat Identifier nodes (atlas:scheme / atlas:value)
- # attributes : arbitrary key-value facts (atlas:isAlive, atlas:latitude, …)
- # These map to direct datatype triples on the entity.
- # raw_json : opaque source blobs (atlas:rawJson "…"^^xsd:string)
- # One entry per source; source name lives inside the JSON blob.
- # claims : provenance audit trail — one Claim per attributed triple
- # curate_flag: set when the entity needs human review
- # ---------------------------------------------------------------------------
- @dataclass
- class Entity:
- id: str # "atlas:1b0e7222c7730540"
- label: str # canonical label
- type: Optional[str] = None # "atlas:Person" | "atlas:Location" | …
- description: Optional[str] = None
- aliases: List[str] = field(default_factory=list)
- identifiers: List[Identifier] = field(default_factory=list)
- attributes: Dict[str, Any] = field(default_factory=dict) # extensible facts
- raw_json: List[str] = field(default_factory=list) # opaque blobs
- claims: List[Claim] = field(default_factory=list)
- needs_curation: bool = False
- curate_flag: Optional[CurateFlag] = None
- # ------------------------------------------------------------------
- # Helpers
- # ------------------------------------------------------------------
- def get_identifier(self, scheme: str) -> Optional[str]:
- """Return the value for the first identifier matching scheme, or None."""
- for ident in self.identifiers:
- if ident.scheme == scheme:
- return ident.value
- return None
- def add_raw_json(self, source: str, data: Dict[str, Any]) -> None:
- """Serialise a source payload and append it to raw_json blobs."""
- self.raw_json.append(json.dumps({"source": source, **data}))
- def _entity_iri(self) -> str:
- return f"atlas_data:entity_{self.id.replace('atlas:', '')}"
- def _identifier_iri(self, ident: Identifier) -> str:
- slug = ident.value.replace("/", "_").replace(":", "_").lower()
- return f"atlas_data:ident_{ident.scheme}_{slug}"
- def _claim_id(self, predicate: str, obj: str) -> str:
- h = hashlib.sha1(f"{self.id}{predicate}{obj}".encode()).hexdigest()[:8]
- return f"atlas_data:claim_{h}"
- def _prov_id(self, claim_id: str) -> str:
- return claim_id.replace("claim_", "prov_")
- # ------------------------------------------------------------------
- # Serialisation to Turtle
- # ------------------------------------------------------------------
- def to_turtle(self) -> str:
- lines: List[str] = [
- "@prefix atlas: <http://world.eu.org/atlas_ontology#> .",
- "@prefix atlas_data: <http://world.eu.org/atlas_data#> .",
- "@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .",
- "",
- ]
- entity_iri = self._entity_iri()
- # --- Entity node ---
- lines.append("### Entity")
- lines.append(f"{entity_iri}")
- lines.append(f' a atlas:Entity ;')
- lines.append(f' atlas:atlasId "{self.id}" ;')
- lines.append(f' atlas:canonicalLabel "{self.label}"@en ;')
- if self.description:
- lines.append(f' atlas:canonicalDescription "{self.description}"@en ;')
- if self.type:
- lines.append(f' atlas:hasCanonicalType {self.type} ;')
- for alias in self.aliases:
- lines.append(f' atlas:aliasLabel "{alias}"@en ;')
- for ident in self.identifiers:
- lines.append(f' atlas:hasIdentifier {self._identifier_iri(ident)} ;')
- for key, val in self.attributes.items():
- if isinstance(val, bool):
- lit = str(val).lower()
- lines.append(f' atlas:{key:<26} "{lit}"^^xsd:boolean ;')
- elif isinstance(val, float):
- lines.append(f' atlas:{key:<26} "{val}"^^xsd:decimal ;')
- elif isinstance(val, int):
- lines.append(f' atlas:{key:<26} "{val}"^^xsd:integer ;')
- else:
- lines.append(f' atlas:{key:<26} "{val}" ;')
- for blob in self.raw_json:
- escaped = blob.replace("\\", "\\\\").replace('"', '\\"')
- lines.append(f' atlas:rawJson "{escaped}"^^xsd:string ;')
- for claim in self.claims:
- obj = claim.object_iri or claim.object_literal or ""
- cid = self._claim_id(claim.predicate, obj)
- lines.append(f' atlas:hasClaim {cid} ;')
- lines.append(f' atlas:needsCuration {str(self.needs_curation).lower()}')
- if self.curate_flag:
- # reopen with semicolon on previous line
- lines[-1] += " ;"
- lines.append(f' atlas:hasCurateFlag {entity_iri}_curate')
- lines[-1] += " ."
- lines.append("")
- # --- Identifier nodes ---
- if self.identifiers:
- lines.append("### Identifiers")
- for ident in self.identifiers:
- iiri = self._identifier_iri(ident)
- lines.append(f"{iiri}")
- lines.append(f' a atlas:Identifier ;')
- lines.append(f' atlas:scheme "{ident.scheme}" ;')
- lines.append(f' atlas:value "{ident.value}" .')
- lines.append("")
- # --- Claim + Provenance nodes ---
- if self.claims:
- lines.append("### Claims")
- for claim in self.claims:
- obj = claim.object_iri or claim.object_literal or ""
- cid = self._claim_id(claim.predicate, obj)
- pid = self._prov_id(cid)
- lines.append(f"{cid}")
- lines.append(f' a atlas:Claim ;')
- lines.append(f' atlas:claimSubjectIri {entity_iri} ;')
- lines.append(f' atlas:claimPredicate {claim.predicate} ;')
- if claim.object_iri:
- lines.append(f' atlas:claimObjectIri {claim.object_iri} ;')
- else:
- lines.append(f' atlas:claimObjectLiteral "{claim.object_literal}" ;')
- lines.append(f' atlas:claimLayer "{claim.layer}" ;')
- lines.append(f' atlas:claimStatus "{claim.status}"')
- if claim.provenance:
- lines[-1] += " ;"
- lines.append(f' atlas:hasProvenance {pid}')
- lines[-1] += " ."
- lines.append("")
- if claim.provenance:
- p = claim.provenance
- lines.append(f"{pid}")
- lines.append(f' a atlas:Provenance ;')
- lines.append(f' atlas:provenanceSource "{p.source}" ;')
- lines.append(f' atlas:retrievalMethod "{p.method}" ;')
- lines.append(f' atlas:confidence "{p.confidence}"^^xsd:decimal ;')
- lines.append(f' atlas:retrievedAt "{p.retrieved_at}"^^xsd:dateTime .')
- lines.append("")
- # --- CurateFlag node ---
- if self.curate_flag:
- lines.append("### Curation flag")
- lines.append(f"{entity_iri}_curate")
- lines.append(f' a atlas:CurateFlag ;')
- lines.append(f' atlas:curationReason "{self.curate_flag.reason}"@en .')
- lines.append("")
- return "\n".join(lines)
- # ---------------------------------------------------------------------------
- # Usage example — reproduces joe.ttl
- # ---------------------------------------------------------------------------
- if __name__ == "__main__":
- biden = Entity(
- id="atlas:1b0e7222c7730540",
- label="Joe Biden",
- description="46th President of the United States (2021\u20132025)",
- type="atlas:Person",
- aliases=["Joe Biden", "Biden", "Joseph Biden"],
- identifiers=[
- Identifier(scheme="google-mid", value="/m/012gx2"),
- Identifier(scheme="wikidata-qid", value="Q6279"),
- ],
- attributes={
- "isAlive": True,
- },
- needs_curation=True,
- curate_flag=CurateFlag(
- reason="Fine-grained Trends type '46th U.S. President' not yet adjudicated."
- ),
- claims=[
- Claim(
- predicate="atlas:hasIdentifier",
- object_iri="atlas_data:ident_google-mid__m_012gx2",
- layer="raw",
- provenance=Provenance(
- source="google-trends",
- method="trends-resolution",
- confidence=0.9,
- retrieved_at="2026-04-04T23:27:06Z",
- ),
- ),
- Claim(
- predicate="atlas:hasIdentifier",
- object_iri="atlas_data:ident_wikidata-qid_q6279",
- layer="raw",
- provenance=Provenance(
- source="wikidata",
- method="wbsearchentities + entitydata",
- confidence=0.99,
- retrieved_at="2026-04-04T23:27:09Z",
- ),
- ),
- Claim(
- predicate="atlas:hasCanonicalType",
- object_iri="atlas:Person",
- layer="derived",
- ),
- ],
- )
- biden.add_raw_json("wikidata", {"qid": "Q6279", "label": "Joe Biden", "retrieved_at": "2026-04-04T23:27:09Z"})
- biden.add_raw_json("google-trends", {"mid": "/m/012gx2", "type": "46th U.S. President", "retrieved_at": "2026-04-04T23:27:06Z"})
- print(biden.to_turtle())
|