atlas_store.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427
  1. """
  2. atlas_store.py — SPARQL persistence for Atlas Entity objects.
  3. Two public functions:
  4. save_entity(entity, endpoint) — insert all triples into Virtuoso
  5. load_entity(atlas_id, endpoint) — reconstruct an Entity from the store
  6. Tested against Virtuoso 7.x (SPARQL 1.1 Update endpoint).
  7. The read side works against any SPARQL 1.1 endpoint.
  8. Dependencies:
  9. pip install SPARQLWrapper
  10. """
  11. from __future__ import annotations
  12. import json
  13. import re
  14. import asyncio
  15. from typing import Any, Dict, List, Optional
  16. from mcp import ClientSession
  17. from mcp.client.sse import sse_client
  18. from .atlas_model import Claim, CurateFlag, Entity, Identifier, Provenance
  19. # ---------------------------------------------------------------------------
  20. # Namespace constants — keep in sync with atlas.ttl
  21. # ---------------------------------------------------------------------------
  22. ATLAS = "http://world.eu.org/atlas_ontology#"
  23. ATLAS_D = "http://world.eu.org/atlas_data#"
  24. XSD = "http://www.w3.org/2001/XMLSchema#"
  25. RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  26. PREFIXES = f"""\
  27. PREFIX atlas: <{ATLAS}>
  28. PREFIX atlas_data: <{ATLAS_D}>
  29. PREFIX xsd: <{XSD}>
  30. PREFIX rdf: <{RDF}>
  31. """
  32. # ---------------------------------------------------------------------------
  33. # Internal helpers
  34. # ---------------------------------------------------------------------------
  35. def _full_iri(prefixed: str) -> str:
  36. """Expand a prefixed IRI to a full IRI string."""
  37. prefixed = prefixed.strip()
  38. if prefixed.startswith("atlas_data:"):
  39. return ATLAS_D + prefixed[len("atlas_data:"):]
  40. if prefixed.startswith("atlas:"):
  41. return ATLAS + prefixed[len("atlas:"):]
  42. if prefixed.startswith("<") and prefixed.endswith(">"):
  43. return prefixed[1:-1]
  44. return prefixed
  45. def _short_iri(full: str) -> str:
  46. """Compress a full IRI back to a prefixed form."""
  47. if full.startswith(ATLAS_D):
  48. return "atlas_data:" + full[len(ATLAS_D):]
  49. if full.startswith(ATLAS):
  50. return "atlas:" + full[len(ATLAS):]
  51. return f"<{full}>"
  52. def _local(full_iri: str) -> str:
  53. """Return the local name of a full IRI (after # or last /)."""
  54. for sep in ("#", "/"):
  55. idx = full_iri.rfind(sep)
  56. if idx != -1:
  57. return full_iri[idx + 1:]
  58. return full_iri
  59. def _escape(s: str) -> str:
  60. """Escape a string for embedding in a SPARQL triple-quoted literal."""
  61. return s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
  62. async def _sparql_update(endpoint: str, query: str) -> None:
  63. if "/mcp/sse" not in endpoint:
  64. raise RuntimeError("atlas_store only supports Virtuoso MCP/SSE endpoints in this scaffold")
  65. async def _run() -> None:
  66. async with sse_client(endpoint, timeout=10, sse_read_timeout=300) as (read_stream, write_stream):
  67. async with ClientSession(read_stream, write_stream) as session:
  68. await session.initialize()
  69. result = await session.call_tool("sparql_update", {"input": {"query": query}})
  70. if result.isError:
  71. raise RuntimeError(f"sparql_update failed: {result}")
  72. await _run()
  73. async def _sparql_select(endpoint: str, query: str) -> List[Dict[str, Any]]:
  74. if "/mcp/sse" not in endpoint:
  75. raise RuntimeError("atlas_store only supports Virtuoso MCP/SSE endpoints in this scaffold")
  76. async def _run() -> List[Dict[str, Any]]:
  77. async with sse_client(endpoint, timeout=10, sse_read_timeout=300) as (read_stream, write_stream):
  78. async with ClientSession(read_stream, write_stream) as session:
  79. await session.initialize()
  80. result = await session.call_tool("sparql_query", {"input": {"query": query}})
  81. if result.isError:
  82. raise RuntimeError(f"sparql_query failed: {result}")
  83. data = result.structuredContent if result.structuredContent is not None else result.content
  84. if isinstance(data, dict):
  85. return data.get("results", {}).get("bindings", []) or []
  86. return []
  87. return await _run()
  88. # ---------------------------------------------------------------------------
  89. # Save
  90. # ---------------------------------------------------------------------------
  91. def save_entity(entity: Entity, endpoint: str) -> None:
  92. """
  93. Insert all triples for an Entity into the SPARQL store.
  94. Uses SPARQL 1.1 INSERT DATA. Call delete_entity() first if you need
  95. to replace an existing entity (full replace pattern).
  96. Args:
  97. entity: The Entity to persist.
  98. endpoint: SPARQL Update endpoint URL,
  99. e.g. "http://localhost:8890/sparql-auth"
  100. """
  101. ttl_body = _build_insert_body(entity)
  102. query = f"{PREFIXES}\nINSERT DATA {{\n{ttl_body}\n}}"
  103. _sparql_update(endpoint, query)
  104. def delete_entity(atlas_id: str, endpoint: str) -> None:
  105. """
  106. Remove all triples where the entity or any of its blank/named nodes
  107. are the subject. Run before save_entity() for a clean replace.
  108. Args:
  109. atlas_id: e.g. "atlas:1b0e7222c7730540"
  110. endpoint: SPARQL Update endpoint URL.
  111. """
  112. entity_iri = f"<{ATLAS_D}entity_{atlas_id.replace('atlas:', '')}>"
  113. # Delete triples where entity is subject, plus all linked sub-nodes
  114. # (identifiers, claims, provenance, curate flag) via a SPARQL DELETE WHERE.
  115. query = f"""{PREFIXES}
  116. DELETE {{
  117. ?s ?p ?o .
  118. }}
  119. WHERE {{
  120. {{
  121. BIND({entity_iri} AS ?s)
  122. ?s ?p ?o .
  123. }}
  124. UNION
  125. {{
  126. {entity_iri} atlas:hasIdentifier ?s .
  127. ?s ?p ?o .
  128. }}
  129. UNION
  130. {{
  131. {entity_iri} atlas:hasClaim ?s .
  132. ?s ?p ?o .
  133. }}
  134. UNION
  135. {{
  136. {entity_iri} atlas:hasClaim ?claim .
  137. ?claim atlas:hasProvenance ?s .
  138. ?s ?p ?o .
  139. }}
  140. UNION
  141. {{
  142. {entity_iri} atlas:hasCurateFlag ?s .
  143. ?s ?p ?o .
  144. }}
  145. }}"""
  146. _sparql_update(endpoint, query)
  147. def _build_insert_body(entity: Entity) -> str:
  148. """Build the triple block (no INSERT DATA wrapper) for an Entity."""
  149. lines: List[str] = []
  150. e = f"<{_full_iri(entity._entity_iri())}>"
  151. # --- Entity core ---
  152. lines += [
  153. f" {e} a atlas:Entity ;",
  154. f' atlas:atlasId "{_escape(entity.id)}" ;',
  155. f' atlas:canonicalLabel "{_escape(entity.label)}"@en ;',
  156. ]
  157. if entity.description:
  158. lines.append(f' atlas:canonicalDescription "{_escape(entity.description)}"@en ;')
  159. if entity.type:
  160. lines.append(f' atlas:hasCanonicalType <{_full_iri(entity.type)}> ;')
  161. for alias in entity.aliases:
  162. lines.append(f' atlas:aliasLabel "{_escape(alias)}"@en ;')
  163. for ident in entity.identifiers:
  164. iiri = f"<{_full_iri(entity._identifier_iri(ident))}>"
  165. lines.append(f' atlas:hasIdentifier {iiri} ;')
  166. for key, val in entity.attributes.items():
  167. if isinstance(val, bool):
  168. lines.append(f' atlas:{key} "{str(val).lower()}"^^xsd:boolean ;')
  169. elif isinstance(val, float):
  170. lines.append(f' atlas:{key} "{val}"^^xsd:decimal ;')
  171. elif isinstance(val, int):
  172. lines.append(f' atlas:{key} "{val}"^^xsd:integer ;')
  173. else:
  174. lines.append(f' atlas:{key} "{_escape(str(val))}" ;')
  175. for blob in entity.raw_json:
  176. lines.append(f' atlas:rawJson "{_escape(blob)}"^^xsd:string ;')
  177. for claim in entity.claims:
  178. obj = claim.object_iri or claim.object_literal or ""
  179. cid = f"<{_full_iri(entity._claim_id(claim.predicate, obj))}>"
  180. lines.append(f' atlas:hasClaim {cid} ;')
  181. lines.append(f' atlas:needsCuration "{str(entity.needs_curation).lower()}"^^xsd:boolean')
  182. if entity.curate_flag:
  183. lines[-1] += " ;"
  184. curate_iri = f"<{_full_iri(entity._entity_iri())}_curate>"
  185. lines.append(f' atlas:hasCurateFlag {curate_iri}')
  186. lines[-1] += " ."
  187. # --- Identifier nodes ---
  188. for ident in entity.identifiers:
  189. iiri = f"<{_full_iri(entity._identifier_iri(ident))}>"
  190. lines += [
  191. f" {iiri} a atlas:Identifier ;",
  192. f' atlas:scheme "{_escape(ident.scheme)}" ;',
  193. f' atlas:value "{_escape(ident.value)}" .',
  194. ]
  195. # --- Claim + Provenance nodes ---
  196. for claim in entity.claims:
  197. obj = claim.object_iri or claim.object_literal or ""
  198. cid_str = entity._claim_id(claim.predicate, obj)
  199. cid = f"<{_full_iri(cid_str)}>"
  200. pid = f"<{_full_iri(entity._prov_id(cid_str))}>"
  201. pred_iri = f"<{_full_iri(claim.predicate)}>"
  202. lines += [
  203. f" {cid} a atlas:Claim ;",
  204. f' atlas:claimSubjectIri {e} ;',
  205. f' atlas:claimPredicate {pred_iri} ;',
  206. ]
  207. if claim.object_iri:
  208. lines.append(f' atlas:claimObjectIri <{_full_iri(claim.object_iri)}> ;')
  209. elif claim.object_literal:
  210. lines.append(f' atlas:claimObjectLiteral "{_escape(claim.object_literal)}" ;')
  211. lines += [
  212. f' atlas:claimLayer "{claim.layer}" ;',
  213. f' atlas:claimStatus "{claim.status}"',
  214. ]
  215. if claim.provenance:
  216. lines[-1] += " ;"
  217. lines.append(f' atlas:hasProvenance {pid}')
  218. lines[-1] += " ."
  219. if claim.provenance:
  220. p = claim.provenance
  221. lines += [
  222. f" {pid} a atlas:Provenance ;",
  223. f' atlas:provenanceSource "{_escape(p.source)}" ;',
  224. f' atlas:retrievalMethod "{_escape(p.method)}" ;',
  225. f' atlas:confidence "{p.confidence}"^^xsd:decimal ;',
  226. f' atlas:retrievedAt "{p.retrieved_at}"^^xsd:dateTime .',
  227. ]
  228. # --- CurateFlag node ---
  229. if entity.curate_flag:
  230. curate_iri = f"<{_full_iri(entity._entity_iri())}_curate>"
  231. lines += [
  232. f" {curate_iri} a atlas:CurateFlag ;",
  233. f' atlas:curationReason "{_escape(entity.curate_flag.reason)}"@en .',
  234. ]
  235. return "\n".join(lines)
  236. # ---------------------------------------------------------------------------
  237. # Load
  238. # ---------------------------------------------------------------------------
  239. def load_entity(atlas_id: str, endpoint: str) -> Optional[Entity]:
  240. """
  241. Reconstruct an Entity object from the SPARQL store.
  242. Returns None if the entity does not exist.
  243. Args:
  244. atlas_id: e.g. "atlas:1b0e7222c7730540"
  245. endpoint: SPARQL Query endpoint URL,
  246. e.g. "http://localhost:8890/sparql"
  247. """
  248. entity_iri = f"<{ATLAS_D}entity_{atlas_id.replace('atlas:', '')}>"
  249. # --- 1. Core entity fields ---
  250. core = _sparql_select(endpoint, f"""{PREFIXES}
  251. SELECT ?label ?description ?type ?needsCuration WHERE {{
  252. {entity_iri} atlas:canonicalLabel ?label .
  253. OPTIONAL {{ {entity_iri} atlas:canonicalDescription ?description . }}
  254. OPTIONAL {{ {entity_iri} atlas:hasCanonicalType ?type . }}
  255. OPTIONAL {{ {entity_iri} atlas:needsCuration ?needsCuration . }}
  256. }}""")
  257. if not core:
  258. return None
  259. row = core[0]
  260. label = row["label"]["value"]
  261. description = row.get("description", {}).get("value")
  262. type_full = row.get("type", {}).get("value")
  263. entity_type = _short_iri(type_full) if type_full else None
  264. needs_curation = row.get("needsCuration", {}).get("value", "false").lower() == "true"
  265. # --- 2. Aliases ---
  266. alias_rows = _sparql_select(endpoint, f"""{PREFIXES}
  267. SELECT ?alias WHERE {{
  268. {entity_iri} atlas:aliasLabel ?alias .
  269. }}""")
  270. aliases = [r["alias"]["value"] for r in alias_rows]
  271. # --- 3. Identifiers ---
  272. ident_rows = _sparql_select(endpoint, f"""{PREFIXES}
  273. SELECT ?scheme ?value WHERE {{
  274. {entity_iri} atlas:hasIdentifier ?ident .
  275. ?ident atlas:scheme ?scheme ;
  276. atlas:value ?value .
  277. }}""")
  278. identifiers = [
  279. Identifier(scheme=r["scheme"]["value"], value=r["value"]["value"])
  280. for r in ident_rows
  281. ]
  282. # --- 4. Attributes (any atlas: datatype property not covered above) ---
  283. KNOWN = {
  284. "atlasId", "canonicalLabel", "canonicalDescription",
  285. "aliasLabel", "needsCuration", "rawJson",
  286. }
  287. attr_rows = _sparql_select(endpoint, f"""{PREFIXES}
  288. SELECT ?pred ?val WHERE {{
  289. {entity_iri} ?pred ?val .
  290. FILTER(STRSTARTS(STR(?pred), "{ATLAS}"))
  291. FILTER(isLiteral(?val))
  292. }}""")
  293. attributes: Dict[str, Any] = {}
  294. raw_json: List[str] = []
  295. for r in attr_rows:
  296. pred_local = _local(r["pred"]["value"])
  297. if pred_local in KNOWN:
  298. continue
  299. val = r["val"]["value"]
  300. dt = r["val"].get("datatype", "")
  301. if pred_local == "rawJson":
  302. raw_json.append(val)
  303. elif dt.endswith("boolean"):
  304. attributes[pred_local] = val.lower() == "true"
  305. elif dt.endswith("decimal") or dt.endswith("float") or dt.endswith("double"):
  306. attributes[pred_local] = float(val)
  307. elif dt.endswith("integer") or dt.endswith("int"):
  308. attributes[pred_local] = int(val)
  309. else:
  310. attributes[pred_local] = val
  311. # --- 5. Claims + Provenance ---
  312. claim_rows = _sparql_select(endpoint, f"""{PREFIXES}
  313. SELECT ?pred ?objIri ?objLit ?layer ?status
  314. ?provSource ?provMethod ?provConf ?provAt
  315. WHERE {{
  316. {entity_iri} atlas:hasClaim ?claim .
  317. ?claim atlas:claimPredicate ?pred ;
  318. atlas:claimLayer ?layer ;
  319. atlas:claimStatus ?status .
  320. OPTIONAL {{ ?claim atlas:claimObjectIri ?objIri . }}
  321. OPTIONAL {{ ?claim atlas:claimObjectLiteral ?objLit . }}
  322. OPTIONAL {{
  323. ?claim atlas:hasProvenance ?prov .
  324. ?prov atlas:provenanceSource ?provSource ;
  325. atlas:retrievalMethod ?provMethod ;
  326. atlas:confidence ?provConf ;
  327. atlas:retrievedAt ?provAt .
  328. }}
  329. }}""")
  330. claims: List[Claim] = []
  331. for r in claim_rows:
  332. prov = None
  333. if "provSource" in r:
  334. prov = Provenance(
  335. source=r["provSource"]["value"],
  336. method=r["provMethod"]["value"],
  337. confidence=float(r["provConf"]["value"]),
  338. retrieved_at=r["provAt"]["value"],
  339. )
  340. claims.append(Claim(
  341. predicate=_short_iri(r["pred"]["value"]),
  342. object_iri=_short_iri(r["objIri"]["value"]) if "objIri" in r else None,
  343. object_literal=r["objLit"]["value"] if "objLit" in r else None,
  344. layer=r["layer"]["value"],
  345. status=r["status"]["value"],
  346. provenance=prov,
  347. ))
  348. # --- 6. CurateFlag ---
  349. curate_flag = None
  350. curate_rows = _sparql_select(endpoint, f"""{PREFIXES}
  351. SELECT ?reason WHERE {{
  352. {entity_iri} atlas:hasCurateFlag ?flag .
  353. ?flag atlas:curationReason ?reason .
  354. }}""")
  355. if curate_rows:
  356. curate_flag = CurateFlag(reason=curate_rows[0]["reason"]["value"])
  357. return Entity(
  358. id=atlas_id,
  359. label=label,
  360. description=description,
  361. type=entity_type,
  362. aliases=aliases,
  363. identifiers=identifiers,
  364. attributes=attributes,
  365. raw_json=raw_json,
  366. claims=claims,
  367. needs_curation=needs_curation,
  368. curate_flag=curate_flag,
  369. )