atlas_store.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443
  1. """
  2. atlas_store.py — SPARQL persistence for Atlas Entity objects.
  3. Two public functions:
  4. save_entity(entity, endpoint) — insert all triples into Virtuoso
  5. load_entity(atlas_id, endpoint) — reconstruct an Entity from the store
  6. Tested against Virtuoso 7.x (SPARQL 1.1 Update endpoint).
  7. The read side works against any SPARQL 1.1 endpoint.
  8. Dependencies:
  9. pip install SPARQLWrapper
  10. """
  11. from __future__ import annotations
  12. import json
  13. import re
  14. import asyncio
  15. from typing import Any, Dict, List, Optional
  16. from SPARQLWrapper import JSON, POST, SPARQLWrapper
  17. from mcp import ClientSession
  18. from mcp.client.sse import sse_client
  19. from atlas_model import Claim, CurateFlag, Entity, Identifier, Provenance
  20. # ---------------------------------------------------------------------------
  21. # Namespace constants — keep in sync with atlas.ttl
  22. # ---------------------------------------------------------------------------
  23. ATLAS = "http://world.eu.org/atlas_ontology#"
  24. ATLAS_D = "http://world.eu.org/atlas_data#"
  25. XSD = "http://www.w3.org/2001/XMLSchema#"
  26. RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  27. PREFIXES = f"""\
  28. PREFIX atlas: <{ATLAS}>
  29. PREFIX atlas_data: <{ATLAS_D}>
  30. PREFIX xsd: <{XSD}>
  31. PREFIX rdf: <{RDF}>
  32. """
  33. # ---------------------------------------------------------------------------
  34. # Internal helpers
  35. # ---------------------------------------------------------------------------
  36. def _full_iri(prefixed: str) -> str:
  37. """Expand a prefixed IRI to a full IRI string."""
  38. prefixed = prefixed.strip()
  39. if prefixed.startswith("atlas_data:"):
  40. return ATLAS_D + prefixed[len("atlas_data:"):]
  41. if prefixed.startswith("atlas:"):
  42. return ATLAS + prefixed[len("atlas:"):]
  43. if prefixed.startswith("<") and prefixed.endswith(">"):
  44. return prefixed[1:-1]
  45. return prefixed
  46. def _short_iri(full: str) -> str:
  47. """Compress a full IRI back to a prefixed form."""
  48. if full.startswith(ATLAS_D):
  49. return "atlas_data:" + full[len(ATLAS_D):]
  50. if full.startswith(ATLAS):
  51. return "atlas:" + full[len(ATLAS):]
  52. return f"<{full}>"
  53. def _local(full_iri: str) -> str:
  54. """Return the local name of a full IRI (after # or last /)."""
  55. for sep in ("#", "/"):
  56. idx = full_iri.rfind(sep)
  57. if idx != -1:
  58. return full_iri[idx + 1:]
  59. return full_iri
  60. def _escape(s: str) -> str:
  61. """Escape a string for embedding in a SPARQL triple-quoted literal."""
  62. return s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
  63. def _sparql_update(endpoint: str, query: str) -> None:
  64. # If endpoint looks like an MCP SSE URL, call the remote MCP tool.
  65. if "/mcp/sse" in endpoint:
  66. async def _run() -> None:
  67. async with sse_client(endpoint, timeout=10, sse_read_timeout=300) as (read_stream, write_stream):
  68. async with ClientSession(read_stream, write_stream) as session:
  69. await session.initialize()
  70. result = await session.call_tool("sparql_update", {"input": {"query": query}})
  71. if result.isError:
  72. raise RuntimeError(f"sparql_update failed: {result.error}")
  73. asyncio.run(_run())
  74. return
  75. # Fallback: plain SPARQL endpoint URL.
  76. sparql = SPARQLWrapper(endpoint)
  77. sparql.setMethod(POST)
  78. sparql.setQuery(query)
  79. sparql.setReturnFormat(JSON)
  80. sparql.query()
  81. def _sparql_select(endpoint: str, query: str) -> List[Dict[str, Any]]:
  82. # If endpoint looks like an MCP SSE URL, call the remote MCP tool.
  83. if "/mcp/sse" in endpoint:
  84. async def _run() -> List[Dict[str, Any]]:
  85. async with sse_client(endpoint, timeout=10, sse_read_timeout=300) as (read_stream, write_stream):
  86. async with ClientSession(read_stream, write_stream) as session:
  87. await session.initialize()
  88. result = await session.call_tool("sparql_query", {"input": {"query": query}})
  89. if result.isError:
  90. raise RuntimeError(f"sparql_query failed: {result.error}")
  91. data = result.structuredContent if result.structuredContent is not None else result.content
  92. if isinstance(data, dict):
  93. return data.get("results", {}).get("bindings", []) or []
  94. return []
  95. return asyncio.run(_run())
  96. # Fallback: plain SPARQL endpoint URL.
  97. sparql = SPARQLWrapper(endpoint)
  98. sparql.setMethod('GET')
  99. sparql.setQuery(query)
  100. sparql.setReturnFormat(JSON)
  101. res = sparql.query().convert()
  102. return res.get("results", {}).get("bindings", [])
  103. # ---------------------------------------------------------------------------
  104. # Save
  105. # ---------------------------------------------------------------------------
  106. def save_entity(entity: Entity, endpoint: str) -> None:
  107. """
  108. Insert all triples for an Entity into the SPARQL store.
  109. Uses SPARQL 1.1 INSERT DATA. Call delete_entity() first if you need
  110. to replace an existing entity (full replace pattern).
  111. Args:
  112. entity: The Entity to persist.
  113. endpoint: SPARQL Update endpoint URL,
  114. e.g. "http://localhost:8890/sparql-auth"
  115. """
  116. ttl_body = _build_insert_body(entity)
  117. query = f"{PREFIXES}\nINSERT DATA {{\n{ttl_body}\n}}"
  118. _sparql_update(endpoint, query)
  119. def delete_entity(atlas_id: str, endpoint: str) -> None:
  120. """
  121. Remove all triples where the entity or any of its blank/named nodes
  122. are the subject. Run before save_entity() for a clean replace.
  123. Args:
  124. atlas_id: e.g. "atlas:1b0e7222c7730540"
  125. endpoint: SPARQL Update endpoint URL.
  126. """
  127. entity_iri = f"<{ATLAS_D}entity_{atlas_id.replace('atlas:', '')}>"
  128. # Delete triples where entity is subject, plus all linked sub-nodes
  129. # (identifiers, claims, provenance, curate flag) via a SPARQL DELETE WHERE.
  130. query = f"""{PREFIXES}
  131. DELETE {{
  132. ?s ?p ?o .
  133. }}
  134. WHERE {{
  135. {{
  136. BIND({entity_iri} AS ?s)
  137. ?s ?p ?o .
  138. }}
  139. UNION
  140. {{
  141. {entity_iri} atlas:hasIdentifier ?s .
  142. ?s ?p ?o .
  143. }}
  144. UNION
  145. {{
  146. {entity_iri} atlas:hasClaim ?s .
  147. ?s ?p ?o .
  148. }}
  149. UNION
  150. {{
  151. {entity_iri} atlas:hasClaim ?claim .
  152. ?claim atlas:hasProvenance ?s .
  153. ?s ?p ?o .
  154. }}
  155. UNION
  156. {{
  157. {entity_iri} atlas:hasCurateFlag ?s .
  158. ?s ?p ?o .
  159. }}
  160. }}"""
  161. _sparql_update(endpoint, query)
  162. def _build_insert_body(entity: Entity) -> str:
  163. """Build the triple block (no INSERT DATA wrapper) for an Entity."""
  164. lines: List[str] = []
  165. e = f"<{_full_iri(entity._entity_iri())}>"
  166. # --- Entity core ---
  167. lines += [
  168. f" {e} a atlas:Entity ;",
  169. f' atlas:atlasId "{_escape(entity.id)}" ;',
  170. f' atlas:canonicalLabel "{_escape(entity.label)}"@en ;',
  171. ]
  172. if entity.description:
  173. lines.append(f' atlas:canonicalDescription "{_escape(entity.description)}"@en ;')
  174. if entity.type:
  175. lines.append(f' atlas:hasCanonicalType <{_full_iri(entity.type)}> ;')
  176. for alias in entity.aliases:
  177. lines.append(f' atlas:aliasLabel "{_escape(alias)}"@en ;')
  178. for ident in entity.identifiers:
  179. iiri = f"<{_full_iri(entity._identifier_iri(ident))}>"
  180. lines.append(f' atlas:hasIdentifier {iiri} ;')
  181. for key, val in entity.attributes.items():
  182. if isinstance(val, bool):
  183. lines.append(f' atlas:{key} "{str(val).lower()}"^^xsd:boolean ;')
  184. elif isinstance(val, float):
  185. lines.append(f' atlas:{key} "{val}"^^xsd:decimal ;')
  186. elif isinstance(val, int):
  187. lines.append(f' atlas:{key} "{val}"^^xsd:integer ;')
  188. else:
  189. lines.append(f' atlas:{key} "{_escape(str(val))}" ;')
  190. for blob in entity.raw_json:
  191. lines.append(f' atlas:rawJson "{_escape(blob)}"^^xsd:string ;')
  192. for claim in entity.claims:
  193. obj = claim.object_iri or claim.object_literal or ""
  194. cid = f"<{_full_iri(entity._claim_id(claim.predicate, obj))}>"
  195. lines.append(f' atlas:hasClaim {cid} ;')
  196. lines.append(f' atlas:needsCuration "{str(entity.needs_curation).lower()}"^^xsd:boolean')
  197. if entity.curate_flag:
  198. lines[-1] += " ;"
  199. curate_iri = f"<{_full_iri(entity._entity_iri())}_curate>"
  200. lines.append(f' atlas:hasCurateFlag {curate_iri}')
  201. lines[-1] += " ."
  202. # --- Identifier nodes ---
  203. for ident in entity.identifiers:
  204. iiri = f"<{_full_iri(entity._identifier_iri(ident))}>"
  205. lines += [
  206. f" {iiri} a atlas:Identifier ;",
  207. f' atlas:scheme "{_escape(ident.scheme)}" ;',
  208. f' atlas:value "{_escape(ident.value)}" .',
  209. ]
  210. # --- Claim + Provenance nodes ---
  211. for claim in entity.claims:
  212. obj = claim.object_iri or claim.object_literal or ""
  213. cid_str = entity._claim_id(claim.predicate, obj)
  214. cid = f"<{_full_iri(cid_str)}>"
  215. pid = f"<{_full_iri(entity._prov_id(cid_str))}>"
  216. pred_iri = f"<{_full_iri(claim.predicate)}>"
  217. lines += [
  218. f" {cid} a atlas:Claim ;",
  219. f' atlas:claimSubjectIri {e} ;',
  220. f' atlas:claimPredicate {pred_iri} ;',
  221. ]
  222. if claim.object_iri:
  223. lines.append(f' atlas:claimObjectIri <{_full_iri(claim.object_iri)}> ;')
  224. elif claim.object_literal:
  225. lines.append(f' atlas:claimObjectLiteral "{_escape(claim.object_literal)}" ;')
  226. lines += [
  227. f' atlas:claimLayer "{claim.layer}" ;',
  228. f' atlas:claimStatus "{claim.status}"',
  229. ]
  230. if claim.provenance:
  231. lines[-1] += " ;"
  232. lines.append(f' atlas:hasProvenance {pid}')
  233. lines[-1] += " ."
  234. if claim.provenance:
  235. p = claim.provenance
  236. lines += [
  237. f" {pid} a atlas:Provenance ;",
  238. f' atlas:provenanceSource "{_escape(p.source)}" ;',
  239. f' atlas:retrievalMethod "{_escape(p.method)}" ;',
  240. f' atlas:confidence "{p.confidence}"^^xsd:decimal ;',
  241. f' atlas:retrievedAt "{p.retrieved_at}"^^xsd:dateTime .',
  242. ]
  243. # --- CurateFlag node ---
  244. if entity.curate_flag:
  245. curate_iri = f"<{_full_iri(entity._entity_iri())}_curate>"
  246. lines += [
  247. f" {curate_iri} a atlas:CurateFlag ;",
  248. f' atlas:curationReason "{_escape(entity.curate_flag.reason)}"@en .',
  249. ]
  250. return "\n".join(lines)
  251. # ---------------------------------------------------------------------------
  252. # Load
  253. # ---------------------------------------------------------------------------
  254. def load_entity(atlas_id: str, endpoint: str) -> Optional[Entity]:
  255. """
  256. Reconstruct an Entity object from the SPARQL store.
  257. Returns None if the entity does not exist.
  258. Args:
  259. atlas_id: e.g. "atlas:1b0e7222c7730540"
  260. endpoint: SPARQL Query endpoint URL,
  261. e.g. "http://localhost:8890/sparql"
  262. """
  263. entity_iri = f"<{ATLAS_D}entity_{atlas_id.replace('atlas:', '')}>"
  264. # --- 1. Core entity fields ---
  265. core = _sparql_select(endpoint, f"""{PREFIXES}
  266. SELECT ?label ?description ?type ?needsCuration WHERE {{
  267. {entity_iri} atlas:canonicalLabel ?label .
  268. OPTIONAL {{ {entity_iri} atlas:canonicalDescription ?description . }}
  269. OPTIONAL {{ {entity_iri} atlas:hasCanonicalType ?type . }}
  270. OPTIONAL {{ {entity_iri} atlas:needsCuration ?needsCuration . }}
  271. }}""")
  272. if not core:
  273. return None
  274. row = core[0]
  275. label = row["label"]["value"]
  276. description = row.get("description", {}).get("value")
  277. type_full = row.get("type", {}).get("value")
  278. entity_type = _short_iri(type_full) if type_full else None
  279. needs_curation = row.get("needsCuration", {}).get("value", "false").lower() == "true"
  280. # --- 2. Aliases ---
  281. alias_rows = _sparql_select(endpoint, f"""{PREFIXES}
  282. SELECT ?alias WHERE {{
  283. {entity_iri} atlas:aliasLabel ?alias .
  284. }}""")
  285. aliases = [r["alias"]["value"] for r in alias_rows]
  286. # --- 3. Identifiers ---
  287. ident_rows = _sparql_select(endpoint, f"""{PREFIXES}
  288. SELECT ?scheme ?value WHERE {{
  289. {entity_iri} atlas:hasIdentifier ?ident .
  290. ?ident atlas:scheme ?scheme ;
  291. atlas:value ?value .
  292. }}""")
  293. identifiers = [
  294. Identifier(scheme=r["scheme"]["value"], value=r["value"]["value"])
  295. for r in ident_rows
  296. ]
  297. # --- 4. Attributes (any atlas: datatype property not covered above) ---
  298. KNOWN = {
  299. "atlasId", "canonicalLabel", "canonicalDescription",
  300. "aliasLabel", "needsCuration", "rawJson",
  301. }
  302. attr_rows = _sparql_select(endpoint, f"""{PREFIXES}
  303. SELECT ?pred ?val WHERE {{
  304. {entity_iri} ?pred ?val .
  305. FILTER(STRSTARTS(STR(?pred), "{ATLAS}"))
  306. FILTER(isLiteral(?val))
  307. }}""")
  308. attributes: Dict[str, Any] = {}
  309. raw_json: List[str] = []
  310. for r in attr_rows:
  311. pred_local = _local(r["pred"]["value"])
  312. if pred_local in KNOWN:
  313. continue
  314. val = r["val"]["value"]
  315. dt = r["val"].get("datatype", "")
  316. if pred_local == "rawJson":
  317. raw_json.append(val)
  318. elif dt.endswith("boolean"):
  319. attributes[pred_local] = val.lower() == "true"
  320. elif dt.endswith("decimal") or dt.endswith("float") or dt.endswith("double"):
  321. attributes[pred_local] = float(val)
  322. elif dt.endswith("integer") or dt.endswith("int"):
  323. attributes[pred_local] = int(val)
  324. else:
  325. attributes[pred_local] = val
  326. # --- 5. Claims + Provenance ---
  327. claim_rows = _sparql_select(endpoint, f"""{PREFIXES}
  328. SELECT ?pred ?objIri ?objLit ?layer ?status
  329. ?provSource ?provMethod ?provConf ?provAt
  330. WHERE {{
  331. {entity_iri} atlas:hasClaim ?claim .
  332. ?claim atlas:claimPredicate ?pred ;
  333. atlas:claimLayer ?layer ;
  334. atlas:claimStatus ?status .
  335. OPTIONAL {{ ?claim atlas:claimObjectIri ?objIri . }}
  336. OPTIONAL {{ ?claim atlas:claimObjectLiteral ?objLit . }}
  337. OPTIONAL {{
  338. ?claim atlas:hasProvenance ?prov .
  339. ?prov atlas:provenanceSource ?provSource ;
  340. atlas:retrievalMethod ?provMethod ;
  341. atlas:confidence ?provConf ;
  342. atlas:retrievedAt ?provAt .
  343. }}
  344. }}""")
  345. claims: List[Claim] = []
  346. for r in claim_rows:
  347. prov = None
  348. if "provSource" in r:
  349. prov = Provenance(
  350. source=r["provSource"]["value"],
  351. method=r["provMethod"]["value"],
  352. confidence=float(r["provConf"]["value"]),
  353. retrieved_at=r["provAt"]["value"],
  354. )
  355. claims.append(Claim(
  356. predicate=_short_iri(r["pred"]["value"]),
  357. object_iri=_short_iri(r["objIri"]["value"]) if "objIri" in r else None,
  358. object_literal=r["objLit"]["value"] if "objLit" in r else None,
  359. layer=r["layer"]["value"],
  360. status=r["status"]["value"],
  361. provenance=prov,
  362. ))
  363. # --- 6. CurateFlag ---
  364. curate_flag = None
  365. curate_rows = _sparql_select(endpoint, f"""{PREFIXES}
  366. SELECT ?reason WHERE {{
  367. {entity_iri} atlas:hasCurateFlag ?flag .
  368. ?flag atlas:curationReason ?reason .
  369. }}""")
  370. if curate_rows:
  371. curate_flag = CurateFlag(reason=curate_rows[0]["reason"]["value"])
  372. return Entity(
  373. id=atlas_id,
  374. label=label,
  375. description=description,
  376. type=entity_type,
  377. aliases=aliases,
  378. identifiers=identifiers,
  379. attributes=attributes,
  380. raw_json=raw_json,
  381. claims=claims,
  382. needs_curation=needs_curation,
  383. curate_flag=curate_flag,
  384. )