atlas_store.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536
  1. """
  2. atlas_store.py — SPARQL persistence for Atlas Entity objects.
  3. Two public functions:
  4. save_entity(entity, endpoint) — insert all triples into Virtuoso
  5. load_entity(atlas_id, endpoint) — reconstruct an Entity from the store
  6. Tested against Virtuoso 7.x (SPARQL 1.1 Update endpoint).
  7. The read side works against any SPARQL 1.1 endpoint.
  8. Dependencies:
  9. pip install SPARQLWrapper
  10. """
  11. from __future__ import annotations
  12. import json
  13. import re
  14. import asyncio
  15. import os
  16. from typing import Any, Dict, List, Optional
  17. import logging
  18. from mcp import ClientSession
  19. from mcp.client.sse import sse_client
  20. from .atlas_model import Claim, CurateFlag, Entity, Identifier, Provenance
  21. # ---------------------------------------------------------------------------
  22. # Namespace constants — keep in sync with atlas.ttl
  23. # ---------------------------------------------------------------------------
  24. ATLAS = "http://world.eu.org/atlas_ontology#"
  25. ATLAS_D = "http://world.eu.org/atlas_data#"
  26. XSD = "http://www.w3.org/2001/XMLSchema#"
  27. RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  28. PREFIXES = f"""\
  29. PREFIX atlas: <{ATLAS}>
  30. PREFIX atlas_data: <{ATLAS_D}>
  31. PREFIX xsd: <{XSD}>
  32. PREFIX rdf: <{RDF}>
  33. """
  34. DEFAULT_GRAPH_IRI = os.getenv("ATLAS_GRAPH_IRI", ATLAS_D)
  35. DEBUG_LOGS = os.getenv("ATLAS_DEBUG_LOGS", "false").lower() in {"1", "true", "yes", "on"}
  36. logger = logging.getLogger(__name__)
  37. # ---------------------------------------------------------------------------
  38. # Internal helpers
  39. # ---------------------------------------------------------------------------
  40. def _full_iri(prefixed: str) -> str:
  41. """Expand a prefixed IRI to a full IRI string."""
  42. prefixed = prefixed.strip()
  43. if prefixed.startswith("atlas_data:"):
  44. return ATLAS_D + prefixed[len("atlas_data:"):]
  45. if prefixed.startswith("atlas:"):
  46. return ATLAS + prefixed[len("atlas:"):]
  47. if prefixed.startswith("<") and prefixed.endswith(">"):
  48. return prefixed[1:-1]
  49. return prefixed
  50. def _short_iri(full: str) -> str:
  51. """Compress a full IRI back to a prefixed form."""
  52. if full.startswith(ATLAS_D):
  53. return "atlas_data:" + full[len(ATLAS_D):]
  54. if full.startswith(ATLAS):
  55. return "atlas:" + full[len(ATLAS):]
  56. return f"<{full}>"
  57. def _local(full_iri: str) -> str:
  58. """Return the local name of a full IRI (after # or last /)."""
  59. for sep in ("#", "/"):
  60. idx = full_iri.rfind(sep)
  61. if idx != -1:
  62. return full_iri[idx + 1:]
  63. return full_iri
  64. def _escape(s: str) -> str:
  65. """Escape a string for embedding in a SPARQL triple-quoted literal."""
  66. return s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
  67. async def _sparql_update(endpoint: str, query: str) -> None:
  68. if "/mcp/sse" not in endpoint:
  69. raise RuntimeError("atlas_store only supports Virtuoso MCP/SSE endpoints in this scaffold")
  70. async def _run() -> None:
  71. async with sse_client(endpoint, timeout=10, sse_read_timeout=300) as (read_stream, write_stream):
  72. async with ClientSession(read_stream, write_stream) as session:
  73. await session.initialize()
  74. result = await session.call_tool("sparql_update", {"input": {"query": query}})
  75. if result.isError:
  76. raise RuntimeError(f"sparql_update failed: {result}")
  77. await _run()
  78. async def _sparql_select(endpoint: str, query: str) -> List[Dict[str, Any]]:
  79. if "/mcp/sse" not in endpoint:
  80. raise RuntimeError("atlas_store only supports Virtuoso MCP/SSE endpoints in this scaffold")
  81. async def _run() -> List[Dict[str, Any]]:
  82. async with sse_client(endpoint, timeout=10, sse_read_timeout=300) as (read_stream, write_stream):
  83. async with ClientSession(read_stream, write_stream) as session:
  84. await session.initialize()
  85. result = await session.call_tool("sparql_query", {"input": {"query": query}})
  86. if result.isError:
  87. raise RuntimeError(f"sparql_query failed: {result}")
  88. data = result.structuredContent if result.structuredContent is not None else result.content
  89. if DEBUG_LOGS:
  90. if isinstance(data, dict):
  91. logger.info("sparql_select raw keys=%s", list(data.keys()))
  92. else:
  93. logger.info("sparql_select raw type=%s", type(data).__name__)
  94. # Some MCP servers return content as a list of TextContent items.
  95. if isinstance(data, list) and data:
  96. first = data[0]
  97. text = getattr(first, "text", None)
  98. if text:
  99. try:
  100. data = json.loads(text)
  101. if DEBUG_LOGS and isinstance(data, dict):
  102. logger.info("sparql_select decoded list->dict keys=%s", list(data.keys()))
  103. except Exception:
  104. if DEBUG_LOGS:
  105. logger.info("sparql_select could not decode list text as JSON")
  106. if isinstance(data, dict):
  107. bindings = data.get("results", {}).get("bindings", []) or []
  108. if DEBUG_LOGS:
  109. logger.info("sparql_select extracted bindings=%s", len(bindings) if bindings is not None else 0)
  110. return bindings
  111. return []
  112. return await _run()
  113. async def load_entity_by_subject(subject: str, endpoint: str, graph_iri: str = DEFAULT_GRAPH_IRI) -> Dict[str, Any] | None:
  114. needle = _escape((subject or "").strip())
  115. if not needle:
  116. return None
  117. label_query = f"""
  118. {PREFIXES}
  119. SELECT ?atlasId ?label ?type ?qid WHERE {{
  120. VALUES ?needle {{ "{needle}" }}
  121. GRAPH <{graph_iri}> {{
  122. ?entity a atlas:Entity ;
  123. atlas:atlasId ?atlasId ;
  124. atlas:canonicalLabel ?label .
  125. OPTIONAL {{ ?entity atlas:hasCanonicalType ?type . }}
  126. OPTIONAL {{ ?entity atlas:hasIdentifier ?ident . ?ident atlas:scheme "wikidata-qid" ; atlas:value ?qid . }}
  127. FILTER(LCASE(STR(?label)) = LCASE(STR(?needle)))
  128. }}
  129. }}
  130. LIMIT 1
  131. """.strip()
  132. alias_query = f"""
  133. {PREFIXES}
  134. SELECT ?atlasId ?label ?type ?qid ?alias WHERE {{
  135. VALUES ?needle {{ "{needle}" }}
  136. GRAPH <{graph_iri}> {{
  137. ?entity a atlas:Entity ;
  138. atlas:atlasId ?atlasId ;
  139. atlas:aliasLabel ?alias .
  140. OPTIONAL {{ ?entity atlas:canonicalLabel ?label . }}
  141. OPTIONAL {{ ?entity atlas:hasCanonicalType ?type . }}
  142. OPTIONAL {{ ?entity atlas:hasIdentifier ?ident . ?ident atlas:scheme "wikidata-qid" ; atlas:value ?qid . }}
  143. FILTER(LCASE(STR(?alias)) = LCASE(STR(?needle)))
  144. }}
  145. }}
  146. LIMIT 1
  147. """.strip()
  148. if DEBUG_LOGS:
  149. logger.info("store lookup by subject: needle=%s graph=%s", (subject or "").strip(), graph_iri)
  150. logger.info("store label query=%s", label_query)
  151. rows = await _sparql_select(endpoint, label_query)
  152. if not rows:
  153. if DEBUG_LOGS:
  154. logger.info("store alias query=%s", alias_query)
  155. rows = await _sparql_select(endpoint, alias_query)
  156. if DEBUG_LOGS:
  157. logger.info("store lookup rows=%s", len(rows) if rows else 0)
  158. if not rows:
  159. return None
  160. row = rows[0]
  161. type_value = row.get("type", {}).get("value")
  162. if type_value and type_value.startswith(ATLAS):
  163. type_value = f"atlas:{type_value.split('#', 1)[-1]}"
  164. return {
  165. "atlas_id": row.get("atlasId", {}).get("value"),
  166. "label": row.get("label", {}).get("value"),
  167. "type": type_value,
  168. "wikidata_id": row.get("qid", {}).get("value"),
  169. "alias": row.get("alias", {}).get("value"),
  170. }
  171. async def save_entity_minimal(entity: Entity, endpoint: str, graph_iri: str = DEFAULT_GRAPH_IRI) -> None:
  172. body = _build_insert_body(entity)
  173. query = f"""
  174. {PREFIXES}
  175. INSERT DATA {{
  176. GRAPH <{graph_iri}> {{
  177. {body}
  178. }}
  179. }}
  180. """.strip()
  181. await _sparql_update(endpoint, query)
  182. # ---------------------------------------------------------------------------
  183. # Save
  184. # ---------------------------------------------------------------------------
  185. def save_entity(entity: Entity, endpoint: str) -> None:
  186. """
  187. Insert all triples for an Entity into the SPARQL store.
  188. Uses SPARQL 1.1 INSERT DATA. Call delete_entity() first if you need
  189. to replace an existing entity (full replace pattern).
  190. Args:
  191. entity: The Entity to persist.
  192. endpoint: SPARQL Update endpoint URL,
  193. e.g. "http://localhost:8890/sparql-auth"
  194. """
  195. ttl_body = _build_insert_body(entity)
  196. query = f"{PREFIXES}\nINSERT DATA {{\n{ttl_body}\n}}"
  197. _sparql_update(endpoint, query)
  198. def delete_entity(atlas_id: str, endpoint: str) -> None:
  199. """
  200. Remove all triples where the entity or any of its blank/named nodes
  201. are the subject. Run before save_entity() for a clean replace.
  202. Args:
  203. atlas_id: e.g. "atlas:1b0e7222c7730540"
  204. endpoint: SPARQL Update endpoint URL.
  205. """
  206. entity_iri = f"<{ATLAS_D}entity_{atlas_id.replace('atlas:', '')}>"
  207. # Delete triples where entity is subject, plus all linked sub-nodes
  208. # (identifiers, claims, provenance, curate flag) via a SPARQL DELETE WHERE.
  209. query = f"""{PREFIXES}
  210. DELETE {{
  211. ?s ?p ?o .
  212. }}
  213. WHERE {{
  214. {{
  215. BIND({entity_iri} AS ?s)
  216. ?s ?p ?o .
  217. }}
  218. UNION
  219. {{
  220. {entity_iri} atlas:hasIdentifier ?s .
  221. ?s ?p ?o .
  222. }}
  223. UNION
  224. {{
  225. {entity_iri} atlas:hasClaim ?s .
  226. ?s ?p ?o .
  227. }}
  228. UNION
  229. {{
  230. {entity_iri} atlas:hasClaim ?claim .
  231. ?claim atlas:hasProvenance ?s .
  232. ?s ?p ?o .
  233. }}
  234. UNION
  235. {{
  236. {entity_iri} atlas:hasCurateFlag ?s .
  237. ?s ?p ?o .
  238. }}
  239. }}"""
  240. _sparql_update(endpoint, query)
  241. def _build_insert_body(entity: Entity) -> str:
  242. """Build the triple block (no INSERT DATA wrapper) for an Entity."""
  243. lines: List[str] = []
  244. e = f"<{_full_iri(entity._entity_iri())}>"
  245. # --- Entity core ---
  246. lines += [
  247. f" {e} a atlas:Entity ;",
  248. f' atlas:atlasId "{_escape(entity.id)}" ;',
  249. f' atlas:canonicalLabel "{_escape(entity.label)}"@en ;',
  250. ]
  251. if entity.description:
  252. lines.append(f' atlas:canonicalDescription "{_escape(entity.description)}"@en ;')
  253. if entity.type:
  254. lines.append(f' atlas:hasCanonicalType <{_full_iri(entity.type)}> ;')
  255. for alias in entity.aliases:
  256. lines.append(f' atlas:aliasLabel "{_escape(alias)}"@en ;')
  257. for ident in entity.identifiers:
  258. iiri = f"<{_full_iri(entity._identifier_iri(ident))}>"
  259. lines.append(f' atlas:hasIdentifier {iiri} ;')
  260. for key, val in entity.attributes.items():
  261. if isinstance(val, bool):
  262. lines.append(f' atlas:{key} "{str(val).lower()}"^^xsd:boolean ;')
  263. elif isinstance(val, float):
  264. lines.append(f' atlas:{key} "{val}"^^xsd:decimal ;')
  265. elif isinstance(val, int):
  266. lines.append(f' atlas:{key} "{val}"^^xsd:integer ;')
  267. else:
  268. lines.append(f' atlas:{key} "{_escape(str(val))}" ;')
  269. for blob in entity.raw_json:
  270. lines.append(f' atlas:rawJson "{_escape(blob)}"^^xsd:string ;')
  271. for claim in entity.claims:
  272. obj = claim.object_iri or claim.object_literal or ""
  273. cid = f"<{_full_iri(entity._claim_id(claim.predicate, obj))}>"
  274. lines.append(f' atlas:hasClaim {cid} ;')
  275. lines.append(f' atlas:needsCuration "{str(entity.needs_curation).lower()}"^^xsd:boolean')
  276. if entity.curate_flag:
  277. lines[-1] += " ;"
  278. curate_iri = f"<{_full_iri(entity._entity_iri())}_curate>"
  279. lines.append(f' atlas:hasCurateFlag {curate_iri}')
  280. lines[-1] += " ."
  281. # --- Identifier nodes ---
  282. for ident in entity.identifiers:
  283. iiri = f"<{_full_iri(entity._identifier_iri(ident))}>"
  284. lines += [
  285. f" {iiri} a atlas:Identifier ;",
  286. f' atlas:scheme "{_escape(ident.scheme)}" ;',
  287. f' atlas:value "{_escape(ident.value)}" .',
  288. ]
  289. # --- Claim + Provenance nodes ---
  290. for claim in entity.claims:
  291. obj = claim.object_iri or claim.object_literal or ""
  292. cid_str = entity._claim_id(claim.predicate, obj)
  293. cid = f"<{_full_iri(cid_str)}>"
  294. pid = f"<{_full_iri(entity._prov_id(cid_str))}>"
  295. pred_iri = f"<{_full_iri(claim.predicate)}>"
  296. lines += [
  297. f" {cid} a atlas:Claim ;",
  298. f' atlas:claimSubjectIri {e} ;',
  299. f' atlas:claimPredicate {pred_iri} ;',
  300. ]
  301. if claim.object_iri:
  302. lines.append(f' atlas:claimObjectIri <{_full_iri(claim.object_iri)}> ;')
  303. elif claim.object_literal:
  304. lines.append(f' atlas:claimObjectLiteral "{_escape(claim.object_literal)}" ;')
  305. lines += [
  306. f' atlas:claimLayer "{claim.layer}" ;',
  307. f' atlas:claimStatus "{claim.status}"',
  308. ]
  309. if claim.provenance:
  310. lines[-1] += " ;"
  311. lines.append(f' atlas:hasProvenance {pid}')
  312. lines[-1] += " ."
  313. if claim.provenance:
  314. p = claim.provenance
  315. lines += [
  316. f" {pid} a atlas:Provenance ;",
  317. f' atlas:provenanceSource "{_escape(p.source)}" ;',
  318. f' atlas:retrievalMethod "{_escape(p.method)}" ;',
  319. f' atlas:confidence "{p.confidence}"^^xsd:decimal ;',
  320. f' atlas:retrievedAt "{p.retrieved_at}"^^xsd:dateTime .',
  321. ]
  322. # --- CurateFlag node ---
  323. if entity.curate_flag:
  324. curate_iri = f"<{_full_iri(entity._entity_iri())}_curate>"
  325. lines += [
  326. f" {curate_iri} a atlas:CurateFlag ;",
  327. f' atlas:curationReason "{_escape(entity.curate_flag.reason)}"@en .',
  328. ]
  329. return "\n".join(lines)
  330. # ---------------------------------------------------------------------------
  331. # Load
  332. # ---------------------------------------------------------------------------
  333. def load_entity(atlas_id: str, endpoint: str) -> Optional[Entity]:
  334. """
  335. Reconstruct an Entity object from the SPARQL store.
  336. Returns None if the entity does not exist.
  337. Args:
  338. atlas_id: e.g. "atlas:1b0e7222c7730540"
  339. endpoint: SPARQL Query endpoint URL,
  340. e.g. "http://localhost:8890/sparql"
  341. """
  342. entity_iri = f"<{ATLAS_D}entity_{atlas_id.replace('atlas:', '')}>"
  343. # --- 1. Core entity fields ---
  344. core = _sparql_select(endpoint, f"""{PREFIXES}
  345. SELECT ?label ?description ?type ?needsCuration WHERE {{
  346. {entity_iri} atlas:canonicalLabel ?label .
  347. OPTIONAL {{ {entity_iri} atlas:canonicalDescription ?description . }}
  348. OPTIONAL {{ {entity_iri} atlas:hasCanonicalType ?type . }}
  349. OPTIONAL {{ {entity_iri} atlas:needsCuration ?needsCuration . }}
  350. }}""")
  351. if not core:
  352. return None
  353. row = core[0]
  354. label = row["label"]["value"]
  355. description = row.get("description", {}).get("value")
  356. type_full = row.get("type", {}).get("value")
  357. entity_type = _short_iri(type_full) if type_full else None
  358. needs_curation = row.get("needsCuration", {}).get("value", "false").lower() == "true"
  359. # --- 2. Aliases ---
  360. alias_rows = _sparql_select(endpoint, f"""{PREFIXES}
  361. SELECT ?alias WHERE {{
  362. {entity_iri} atlas:aliasLabel ?alias .
  363. }}""")
  364. aliases = [r["alias"]["value"] for r in alias_rows]
  365. # --- 3. Identifiers ---
  366. ident_rows = _sparql_select(endpoint, f"""{PREFIXES}
  367. SELECT ?scheme ?value WHERE {{
  368. {entity_iri} atlas:hasIdentifier ?ident .
  369. ?ident atlas:scheme ?scheme ;
  370. atlas:value ?value .
  371. }}""")
  372. identifiers = [
  373. Identifier(scheme=r["scheme"]["value"], value=r["value"]["value"])
  374. for r in ident_rows
  375. ]
  376. # --- 4. Attributes (any atlas: datatype property not covered above) ---
  377. KNOWN = {
  378. "atlasId", "canonicalLabel", "canonicalDescription",
  379. "aliasLabel", "needsCuration", "rawJson",
  380. }
  381. attr_rows = _sparql_select(endpoint, f"""{PREFIXES}
  382. SELECT ?pred ?val WHERE {{
  383. {entity_iri} ?pred ?val .
  384. FILTER(STRSTARTS(STR(?pred), "{ATLAS}"))
  385. FILTER(isLiteral(?val))
  386. }}""")
  387. attributes: Dict[str, Any] = {}
  388. raw_json: List[str] = []
  389. for r in attr_rows:
  390. pred_local = _local(r["pred"]["value"])
  391. if pred_local in KNOWN:
  392. continue
  393. val = r["val"]["value"]
  394. dt = r["val"].get("datatype", "")
  395. if pred_local == "rawJson":
  396. raw_json.append(val)
  397. elif dt.endswith("boolean"):
  398. attributes[pred_local] = val.lower() == "true"
  399. elif dt.endswith("decimal") or dt.endswith("float") or dt.endswith("double"):
  400. attributes[pred_local] = float(val)
  401. elif dt.endswith("integer") or dt.endswith("int"):
  402. attributes[pred_local] = int(val)
  403. else:
  404. attributes[pred_local] = val
  405. # --- 5. Claims + Provenance ---
  406. claim_rows = _sparql_select(endpoint, f"""{PREFIXES}
  407. SELECT ?pred ?objIri ?objLit ?layer ?status
  408. ?provSource ?provMethod ?provConf ?provAt
  409. WHERE {{
  410. {entity_iri} atlas:hasClaim ?claim .
  411. ?claim atlas:claimPredicate ?pred ;
  412. atlas:claimLayer ?layer ;
  413. atlas:claimStatus ?status .
  414. OPTIONAL {{ ?claim atlas:claimObjectIri ?objIri . }}
  415. OPTIONAL {{ ?claim atlas:claimObjectLiteral ?objLit . }}
  416. OPTIONAL {{
  417. ?claim atlas:hasProvenance ?prov .
  418. ?prov atlas:provenanceSource ?provSource ;
  419. atlas:retrievalMethod ?provMethod ;
  420. atlas:confidence ?provConf ;
  421. atlas:retrievedAt ?provAt .
  422. }}
  423. }}""")
  424. claims: List[Claim] = []
  425. for r in claim_rows:
  426. prov = None
  427. if "provSource" in r:
  428. prov = Provenance(
  429. source=r["provSource"]["value"],
  430. method=r["provMethod"]["value"],
  431. confidence=float(r["provConf"]["value"]),
  432. retrieved_at=r["provAt"]["value"],
  433. )
  434. claims.append(Claim(
  435. predicate=_short_iri(r["pred"]["value"]),
  436. object_iri=_short_iri(r["objIri"]["value"]) if "objIri" in r else None,
  437. object_literal=r["objLit"]["value"] if "objLit" in r else None,
  438. layer=r["layer"]["value"],
  439. status=r["status"]["value"],
  440. provenance=prov,
  441. ))
  442. # --- 6. CurateFlag ---
  443. curate_flag = None
  444. curate_rows = _sparql_select(endpoint, f"""{PREFIXES}
  445. SELECT ?reason WHERE {{
  446. {entity_iri} atlas:hasCurateFlag ?flag .
  447. ?flag atlas:curationReason ?reason .
  448. }}""")
  449. if curate_rows:
  450. curate_flag = CurateFlag(reason=curate_rows[0]["reason"]["value"])
  451. return Entity(
  452. id=atlas_id,
  453. label=label,
  454. description=description,
  455. type=entity_type,
  456. aliases=aliases,
  457. identifiers=identifiers,
  458. attributes=attributes,
  459. raw_json=raw_json,
  460. claims=claims,
  461. needs_curation=needs_curation,
  462. curate_flag=curate_flag,
  463. )