atlas_store.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550
  1. """
  2. atlas_store.py — SPARQL persistence for Atlas Entity objects.
  3. Two public functions:
  4. save_entity(entity, endpoint) — insert all triples into Virtuoso
  5. load_entity(atlas_id, endpoint) — reconstruct an Entity from the store
  6. Tested against Virtuoso 7.x (SPARQL 1.1 Update endpoint).
  7. The read side works against any SPARQL 1.1 endpoint.
  8. Dependencies:
  9. pip install SPARQLWrapper
  10. """
  11. from __future__ import annotations
  12. import json
  13. import re
  14. import asyncio
  15. import os
  16. from typing import Any, Dict, List, Optional
  17. import logging
  18. from mcp import ClientSession
  19. from mcp.client.sse import sse_client
  20. from .atlas_model import Claim, CurateFlag, Entity, Identifier, Provenance
  21. # ---------------------------------------------------------------------------
  22. # Namespace constants — keep in sync with atlas.ttl
  23. # ---------------------------------------------------------------------------
  24. ATLAS = "http://world.eu.org/atlas_ontology#"
  25. ATLAS_D = "http://world.eu.org/atlas_data#"
  26. XSD = "http://www.w3.org/2001/XMLSchema#"
  27. RDF = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
  28. PREFIXES = f"""\
  29. PREFIX atlas: <{ATLAS}>
  30. PREFIX atlas_data: <{ATLAS_D}>
  31. PREFIX xsd: <{XSD}>
  32. PREFIX rdf: <{RDF}>
  33. """
  34. DEFAULT_GRAPH_IRI = os.getenv("ATLAS_GRAPH_IRI", ATLAS_D)
  35. DEBUG_LOGS = os.getenv("ATLAS_DEBUG_LOGS", "false").lower() in {"1", "true", "yes", "on"}
  36. logger = logging.getLogger(__name__)
  37. # ---------------------------------------------------------------------------
  38. # Internal helpers
  39. # ---------------------------------------------------------------------------
  40. def _full_iri(prefixed: str) -> str:
  41. """Expand a prefixed IRI to a full IRI string."""
  42. prefixed = prefixed.strip()
  43. if prefixed.startswith("atlas_data:"):
  44. return ATLAS_D + prefixed[len("atlas_data:"):]
  45. if prefixed.startswith("atlas:"):
  46. return ATLAS + prefixed[len("atlas:"):]
  47. if prefixed.startswith("<") and prefixed.endswith(">"):
  48. return prefixed[1:-1]
  49. return prefixed
  50. def _short_iri(full: str) -> str:
  51. """Compress a full IRI back to a prefixed form."""
  52. if full.startswith(ATLAS_D):
  53. return "atlas_data:" + full[len(ATLAS_D):]
  54. if full.startswith(ATLAS):
  55. return "atlas:" + full[len(ATLAS):]
  56. return f"<{full}>"
  57. def _local(full_iri: str) -> str:
  58. """Return the local name of a full IRI (after # or last /)."""
  59. for sep in ("#", "/"):
  60. idx = full_iri.rfind(sep)
  61. if idx != -1:
  62. return full_iri[idx + 1:]
  63. return full_iri
  64. def _escape(s: str) -> str:
  65. """Escape a string for embedding in a SPARQL triple-quoted literal."""
  66. return s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
  67. def _parse_boolean_literal(value: str | None, default: bool = True) -> bool:
  68. if value is None:
  69. return default
  70. v = str(value).strip().lower()
  71. return v in {"1", "true", "yes", "y", "t"}
  72. async def _sparql_update(endpoint: str, query: str) -> None:
  73. if "/mcp/sse" not in endpoint:
  74. raise RuntimeError("atlas_store only supports Virtuoso MCP/SSE endpoints in this scaffold")
  75. async def _run() -> None:
  76. async with sse_client(endpoint, timeout=10, sse_read_timeout=300) as (read_stream, write_stream):
  77. async with ClientSession(read_stream, write_stream) as session:
  78. await session.initialize()
  79. result = await session.call_tool("sparql_update", {"input": {"query": query}})
  80. if result.isError:
  81. raise RuntimeError(f"sparql_update failed: {result}")
  82. await _run()
  83. async def _sparql_select(endpoint: str, query: str) -> List[Dict[str, Any]]:
  84. if "/mcp/sse" not in endpoint:
  85. raise RuntimeError("atlas_store only supports Virtuoso MCP/SSE endpoints in this scaffold")
  86. async def _run() -> List[Dict[str, Any]]:
  87. async with sse_client(endpoint, timeout=10, sse_read_timeout=300) as (read_stream, write_stream):
  88. async with ClientSession(read_stream, write_stream) as session:
  89. await session.initialize()
  90. result = await session.call_tool("sparql_query", {"input": {"query": query}})
  91. if result.isError:
  92. raise RuntimeError(f"sparql_query failed: {result}")
  93. data = result.structuredContent if result.structuredContent is not None else result.content
  94. if DEBUG_LOGS:
  95. if isinstance(data, dict):
  96. logger.info("sparql_select raw keys=%s", list(data.keys()))
  97. else:
  98. logger.info("sparql_select raw type=%s", type(data).__name__)
  99. # Some MCP servers return content as a list of TextContent items.
  100. if isinstance(data, list) and data:
  101. first = data[0]
  102. text = getattr(first, "text", None)
  103. if text:
  104. try:
  105. data = json.loads(text)
  106. if DEBUG_LOGS and isinstance(data, dict):
  107. logger.info("sparql_select decoded list->dict keys=%s", list(data.keys()))
  108. except Exception:
  109. if DEBUG_LOGS:
  110. logger.info("sparql_select could not decode list text as JSON")
  111. if isinstance(data, dict):
  112. bindings = data.get("results", {}).get("bindings", []) or []
  113. if DEBUG_LOGS:
  114. logger.info("sparql_select extracted bindings=%s", len(bindings) if bindings is not None else 0)
  115. return bindings
  116. return []
  117. return await _run()
  118. async def load_entity_by_subject(subject: str, endpoint: str, graph_iri: str = DEFAULT_GRAPH_IRI) -> Dict[str, Any] | None:
  119. needle = _escape((subject or "").strip())
  120. if not needle:
  121. return None
  122. label_query = f"""
  123. {PREFIXES}
  124. SELECT ?atlasId ?label ?desc ?type ?qid ?needsCuration WHERE {{
  125. VALUES ?needle {{ "{needle}" }}
  126. GRAPH <{graph_iri}> {{
  127. ?entity a atlas:Entity ;
  128. atlas:atlasId ?atlasId ;
  129. atlas:canonicalLabel ?label .
  130. OPTIONAL {{ ?entity atlas:canonicalDescription ?desc . }}
  131. OPTIONAL {{ ?entity atlas:hasCanonicalType ?type . }}
  132. OPTIONAL {{ ?entity atlas:needsCuration ?needsCuration . }}
  133. OPTIONAL {{ ?entity atlas:hasIdentifier ?ident . ?ident atlas:scheme "wikidata-qid" ; atlas:value ?qid . }}
  134. FILTER(LCASE(STR(?label)) = LCASE(STR(?needle)))
  135. }}
  136. }}
  137. LIMIT 1
  138. """.strip()
  139. alias_query = f"""
  140. {PREFIXES}
  141. SELECT ?atlasId ?label ?desc ?type ?qid ?alias ?needsCuration WHERE {{
  142. VALUES ?needle {{ "{needle}" }}
  143. GRAPH <{graph_iri}> {{
  144. ?entity a atlas:Entity ;
  145. atlas:atlasId ?atlasId ;
  146. atlas:aliasLabel ?alias .
  147. OPTIONAL {{ ?entity atlas:canonicalLabel ?label . }}
  148. OPTIONAL {{ ?entity atlas:canonicalDescription ?desc . }}
  149. OPTIONAL {{ ?entity atlas:hasCanonicalType ?type . }}
  150. OPTIONAL {{ ?entity atlas:needsCuration ?needsCuration . }}
  151. OPTIONAL {{ ?entity atlas:hasIdentifier ?ident . ?ident atlas:scheme "wikidata-qid" ; atlas:value ?qid . }}
  152. FILTER(LCASE(STR(?alias)) = LCASE(STR(?needle)))
  153. }}
  154. }}
  155. LIMIT 1
  156. """.strip()
  157. if DEBUG_LOGS:
  158. logger.info("store lookup by subject: needle=%s graph=%s", (subject or "").strip(), graph_iri)
  159. logger.info("store label query=%s", label_query)
  160. rows = await _sparql_select(endpoint, label_query)
  161. if not rows:
  162. if DEBUG_LOGS:
  163. logger.info("store alias query=%s", alias_query)
  164. rows = await _sparql_select(endpoint, alias_query)
  165. if DEBUG_LOGS:
  166. logger.info("store lookup rows=%s", len(rows) if rows else 0)
  167. if not rows:
  168. return None
  169. row = rows[0]
  170. type_value = row.get("type", {}).get("value")
  171. if type_value and type_value.startswith(ATLAS):
  172. type_value = f"atlas:{type_value.split('#', 1)[-1]}"
  173. return {
  174. "atlas_id": row.get("atlasId", {}).get("value"),
  175. "label": row.get("label", {}).get("value"),
  176. "description": (row.get("desc", {}) or {}).get("value"),
  177. "type": type_value,
  178. "wikidata_id": row.get("qid", {}).get("value"),
  179. "alias": row.get("alias", {}).get("value"),
  180. # If the triple is missing, treat it as needsCuration=true for this early-stage workflow.
  181. "needs_curation": _parse_boolean_literal(row.get("needsCuration", {}).get("value"), default=True),
  182. }
  183. async def save_entity_minimal(entity: Entity, endpoint: str, graph_iri: str = DEFAULT_GRAPH_IRI) -> None:
  184. body = _build_insert_body(entity)
  185. query = f"""
  186. {PREFIXES}
  187. INSERT DATA {{
  188. GRAPH <{graph_iri}> {{
  189. {body}
  190. }}
  191. }}
  192. """.strip()
  193. await _sparql_update(endpoint, query)
  194. # ---------------------------------------------------------------------------
  195. # Save
  196. # ---------------------------------------------------------------------------
  197. def save_entity(entity: Entity, endpoint: str) -> None:
  198. """
  199. Insert all triples for an Entity into the SPARQL store.
  200. Uses SPARQL 1.1 INSERT DATA. Call delete_entity() first if you need
  201. to replace an existing entity (full replace pattern).
  202. Args:
  203. entity: The Entity to persist.
  204. endpoint: SPARQL Update endpoint URL,
  205. e.g. "http://localhost:8890/sparql-auth"
  206. """
  207. ttl_body = _build_insert_body(entity)
  208. query = f"{PREFIXES}\nINSERT DATA {{\n{ttl_body}\n}}"
  209. _sparql_update(endpoint, query)
  210. def delete_entity(atlas_id: str, endpoint: str) -> None:
  211. """
  212. Remove all triples where the entity or any of its blank/named nodes
  213. are the subject. Run before save_entity() for a clean replace.
  214. Args:
  215. atlas_id: e.g. "atlas:1b0e7222c7730540"
  216. endpoint: SPARQL Update endpoint URL.
  217. """
  218. entity_iri = f"<{ATLAS_D}entity_{atlas_id.replace('atlas:', '')}>"
  219. # Delete triples where entity is subject, plus all linked sub-nodes
  220. # (identifiers, claims, provenance, curate flag) via a SPARQL DELETE WHERE.
  221. query = f"""{PREFIXES}
  222. DELETE {{
  223. ?s ?p ?o .
  224. }}
  225. WHERE {{
  226. {{
  227. BIND({entity_iri} AS ?s)
  228. ?s ?p ?o .
  229. }}
  230. UNION
  231. {{
  232. {entity_iri} atlas:hasIdentifier ?s .
  233. ?s ?p ?o .
  234. }}
  235. UNION
  236. {{
  237. {entity_iri} atlas:hasClaim ?s .
  238. ?s ?p ?o .
  239. }}
  240. UNION
  241. {{
  242. {entity_iri} atlas:hasClaim ?claim .
  243. ?claim atlas:hasProvenance ?s .
  244. ?s ?p ?o .
  245. }}
  246. UNION
  247. {{
  248. {entity_iri} atlas:hasCurateFlag ?s .
  249. ?s ?p ?o .
  250. }}
  251. }}"""
  252. _sparql_update(endpoint, query)
  253. def _build_insert_body(entity: Entity) -> str:
  254. """Build the triple block (no INSERT DATA wrapper) for an Entity."""
  255. lines: List[str] = []
  256. e = f"<{_full_iri(entity._entity_iri())}>"
  257. # --- Entity core ---
  258. lines += [
  259. f" {e} a atlas:Entity ;",
  260. f' atlas:atlasId "{_escape(entity.id)}" ;',
  261. f' atlas:canonicalLabel "{_escape(entity.label)}"@en ;',
  262. ]
  263. if entity.description:
  264. lines.append(f' atlas:canonicalDescription "{_escape(entity.description)}"@en ;')
  265. if entity.type:
  266. lines.append(f' atlas:hasCanonicalType <{_full_iri(entity.type)}> ;')
  267. for alias in entity.aliases:
  268. lines.append(f' atlas:aliasLabel "{_escape(alias)}"@en ;')
  269. for ident in entity.identifiers:
  270. iiri = f"<{_full_iri(entity._identifier_iri(ident))}>"
  271. lines.append(f' atlas:hasIdentifier {iiri} ;')
  272. for key, val in entity.attributes.items():
  273. if isinstance(val, bool):
  274. lines.append(f' atlas:{key} "{str(val).lower()}"^^xsd:boolean ;')
  275. elif isinstance(val, float):
  276. lines.append(f' atlas:{key} "{val}"^^xsd:decimal ;')
  277. elif isinstance(val, int):
  278. lines.append(f' atlas:{key} "{val}"^^xsd:integer ;')
  279. else:
  280. lines.append(f' atlas:{key} "{_escape(str(val))}" ;')
  281. for blob in entity.raw_json:
  282. lines.append(f' atlas:rawJson "{_escape(blob)}"^^xsd:string ;')
  283. for claim in entity.claims:
  284. obj = claim.object_iri or claim.object_literal or ""
  285. cid = f"<{_full_iri(entity._claim_id(claim.predicate, obj))}>"
  286. lines.append(f' atlas:hasClaim {cid} ;')
  287. lines.append(f' atlas:needsCuration "{str(entity.needs_curation).lower()}"^^xsd:boolean')
  288. if entity.curate_flag:
  289. lines[-1] += " ;"
  290. curate_iri = f"<{_full_iri(entity._entity_iri())}_curate>"
  291. lines.append(f' atlas:hasCurateFlag {curate_iri}')
  292. lines[-1] += " ."
  293. # --- Identifier nodes ---
  294. for ident in entity.identifiers:
  295. iiri = f"<{_full_iri(entity._identifier_iri(ident))}>"
  296. lines += [
  297. f" {iiri} a atlas:Identifier ;",
  298. f' atlas:scheme "{_escape(ident.scheme)}" ;',
  299. f' atlas:value "{_escape(ident.value)}" .',
  300. ]
  301. # --- Claim + Provenance nodes ---
  302. for claim in entity.claims:
  303. obj = claim.object_iri or claim.object_literal or ""
  304. cid_str = entity._claim_id(claim.predicate, obj)
  305. cid = f"<{_full_iri(cid_str)}>"
  306. pid = f"<{_full_iri(entity._prov_id(cid_str))}>"
  307. pred_iri = f"<{_full_iri(claim.predicate)}>"
  308. lines += [
  309. f" {cid} a atlas:Claim ;",
  310. f' atlas:claimSubjectIri {e} ;',
  311. f' atlas:claimPredicate {pred_iri} ;',
  312. ]
  313. if claim.object_iri:
  314. lines.append(f' atlas:claimObjectIri <{_full_iri(claim.object_iri)}> ;')
  315. elif claim.object_literal:
  316. lines.append(f' atlas:claimObjectLiteral "{_escape(claim.object_literal)}" ;')
  317. lines += [
  318. f' atlas:claimLayer "{claim.layer}" ;',
  319. f' atlas:claimStatus "{claim.status}"',
  320. ]
  321. if claim.provenance:
  322. lines[-1] += " ;"
  323. lines.append(f' atlas:hasProvenance {pid}')
  324. lines[-1] += " ."
  325. if claim.provenance:
  326. p = claim.provenance
  327. lines += [
  328. f" {pid} a atlas:Provenance ;",
  329. f' atlas:provenanceSource "{_escape(p.source)}" ;',
  330. f' atlas:retrievalMethod "{_escape(p.method)}" ;',
  331. f' atlas:confidence "{p.confidence}"^^xsd:decimal ;',
  332. f' atlas:retrievedAt "{p.retrieved_at}"^^xsd:dateTime .',
  333. ]
  334. # --- CurateFlag node ---
  335. if entity.curate_flag:
  336. curate_iri = f"<{_full_iri(entity._entity_iri())}_curate>"
  337. lines += [
  338. f" {curate_iri} a atlas:CurateFlag ;",
  339. f' atlas:curationReason "{_escape(entity.curate_flag.reason)}"@en .',
  340. ]
  341. return "\n".join(lines)
  342. # ---------------------------------------------------------------------------
  343. # Load
  344. # ---------------------------------------------------------------------------
  345. def load_entity(atlas_id: str, endpoint: str) -> Optional[Entity]:
  346. """
  347. Reconstruct an Entity object from the SPARQL store.
  348. Returns None if the entity does not exist.
  349. Args:
  350. atlas_id: e.g. "atlas:1b0e7222c7730540"
  351. endpoint: SPARQL Query endpoint URL,
  352. e.g. "http://localhost:8890/sparql"
  353. """
  354. entity_iri = f"<{ATLAS_D}entity_{atlas_id.replace('atlas:', '')}>"
  355. # --- 1. Core entity fields ---
  356. core = _sparql_select(endpoint, f"""{PREFIXES}
  357. SELECT ?label ?description ?type ?needsCuration WHERE {{
  358. {entity_iri} atlas:canonicalLabel ?label .
  359. OPTIONAL {{ {entity_iri} atlas:canonicalDescription ?description . }}
  360. OPTIONAL {{ {entity_iri} atlas:hasCanonicalType ?type . }}
  361. OPTIONAL {{ {entity_iri} atlas:needsCuration ?needsCuration . }}
  362. }}""")
  363. if not core:
  364. return None
  365. row = core[0]
  366. label = row["label"]["value"]
  367. description = row.get("description", {}).get("value")
  368. type_full = row.get("type", {}).get("value")
  369. entity_type = _short_iri(type_full) if type_full else None
  370. needs_curation = row.get("needsCuration", {}).get("value", "false").lower() == "true"
  371. # --- 2. Aliases ---
  372. alias_rows = _sparql_select(endpoint, f"""{PREFIXES}
  373. SELECT ?alias WHERE {{
  374. {entity_iri} atlas:aliasLabel ?alias .
  375. }}""")
  376. aliases = [r["alias"]["value"] for r in alias_rows]
  377. # --- 3. Identifiers ---
  378. ident_rows = _sparql_select(endpoint, f"""{PREFIXES}
  379. SELECT ?scheme ?value WHERE {{
  380. {entity_iri} atlas:hasIdentifier ?ident .
  381. ?ident atlas:scheme ?scheme ;
  382. atlas:value ?value .
  383. }}""")
  384. identifiers = [
  385. Identifier(scheme=r["scheme"]["value"], value=r["value"]["value"])
  386. for r in ident_rows
  387. ]
  388. # --- 4. Attributes (any atlas: datatype property not covered above) ---
  389. KNOWN = {
  390. "atlasId", "canonicalLabel", "canonicalDescription",
  391. "aliasLabel", "needsCuration", "rawJson",
  392. }
  393. attr_rows = _sparql_select(endpoint, f"""{PREFIXES}
  394. SELECT ?pred ?val WHERE {{
  395. {entity_iri} ?pred ?val .
  396. FILTER(STRSTARTS(STR(?pred), "{ATLAS}"))
  397. FILTER(isLiteral(?val))
  398. }}""")
  399. attributes: Dict[str, Any] = {}
  400. raw_json: List[str] = []
  401. for r in attr_rows:
  402. pred_local = _local(r["pred"]["value"])
  403. if pred_local in KNOWN:
  404. continue
  405. val = r["val"]["value"]
  406. dt = r["val"].get("datatype", "")
  407. if pred_local == "rawJson":
  408. raw_json.append(val)
  409. elif dt.endswith("boolean"):
  410. attributes[pred_local] = val.lower() == "true"
  411. elif dt.endswith("decimal") or dt.endswith("float") or dt.endswith("double"):
  412. attributes[pred_local] = float(val)
  413. elif dt.endswith("integer") or dt.endswith("int"):
  414. attributes[pred_local] = int(val)
  415. else:
  416. attributes[pred_local] = val
  417. # --- 5. Claims + Provenance ---
  418. claim_rows = _sparql_select(endpoint, f"""{PREFIXES}
  419. SELECT ?pred ?objIri ?objLit ?layer ?status
  420. ?provSource ?provMethod ?provConf ?provAt
  421. WHERE {{
  422. {entity_iri} atlas:hasClaim ?claim .
  423. ?claim atlas:claimPredicate ?pred ;
  424. atlas:claimLayer ?layer ;
  425. atlas:claimStatus ?status .
  426. OPTIONAL {{ ?claim atlas:claimObjectIri ?objIri . }}
  427. OPTIONAL {{ ?claim atlas:claimObjectLiteral ?objLit . }}
  428. OPTIONAL {{
  429. ?claim atlas:hasProvenance ?prov .
  430. ?prov atlas:provenanceSource ?provSource ;
  431. atlas:retrievalMethod ?provMethod ;
  432. atlas:confidence ?provConf ;
  433. atlas:retrievedAt ?provAt .
  434. }}
  435. }}""")
  436. claims: List[Claim] = []
  437. for r in claim_rows:
  438. prov = None
  439. if "provSource" in r:
  440. prov = Provenance(
  441. source=r["provSource"]["value"],
  442. method=r["provMethod"]["value"],
  443. confidence=float(r["provConf"]["value"]),
  444. retrieved_at=r["provAt"]["value"],
  445. )
  446. claims.append(Claim(
  447. predicate=_short_iri(r["pred"]["value"]),
  448. object_iri=_short_iri(r["objIri"]["value"]) if "objIri" in r else None,
  449. object_literal=r["objLit"]["value"] if "objLit" in r else None,
  450. layer=r["layer"]["value"],
  451. status=r["status"]["value"],
  452. provenance=prov,
  453. ))
  454. # --- 6. CurateFlag ---
  455. curate_flag = None
  456. curate_rows = _sparql_select(endpoint, f"""{PREFIXES}
  457. SELECT ?reason WHERE {{
  458. {entity_iri} atlas:hasCurateFlag ?flag .
  459. ?flag atlas:curationReason ?reason .
  460. }}""")
  461. if curate_rows:
  462. curate_flag = CurateFlag(reason=curate_rows[0]["reason"]["value"])
  463. return Entity(
  464. id=atlas_id,
  465. label=label,
  466. description=description,
  467. type=entity_type,
  468. aliases=aliases,
  469. identifiers=identifiers,
  470. attributes=attributes,
  471. raw_json=raw_json,
  472. claims=claims,
  473. needs_curation=needs_curation,
  474. curate_flag=curate_flag,
  475. )