|
|
@@ -2,18 +2,21 @@ from __future__ import annotations
|
|
|
|
|
|
import hashlib
|
|
|
import os
|
|
|
+import logging
|
|
|
from dataclasses import dataclass
|
|
|
from typing import Any
|
|
|
|
|
|
-from .atlas_model import CurateFlag, Entity, Identifier
|
|
|
-from .atlas_store import _sparql_select, _sparql_update
|
|
|
+from .atlas_model import Entity, Identifier
|
|
|
+from .atlas_store import load_entity_by_subject, save_entity_minimal
|
|
|
from .wikidata import WikidataSearch
|
|
|
|
|
|
|
|
|
ATLAS = "http://world.eu.org/atlas_ontology#"
|
|
|
-ATLAS_D = "http://world.eu.org/atlas_data#"
|
|
|
DEFAULT_ENDPOINT = os.getenv("ATLAS_VIRTUOSO_MCP_SSE_URL", "http://192.168.0.249:8501/mcp/sse")
|
|
|
DEFAULT_UPDATE_ENDPOINT = os.getenv("ATLAS_VIRTUOSO_MCP_SSE_URL", DEFAULT_ENDPOINT)
|
|
|
+DEBUG_LOGS = os.getenv("ATLAS_DEBUG_LOGS", "false").lower() in {"1", "true", "yes", "on"}
|
|
|
+
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
def _hash_id(subject: str) -> str:
|
|
|
@@ -21,40 +24,7 @@ def _hash_id(subject: str) -> str:
|
|
|
|
|
|
|
|
|
def _entity_iri(atlas_id: str) -> str:
|
|
|
- return f"<{ATLAS_D}entity_{atlas_id}>"
|
|
|
-
|
|
|
-
|
|
|
-def _escape_literal(value: str) -> str:
|
|
|
- return value.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
|
|
|
-
|
|
|
-
|
|
|
-def _label_lookup_query(subject: str) -> str:
|
|
|
- safe = _escape_literal(subject)
|
|
|
- return f"""
|
|
|
-PREFIX atlas: <{ATLAS}>
|
|
|
-SELECT ?atlasId ?label ?type ?qid ?alias WHERE {{
|
|
|
- VALUES ?needle {{ \"{safe}\" }}
|
|
|
- {{
|
|
|
- ?entity a atlas:Entity ;
|
|
|
- atlas:atlasId ?atlasId ;
|
|
|
- atlas:canonicalLabel ?label .
|
|
|
- OPTIONAL {{ ?entity atlas:hasCanonicalType ?type . }}
|
|
|
- OPTIONAL {{ ?entity atlas:hasIdentifier ?ident . ?ident atlas:scheme \"wikidata-qid\" ; atlas:value ?qid . }}
|
|
|
- FILTER(LCASE(STR(?label)) = LCASE(?needle))
|
|
|
- }}
|
|
|
- UNION
|
|
|
- {{
|
|
|
- ?entity a atlas:Entity ;
|
|
|
- atlas:atlasId ?atlasId ;
|
|
|
- atlas:aliasLabel ?alias .
|
|
|
- OPTIONAL {{ ?entity atlas:canonicalLabel ?label . }}
|
|
|
- OPTIONAL {{ ?entity atlas:hasCanonicalType ?type . }}
|
|
|
- OPTIONAL {{ ?entity atlas:hasIdentifier ?ident . ?ident atlas:scheme \"wikidata-qid\" ; atlas:value ?qid . }}
|
|
|
- FILTER(LCASE(STR(?alias)) = LCASE(?needle))
|
|
|
- }}
|
|
|
-}}
|
|
|
-LIMIT 1
|
|
|
-""".strip()
|
|
|
+ return f"atlas_data:entity_{atlas_id}"
|
|
|
|
|
|
|
|
|
async def _wikidata_lookup(subject: str) -> dict[str, Any] | None:
|
|
|
@@ -64,53 +34,36 @@ async def _wikidata_lookup(subject: str) -> dict[str, Any] | None:
|
|
|
return items[0] if items else None
|
|
|
|
|
|
|
|
|
+def _infer_atlas_type(label: str | None, description: str | None) -> str:
|
|
|
+ text = f"{label or ''} {description or ''}".lower()
|
|
|
+ if any(k in text for k in ["president", "person", "singer", "composer", "human", "actor", "writer"]):
|
|
|
+ return "atlas:Person"
|
|
|
+ if any(k in text for k in ["city", "town", "village", "country", "state", "location", "place"]):
|
|
|
+ return "atlas:Location"
|
|
|
+ if any(k in text for k in ["company", "organization", "organisation", "institution", "foundation", "band"]):
|
|
|
+ return "atlas:Organization"
|
|
|
+ return "atlas:Other"
|
|
|
+
|
|
|
+
|
|
|
def _entity_from_wikidata(subject: str, wd: dict[str, Any]) -> Entity:
|
|
|
atlas_id = _hash_id(subject)
|
|
|
label = wd.get("label") or subject
|
|
|
description = wd.get("description")
|
|
|
qid = wd.get("id")
|
|
|
- entity_type = wd.get("type") or "Thing"
|
|
|
+ entity_type = _infer_atlas_type(label, description)
|
|
|
|
|
|
ent = Entity(
|
|
|
id=atlas_id,
|
|
|
label=label,
|
|
|
description=description,
|
|
|
- type=f"atlas:{entity_type}" if not entity_type.startswith("atlas:") else entity_type,
|
|
|
+ type=entity_type,
|
|
|
aliases=[subject] if subject.lower() != label.lower() else [],
|
|
|
identifiers=[Identifier(scheme="wikidata-qid", value=qid)] if qid else [],
|
|
|
- needs_curation=False,
|
|
|
+ needs_curation=True,
|
|
|
)
|
|
|
return ent
|
|
|
|
|
|
|
|
|
-def _entity_to_turtle(entity: Entity) -> str:
|
|
|
- lines = []
|
|
|
- e = _entity_iri(entity.id)
|
|
|
- lines.append(f"{e}")
|
|
|
- lines.append(" a atlas:Entity ;")
|
|
|
- lines.append(f' atlas:atlasId "{_escape_literal(entity.id)}" ;')
|
|
|
- lines.append(f' atlas:canonicalLabel "{_escape_literal(entity.label)}"@en ;')
|
|
|
- if entity.description:
|
|
|
- lines.append(f' atlas:canonicalDescription "{_escape_literal(entity.description)}"@en ;')
|
|
|
- if entity.type:
|
|
|
- lines.append(f" atlas:hasCanonicalType {entity.type} ;")
|
|
|
- for alias in entity.aliases:
|
|
|
- lines.append(f' atlas:aliasLabel "{_escape_literal(alias)}"@en ;')
|
|
|
- for ident in entity.identifiers:
|
|
|
- ident_iri = f"<{ATLAS_D}ident_{ident.scheme}_{_hash_id(ident.value)}>.".rstrip(".")
|
|
|
- lines.append(f" atlas:hasIdentifier {ident_iri} ;")
|
|
|
- lines.append(f' atlas:needsCuration "{str(entity.needs_curation).lower()}"^^xsd:boolean .')
|
|
|
- lines.append("")
|
|
|
- for ident in entity.identifiers:
|
|
|
- ident_iri = f"<{ATLAS_D}ident_{ident.scheme}_{_hash_id(ident.value)}>"
|
|
|
- lines.append(f"{ident_iri}")
|
|
|
- lines.append(" a atlas:Identifier ;")
|
|
|
- lines.append(f' atlas:scheme "{_escape_literal(ident.scheme)}" ;')
|
|
|
- lines.append(f' atlas:value "{_escape_literal(ident.value)}" .')
|
|
|
- lines.append("")
|
|
|
- return "\n".join(lines)
|
|
|
-
|
|
|
-
|
|
|
def _flatten_exception_details(exc: BaseException) -> list[str]:
|
|
|
parts = [f"{type(exc).__name__}: {exc}"]
|
|
|
nested = getattr(exc, "exceptions", None)
|
|
|
@@ -121,32 +74,11 @@ def _flatten_exception_details(exc: BaseException) -> list[str]:
|
|
|
|
|
|
|
|
|
async def _persist_entity(entity: Entity) -> None:
|
|
|
- ttl = _entity_to_turtle(entity)
|
|
|
- query = f"""
|
|
|
-PREFIX atlas: <{ATLAS}>
|
|
|
-PREFIX atlas_data: <{ATLAS_D}>
|
|
|
-PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
|
|
|
-INSERT DATA {{
|
|
|
- GRAPH <{ATLAS_D}> {{
|
|
|
-{ttl}
|
|
|
- }}
|
|
|
-}}
|
|
|
-""".strip()
|
|
|
- await _sparql_update(DEFAULT_UPDATE_ENDPOINT, query)
|
|
|
+ await save_entity_minimal(entity, DEFAULT_UPDATE_ENDPOINT)
|
|
|
|
|
|
|
|
|
async def _load_entity(subject: str) -> dict[str, Any] | None:
|
|
|
- rows = await _sparql_select(DEFAULT_ENDPOINT, _label_lookup_query(subject))
|
|
|
- if not rows:
|
|
|
- return None
|
|
|
- row = rows[0]
|
|
|
- return {
|
|
|
- "atlas_id": row.get("atlasId", {}).get("value"),
|
|
|
- "label": row.get("label", {}).get("value"),
|
|
|
- "type": row.get("type", {}).get("value"),
|
|
|
- "wikidata_id": row.get("qid", {}).get("value"),
|
|
|
- "alias": row.get("alias", {}).get("value"),
|
|
|
- }
|
|
|
+ return await load_entity_by_subject(subject, DEFAULT_ENDPOINT)
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
@@ -164,8 +96,13 @@ class ResolveService:
|
|
|
if not subject:
|
|
|
return {"status": "not_found"}
|
|
|
|
|
|
+ if DEBUG_LOGS:
|
|
|
+ logger.info("resolve start subject=%s", subject)
|
|
|
+
|
|
|
stored = await self.load_entity_fn(subject)
|
|
|
if stored:
|
|
|
+ if DEBUG_LOGS:
|
|
|
+ logger.info("store hit subject=%s atlas_id=%s", subject, stored.get("atlas_id"))
|
|
|
return {
|
|
|
"status": "resolved",
|
|
|
"atlas_id": stored.get("atlas_id"),
|
|
|
@@ -177,9 +114,19 @@ class ResolveService:
|
|
|
|
|
|
wd = await self.wikidata_lookup_fn(subject)
|
|
|
if not wd:
|
|
|
+ if DEBUG_LOGS:
|
|
|
+ logger.info("wikidata miss subject=%s", subject)
|
|
|
return {"status": "not_found"}
|
|
|
|
|
|
entity = _entity_from_wikidata(subject, wd)
|
|
|
+ if DEBUG_LOGS:
|
|
|
+ logger.info(
|
|
|
+ "wikidata hit subject=%s qid=%s atlas_id=%s type=%s",
|
|
|
+ subject,
|
|
|
+ wd.get("id"),
|
|
|
+ entity.id,
|
|
|
+ entity.type,
|
|
|
+ )
|
|
|
await self.persist_entity_fn(entity)
|
|
|
|
|
|
return {
|