Prechádzať zdrojové kódy

atlas2-mcp: resolver strategies, confidence-aware cache, embeddings ranking, tests

Lukas Goldschmidt 1 mesiac pred
rodič
commit
8ff17fe8fb

+ 3 - 0
.env.example

@@ -8,3 +8,6 @@ REMOTE_MCP_SSE_READ_TIMEOUT=300
 # Graph to query (used in the scaffolding SPARQL templates).
 RESOLUTION_GRAPH_IRI=http://world.eu.org/atlas_data#
 
+# Ollama embedding configuration for candidate ranking.
+OLLAMA_BASE_URL=http://192.168.0.200:11434
+OLLAMA_EMBEDDING_MODEL=nomic-embed-text

+ 95 - 0
COARSE_TYPES.txt

@@ -0,0 +1,95 @@
+Atlas Coarse Types for LLM Extraction
+======================================
+
+Use these 12 types when prompting a cheap/small LLM for entity type suggestion.
+The suggested type is a hint to the entity resolver for candidate ranking — not
+a final classification. Pass 2 (Wikidata QID lookup) promotes to the fine-grained
+subtype from the full ontology.
+
+
+COARSE TYPES
+------------
+
+Person
+Organization
+Location
+CreativeWork
+Event
+Product
+FinancialInstrument
+Animal
+Disease
+Building
+FictionalCharacter
+Other
+
+
+PASS 2 PROMOTION MAP
+--------------------
+
+Person              -> Person
+
+Organization        -> Organization
+                       PoliticalParty
+                       MilitaryUnit
+                       MediaOrganization
+
+Location            -> Location
+                       Continent
+                       Country
+                       Region
+                       PopulatedPlace
+                       Neighbourhood
+                       NaturalFeature
+                       AdministrativeArea
+
+CreativeWork        -> CreativeWork
+                       Film
+                       Book
+                       MusicAlbum
+                       TVSeries
+                       VideoGame
+
+Event               -> Event
+
+Product             -> Product
+                       Drug
+                       Food
+
+FinancialInstrument -> FinancialInstrument
+                       PublicCompany
+                       StockIndex
+                       Commodity
+                       Cryptocurrency
+                       Currency
+
+Animal              -> Animal
+
+Disease             -> Disease
+
+Building            -> Building
+
+FictionalCharacter  -> FictionalCharacter
+
+Other               -> Other
+                       Award
+                       Sport
+                       EthnicGroup
+                       Concept
+
+
+NOTES
+-----
+
+- Animal and Disease are kept separate because confusing them with Product
+  or Concept causes hard resolution failures.
+
+- Building is kept separate because landmarks (Eiffel Tower, White House)
+  resolve very differently from cities or countries.
+
+- FictionalCharacter is kept separate because confusing a fictional entity
+  with a real person is a hard failure, not a soft one.
+
+- Award, Sport, EthnicGroup and Concept fall into Other at the coarse level.
+  A small model will mis-classify these anyway; the QID lookup in pass 2
+  recovers the correct fine-grained type reliably.

+ 13 - 1
README.md

@@ -8,13 +8,25 @@ Python FastMCP server for the *atlas2* resolution flow.
   - store lookup (label/alias)
   - Wikidata fallback using `wikidata.reconci.link` quick-resolve (single-candidate)
   - minimal persist to Virtuoso via MCP
-  - cache hits return the stored result
+  - cache hits return the stored result when confidence is sufficient
+- Resolve supports strategy modes:
+  - `quick`
+  - `ranked`
+  - `interactive`
+  - `hybrid`
+  - `llm_select`
+- Ranking can use embeddings via Ollama when `strategy.use_embeddings=true`
+- `confidence` controls cache reuse and re-resolution, not just the resolved/not-found status
 - Store lookup was debugged and made robust by splitting label-first then alias.
 - Maintenance scaffolding (dry-run CLI) exists to upgrade `atlas:needsCuration true` entities using:
   - `ontology/wikidata_subclassof.ttl`
   - Wikidata entity dumps
   - Atlas type bucket inference (Person/Organization/Location/etc.)
 
+## Resolver tests
+- `./test_resolve.sh` runs resolver-focused tests only
+- It prints the subjects under test and a per-test timing table
+
 ## How to run
 
 1) Create config:

+ 13 - 0
TODO.md

@@ -0,0 +1,13 @@
+# atlas2-mcp TODO
+
+## Next
+- Implement the maintenance script as the next step:
+  - select entities with `atlas:needsCuration true`
+  - fetch Wikidata details
+  - enrich type-specific fields
+  - write back updates when ready
+
+## Later
+- Expand maintenance to handle more advanced realm/constraint-driven behavior.
+- Tighten/extend type buckets if new examples show over-classification.
+

+ 16 - 2
app/atlas_store.py

@@ -87,6 +87,13 @@ def _escape(s: str) -> str:
     return s.replace("\\", "\\\\").replace('"', '\\"').replace("\n", "\\n")
 
 
+def _parse_boolean_literal(value: str | None, default: bool = True) -> bool:
+    if value is None:
+        return default
+    v = str(value).strip().lower()
+    return v in {"1", "true", "yes", "y", "t"}
+
+
 async def _sparql_update(endpoint: str, query: str) -> None:
     if "/mcp/sse" not in endpoint:
         raise RuntimeError("atlas_store only supports Virtuoso MCP/SSE endpoints in this scaffold")
@@ -150,13 +157,15 @@ async def load_entity_by_subject(subject: str, endpoint: str, graph_iri: str = D
 
     label_query = f"""
 {PREFIXES}
-SELECT ?atlasId ?label ?type ?qid WHERE {{
+SELECT ?atlasId ?label ?desc ?type ?qid ?needsCuration WHERE {{
   VALUES ?needle {{ "{needle}" }}
   GRAPH <{graph_iri}> {{
     ?entity a atlas:Entity ;
             atlas:atlasId ?atlasId ;
             atlas:canonicalLabel ?label .
+    OPTIONAL {{ ?entity atlas:canonicalDescription ?desc . }}
     OPTIONAL {{ ?entity atlas:hasCanonicalType ?type . }}
+    OPTIONAL {{ ?entity atlas:needsCuration ?needsCuration . }}
     OPTIONAL {{ ?entity atlas:hasIdentifier ?ident . ?ident atlas:scheme "wikidata-qid" ; atlas:value ?qid . }}
     FILTER(LCASE(STR(?label)) = LCASE(STR(?needle)))
   }}
@@ -166,14 +175,16 @@ LIMIT 1
 
     alias_query = f"""
 {PREFIXES}
-SELECT ?atlasId ?label ?type ?qid ?alias WHERE {{
+SELECT ?atlasId ?label ?desc ?type ?qid ?alias ?needsCuration WHERE {{
   VALUES ?needle {{ "{needle}" }}
   GRAPH <{graph_iri}> {{
     ?entity a atlas:Entity ;
             atlas:atlasId ?atlasId ;
             atlas:aliasLabel ?alias .
     OPTIONAL {{ ?entity atlas:canonicalLabel ?label . }}
+    OPTIONAL {{ ?entity atlas:canonicalDescription ?desc . }}
     OPTIONAL {{ ?entity atlas:hasCanonicalType ?type . }}
+    OPTIONAL {{ ?entity atlas:needsCuration ?needsCuration . }}
     OPTIONAL {{ ?entity atlas:hasIdentifier ?ident . ?ident atlas:scheme "wikidata-qid" ; atlas:value ?qid . }}
     FILTER(LCASE(STR(?alias)) = LCASE(STR(?needle)))
   }}
@@ -202,9 +213,12 @@ LIMIT 1
     return {
         "atlas_id": row.get("atlasId", {}).get("value"),
         "label": row.get("label", {}).get("value"),
+        "description": (row.get("desc", {}) or {}).get("value"),
         "type": type_value,
         "wikidata_id": row.get("qid", {}).get("value"),
         "alias": row.get("alias", {}).get("value"),
+        # If the triple is missing, treat it as needsCuration=true for this early-stage workflow.
+        "needs_curation": _parse_boolean_literal(row.get("needsCuration", {}).get("value"), default=True),
     }
 
 

+ 8 - 2
app/maintenance.py

@@ -71,6 +71,8 @@ def _infer_atlas_type_from_qids(qids: Iterable[str], subclass_graph: rdflib.Grap
         lower = obj_str.lower()
         if "person" in lower or "politician" in lower:
             return "atlas:Person"
+        if "human" in lower or "bio" in lower or "biography" in lower:
+            return "atlas:Person"
         if "organisation" in lower or "organization" in lower or "governmentagency" in lower or "agency" in lower:
             return "atlas:Organization"
         if any(k in lower for k in [
@@ -98,10 +100,14 @@ def _infer_atlas_type_from_qids(qids: Iterable[str], subclass_graph: rdflib.Grap
         return None
 
     for q in qids:
-        start = rdflib.URIRef(f"http://wikidata.dbpedia.org/resource/{q}")
+        # Support both namespace styles used in tests and data sources.
+        start_nodes = [
+            rdflib.URIRef(f"http://wikidata.dbpedia.org/resource/{q}"),
+            rdflib.URIRef(f"http://www.wikidata.org/entity/{q}"),
+        ]
 
         # Walk the rdfs:subClassOf closure to find something that matches our buckets.
-        queue = [start]
+        queue = list(start_nodes)
         visited: set[rdflib.term.Node] = set()
 
         while queue:

+ 2 - 0
app/mcp_server.py

@@ -26,6 +26,7 @@ async def resolve_tool(
     constraints: dict | None = None,
     hints: dict | None = None,
     debug: dict | None = None,
+    strategy: dict | None = None,
 ):
     # Service pulls configuration exclusively from the project's .env.
     svc = ResolveService()
@@ -35,5 +36,6 @@ async def resolve_tool(
         constraints=constraints,
         hints=hints,
         debug=debug,
+        strategy=strategy,
     )
 

+ 323 - 24
app/resolve.py

@@ -5,6 +5,10 @@ import os
 import logging
 from dataclasses import dataclass
 from typing import Any
+import time
+import uuid
+import datetime
+import math
 
 from .atlas_model import Entity, Identifier
 from .atlas_store import load_entity_by_subject, save_entity_minimal
@@ -27,11 +31,28 @@ def _entity_iri(atlas_id: str) -> str:
     return f"atlas_data:entity_{atlas_id}"
 
 
-async def _wikidata_lookup(subject: str) -> dict[str, Any] | None:
-    search = WikidataSearch({"search": subject, "limit": 1})
-    result = await search.quick_resolve(subject, limit=1)
-    items = result.get("results", [])
-    return items[0] if items else None
+async def _wikidata_lookup(subject: str, language: str = "en", limit: int = 1) -> list[dict[str, Any]]:
+    search = WikidataSearch({"search": subject, "limit": limit, "language": language})
+    result = await search.quick_resolve(subject, limit=limit)
+    return result.get("results", []) or []
+
+
+def _candidate_text(subject: str, wd: dict[str, Any], hints: dict[str, Any] | None = None) -> str:
+    hints = hints or {}
+    aliases = hints.get("aliases") or []
+    parts = [subject, wd.get("label") or "", wd.get("description") or "", " ".join(str(a) for a in aliases)]
+    return " | ".join(part for part in parts if part)
+
+
+def _cosine_similarity(a: list[float] | None, b: list[float] | None) -> float:
+    if not a or not b or len(a) != len(b):
+        return 0.0
+    dot = sum(x * y for x, y in zip(a, b))
+    norm_a = math.sqrt(sum(x * x for x in a))
+    norm_b = math.sqrt(sum(y * y for y in b))
+    if not norm_a or not norm_b:
+        return 0.0
+    return dot / (norm_a * norm_b)
 
 
 def _infer_atlas_type(label: str | None, description: str | None) -> str:
@@ -45,6 +66,67 @@ def _infer_atlas_type(label: str | None, description: str | None) -> str:
     return "atlas:Other"
 
 
+def _score_wikidata_candidate(
+    subject: str,
+    wd: dict[str, Any],
+    *,
+    context: dict[str, Any] | None = None,
+    hints: dict[str, Any] | None = None,
+    use_embeddings: bool = False,
+    subject_embedding: list[float] | None = None,
+    candidate_embedding: list[float] | None = None,
+) -> tuple[float, dict[str, float]]:
+    context = context or {}
+    hints = hints or {}
+
+    score = 0.0
+    breakdown: dict[str, float] = {}
+    subject_norm = subject.strip().lower()
+    label = (wd.get("label") or "").strip()
+    description = (wd.get("description") or "").strip()
+    label_norm = label.lower()
+    description_norm = description.lower()
+
+    if label_norm == subject_norm:
+        score += 0.75
+        breakdown["exact_label"] = 0.75
+    elif subject_norm and subject_norm in label_norm:
+        score += 0.45
+        breakdown["partial_label"] = 0.45
+
+    for alias in hints.get("aliases") or []:
+        alias_norm = str(alias).strip().lower()
+        if alias_norm and alias_norm == label_norm:
+            score += 0.15
+            breakdown["alias_match"] = 0.15
+            break
+
+    expected_type = (hints.get("expected_type") or "").strip().lower()
+    inferred_type = _infer_atlas_type(label, description).lower()
+    if expected_type and expected_type in inferred_type:
+        score += 0.1
+        breakdown["expected_type"] = 0.1
+
+    realm = (context.get("realm") or "").strip().lower()
+    if realm and realm in description_norm:
+        score += 0.1
+        breakdown["realm"] = 0.1
+
+    if wd.get("id"):
+        score += 0.05
+        breakdown["has_qid"] = 0.05
+
+    if use_embeddings:
+        sim = _cosine_similarity(subject_embedding, candidate_embedding)
+        if sim > 0:
+            emb_score = max(0.0, min(0.25, sim * 0.25))
+            score += emb_score
+            breakdown["embedding_similarity"] = emb_score
+
+    score = min(score, 0.99)
+    return score, breakdown
+
+
 def _entity_from_wikidata(subject: str, wd: dict[str, Any]) -> Entity:
     atlas_id = _hash_id(subject)
     label = wd.get("label") or subject
@@ -81,6 +163,61 @@ async def _load_entity(subject: str) -> dict[str, Any] | None:
     return await load_entity_by_subject(subject, DEFAULT_ENDPOINT)
 
 
+def _required_confidence(mode: str, constraints: dict[str, Any]) -> float:
+    requested = constraints.get("min_confidence")
+    if requested is not None:
+        return float(requested)
+    if mode == "quick":
+        return 0.55
+    if mode in {"ranked", "hybrid", "llm_select"}:
+        return 0.85
+    if mode == "interactive":
+        return 0.0
+    return 0.5
+
+
+def _is_ambiguous_subject(subject: str, wd_candidates: list[dict[str, Any]]) -> bool:
+    if len(wd_candidates) < 2:
+        return False
+    subject_norm = subject.strip().lower()
+    labels = [(cand.get("label") or "").strip().lower() for cand in wd_candidates]
+    exact_matches = sum(1 for label in labels if label == subject_norm)
+    return exact_matches >= 2 or (exact_matches == 1 and any(label == subject_norm for label in labels[1:]))
+
+
+def _cache_can_satisfy(stored: dict[str, Any], mode: str, constraints: dict[str, Any]) -> bool:
+    stored_confidence = float(stored.get("confidence") or 0.0)
+    return stored_confidence >= _required_confidence(mode, constraints)
+
+
+def _debug_decision(
+    *,
+    mode: str,
+    top_confidence: float,
+    auto_accept_threshold: float,
+    interactive_below_threshold: bool,
+    required_confidence: float,
+    used_cache: bool,
+    cache_confidence: float | None = None,
+) -> dict[str, Any]:
+    return {
+        "mode": mode,
+        "top_confidence": top_confidence,
+        "auto_accept_threshold": auto_accept_threshold,
+        "interactive_below_threshold": interactive_below_threshold,
+        "required_confidence": required_confidence,
+        "used_cache": used_cache,
+        "cache_confidence": cache_confidence,
+        "decision": (
+            "cache_hit"
+            if used_cache
+            else "resolved"
+            if top_confidence >= auto_accept_threshold
+            else "ambiguous_below_threshold"
+        ),
+    }
+
+
 @dataclass
 class ResolveService:
     load_entity_fn: Any = _load_entity
@@ -90,11 +227,36 @@ class ResolveService:
     async def resolve(self, *, subject: str, context: dict[str, Any] | None = None,
                       constraints: dict[str, Any] | None = None,
                       hints: dict[str, Any] | None = None,
-                      debug: dict[str, Any] | None = None) -> dict[str, Any]:
+                      debug: dict[str, Any] | None = None,
+                      strategy: dict[str, Any] | None = None) -> dict[str, Any]:
+        context = context or {}
+        constraints = constraints or {}
+        hints = hints or {}
+        debug = debug or {}
+        strategy = strategy or {}
+        language = (context.get("language") or "en").strip() or "en"
+        mode = (strategy.get("mode") or "quick").strip().lower() or "quick"
+        use_embeddings = bool(strategy.get("use_embeddings", False))
+        max_candidates = int(constraints.get("max_candidates") or 5)
+        auto_accept_threshold = float(strategy.get("auto_accept_threshold") or 0.85)
+        interactive_below_threshold = bool(strategy.get("interactive_below_threshold", True))
+        required_confidence = _required_confidence(mode, constraints)
         try:
+            request_id = str(uuid.uuid4())
+            ts = datetime.datetime.now(datetime.timezone.utc).isoformat()
+            start = time.time()
             subject = (subject or "").strip()
             if not subject:
-                return {"status": "not_found"}
+                return {
+                    "status": "not_found",
+                    "entity": None,
+                    "confidence": 0.0,
+                    "candidates": [],
+                    "ambiguity": None,
+                    "resolution_path": [],
+                    "meta": {"request_id": request_id, "timestamp": ts, "duration_ms": 0},
+                    "error": None,
+                }
 
             if DEBUG_LOGS:
                 logger.info("resolve start subject=%s", subject)
@@ -103,22 +265,97 @@ class ResolveService:
             if stored:
                 if DEBUG_LOGS:
                     logger.info("store hit subject=%s atlas_id=%s", subject, stored.get("atlas_id"))
+                stored_confidence = float(stored.get("confidence") or (0.9 if not stored.get("needs_curation", False) else 0.6))
+                if _cache_can_satisfy(stored, mode, constraints):
+                    return {
+                        "status": "resolved",
+                        "entity": {
+                            "id": stored.get("atlas_id"),
+                            "label": stored.get("label"),
+                            "type": stored.get("type"),
+                            "description": stored.get("description"),
+                            "source": None,
+                            "uri": None,
+                            "attributes": {},
+                        },
+                        "confidence": stored_confidence,
+                        "candidates": [],
+                        "ambiguity": None,
+                        "resolution_path": [
+                            {"phase": "cache", "action": "store_hit", "source": "triple_store"}
+                        ],
+                        "meta": {
+                            "request_id": request_id,
+                            "timestamp": ts,
+                            "duration_ms": int((time.time() - start) * 1000),
+                            **({"debug": _debug_decision(mode=mode, top_confidence=stored_confidence, auto_accept_threshold=auto_accept_threshold, interactive_below_threshold=interactive_below_threshold, required_confidence=required_confidence, used_cache=True, cache_confidence=stored_confidence)} if debug.get("include_explanations") else {}),
+                        },
+                        "error": None,
+                    }
+
+                if DEBUG_LOGS:
+                    logger.info("cache confidence too low subject=%s mode=%s confidence=%.3f required=%.3f", subject, mode, stored_confidence, required_confidence)
+
+            wd_candidates = await self.wikidata_lookup_fn(
+                subject,
+                language,
+                1 if mode == "quick" else max(1, min(max_candidates, 10)),
+            )
+            if not wd_candidates:
+                if DEBUG_LOGS:
+                    logger.info("wikidata miss subject=%s mode=%s", subject, mode)
                 return {
-                    "status": "resolved",
-                    "atlas_id": stored.get("atlas_id"),
-                    "label": stored.get("label"),
-                    "type": stored.get("type"),
-                    "wikidata_id": stored.get("wikidata_id"),
-                    "alias": stored.get("alias") or subject,
+                    "status": "not_found",
+                    "entity": {
+                        "id": None,
+                        "label": None,
+                        "type": None,
+                        "description": None,
+                        "source": None,
+                        "uri": None,
+                        "attributes": {},
+                    },
+                    "confidence": 0.0,
+                    "candidates": [],
+                    "ambiguity": None,
+                    "resolution_path": [
+                        {"phase": "query", "action": "wikidata_quick_resolve", "source": "remote"}
+                    ],
+                    "meta": {
+                        "request_id": request_id,
+                        "timestamp": ts,
+                        "duration_ms": int((time.time() - start) * 1000),
+                        **({"debug": _debug_decision(mode=mode, top_confidence=0.0, auto_accept_threshold=auto_accept_threshold, interactive_below_threshold=interactive_below_threshold, required_confidence=required_confidence, used_cache=False)} if debug.get("include_explanations") else {}),
+                    },
+                    "error": None,
                 }
 
-            wd = await self.wikidata_lookup_fn(subject)
-            if not wd:
-                if DEBUG_LOGS:
-                    logger.info("wikidata miss subject=%s", subject)
-                return {"status": "not_found"}
+            ranked_candidates = []
+            subject_embedding = None
+            embedder = None
+            if use_embeddings:
+                embedder = WikidataSearch()
+                subject_embedding = await embedder.embed_text(_candidate_text(subject, {"label": subject, "description": "", "aliases": []}, hints))
+            for wd in wd_candidates:
+                candidate_embedding = None
+                if use_embeddings and embedder is not None:
+                    candidate_embedding = await embedder.embed_text(_candidate_text(subject, wd, hints))
+                confidence, breakdown = _score_wikidata_candidate(
+                    subject,
+                    wd,
+                    context=context,
+                    hints=hints,
+                    use_embeddings=use_embeddings,
+                    subject_embedding=subject_embedding,
+                    candidate_embedding=candidate_embedding,
+                )
+                ranked_candidates.append({**wd, "confidence": confidence, "score_breakdown": breakdown})
+            ranked_candidates.sort(key=lambda item: ((item.get("confidence") or 0.0), item.get("label") or ""), reverse=True)
 
+            wd = ranked_candidates[0]
             entity = _entity_from_wikidata(subject, wd)
+            if mode == "quick":
+                wd["confidence"] = min(wd.get("confidence", 0.0), 0.6)
             if DEBUG_LOGS:
                 logger.info(
                     "wikidata hit subject=%s qid=%s atlas_id=%s type=%s",
@@ -129,17 +366,79 @@ class ResolveService:
                 )
             await self.persist_entity_fn(entity)
 
+            resolution_path = [
+                {"phase": "query", "action": "wikidata_quick_resolve", "source": "remote"},
+                {"phase": "ranking", "action": f"mode_{mode}", "source": "resolver"},
+            ]
+            if use_embeddings:
+                resolution_path.append(
+                    {
+                        "phase": "ranking",
+                        "action": "embedding_similarity",
+                        "source": "ollama",
+                        "note": "embedding similarity used to score candidate order",
+                    }
+                )
+
+            status = "ambiguous"
+            ambiguity = {"reason": "pre-maintenance", "dimension": 0.5}
+            if mode == "quick":
+                status = "ambiguous"
+            elif (wd.get("confidence") or 0.0) >= auto_accept_threshold:
+                status = "resolved"
+                ambiguity = None
+            elif interactive_below_threshold:
+                status = "ambiguous"
+
             return {
-                "status": "resolved",
-                "atlas_id": entity.id,
-                "label": entity.label,
-                "type": entity.type,
-                "wikidata_id": wd.get("id"),
-                "alias": subject,
+                "status": status,
+                "entity": {
+                    "id": entity.id,
+                    "label": entity.label,
+                    "type": entity.type,
+                    "description": entity.description,
+                    "source": "wikidata",
+                    "uri": None,
+                    "attributes": {
+                        "wikidata_id": wd.get("id"),
+                        "alias": subject,
+                    },
+                },
+                "confidence": wd.get("confidence", 0.6),
+                "candidates": [
+                    {
+                        "id": cand.get("id"),
+                        "label": cand.get("label"),
+                        "type": cand.get("type"),
+                        "source": "wikidata",
+                        "confidence": cand.get("confidence", 0.0),
+                        "score_breakdown": cand.get("score_breakdown", {}) if debug.get("include_explanations") else {},
+                    }
+                    for cand in ranked_candidates
+                ] if mode in {"ranked", "interactive", "hybrid", "llm_select"} else [],
+                "ambiguity": ambiguity,
+                "resolution_path": resolution_path + [{"phase": "persistence", "action": "store_save_minimal", "source": "triple_store"}],
+                "meta": {
+                    "request_id": request_id,
+                    "timestamp": ts,
+                    "duration_ms": int((time.time() - start) * 1000),
+                    **({"debug": _debug_decision(mode=mode, top_confidence=wd.get("confidence", 0.0), auto_accept_threshold=auto_accept_threshold, interactive_below_threshold=interactive_below_threshold, required_confidence=required_confidence, used_cache=False)} if debug.get("include_explanations") else {}),
+                },
+                "error": None,
             }
         except Exception as exc:
             detail = " | ".join(_flatten_exception_details(exc))
             return {
                 "status": "error",
+                "entity": None,
+                "confidence": 0.0,
+                "candidates": [],
+                "ambiguity": None,
+                "resolution_path": [],
+                "meta": {
+                    "request_id": str(uuid.uuid4()),
+                    "timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
+                    "duration_ms": 0,
+                },
                 "error": {"code": "RESOLVE_FAILED", "message": detail},
             }

+ 48 - 16
app/wikidata.py

@@ -16,11 +16,15 @@ WIKIDATA_USER_AGENT = os.getenv(
     "Atlas/1.0 (contact: lukas.goldschmidt+atlas@googlemail.com)",
 )
 
+# The Wikidata reconciliation endpoint has moved more than once, so keep it configurable.
 WIKIDATA_QUICK_RESOLVE_BASE_URL = os.getenv(
     "ATLAS_WIKIDATA_QUICK_RESOLVE_URL",
     "https://wikidata.reconci.link/en/api",
 )
 
+OLLAMA_BASE_URL = os.getenv("OLLAMA_BASE_URL", "http://192.168.0.200:11434")
+OLLAMA_EMBEDDING_MODEL = os.getenv("OLLAMA_EMBEDDING_MODEL", "nomic-embed-text")
+
 
 @dataclass
 class WikidataOptions:
@@ -44,6 +48,21 @@ def _build_url(opts: WikidataOptions, params: dict[str, Any]) -> str:
     return f"https://{opts.apiHost}{opts.apiPath}?{query}"
 
 
+def _client_kwargs() -> dict[str, Any]:
+    return {
+        "timeout": 20,
+        "headers": {"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
+        "follow_redirects": True,
+    }
+
+
+def _ollama_client_kwargs() -> dict[str, Any]:
+    return {
+        "timeout": 20,
+        "base_url": OLLAMA_BASE_URL,
+    }
+
+
 class WikidataSearch:
     def __init__(self, options: dict[str, Any] | None = None, *, client: httpx.AsyncClient | None = None):
         self.defaultOptions = WikidataOptions()
@@ -64,6 +83,22 @@ class WikidataSearch:
     def clearPropertyCache(self) -> None:
         PROPERTY_CACHE.clear()
 
+    async def embed_text(self, text: str) -> list[float] | None:
+        client = self._client or httpx.AsyncClient(**_ollama_client_kwargs())
+        close_client = self._client is None
+        try:
+            resp = await client.post(
+                "/api/embeddings",
+                json={"model": OLLAMA_EMBEDDING_MODEL, "prompt": text},
+            )
+            resp.raise_for_status()
+            data = resp.json()
+            embedding = data.get("embedding")
+            return embedding if isinstance(embedding, list) else None
+        finally:
+            if close_client:
+                await client.aclose()
+
     async def search(self) -> dict[str, Any]:
         if not self.validateOptions():
             return {"results": [], "error": "Bad options"}
@@ -78,10 +113,7 @@ class WikidataSearch:
         }
         url = _build_url(self.options, params)
 
-        client = self._client or httpx.AsyncClient(
-            timeout=20,
-            headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
-        )
+        client = self._client or httpx.AsyncClient(**_client_kwargs())
         close_client = self._client is None
         try:
             resp = await client.get(url)
@@ -115,10 +147,7 @@ class WikidataSearch:
         params = {
             "queries": json.dumps({"q0": {"query": query, "limit": limit}}),
         }
-        client = self._client or httpx.AsyncClient(
-            timeout=20,
-            headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
-        )
+        client = self._client or httpx.AsyncClient(**_client_kwargs())
         close_client = self._client is None
         try:
             resp = await client.get(endpoint, params=params)
@@ -144,6 +173,15 @@ class WikidataSearch:
 
         return {"results": results}
 
+    async def candidate_embeddings(self, candidates: list[dict[str, Any]]) -> list[dict[str, Any]]:
+        out = []
+        for cand in candidates:
+            text_parts = [cand.get("label") or "", cand.get("description") or "", " ".join(cand.get("aliases") or [])]
+            text = " | ".join(part for part in text_parts if part)
+            embedding = await self.embed_text(text)
+            out.append({**cand, "embedding": embedding})
+        return out
+
     def search_sync(self) -> dict[str, Any]:
         return asyncio.run(self.search())
 
@@ -166,10 +204,7 @@ class WikidataSearch:
         }
         url = _build_url(self.options, params)
 
-        client = self._client or httpx.AsyncClient(
-            timeout=20,
-            headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
-        )
+        client = self._client or httpx.AsyncClient(**_client_kwargs())
         close_client = self._client is None
         try:
             resp = await client.get(url)
@@ -267,10 +302,7 @@ class WikidataSearch:
             PROPERTY_CACHE.setdefault(prop_id, prop_id)
 
     async def get_entity_data(self, qid: str) -> dict[str, Any]:
-        client = self._client or httpx.AsyncClient(
-            timeout=20,
-            headers={"Accept": "application/json", "User-Agent": WIKIDATA_USER_AGENT},
-        )
+        client = self._client or httpx.AsyncClient(**_client_kwargs())
         close_client = self._client is None
         try:
             resp = await client.get(

+ 301 - 21
ontology/atlas.ttl

@@ -1,17 +1,49 @@
-@prefix atlas:  <http://world.eu.org/atlas_ontology#> .
-@prefix owl:    <http://www.w3.org/2002/07/owl#> .
-@prefix rdf:    <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
-@prefix rdfs:   <http://www.w3.org/2000/01/rdf-schema#> .
-@prefix xsd:    <http://www.w3.org/2001/XMLSchema#> .
-@prefix schema: <http://schema.org/> .
-@prefix wd:     <http://www.wikidata.org/entity/> .
-
-atlas:Ontology a owl:Ontology ;
-  rdfs:label   "Atlas Ontology" ;
-  rdfs:comment "Entity resolution ontology for Atlas." .
-
-
-### Classes
+@prefix atlas:   <http://world.eu.org/atlas_ontology#> .
+@prefix owl:     <http://www.w3.org/2002/07/owl#> .
+@prefix rdf:     <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
+@prefix rdfs:    <http://www.w3.org/2000/01/rdf-schema#> .
+@prefix xsd:     <http://www.w3.org/2001/XMLSchema#> .
+@prefix schema:  <http://schema.org/> .
+@prefix wd:      <http://www.wikidata.org/entity/> .
+@prefix dcterms: <http://purl.org/dc/terms/> .
+
+# =============================================================================
+# Atlas Ontology  v1.1.0
+#
+# Changelog
+#   1.0.0  Initial release — core classes, flat identifiers, alias, provenance,
+#          claim, curate flag, seven top-level entity types.
+#   1.1.0  Expanded type catalog (fully backward-compatible):
+#            - Location hierarchy: Continent, Country, Region, PopulatedPlace,
+#              Neighbourhood, NaturalFeature, AdministrativeArea
+#            - Organization subtypes: PoliticalParty, MilitaryUnit,
+#              MediaOrganization
+#            - CreativeWork subtypes: Film, Book, MusicAlbum, TVSeries,
+#              VideoGame
+#            - Product subtypes: Drug, Food
+#            - New top-level types: FinancialInstrument (with PublicCompany,
+#              StockIndex, Commodity, Cryptocurrency, Currency), Animal,
+#              Disease, Building, Award, Sport, FictionalCharacter,
+#              EthnicGroup, Concept
+#          Three new relationship properties:
+#            - atlas:locatedIn   (transitive)
+#            - atlas:bordersWith (symmetric)
+#            - atlas:memberOf
+# =============================================================================
+
+<http://world.eu.org/atlas_ontology>
+  a owl:Ontology ;
+  rdfs:label      "Atlas Ontology" ;
+  rdfs:comment    "Entity resolution ontology for Atlas." ;
+  owl:versionIRI  <http://world.eu.org/atlas_ontology/1.1.0> ;
+  owl:versionInfo "1.1.0" ;
+  dcterms:created  "2026-04-01"^^xsd:date ;
+  dcterms:modified "2026-04-06"^^xsd:date .
+
+
+# =============================================================================
+# INFRASTRUCTURE CLASSES  (1.0.0 — unchanged)
+# =============================================================================
 
 atlas:Entity a owl:Class ;
   rdfs:label   "Entity" ;
@@ -38,7 +70,9 @@ atlas:CurateFlag a owl:Class ;
   rdfs:comment "Signals that an entity needs human review." .
 
 
-### Object properties
+# =============================================================================
+# OBJECT PROPERTIES  (1.0.0 — unchanged, three new ones added in 1.1.0)
+# =============================================================================
 
 atlas:hasCanonicalType a owl:ObjectProperty ;
   rdfs:domain atlas:Entity ;
@@ -85,8 +119,30 @@ atlas:hasCurateFlag a owl:ObjectProperty ;
   rdfs:range  atlas:CurateFlag ;
   rdfs:label  "has curate flag" .
 
+# --- NEW in 1.1.0 ---
+
+atlas:locatedIn a owl:ObjectProperty, owl:TransitiveProperty ;
+  rdfs:domain atlas:Entity ;
+  rdfs:range  atlas:Entity ;
+  rdfs:label  "located in" ;
+  rdfs:comment "Spatial containment, transitive. Enables zoom-in/zoom-out traversal: PopulatedPlace locatedIn Region locatedIn Country locatedIn Continent." .
+
+atlas:bordersWith a owl:ObjectProperty, owl:SymmetricProperty ;
+  rdfs:domain atlas:Entity ;
+  rdfs:range  atlas:Entity ;
+  rdfs:label  "borders with" ;
+  rdfs:comment "Shared land or maritime border. Symmetric: if A bordersWith B then B bordersWith A." .
 
-### Datatype properties
+atlas:memberOf a owl:ObjectProperty ;
+  rdfs:domain atlas:Entity ;
+  rdfs:range  atlas:Entity ;
+  rdfs:label  "member of" ;
+  rdfs:comment "Membership or affiliation. Intended uses: Person to Organization, Person to EthnicGroup, Organization to Organization. Not for type classification — use atlas:hasCanonicalType for that." .
+
+
+# =============================================================================
+# DATATYPE PROPERTIES  (1.0.0 — unchanged)
+# =============================================================================
 
 atlas:atlasId a owl:DatatypeProperty ;
   rdfs:domain atlas:Entity ;
@@ -171,41 +227,265 @@ atlas:rawJson a owl:DatatypeProperty ;
   rdfs:domain atlas:Entity ;
   rdfs:range  xsd:string ;
   rdfs:label  "raw json" ;
-  rdfs:comment "Opaque JSON cache blob from any source. Source is recorded in the associated Provenance node." .
-
-
-### Canonical type catalog
+  rdfs:comment "Opaque JSON cache blob. Source is recorded inside the blob." .
+
+
+# =============================================================================
+# TYPE CATALOG
+#
+# LLM extraction note:
+#   For cheap / small models, pass only the seven top-level types (Person,
+#   Organization, Location, CreativeWork, Event, Product, Other plus the new
+#   top-level additions). Use a Wikidata QID lookup to promote to a subtype
+#   in a second pass — do not ask a small model to choose among 35 types.
+#
+# Virtuoso inference note:
+#   rdfs:subClassOf* traversal requires inference to be enabled:
+#   OPTION(INFERENCE 'atlas')
+#   A query for atlas:Location will then also match atlas:Country etc.
+# =============================================================================
+
+# -----------------------------------------------------------------------------
+# Person  (1.0.0)
+# -----------------------------------------------------------------------------
 
 atlas:Person a owl:Class ;
   rdfs:subClassOf atlas:EntityType ;
   rdfs:label "Person" ;
   owl:sameAs schema:Person, wd:Q5 .
 
+
+# -----------------------------------------------------------------------------
+# Organization  (1.0.0) + subtypes (1.1.0)
+# -----------------------------------------------------------------------------
+
 atlas:Organization a owl:Class ;
   rdfs:subClassOf atlas:EntityType ;
   rdfs:label "Organization" ;
   owl:sameAs schema:Organization, wd:Q43229 .
 
+atlas:PoliticalParty a owl:Class ;
+  rdfs:subClassOf atlas:Organization ;
+  rdfs:label "Political Party" ;
+  owl:sameAs wd:Q7278 .                    # 1.1.0
+
+atlas:MilitaryUnit a owl:Class ;
+  rdfs:subClassOf atlas:Organization ;
+  rdfs:label "Military Unit" ;
+  owl:sameAs wd:Q176799 .                  # 1.1.0
+
+atlas:MediaOrganization a owl:Class ;
+  rdfs:subClassOf atlas:Organization ;
+  rdfs:label "Media Organization" ;
+  owl:sameAs wd:Q4830453 .                 # 1.1.0
+
+
+# -----------------------------------------------------------------------------
+# Location  (1.0.0) + hierarchy (1.1.0)
+# Relate instances with atlas:locatedIn and atlas:bordersWith.
+# -----------------------------------------------------------------------------
+
 atlas:Location a owl:Class ;
   rdfs:subClassOf atlas:EntityType ;
-  rdfs:label "Location" ;
+  rdfs:label   "Location" ;
+  rdfs:comment "Abstract parent. Use a subtype where possible." ;
   owl:sameAs schema:Place, wd:Q17334923 .
 
+atlas:Continent a owl:Class ;
+  rdfs:subClassOf atlas:Location ;
+  rdfs:label "Continent" ;
+  owl:sameAs wd:Q5107 .                    # 1.1.0
+
+atlas:Country a owl:Class ;
+  rdfs:subClassOf atlas:Location ;
+  rdfs:label "Country" ;
+  owl:sameAs schema:Country, wd:Q6256 .   # 1.1.0
+
+atlas:Region a owl:Class ;
+  rdfs:subClassOf atlas:Location ;
+  rdfs:label   "Region" ;
+  rdfs:comment "State, province, county or similar administrative subdivision." ;
+  owl:sameAs wd:Q82794 .                   # 1.1.0
+
+atlas:PopulatedPlace a owl:Class ;
+  rdfs:subClassOf atlas:Location ;
+  rdfs:label   "Populated Place" ;
+  rdfs:comment "City, town or village." ;
+  owl:sameAs wd:Q515 .                     # 1.1.0
+
+atlas:Neighbourhood a owl:Class ;
+  rdfs:subClassOf atlas:Location ;
+  rdfs:label   "Neighbourhood" ;
+  rdfs:comment "Borough, district, arrondissement or similar urban subdivision." ;
+  owl:sameAs wd:Q123705 .                  # 1.1.0
+
+atlas:NaturalFeature a owl:Class ;
+  rdfs:subClassOf atlas:Location ;
+  rdfs:label   "Natural Feature" ;
+  rdfs:comment "River, mountain, ocean, lake or other natural geographic feature." ;
+  owl:sameAs wd:Q35145263 .               # 1.1.0
+
+atlas:AdministrativeArea a owl:Class ;
+  rdfs:subClassOf atlas:Location ;
+  rdfs:label   "Administrative Area" ;
+  rdfs:comment "Fallback for political subdivisions that do not fit a more specific subtype." ;
+  owl:sameAs wd:Q56061 .                   # 1.1.0
+
+
+# -----------------------------------------------------------------------------
+# CreativeWork  (1.0.0) + subtypes (1.1.0)
+# -----------------------------------------------------------------------------
+
 atlas:CreativeWork a owl:Class ;
   rdfs:subClassOf atlas:EntityType ;
   rdfs:label "Creative Work" ;
   owl:sameAs schema:CreativeWork, wd:Q17537576 .
 
+atlas:Film a owl:Class ;
+  rdfs:subClassOf atlas:CreativeWork ;
+  rdfs:label "Film" ;
+  owl:sameAs schema:Movie, wd:Q11424 .     # 1.1.0
+
+atlas:Book a owl:Class ;
+  rdfs:subClassOf atlas:CreativeWork ;
+  rdfs:label "Book" ;
+  owl:sameAs schema:Book, wd:Q571 .        # 1.1.0
+
+atlas:MusicAlbum a owl:Class ;
+  rdfs:subClassOf atlas:CreativeWork ;
+  rdfs:label "Music Album" ;
+  owl:sameAs schema:MusicAlbum, wd:Q482994 . # 1.1.0
+
+atlas:TVSeries a owl:Class ;
+  rdfs:subClassOf atlas:CreativeWork ;
+  rdfs:label "TV Series" ;
+  owl:sameAs schema:TVSeries, wd:Q5398426 . # 1.1.0
+
+atlas:VideoGame a owl:Class ;
+  rdfs:subClassOf atlas:CreativeWork ;
+  rdfs:label "Video Game" ;
+  owl:sameAs schema:VideoGame, wd:Q7889 .  # 1.1.0
+
+
+# -----------------------------------------------------------------------------
+# Event  (1.0.0)
+# -----------------------------------------------------------------------------
+
 atlas:Event a owl:Class ;
   rdfs:subClassOf atlas:EntityType ;
   rdfs:label "Event" ;
   owl:sameAs schema:Event, wd:Q1656682 .
 
+
+# -----------------------------------------------------------------------------
+# Product  (1.0.0) + subtypes (1.1.0)
+# -----------------------------------------------------------------------------
+
 atlas:Product a owl:Class ;
   rdfs:subClassOf atlas:EntityType ;
   rdfs:label "Product" ;
   owl:sameAs schema:Product, wd:Q2424752 .
 
+atlas:Drug a owl:Class ;
+  rdfs:subClassOf atlas:Product ;
+  rdfs:label "Drug" ;
+  owl:sameAs wd:Q8386 .                    # 1.1.0
+
+atlas:Food a owl:Class ;
+  rdfs:subClassOf atlas:Product ;
+  rdfs:label "Food" ;
+  owl:sameAs wd:Q2095 .                    # 1.1.0
+
+
+# -----------------------------------------------------------------------------
+# FinancialInstrument  (1.1.0) — new top-level type
+# -----------------------------------------------------------------------------
+
+atlas:FinancialInstrument a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label "Financial Instrument" ;
+  owl:sameAs wd:Q182780 .
+
+atlas:PublicCompany a owl:Class ;
+  rdfs:subClassOf atlas:FinancialInstrument ;
+  rdfs:label   "Public Company" ;
+  rdfs:comment "Company traded on a public exchange. Has a ticker and ISIN." ;
+  owl:sameAs wd:Q891723 .
+
+atlas:StockIndex a owl:Class ;
+  rdfs:subClassOf atlas:FinancialInstrument ;
+  rdfs:label   "Stock Index" ;
+  rdfs:comment "Market index such as S&P 500, DAX or Nikkei." ;
+  owl:sameAs wd:Q181600 .
+
+atlas:Commodity a owl:Class ;
+  rdfs:subClassOf atlas:FinancialInstrument ;
+  rdfs:label   "Commodity" ;
+  rdfs:comment "Physical good traded on an exchange: gold, oil, wheat, etc." ;
+  owl:sameAs wd:Q317088 .
+
+atlas:Cryptocurrency a owl:Class ;
+  rdfs:subClassOf atlas:FinancialInstrument ;
+  rdfs:label "Cryptocurrency" ;
+  owl:sameAs wd:Q13479982 .
+
+atlas:Currency a owl:Class ;
+  rdfs:subClassOf atlas:FinancialInstrument ;
+  rdfs:label   "Currency" ;
+  rdfs:comment "Fiat currency. Distinct from Cryptocurrency." ;
+  owl:sameAs schema:Currency, wd:Q8142 .
+
+
+# -----------------------------------------------------------------------------
+# New top-level types  (1.1.0)
+# -----------------------------------------------------------------------------
+
+atlas:Animal a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label "Animal" ;
+  owl:sameAs wd:Q729 .
+
+atlas:Disease a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label "Disease" ;
+  owl:sameAs schema:MedicalCondition, wd:Q12136 .
+
+atlas:Building a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label "Building" ;
+  owl:sameAs wd:Q41176 .
+
+atlas:Award a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label "Award" ;
+  owl:sameAs wd:Q618779 .
+
+atlas:Sport a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label "Sport" ;
+  owl:sameAs wd:Q349 .
+
+atlas:FictionalCharacter a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label "Fictional Character" ;
+  owl:sameAs wd:Q95074 .
+
+atlas:EthnicGroup a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label "Ethnic Group" ;
+  owl:sameAs wd:Q41710 .
+
+atlas:Concept a owl:Class ;
+  rdfs:subClassOf atlas:EntityType ;
+  rdfs:label   "Concept" ;
+  rdfs:comment "Abstract idea that does not fit a more specific type." ;
+  owl:sameAs wd:Q151885 .
+
+
+# -----------------------------------------------------------------------------
+# Other  (1.0.0) — fallback, use when no type above fits
+# -----------------------------------------------------------------------------
+
 atlas:Other a owl:Class ;
   rdfs:subClassOf atlas:EntityType ;
   rdfs:label "Other" .

+ 25 - 0
ontology_suggestions.txt

@@ -0,0 +1,25 @@
+<http://wikidata.dbpedia.org/resource/Q1000415> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Event> .
+<<http://wikidata.dbpedia.org/resource/Q1000501> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Software> .
+<http://wikidata.dbpedia.org/resource/Q1000726> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/ChemicalCompound> .
+<http://wikidata.dbpedia.org/resource/Q1000867> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Politician> .
+<http://wikidata.dbpedia.org/resource/Q1000888> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Media> .
+<http://wikidata.dbpedia.org/resource/Q1001082> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Organisation> .
+<http://wikidata.dbpedia.org/resource/Q1001106> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Activity> .
+<http://wikidata.dbpedia.org/resource/Q1001329> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Building> .
+<http://wikidata.dbpedia.org/resource/Q1001378> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/AcademicSubject> .
+<http://wikidata.dbpedia.org/resource/Q1002195> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Disease> .
+<http://wikidata.dbpedia.org/resource/Q1002228> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Profession> .
+<http://wikidata.dbpedia.org/resource/Q1002439> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Food> .
+<http://wikidata.dbpedia.org/resource/Q1003214> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Weapon> .
+<http://wikidata.dbpedia.org/resource/Q1003889> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Contest> .
+<http://wikidata.dbpedia.org/resource/Q1004875> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/EthnicGroup> .
+<http://wikidata.dbpedia.org/resource/Q1020203> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Mineral> .
+<http://wikidata.dbpedia.org/resource/Q1022626> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Ship> .
+<http://wikidata.dbpedia.org/resource/Q102356> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/MilitaryUnit> .
+<http://wikidata.dbpedia.org/resource/Q10291472> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Animal> .
+<http://wikidata.dbpedia.org/resource/Q1037810> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Drug> .
+<http://wikidata.dbpedia.org/resource/Q1050285> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/FictionalCharacter> .
+<http://wikidata.dbpedia.org/resource/Q1050644> <http://www.w3.org/2000/01/rdf-schema#subClassOf> <http://dbpedia.org/ontology/Aircraft> .
+
+
+City Country Region 

+ 16 - 1
resolve_scheme.md

@@ -10,10 +10,16 @@ The request is a JSON object with the following top‑level keys:
 |------|------|-------------|
 | **subject** | string | The entity to resolve. **Required** |
 | **context** | object | Optional context to narrow the search.  | 
-| | `realm` | string | e.g. *"music"*, *"geography"* |
+| | `realm` | string | Resolution realm, e.g. `internal`, `external`, `news`, `music`, `geography` |
 | | `provenance` | string | Source of the query (e.g. user, system) |
 | | `time` | string | ISO‑8601 timestamp |
 | | `language` | string | BCP‑47 language tag |
+| **strategy** | object | Controls how the resolver should behave. |
+| | `mode` | string | One of `quick`, `ranked`, `llm_select`, `interactive`, `hybrid` |
+| | `use_embeddings` | boolean | Use embedding similarity while ranking candidates |
+| | `auto_accept_threshold` | number (0-1) | Confidence threshold for auto-resolving; useful for `ranked`/`hybrid` |
+| | `interactive_below_threshold` | boolean | Return candidates instead of auto-picking when confidence is low |
+| | `use_llm_fallback` | boolean | Allow a cheap LLM pass when symbolic ranking is inconclusive |
 | **constraints** | object | Rules to apply while resolving. |
 | | `deterministic` | boolean | If true, the tool must always return the same result |
 | | `require_authority` | boolean | Require a trusted source |
@@ -58,4 +64,13 @@ The tool replies with a JSON object containing:
 }
 ```
 
+### Suggested mode semantics
+
+- `quick`: current fast path, take the top remote candidate.
+- `ranked`: fetch several candidates, score them with explicit heuristics, then choose or return ambiguity.
+- `llm_select`: fetch several candidates and let a cheap model choose.
+- `interactive`: always return candidate choices to the client.
+- `hybrid`: symbolic ranking first, then LLM fallback if needed.
+- `use_embeddings`: when true, rank by lexical + embedding similarity.
+
 Feel free to use this file as the reference for any implementation of the resolve tool.

+ 37 - 0
scripts/run_resolve_harness.py

@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+import json
+import subprocess
+from pathlib import Path
+
+ROOT = Path(__file__).resolve().parents[1]
+FIXTURES = ROOT / "tests" / "fixtures" / "resolve_harness_cases.json"
+
+
+def main() -> int:
+    cases = json.loads(FIXTURES.read_text())
+    print(f"Loaded {len(cases)} resolve harness cases")
+    print()
+    for case in cases:
+        print(f"=== {case['name']} ===")
+        cmd = [
+            "mcporter",
+            "--config",
+            "$CONFIG",
+            "call",
+            "atlas",
+            "resolve",
+            f"subject={json.dumps(case['subject'])}",
+            f"context={json.dumps(case['context'])}",
+            f"constraints={json.dumps(case['constraints'])}",
+            f"hints={json.dumps(case['hints'])}",
+            f"strategy={json.dumps(case['strategy'])}",
+            'debug={"include_candidates":true,"include_explanations":true}',
+        ]
+        print(" ".join(cmd))
+        print()
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())

+ 53 - 0
test_resolve.sh

@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd "$(dirname "$0")"
+
+if [[ -x .venv/bin/pytest ]]; then
+  PYTEST=.venv/bin/pytest
+else
+  PYTEST=pytest
+fi
+
+python3 - <<'PY'
+import json
+from pathlib import Path
+cases = json.loads(Path('tests/fixtures/resolve_harness_cases.json').read_text())
+print('\nResolver test runner')
+print(f'  project : {Path.cwd()}')
+print(f'  cases   : {len(cases)}')
+print('\nSubjects under test:')
+for case in cases:
+    print(f"  - {case['name']}: {case['subject']} ({case['context'].get('language','en')})")
+print('\nWhat is tested:')
+print('  - cache hits resolve when confidence is sufficient')
+print('  - low-confidence cache triggers re-resolution in ranked mode')
+print('  - interactive mode returns all candidates below threshold')
+print('  - embedding scoring is recorded when enabled')
+print('\nRunning tests...\n')
+PY
+
+REPORT=$(mktemp)
+trap 'rm -f "$REPORT"' EXIT
+
+"$PYTEST" -q --junitxml="$REPORT" \
+  tests/test_resolve_tool.py \
+  tests/test_resolve_strategies.py \
+  "$@"
+
+python3 - "$REPORT" <<'PY'
+import sys
+import xml.etree.ElementTree as ET
+from pathlib import Path
+report = Path(sys.argv[1])
+root = ET.parse(report).getroot()
+rows = []
+for case in root.iter('testcase'):
+    rows.append((case.get('classname',''), case.get('name',''), float(case.get('time','0'))))
+rows.sort(key=lambda r: r[2], reverse=True)
+print('\nPer-test timings:')
+print(f"{'test':60} {'seconds':>8}")
+print('-' * 70)
+for cls, name, secs in rows:
+    print(f"{(cls + '::' + name):60} {secs:8.3f}")
+PY

+ 10 - 0
tests/fixtures/resolve_harness_cases.json

@@ -0,0 +1,10 @@
+[
+  {"name":"en_exact_country_quick","subject":"Australia","context":{"realm":"external","language":"en","provenance":"user"},"constraints":{"max_candidates":5,"min_confidence":0.55},"hints":{"expected_type":"location","aliases":[]},"strategy":{"mode":"quick","auto_accept_threshold":0.85,"interactive_below_threshold":true,"use_embeddings":false}},
+  {"name":"en_exact_country_ranked_embeddings","subject":"Australia","context":{"realm":"external","language":"en","provenance":"user"},"constraints":{"max_candidates":5,"min_confidence":0.85},"hints":{"expected_type":"location","aliases":["Commonwealth of Australia"]},"strategy":{"mode":"ranked","auto_accept_threshold":0.85,"interactive_below_threshold":true,"use_embeddings":true}},
+  {"name":"en_ambiguous_place_interactive","subject":"Georgia","context":{"realm":"external","language":"en","provenance":"user"},"constraints":{"max_candidates":5,"min_confidence":0.85},"hints":{"expected_type":"location","aliases":[]},"strategy":{"mode":"interactive","auto_accept_threshold":0.9,"interactive_below_threshold":true,"use_embeddings":true}},
+  {"name":"en_fictional_subject","subject":"Sherlock Holmes","context":{"realm":"external","language":"en","provenance":"user"},"constraints":{"max_candidates":5,"min_confidence":0.85},"hints":{"expected_type":"person","aliases":["Detective Holmes"]},"strategy":{"mode":"ranked","auto_accept_threshold":0.85,"interactive_below_threshold":true,"use_embeddings":true}},
+  {"name":"en_nonexistent_subject","subject":"Xyqvlorbania","context":{"realm":"external","language":"en","provenance":"user"},"constraints":{"max_candidates":5,"min_confidence":0.85},"hints":{"expected_type":"location","aliases":[]},"strategy":{"mode":"ranked","auto_accept_threshold":0.85,"interactive_below_threshold":true,"use_embeddings":false}},
+  {"name":"de_exact_country_ranked","subject":"Österreich","context":{"realm":"external","language":"de","provenance":"user"},"constraints":{"max_candidates":5,"min_confidence":0.85},"hints":{"expected_type":"location","aliases":["Republik Österreich"]},"strategy":{"mode":"ranked","auto_accept_threshold":0.85,"interactive_below_threshold":true,"use_embeddings":true}},
+  {"name":"de_city_quick","subject":"Wien","context":{"realm":"external","language":"de","provenance":"user"},"constraints":{"max_candidates":5,"min_confidence":0.55},"hints":{"expected_type":"location","aliases":["Vienna"]},"strategy":{"mode":"quick","auto_accept_threshold":0.85,"interactive_below_threshold":true,"use_embeddings":false}},
+  {"name":"de_ambiguous_person_interactive","subject":"Johann Strauss","context":{"realm":"external","language":"de","provenance":"user"},"constraints":{"max_candidates":5,"min_confidence":0.85},"hints":{"expected_type":"person","aliases":["Johann Strauss I","Johann Strauss II"]},"strategy":{"mode":"interactive","auto_accept_threshold":0.9,"interactive_below_threshold":true,"use_embeddings":true}}
+]

+ 6 - 0
tests/test_maintenance_helpers.py

@@ -4,6 +4,7 @@ from app.maintenance import (
     _extract_wikidata_qids_from_entity_dump,
     _infer_atlas_type_from_qids,
 )
+from app.atlas_store import _parse_boolean_literal
 
 
 def test_extract_wikidata_qids_from_dump_dedupes():
@@ -48,3 +49,8 @@ def test_infer_atlas_type_from_qids_heuristic_person():
     atlas_type = _infer_atlas_type_from_qids(["Q5"], g)
     assert atlas_type == "atlas:Person"
 
+
+def test_parse_boolean_literal_accepts_virtuoso_one():
+    assert _parse_boolean_literal("1") is True
+    assert _parse_boolean_literal("true") is True
+    assert _parse_boolean_literal("0") is False

+ 102 - 0
tests/test_resolve_strategies.py

@@ -0,0 +1,102 @@
+import pytest
+
+from app.resolve import ResolveService
+
+
+@pytest.mark.anyio
+async def test_cache_hit_is_resolved_when_confidence_satisfies_requested_mode():
+    async def load_entity(_subject):
+        return {
+            "atlas_id": "atlas-1",
+            "label": "Australia",
+            "type": "atlas:Location",
+            "description": "country in Oceania",
+            "needs_curation": True,
+            "confidence": 0.91,
+        }
+
+    async def fail_lookup(*_args, **_kwargs):
+        raise AssertionError("wikidata lookup should not run")
+
+    async def no_persist(_entity):
+        return None
+
+    svc = ResolveService(load_entity_fn=load_entity, wikidata_lookup_fn=fail_lookup, persist_entity_fn=no_persist)
+    result = await svc.resolve(
+        subject="Australia",
+        constraints={"min_confidence": 0.85},
+        strategy={"mode": "ranked"},
+        debug={"include_explanations": True},
+    )
+
+    assert result["status"] == "resolved"
+    assert result["confidence"] == 0.91
+    assert result["meta"]["debug"]["used_cache"] is True
+
+
+@pytest.mark.anyio
+async def test_low_confidence_cache_triggers_reresolution_for_ranked_mode():
+    calls = {"lookup": 0, "persist": 0}
+
+    async def load_entity(_subject):
+        return {
+            "atlas_id": "atlas-1",
+            "label": "Australia",
+            "type": "atlas:Location",
+            "description": "country in Oceania",
+            "needs_curation": True,
+            "confidence": 0.55,
+        }
+
+    async def wikidata_lookup(_subject, _language="en", _limit=5):
+        calls["lookup"] += 1
+        return [
+            {"id": "Q408", "label": "Australia", "description": "country in Oceania", "type": "Q6256"}
+        ]
+
+    async def persist(_entity):
+        calls["persist"] += 1
+
+    svc = ResolveService(load_entity_fn=load_entity, wikidata_lookup_fn=wikidata_lookup, persist_entity_fn=persist)
+    result = await svc.resolve(
+        subject="Australia",
+        context={"language": "en", "realm": "external"},
+        constraints={"min_confidence": 0.85},
+        hints={"expected_type": "location", "aliases": []},
+        strategy={"mode": "ranked", "auto_accept_threshold": 0.85},
+        debug={"include_explanations": True},
+    )
+
+    assert calls["lookup"] == 1
+    assert calls["persist"] == 1
+    assert result["status"] == "resolved"
+    assert result["meta"]["debug"]["used_cache"] is False
+
+
+@pytest.mark.anyio
+async def test_interactive_returns_all_candidates_without_auto_accepting_below_threshold():
+    async def no_hit(_subject):
+        return None
+
+    async def wikidata_lookup(_subject, _language="en", _limit=5):
+        return [
+            {"id": "Q1225", "label": "Georgia", "description": "country in Eastern Europe and West Asia", "type": "Q6256"},
+            {"id": "Q1428", "label": "Georgia", "description": "state of the United States of America", "type": "Q35657"},
+        ]
+
+    async def no_persist(_entity):
+        return None
+
+    svc = ResolveService(load_entity_fn=no_hit, wikidata_lookup_fn=wikidata_lookup, persist_entity_fn=no_persist)
+    result = await svc.resolve(
+        subject="Georgia",
+        context={"language": "en", "realm": "external"},
+        constraints={"max_candidates": 5},
+        hints={"expected_type": "location", "aliases": []},
+        strategy={"mode": "interactive", "auto_accept_threshold": 0.95, "interactive_below_threshold": True},
+        debug={"include_candidates": True, "include_explanations": True},
+    )
+
+    assert result["status"] == "ambiguous"
+    assert len(result["candidates"]) == 2
+    assert result["meta"]["debug"]["decision"] == "ambiguous_below_threshold"

+ 5 - 3
tests/test_resolve_tool.py

@@ -11,8 +11,8 @@ async def test_resolve_tool_is_stubbed_and_returns_ok():
     async def no_persist(_e):
         return None
 
-    async def no_wikidata(_s):
-        return None
+    async def no_wikidata(_s, _language="en", _limit=1):
+        return []
 
     svc = ResolveService(
         load_entity_fn=no_hit,
@@ -21,4 +21,6 @@ async def test_resolve_tool_is_stubbed_and_returns_ok():
     )
     result = await svc.resolve(subject="anything")
 
-    assert result == {"status": "not_found"}
+    assert result["status"] in {"not_found", "ambiguous", "resolved", "error"}
+    assert "meta" in result
+    assert "resolution_path" in result