atlas.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. """Atlas semantic core for entity resolution and enrichment."""
  2. from __future__ import annotations
  3. from datetime import datetime, timezone
  4. from app.cache import EntityCache
  5. from app.entity_normalize import normalize_entity
  6. from app.ids import claim_hash, entity_hash
  7. from app.models import (
  8. AtlasAlias,
  9. AtlasClaim,
  10. AtlasClaimObject,
  11. AtlasEntity,
  12. AtlasEnrichmentDataset,
  13. AtlasProvenance,
  14. )
  15. from app.trends_resolution import resolve_entity_via_trends
  16. from app.type_classifier import TypeClassification, classify_entity_type
  17. from app.storage_service import AtlasStorageService
  18. from app.virtuoso_store import VirtuosoEntityStore
  19. from app.wikidata_lookup import lookup_wikidata
  20. _entity_cache = EntityCache(max_entries=512)
  21. _virtuoso_store = VirtuosoEntityStore(max_cache_entries=256)
  22. _storage = AtlasStorageService()
  23. def _now_date() -> str:
  24. return datetime.now(timezone.utc).date().isoformat()
  25. async def resolve_entity(subject: str, context: str | None = None) -> AtlasEntity:
  26. normalized = normalize_entity(subject)
  27. token = normalized.strip().lower()
  28. cached = _entity_cache.get(token)
  29. if cached is not None:
  30. try:
  31. await _storage.write_entity(cached)
  32. except Exception:
  33. pass
  34. return cached
  35. virt_hit = await _virtuoso_store.lookup(token)
  36. if virt_hit is not None:
  37. # Make the returned raw payload reflect the original caller input
  38. # (so tests and UI/debug output stay stable).
  39. if isinstance(virt_hit.raw_payload, dict):
  40. virt_hit.raw_payload.setdefault("source", "virtuoso")
  41. virt_hit.raw_payload["raw"] = subject
  42. virt_hit.raw_payload["normalized"] = normalized
  43. _entity_cache.store(virt_hit, extra_tokens=[subject, normalized])
  44. try:
  45. await _storage.write_entity(virt_hit)
  46. except Exception:
  47. pass
  48. return virt_hit
  49. resolution = resolve_entity_via_trends(subject)
  50. classification = await classify_entity_type(subject, resolution, context)
  51. wikidata = await lookup_wikidata(subject)
  52. entity = _entity_from_resolution(subject, resolution, classification, wikidata)
  53. _entity_cache.store(entity, extra_tokens=[subject, normalized])
  54. try:
  55. await _storage.write_entity(entity)
  56. except Exception:
  57. pass
  58. return entity
  59. def _entity_from_resolution(subject: str, resolution: dict, classification: TypeClassification, wikidata: dict | None = None) -> AtlasEntity:
  60. canonical_label = (
  61. resolution.get("canonical_label")
  62. or resolution.get("normalized")
  63. or subject.strip()
  64. )
  65. canonical_type = (
  66. classification.canonical_type
  67. or resolution.get("type")
  68. or "unknown"
  69. )
  70. # atlas_id is opaque identity: hash-part only, never semantic content.
  71. atlas_id = f"atlas:{entity_hash((resolution.get('mid') or '').strip(), (wikidata or {}).get('qid') or '', canonical_label.strip().lower())}"
  72. trends_prov = AtlasProvenance(
  73. source=resolution.get("source") or "resolver",
  74. retrieval_method="trends-resolution",
  75. confidence=0.9 if resolution.get("mid") else 0.3,
  76. retrieved_at=resolution.get("retrieved_at"),
  77. )
  78. wikidata_prov = (
  79. AtlasProvenance(
  80. source="wikidata",
  81. retrieval_method="wbsearchentities + entitydata",
  82. confidence=0.99,
  83. retrieved_at=wikidata.get("retrieved_at"),
  84. )
  85. if wikidata and wikidata.get("qid")
  86. else None
  87. )
  88. claims: list[AtlasClaim] = []
  89. mid = resolution.get("mid")
  90. if mid:
  91. claims.append(
  92. AtlasClaim(
  93. claim_id=f"clm_raw_ident_mid_{claim_hash(atlas_id, 'atlas:hasIdentifier', mid, 'raw')}",
  94. subject=atlas_id,
  95. predicate="atlas:hasIdentifier",
  96. object=AtlasClaimObject(kind="identifier", id_type="mid", value=mid),
  97. layer="raw",
  98. provenance=trends_prov,
  99. created_at=_now_date(),
  100. )
  101. )
  102. if wikidata and wikidata.get("qid"):
  103. claims.append(
  104. AtlasClaim(
  105. claim_id=f"clm_raw_ident_qid_{claim_hash(atlas_id, 'atlas:hasIdentifier', wikidata['qid'], 'raw')}",
  106. subject=atlas_id,
  107. predicate="atlas:hasIdentifier",
  108. object=AtlasClaimObject(kind="identifier", id_type="qid", value=wikidata["qid"]),
  109. layer="raw",
  110. provenance=wikidata_prov,
  111. created_at=_now_date(),
  112. )
  113. )
  114. claims.append(
  115. AtlasClaim(
  116. claim_id=f"clm_drv_canonical_type_{claim_hash(atlas_id, 'atlas:hasCanonicalType', canonical_type, 'derived')}",
  117. subject=atlas_id,
  118. predicate="atlas:hasCanonicalType",
  119. object=AtlasClaimObject(kind="type", value=f"atlas:{canonical_type}"),
  120. layer="derived",
  121. provenance=classification.provenance,
  122. created_at=_now_date(),
  123. )
  124. )
  125. payload = dict(resolution)
  126. if wikidata:
  127. payload["wikidata"] = {
  128. "wikidata_status": "hit",
  129. "source": "wikidata",
  130. "qid": wikidata.get("qid"),
  131. "label": wikidata.get("label"),
  132. "description": wikidata.get("description"),
  133. "retrieved_at": wikidata.get("retrieved_at"),
  134. }
  135. else:
  136. payload["wikidata"] = {"wikidata_status": "missing", "source": "wikidata", "retrieved_at": None}
  137. return AtlasEntity(
  138. atlas_id=atlas_id,
  139. canonical_label=canonical_label,
  140. canonical_description=(wikidata or {}).get("description"),
  141. entity_type=canonical_type,
  142. aliases=[AtlasAlias(label=subject.strip() or canonical_label)],
  143. claims=claims,
  144. raw_payload=payload,
  145. needs_curation=classification.needs_curation,
  146. )
  147. def enrich_entity(entity: AtlasEntity, constraints=None, depth: int = 1) -> AtlasEnrichmentDataset:
  148. return AtlasEnrichmentDataset(
  149. seed_entity=entity,
  150. related_entities=[],
  151. query_context=constraints or {},
  152. depth=depth,
  153. )