manifest.py 3.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. """
  2. manifest.py — records every ingested file as a JSON manifest.
  3. Enables safe re-ingestion, deletion, and status checks.
  4. """
  5. from __future__ import annotations
  6. import json
  7. import hashlib
  8. import logging
  9. from dataclasses import dataclass, field, asdict
  10. from datetime import datetime, timezone
  11. from pathlib import Path
  12. from .config import cfg
  13. log = logging.getLogger(__name__)
  14. @dataclass
  15. class Manifest:
  16. source_file: str
  17. file_hash: str # SHA-256 of original PDF
  18. ingested_at: str
  19. doc_type: str # "structured" | "flat"
  20. doc_title: str
  21. chapters_detected: int
  22. memories: dict # { "book_summary": 1, "chapter_summary": N, "content": M }
  23. memory_ids: list[str] # all mem0 IDs — for future deletion
  24. status: str # "complete" | "partial" | "failed"
  25. def compute_file_hash(path: str | Path) -> str:
  26. sha = hashlib.sha256()
  27. with open(path, "rb") as f:
  28. for chunk in iter(lambda: f.read(65536), b""):
  29. sha.update(chunk)
  30. return sha.hexdigest()
  31. def already_ingested(file_hash: str) -> bool:
  32. """Check if a file with this hash has already been ingested successfully."""
  33. manifests_dir = Path(cfg.books_manifests)
  34. for manifest_file in manifests_dir.glob("*.json"):
  35. try:
  36. data = json.loads(manifest_file.read_text())
  37. if data.get("file_hash") == file_hash and data.get("status") == "complete":
  38. log.info("Skipping already-ingested file (hash match): %s", manifest_file.name)
  39. return True
  40. except (json.JSONDecodeError, KeyError):
  41. continue
  42. return False
  43. def save_manifest(manifest: Manifest) -> Path:
  44. """Write manifest JSON to the manifests directory."""
  45. manifests_dir = Path(cfg.books_manifests)
  46. manifests_dir.mkdir(parents=True, exist_ok=True)
  47. # Filename: stem + timestamp to avoid collisions on re-ingestion
  48. timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
  49. stem = Path(manifest.source_file).stem[:40] # truncate long filenames
  50. filename = f"{stem}_{timestamp}.json"
  51. out_path = manifests_dir / filename
  52. out_path.write_text(json.dumps(asdict(manifest), indent=2))
  53. log.info("Manifest saved: %s", out_path)
  54. return out_path
  55. def build_manifest(
  56. source_file: str,
  57. file_hash: str,
  58. doc_type: str,
  59. doc_title: str,
  60. chapters_detected: int,
  61. book_summary_id: str | None,
  62. chapter_summary_ids: list[str],
  63. content_ids: list[str],
  64. status: str = "complete",
  65. ) -> Manifest:
  66. all_ids = [i for i in [book_summary_id] + chapter_summary_ids + content_ids if i]
  67. return Manifest(
  68. source_file=source_file,
  69. file_hash=file_hash,
  70. ingested_at=datetime.now(timezone.utc).isoformat(),
  71. doc_type=doc_type,
  72. doc_title=doc_title,
  73. chapters_detected=chapters_detected,
  74. memories={
  75. "book_summary": 1 if book_summary_id else 0,
  76. "chapter_summary": len(chapter_summary_ids),
  77. "content": len(content_ids),
  78. },
  79. memory_ids=all_ids,
  80. status=status,
  81. )