""" manifest.py — records every ingested file as a JSON manifest. Enables safe re-ingestion, deletion, and status checks. """ from __future__ import annotations import json import hashlib import logging from dataclasses import dataclass, field, asdict from datetime import datetime, timezone from pathlib import Path from .config import cfg log = logging.getLogger(__name__) @dataclass class Manifest: source_file: str file_hash: str # SHA-256 of original PDF ingested_at: str doc_type: str # "structured" | "flat" doc_title: str chapters_detected: int memories: dict # { "book_summary": 1, "chapter_summary": N, "content": M } memory_ids: list[str] # all mem0 IDs — for future deletion status: str # "complete" | "partial" | "failed" def compute_file_hash(path: str | Path) -> str: sha = hashlib.sha256() with open(path, "rb") as f: for chunk in iter(lambda: f.read(65536), b""): sha.update(chunk) return sha.hexdigest() def already_ingested(file_hash: str) -> bool: """Check if a file with this hash has already been ingested successfully.""" manifests_dir = Path(cfg.books_manifests) for manifest_file in manifests_dir.glob("*.json"): try: data = json.loads(manifest_file.read_text()) if data.get("file_hash") == file_hash and data.get("status") == "complete": log.info("Skipping already-ingested file (hash match): %s", manifest_file.name) return True except (json.JSONDecodeError, KeyError): continue return False def save_manifest(manifest: Manifest) -> Path: """Write manifest JSON to the manifests directory.""" manifests_dir = Path(cfg.books_manifests) manifests_dir.mkdir(parents=True, exist_ok=True) # Filename: stem + timestamp to avoid collisions on re-ingestion timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") stem = Path(manifest.source_file).stem[:40] # truncate long filenames filename = f"{stem}_{timestamp}.json" out_path = manifests_dir / filename out_path.write_text(json.dumps(asdict(manifest), indent=2)) log.info("Manifest saved: %s", out_path) return out_path def build_manifest( source_file: str, file_hash: str, doc_type: str, doc_title: str, chapters_detected: int, book_summary_id: str | None, chapter_summary_ids: list[str], content_ids: list[str], status: str = "complete", ) -> Manifest: all_ids = [i for i in [book_summary_id] + chapter_summary_ids + content_ids if i] return Manifest( source_file=source_file, file_hash=file_hash, ingested_at=datetime.now(timezone.utc).isoformat(), doc_type=doc_type, doc_title=doc_title, chapters_detected=chapters_detected, memories={ "book_summary": 1 if book_summary_id else 0, "chapter_summary": len(chapter_summary_ids), "content": len(content_ids), }, memory_ids=all_ids, status=status, )