| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- """
- manifest.py — records every ingested file as a JSON manifest.
- Enables safe re-ingestion, deletion, and status checks.
- """
- from __future__ import annotations
- import json
- import hashlib
- import logging
- from dataclasses import dataclass, field, asdict
- from datetime import datetime, timezone
- from pathlib import Path
- from .config import cfg
- log = logging.getLogger(__name__)
- @dataclass
- class Manifest:
- source_file: str
- file_hash: str # SHA-256 of original PDF
- ingested_at: str
- doc_type: str # "structured" | "flat"
- doc_title: str
- chapters_detected: int
- memories: dict # { "book_summary": 1, "chapter_summary": N, "content": M }
- memory_ids: list[str] # all mem0 IDs — for future deletion
- status: str # "complete" | "partial" | "failed"
- def compute_file_hash(path: str | Path) -> str:
- sha = hashlib.sha256()
- with open(path, "rb") as f:
- for chunk in iter(lambda: f.read(65536), b""):
- sha.update(chunk)
- return sha.hexdigest()
- def already_ingested(file_hash: str) -> bool:
- """Check if a file with this hash has already been ingested successfully."""
- manifests_dir = Path(cfg.books_manifests)
- for manifest_file in manifests_dir.glob("*.json"):
- try:
- data = json.loads(manifest_file.read_text())
- if data.get("file_hash") == file_hash and data.get("status") == "complete":
- log.info("Skipping already-ingested file (hash match): %s", manifest_file.name)
- return True
- except (json.JSONDecodeError, KeyError):
- continue
- return False
- def save_manifest(manifest: Manifest) -> Path:
- """Write manifest JSON to the manifests directory."""
- manifests_dir = Path(cfg.books_manifests)
- manifests_dir.mkdir(parents=True, exist_ok=True)
- # Filename: stem + timestamp to avoid collisions on re-ingestion
- timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
- stem = Path(manifest.source_file).stem[:40] # truncate long filenames
- filename = f"{stem}_{timestamp}.json"
- out_path = manifests_dir / filename
- out_path.write_text(json.dumps(asdict(manifest), indent=2))
- log.info("Manifest saved: %s", out_path)
- return out_path
- def build_manifest(
- source_file: str,
- file_hash: str,
- doc_type: str,
- doc_title: str,
- chapters_detected: int,
- book_summary_id: str | None,
- chapter_summary_ids: list[str],
- content_ids: list[str],
- status: str = "complete",
- ) -> Manifest:
- all_ids = [i for i in [book_summary_id] + chapter_summary_ids + content_ids if i]
- return Manifest(
- source_file=source_file,
- file_hash=file_hash,
- ingested_at=datetime.now(timezone.utc).isoformat(),
- doc_type=doc_type,
- doc_title=doc_title,
- chapters_detected=chapters_detected,
- memories={
- "book_summary": 1 if book_summary_id else 0,
- "chapter_summary": len(chapter_summary_ids),
- "content": len(content_ids),
- },
- memory_ids=all_ids,
- status=status,
- )
|