4 months ago · f9373b6058
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,55 @@
 
				+# Environment
			
 
				+.env
			
 
				+.env.local
			
 
				+.env.*.local
			
 
				+
			
 
				+# Python
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+*.so
			
 
				+.Python
			
 
				+build/
			
 
				+dist/
			
 
				+*.egg-info/
			
 
				+.eggs/
			
 
				+pip-wheel-metadata/
			
 
				+*.egg
			
 
				+
			
 
				+# Virtual environments
			
 
				+.venv/
			
 
				+venv/
			
 
				+env/
			
 
				+ENV/
			
 
				+
			
 
				+# IDE
			
 
				+.vscode/
			
 
				+.idea/
			
 
				+*.swp
			
 
				+*.swo
			
 
				+.DS_Store
			
 
				+Thumbs.db
			
 
				+
			
 
				+# Book folders (data, not code)
			
 
				+books/inbox/*
			
 
				+books/processing/*
			
 
				+books/done/*
			
 
				+books/manifests/*
			
 
				+
			
 
				+# Keep folder structure but not contents
			
 
				+!books/inbox/.gitkeep
			
 
				+!books/processing/.gitkeep
			
 
				+!books/done/.gitkeep
			
 
				+!books/manifests/.gitkeep
			
 
				+
			
 
				+# Logs
			
 
				+*.log
			
 
				+logs/
			
 
				+
			
 
				+# Docker
			
 
				+docker-compose.override.yml
			
 
				+
			
 
				+# Testing
			
 
				+.pytest_cache/
			
 
				+.coverage
			
 
				+htmlcov/
			
--- a/PROJECT.md
+++ b/PROJECT.md
@@ -0,0 +1,181 @@
 
				+# PROJECT.md — book-ingestor
			
 
				+
			
 
				+## Vision
			
 
				+
			
 
				+Feed structured knowledge into a mem0 memory server so an AI agent can recall it naturally in conversation — no explicit RAG retrieval, no "search the knowledge base" prompts. The agent simply *knows* what it has read.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Architecture
			
 
				+
			
 
				+### Pipeline Overview
			
 
				+
			
 
				+```
			
 
				+File detected (watchdog)
			
 
				+    │
			
 
				+    ▼
			
 
				+[detector.py]
			
 
				+    Pythonic structure analysis via PyMuPDF:
			
 
				+    - Font size variance → heading detection
			
 
				+    - Bold flags + positioning → chapter boundaries
			
 
				+    - Flat if no structural signals found
			
 
				+    │
			
 
				+    ├─── STRUCTURED PATH ──────────────────────────┐
			
 
				+    │    Extract: book title, chapters, paragraphs  │
			
 
				+    │    Summarize: book (1 Groq call)              │
			
 
				+    │    Summarize: each chapter (N Groq calls)     │
			
 
				+    │    Chunk: paragraphs → content memories       │
			
 
				+    │                                               │
			
 
				+    └─── FLAT PATH ─────────────────────────────────┤
			
 
				+         Semantic/sliding window chunking           │
			
 
				+         Summarize: whole doc (1-3 Groq calls)      │
			
 
				+         Chunk: paragraphs → content memories       │
			
 
				+                                                    │
			
 
				+                                                    ▼
			
 
				+                                        [mem0_writer.py]
			
 
				+                                        POST /memories (layered)
			
 
				+                                                    │
			
 
				+                                                    ▼
			
 
				+                                        [manifest.py]
			
 
				+                                        Save manifest JSON
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Memory Schema
			
 
				+
			
 
				+Every memory POSTed to mem0 carries structured metadata:
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "messages": [{"role": "user", "content": "<memory text>"}],
			
 
				+  "agent_id": "knowledge_base",
			
 
				+  "metadata": {
			
 
				+    "source_file": "sapiens.pdf",
			
 
				+    "source_type": "book",
			
 
				+    "memory_type": "chapter_summary",
			
 
				+    "chapter": 4,
			
 
				+    "chapter_title": "The Storytelling Animal",
			
 
				+    "page_start": 67,
			
 
				+    "page_end": 71,
			
 
				+    "ingested_at": "2026-03-11T10:00:00Z"
			
 
				+  }
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+### Memory Types
			
 
				+
			
 
				+| `memory_type` | Count per doc | Purpose |
			
 
				+|--------------|---------------|---------|
			
 
				+| `book_summary` | 1 | High-level overview, broad questions |
			
 
				+| `chapter_summary` | N (structured docs) | Mid-level recall by topic |
			
 
				+| `content` | M | Specific facts, quotes, details |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Module Responsibilities
			
 
				+
			
 
				+| Module | Role | LLM? |
			
 
				+|--------|------|-------|
			
 
				+| `watchdog_runner.py` | Watches `inbox/`, triggers pipeline | No |
			
 
				+| `pipeline.py` | Orchestrates the full flow | No |
			
 
				+| `detector.py` | Detects document structure via PyMuPDF | No |
			
 
				+| `chunker.py` | Splits text into token-sized chunks | No |
			
 
				+| `summarizer.py` | Generates summaries via Groq/Llama 4 | ✅ Yes |
			
 
				+| `mem0_writer.py` | POSTs memories to mem0 REST API | No |
			
 
				+| `manifest.py` | Tracks ingested files and memory IDs | No |
			
 
				+| `config.py` | Loads `.env`, exposes typed settings | No |
			
 
				+
			
 
				+**Rule:** Only `summarizer.py` calls an LLM. Everything else is pure Python.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Token Budget
			
 
				+
			
 
				+Estimated cost per ~300-page book using Groq/Llama 4:
			
 
				+
			
 
				+| Operation | Calls | Input tokens | Output tokens |
			
 
				+|-----------|-------|-------------|---------------|
			
 
				+| Book summary | 1 | ~2,000 | ~500 |
			
 
				+| Chapter summaries (20 ch) | 20 | ~20,000 | ~6,000 |
			
 
				+| Flat doc summary | 1–3 | ~6,000 | ~1,500 |
			
 
				+| **Total (structured)** | ~21 | ~22,000 | ~6,500 |
			
 
				+
			
 
				+At Groq free tier rates: effectively **$0.00** for most books.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Manifest Format
			
 
				+
			
 
				+`books/manifests/sapiens_2026-03-11.json`
			
 
				+
			
 
				+```json
			
 
				+{
			
 
				+  "source_file": "sapiens.pdf",
			
 
				+  "ingested_at": "2026-03-11T10:23:00Z",
			
 
				+  "document_type": "structured",
			
 
				+  "chapters_detected": 20,
			
 
				+  "memories_created": {
			
 
				+    "book_summary": 1,
			
 
				+    "chapter_summary": 20,
			
 
				+    "content": 187
			
 
				+  },
			
 
				+  "mem0_memory_ids": ["abc123", "def456", "..."],
			
 
				+  "status": "complete"
			
 
				+}
			
 
				+```
			
 
				+
			
 
				+The manifest enables clean **deletion**: purge all `mem0_memory_ids` to fully remove a book from memory.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Development Phases
			
 
				+
			
 
				+### Phase 1 — Core Pipeline (current)
			
 
				+- [x] Project structure & config
			
 
				+- [ ] `detector.py` — structure detection
			
 
				+- [ ] `chunker.py` — token-aware chunking
			
 
				+- [ ] `summarizer.py` — Groq/Llama 4 summarization
			
 
				+- [ ] `mem0_writer.py` — mem0 REST integration
			
 
				+- [ ] `manifest.py` — ingestion tracking
			
 
				+- [ ] `pipeline.py` — full orchestration
			
 
				+- [ ] `watchdog_runner.py` — folder watcher + Rich terminal UI
			
 
				+
			
 
				+### Phase 2 — Extended Formats
			
 
				+- [ ] Markdown and plain text ingestion
			
 
				+- [ ] EPUB support
			
 
				+- [ ] Scanned PDF OCR (via Tesseract or Llama 4 vision)
			
 
				+
			
 
				+### Phase 3 — Docker
			
 
				+- [ ] `Dockerfile`
			
 
				+- [ ] `docker-compose.yml` with `books/` volume mount
			
 
				+- [ ] Health check endpoint
			
 
				+
			
 
				+### Phase 4 — Management
			
 
				+- [ ] CLI tool: `book-ingestor delete sapiens.pdf`
			
 
				+- [ ] CLI tool: `book-ingestor list` — show all ingested books
			
 
				+- [ ] Re-ingest on file change (hash-based deduplication)
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Dependencies (planned)
			
 
				+
			
 
				+```
			
 
				+pymupdf          # PDF text + structure extraction
			
 
				+watchdog         # Folder monitoring
			
 
				+groq             # Groq Python SDK
			
 
				+tiktoken         # Token counting (no LLM)
			
 
				+requests         # mem0 REST calls
			
 
				+python-dotenv    # .env loading
			
 
				+rich             # Terminal UI / progress display
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Design Principles
			
 
				+
			
 
				+1. **Python does the heavy lifting** — structure detection, chunking, and file management are pure Python. LLMs only summarize.
			
 
				+2. **Token frugality** — we never send more to Groq than necessary. Chunk boundaries are computed locally.
			
 
				+3. **Idempotent ingestion** — manifests make it safe to re-run. Duplicate detection via file hash.
			
 
				+4. **Network-only coupling** — the only external dependency at runtime is the mem0 server URL. No shared filesystem required.
			
 
				+5. **Docker-ready by design** — folder paths are configurable, stateless between runs.
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,123 @@
 
				+# 📚 book-ingestor
			
 
				+
			
 
				+> *"The agent reads the book so you don't have to explain it."*
			
 
				+
			
 
				+A standalone Python service that watches a folder for PDFs (and other text documents), intelligently processes them into layered memories, and feeds them into a [mem0](https://github.com/mem-ai/mem0) server via its REST API.
			
 
				+
			
 
				+The result: your AI agent doesn't *search* for knowledge — it simply *knows* it.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## How it works
			
 
				+
			
 
				+```
			
 
				+📂 books/inbox/          ← drop a PDF here
			
 
				+        ↓  (watchdog detects new file)
			
 
				+🔍 Structure Detection   ← is this a book with chapters, or a flat doc?
			
 
				+        ↓
			
 
				+✂️  Chunking             ← smart paragraph/semantic chunking (no LLM used)
			
 
				+        ↓
			
 
				+🧠 Summarization         ← Groq/Llama generates book + chapter summaries
			
 
				+        ↓
			
 
				+💾 mem0 /memories        ← layered memories POSTed to your mem0 server
			
 
				+        ↓
			
 
				+📂 books/done/           ← file archived, manifest saved
			
 
				+```
			
 
				+
			
 
				+Memories are stored in layers:
			
 
				+- **Book summary** — one high-level memory for the whole document
			
 
				+- **Chapter summaries** — one memory per chapter/section (structured docs)
			
 
				+- **Content chunks** — paragraph-level memories for fine-grained recall
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Requirements
			
 
				+
			
 
				+- Python 3.11+
			
 
				+- A running [mem0 server](https://github.com/mem-ai/mem0) accessible on your LAN
			
 
				+- A [Groq API key](https://console.groq.com/) (free tier is plenty)
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Quick Start
			
 
				+
			
 
				+```bash
			
 
				+git clone https://github.com/yourname/book-ingestor.git
			
 
				+cd book-ingestor
			
 
				+cp .env.example .env        # fill in your values
			
 
				+pip install -r requirements.txt
			
 
				+python -m book_ingestor.watchdog_runner
			
 
				+```
			
 
				+
			
 
				+Drop a PDF into `books/inbox/` and watch it get ingested.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Configuration
			
 
				+
			
 
				+All config lives in `.env`:
			
 
				+
			
 
				+```env
			
 
				+MEM0_BASE_URL=http://192.168.0.200:8420
			
 
				+MEM0_AGENT_ID=knowledge_base
			
 
				+GROQ_API_KEY=your_groq_key_here
			
 
				+GROQ_MODEL=meta-llama/llama-4-scout-17b-16e-instruct
			
 
				+BOOKS_INBOX=./books/inbox
			
 
				+BOOKS_PROCESSING=./books/processing
			
 
				+BOOKS_DONE=./books/done
			
 
				+BOOKS_MANIFESTS=./books/manifests
			
 
				+CHUNK_SIZE_TOKENS=350
			
 
				+LOG_LEVEL=INFO
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Folder Structure
			
 
				+
			
 
				+```
			
 
				+book-ingestor/
			
 
				+├── books/
			
 
				+│   ├── inbox/          ← drop zone (watched)
			
 
				+│   ├── processing/     ← in-flight (do not touch)
			
 
				+│   ├── done/           ← archived originals
			
 
				+│   └── manifests/      ← JSON record per ingested book
			
 
				+├── book_ingestor/
			
 
				+│   ├── watchdog_runner.py
			
 
				+│   ├── pipeline.py
			
 
				+│   ├── detector.py
			
 
				+│   ├── chunker.py
			
 
				+│   ├── summarizer.py
			
 
				+│   ├── mem0_writer.py
			
 
				+│   ├── manifest.py
			
 
				+│   └── config.py
			
 
				+├── .env.example
			
 
				+├── requirements.txt
			
 
				+├── PROJECT.md
			
 
				+└── README.md
			
 
				+```
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Supported File Types
			
 
				+
			
 
				+| Format | Status |
			
 
				+|--------|--------|
			
 
				+| PDF (text-based) | ✅ |
			
 
				+| PDF (scanned/image) | 🔜 (OCR planned) |
			
 
				+| Markdown (.md) | 🔜 |
			
 
				+| Plain text (.txt) | 🔜 |
			
 
				+| EPUB | 🔜 |
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## Notes
			
 
				+
			
 
				+- This project is **completely independent** of OpenClaw or any specific AI agent — it only talks to mem0.
			
 
				+- Any machine on the LAN with network access to your mem0 server can run this.
			
 
				+- Docker support is planned for a future release.
			
 
				+
			
 
				+---
			
 
				+
			
 
				+## License
			
 
				+
			
 
				+MIT