SHA1
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,8 @@
 
				 # mem0 server (your LAN address)
			
 
				 MEM0_BASE_URL=http://192.168.0.200:8420
			
 
				 MEM0_AGENT_ID=knowledge_base
			
 
				+# book-ingestor posts to /knowledge — change if your server uses a different path
			
 
				+# MEM0_KNOWLEDGE_PATH=/knowledge
			
 
				 
			
 
				 # Groq
			
 
				 GROQ_API_KEY=your_groq_key_here
			
@@ -15,5 +17,14 @@ BOOKS_MANIFESTS=./books/manifests
 
				 # Chunking
			
 
				 CHUNK_SIZE_TOKENS=350
			
 
				 
			
 
				+# Max sections before falling back to flat processing
			
 
				+# Prevents token burn on crappy OCR'd PDFs with hundreds of fake chapters
			
 
				+MAX_SECTIONS=64
			
 
				+
			
 
				+# Throttling — delay in seconds between chunk POSTs (0 = no delay)
			
 
				+# Increase if Ollama/nomic embedder is pegging your GPU
			
 
				+# 0.5 = 2 chunks/sec, 1.0 = 1 chunk/sec
			
 
				+INGEST_DELAY=0.5
			
 
				+
			
 
				 # Logging: DEBUG | INFO | WARNING
			
 
				 LOG_LEVEL=INFO
			
--- a/book_ingestor/config.py
+++ b/book_ingestor/config.py
@@ -29,6 +29,13 @@ class Config:
 
				     # Chunking
			
 
				     chunk_size_tokens: int
			
 
				 
			
 
				+    # Safety cap — docs with more sections than this are treated as flat
			
 
				+    # Prevents token burn on crappy OCR'd PDFs with hundreds of fake chapters
			
 
				+    max_sections: int
			
 
				+
			
 
				+    # Throttling — delay between POSTs to spare the embedder/GPU
			
 
				+    ingest_delay: float
			
 
				+
			
 
				     # Logging
			
 
				     log_level: str
			
 
				 
			
@@ -50,9 +57,11 @@ def load_config() -> Config:
 
				         books_done=os.getenv("BOOKS_DONE", "./books/done"),
			
 
				         books_manifests=os.getenv("BOOKS_MANIFESTS", "./books/manifests"),
			
 
				         chunk_size_tokens=int(os.getenv("CHUNK_SIZE_TOKENS", "350")),
			
 
				+        max_sections=int(os.getenv("MAX_SECTIONS", "60")),
			
 
				+        ingest_delay=float(os.getenv("INGEST_DELAY", "0.5")),
			
 
				         log_level=os.getenv("LOG_LEVEL", "INFO"),
			
 
				     )
			
 
				 
			
 
				 
			
 
				 # Singleton — import this everywhere
			
 
				-cfg = load_config()
			
 
				+cfg = load_config()
			
--- a/book_ingestor/detector.py
+++ b/book_ingestor/detector.py
@@ -41,8 +41,18 @@ class DocumentStructure:
 
				 
			
 
				 # Heading candidates: bold text significantly larger than body font
			
 
				 HEADING_FONT_RATIO = 1.15        # heading must be ≥ 15% larger than median body size
			
 
				-MIN_HEADING_LENGTH = 3
			
 
				+MIN_HEADING_LENGTH = 4           # minimum chars
			
 
				 MAX_HEADING_LENGTH = 120
			
 
				+MIN_HEADING_WORDS = 2            # must have at least 2 real words
			
 
				+MIN_SECTION_TEXT_CHARS = 200     # sections with less content than this are likely TOC artifacts
			
 
				+
			
 
				+# Single-word headings that ARE valid chapter titles — whitelist
			
 
				+VALID_SINGLE_WORD_HEADINGS = {
			
 
				+    "introduction", "preface", "foreword", "prologue", "epilogue",
			
 
				+    "conclusion", "appendix", "bibliography", "glossary", "index",
			
 
				+    "summary", "abstract", "acknowledgements", "acknowledgments",
			
 
				+    "contents", "overview", "background", "references", "afterword",
			
 
				+}
			
 
				 MIN_SECTIONS_FOR_STRUCTURED = 2  # need at least 2 headings to call it structured
			
 
				 
			
 
				 # Patterns that strongly suggest a chapter heading
			
@@ -169,6 +179,24 @@ def _is_heading_span(span: dict, body_size: float) -> bool:
 
				         if len(text) > 40:
			
 
				             return False
			
 
				 
			
 
				+    # Filter single-word headings — almost always TOC entries, running headers, or OCR noise
			
 
				+    # Exception: known valid single-word headings like INTRODUCTION, PREFACE etc.
			
 
				+    words = [w for w in text.split() if w.strip('\'".,;:-')]
			
 
				+    clean_words = [w for w in words if any(c.isalpha() for c in w)]
			
 
				+    if len(clean_words) < MIN_HEADING_WORDS:
			
 
				+        if text.strip('\'".,;:- ').lower() not in VALID_SINGLE_WORD_HEADINGS:
			
 
				+            return False
			
 
				+
			
 
				+    # Filter OCR fragments starting with punctuation — e.g. "'THE", "—Section"
			
 
				+    if text[0] in '\'".,;:-—–':
			
 
				+        return False
			
 
				+
			
 
				+    # Filter running headers: single capitalised common noun, likely page header
			
 
				+    # e.g. "Company", "Suberling" — real headings are phrases or have "CHAPTER/PART" etc.
			
 
				+    if len(clean_words) == 1 and clean_words[0][0].isupper() and len(clean_words[0]) < 20:
			
 
				+        if clean_words[0].lower() not in VALID_SINGLE_WORD_HEADINGS:
			
 
				+            return False
			
 
				+
			
 
				     # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
			
 
				     # e.g. "Chapter 1 ............. 12" or "——————————"
			
 
				     filler_chars = set(".-_·•=~*#/\\|")
			
--- a/book_ingestor/mem0_writer.py
+++ b/book_ingestor/mem0_writer.py
@@ -10,6 +10,7 @@ Server expects: { text, user_id, metadata, infer }
 
				 from __future__ import annotations
			
 
				 
			
 
				 import logging
			
 
				+import time
			
 
				 from datetime import datetime, timezone
			
 
				 
			
 
				 import requests
			
@@ -75,17 +76,23 @@ def write_content_chunk(chunk: Chunk, doc_title: str) -> str | None:
 
				             "chunk_index": chunk.chunk_index,
			
 
				             "token_count": chunk.token_count,
			
 
				         },
			
 
				-        infer=True,
			
 
				+        infer=False,
			
 
				     )
			
 
				 
			
 
				 
			
 
				 def write_content_chunks_batch(chunks: list[Chunk], doc_title: str) -> list[str]:
			
 
				-    """POST multiple content chunks. Returns list of successful memory IDs."""
			
 
				+    """
			
 
				+    POST multiple content chunks. Returns list of successful memory IDs.
			
 
				+    Throttled by INGEST_DELAY to avoid hammering the Ollama embedder.
			
 
				+    """
			
 
				     ids = []
			
 
				-    for chunk in chunks:
			
 
				+    for i, chunk in enumerate(chunks):
			
 
				         mem_id = write_content_chunk(chunk, doc_title)
			
 
				         if mem_id:
			
 
				             ids.append(mem_id)
			
 
				+        # Throttle after every chunk except the last — give the GPU breathing room
			
 
				+        if cfg.ingest_delay > 0 and i < len(chunks) - 1:
			
 
				+            time.sleep(cfg.ingest_delay)
			
 
				     return ids
			
 
				 
			
 
				 
			
@@ -99,6 +106,22 @@ def _base_meta(source_file: str, memory_type: str) -> dict:
 
				     }
			
 
				 
			
 
				 
			
 
				+def _sanitize_meta(metadata: dict) -> dict:
			
 
				+    """
			
 
				+    ChromaDB only accepts str, int, float, bool as metadata values.
			
 
				+    Remove None values and convert anything else to str.
			
 
				+    """
			
 
				+    clean = {}
			
 
				+    for k, v in metadata.items():
			
 
				+        if v is None:
			
 
				+            continue  # drop None entirely
			
 
				+        if isinstance(v, (str, int, float, bool)):
			
 
				+            clean[k] = v
			
 
				+        else:
			
 
				+            clean[k] = str(v)  # last resort conversion
			
 
				+    return clean
			
 
				+
			
 
				+
			
 
				 def _post(text: str, metadata: dict, infer: bool) -> str | None:
			
 
				     """
			
 
				     POST a single entry to the /knowledge endpoint.
			
@@ -111,6 +134,7 @@ def _post(text: str, metadata: dict, infer: bool) -> str | None:
 
				         "infer": infer,
			
 
				     }
			
 
				 
			
 
				+    metadata = _sanitize_meta(metadata)
			
 
				     url = f"{cfg.mem0_base_url}/knowledge"
			
 
				 
			
 
				     try:
			
--- a/book_ingestor/pipeline.py
+++ b/book_ingestor/pipeline.py
@@ -91,6 +91,21 @@ def _run_pipeline(pdf_path: Path, file_hash: str) -> bool:
 
				         if structure.doc_type == "structured":
			
 
				             console.print(f"  [green]✓[/] {len(structure.sections)} sections detected")
			
 
				 
			
 
				+        # ── Sanity check: too many sections = crappy PDF, force flat ────────
			
 
				+        if structure.doc_type == "structured" and len(structure.sections) > cfg.max_sections:
			
 
				+            log.warning(
			
 
				+                "%s yielded %d sections (max=%d) — looks like OCR noise, "
			
 
				+                "falling back to flat processing",
			
 
				+                structure.source_file, len(structure.sections), cfg.max_sections
			
 
				+            )
			
 
				+            console.print(
			
 
				+                f"  [yellow]⚠ {len(structure.sections)} sections detected — "
			
 
				+                f"exceeds MAX_SECTIONS={cfg.max_sections}, treating as flat[/]"
			
 
				+            )
			
 
				+            structure.doc_type = "flat"
			
 
				+            structure.full_text = "\n\n".join(s.raw_text for s in structure.sections)
			
 
				+            structure.sections = []
			
 
				+
			
 
				         # ── Route to appropriate sub-pipeline ───────────────────────────────
			
 
				         if structure.doc_type == "structured":
			
 
				             return _structured_pipeline(pdf_path, structure, file_hash, progress)
			
@@ -229,4 +244,4 @@ def _print_summary(memories: dict, title: str) -> None:
 
				         f"[cyan]{memories.get('chapter_summary', 0)}[/] chapters · "
			
 
				         f"[cyan]{memories.get('content', 0)}[/] chunks · "
			
 
				         f"[bold]{total}[/] total\n"
			
 
				-    )
			
 
				+    )
نویسنده	SHA1 پیام	تاریخ
Lukas Goldschmidt	4122e5f338 pdf reading improved	4 ماه پیش
Lukas Goldschmidt	8220079693 request throttle	4 ماه پیش