SHA1
--- a/.env.example
+++ b/.env.example
@@ -1,6 +1,8 @@
 
															 # mem0 server (your LAN address)
														
 
															 MEM0_BASE_URL=http://192.168.0.200:8420
														
 
															 MEM0_AGENT_ID=knowledge_base
														
 
															+# book-ingestor posts to /knowledge — change if your server uses a different path
														
 
															+# MEM0_KNOWLEDGE_PATH=/knowledge
														
 
															 # Groq
														
 
															 GROQ_API_KEY=your_groq_key_here
														
@@ -15,5 +17,14 @@ BOOKS_MANIFESTS=./books/manifests
 
															 # Chunking
														
 
															 CHUNK_SIZE_TOKENS=350
														
 
															+# Max sections before falling back to flat processing
														
 
															+# Prevents token burn on crappy OCR'd PDFs with hundreds of fake chapters
														
 
															+MAX_SECTIONS=64
														
 
															+
														
 
															+# Throttling — delay in seconds between chunk POSTs (0 = no delay)
														
 
															+# Increase if Ollama/nomic embedder is pegging your GPU
														
 
															+# 0.5 = 2 chunks/sec, 1.0 = 1 chunk/sec
														
 
															+INGEST_DELAY=0.5
														
 
															+
														
 
															 # Logging: DEBUG | INFO | WARNING
														
 
															 LOG_LEVEL=INFO
														
--- a/book_ingestor/config.py
+++ b/book_ingestor/config.py
@@ -29,6 +29,13 @@ class Config:
 
															     # Chunking
														
 
															     chunk_size_tokens: int
														
 
															+    # Safety cap — docs with more sections than this are treated as flat
														
 
															+    # Prevents token burn on crappy OCR'd PDFs with hundreds of fake chapters
														
 
															+    max_sections: int
														
 
															+
														
 
															+    # Throttling — delay between POSTs to spare the embedder/GPU
														
 
															+    ingest_delay: float
														
 
															+
														
 
															     # Logging
														
 
															     log_level: str
														
@@ -50,9 +57,11 @@ def load_config() -> Config:
 
															         books_done=os.getenv("BOOKS_DONE", "./books/done"),
														
 
															         books_manifests=os.getenv("BOOKS_MANIFESTS", "./books/manifests"),
														
 
															         chunk_size_tokens=int(os.getenv("CHUNK_SIZE_TOKENS", "350")),
														
 
															+        max_sections=int(os.getenv("MAX_SECTIONS", "60")),
														
 
															+        ingest_delay=float(os.getenv("INGEST_DELAY", "0.5")),
														
 
															         log_level=os.getenv("LOG_LEVEL", "INFO"),
														
 
															     )
														
 
															 # Singleton — import this everywhere
														
 
															-cfg = load_config()
														
 
															+cfg = load_config()
														
--- a/book_ingestor/detector.py
+++ b/book_ingestor/detector.py
@@ -41,8 +41,18 @@ class DocumentStructure:
 
															 # Heading candidates: bold text significantly larger than body font
														
 
															 HEADING_FONT_RATIO = 1.15        # heading must be ≥ 15% larger than median body size
														
 
															-MIN_HEADING_LENGTH = 3
														
 
															+MIN_HEADING_LENGTH = 4           # minimum chars
														
 
															 MAX_HEADING_LENGTH = 120
														
 
															+MIN_HEADING_WORDS = 2            # must have at least 2 real words
														
 
															+MIN_SECTION_TEXT_CHARS = 200     # sections with less content than this are likely TOC artifacts
														
 
															+
														
 
															+# Single-word headings that ARE valid chapter titles — whitelist
														
 
															+VALID_SINGLE_WORD_HEADINGS = {
														
 
															+    "introduction", "preface", "foreword", "prologue", "epilogue",
														
 
															+    "conclusion", "appendix", "bibliography", "glossary", "index",
														
 
															+    "summary", "abstract", "acknowledgements", "acknowledgments",
														
 
															+    "contents", "overview", "background", "references", "afterword",
														
 
															+}
														
 
															 MIN_SECTIONS_FOR_STRUCTURED = 2  # need at least 2 headings to call it structured
														
 
															 # Patterns that strongly suggest a chapter heading
														
@@ -169,6 +179,24 @@ def _is_heading_span(span: dict, body_size: float) -> bool:
 
															         if len(text) > 40:
														
 
															             return False
														
 
															+    # Filter single-word headings — almost always TOC entries, running headers, or OCR noise
														
 
															+    # Exception: known valid single-word headings like INTRODUCTION, PREFACE etc.
														
 
															+    words = [w for w in text.split() if w.strip('\'".,;:-')]
														
 
															+    clean_words = [w for w in words if any(c.isalpha() for c in w)]
														
 
															+    if len(clean_words) < MIN_HEADING_WORDS:
														
 
															+        if text.strip('\'".,;:- ').lower() not in VALID_SINGLE_WORD_HEADINGS:
														
 
															+            return False
														
 
															+
														
 
															+    # Filter OCR fragments starting with punctuation — e.g. "'THE", "—Section"
														
 
															+    if text[0] in '\'".,;:-—–':
														
 
															+        return False
														
 
															+
														
 
															+    # Filter running headers: single capitalised common noun, likely page header
														
 
															+    # e.g. "Company", "Suberling" — real headings are phrases or have "CHAPTER/PART" etc.
														
 
															+    if len(clean_words) == 1 and clean_words[0][0].isupper() and len(clean_words[0]) < 20:
														
 
															+        if clean_words[0].lower() not in VALID_SINGLE_WORD_HEADINGS:
														
 
															+            return False
														
 
															+
														
 
															     # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
														
 
															     # e.g. "Chapter 1 ............. 12" or "——————————"
														
 
															     filler_chars = set(".-_·•=~*#/\\|")
														
--- a/book_ingestor/mem0_writer.py
+++ b/book_ingestor/mem0_writer.py
@@ -10,6 +10,7 @@ Server expects: { text, user_id, metadata, infer }
 
															 from __future__ import annotations
														
 
															 import logging
														
 
															+import time
														
 
															 from datetime import datetime, timezone
														
 
															 import requests
														
@@ -75,17 +76,23 @@ def write_content_chunk(chunk: Chunk, doc_title: str) -> str | None:
 
															             "chunk_index": chunk.chunk_index,
														
 
															             "token_count": chunk.token_count,
														
 
															         },
														
 
															-        infer=True,
														
 
															+        infer=False,
														
 
															     )
														
 
															 def write_content_chunks_batch(chunks: list[Chunk], doc_title: str) -> list[str]:
														
 
															-    """POST multiple content chunks. Returns list of successful memory IDs."""
														
 
															+    """
														
 
															+    POST multiple content chunks. Returns list of successful memory IDs.
														
 
															+    Throttled by INGEST_DELAY to avoid hammering the Ollama embedder.
														
 
															+    """
														
 
															     ids = []
														
 
															-    for chunk in chunks:
														
 
															+    for i, chunk in enumerate(chunks):
														
 
															         mem_id = write_content_chunk(chunk, doc_title)
														
 
															         if mem_id:
														
 
															             ids.append(mem_id)
														
 
															+        # Throttle after every chunk except the last — give the GPU breathing room
														
 
															+        if cfg.ingest_delay > 0 and i < len(chunks) - 1:
														
 
															+            time.sleep(cfg.ingest_delay)
														
 
															     return ids
														
@@ -99,6 +106,22 @@ def _base_meta(source_file: str, memory_type: str) -> dict:
 
															     }
														
 
															+def _sanitize_meta(metadata: dict) -> dict:
														
 
															+    """
														
 
															+    ChromaDB only accepts str, int, float, bool as metadata values.
														
 
															+    Remove None values and convert anything else to str.
														
 
															+    """
														
 
															+    clean = {}
														
 
															+    for k, v in metadata.items():
														
 
															+        if v is None:
														
 
															+            continue  # drop None entirely
														
 
															+        if isinstance(v, (str, int, float, bool)):
														
 
															+            clean[k] = v
														
 
															+        else:
														
 
															+            clean[k] = str(v)  # last resort conversion
														
 
															+    return clean
														
 
															+
														
 
															+
														
 
															 def _post(text: str, metadata: dict, infer: bool) -> str | None:
														
 
															     """
														
 
															     POST a single entry to the /knowledge endpoint.
														
@@ -111,6 +134,7 @@ def _post(text: str, metadata: dict, infer: bool) -> str | None:
 
															         "infer": infer,
														
 
															     }
														
 
															+    metadata = _sanitize_meta(metadata)
														
 
															     url = f"{cfg.mem0_base_url}/knowledge"
														
 
															     try:
														
--- a/book_ingestor/pipeline.py
+++ b/book_ingestor/pipeline.py
@@ -91,6 +91,21 @@ def _run_pipeline(pdf_path: Path, file_hash: str) -> bool:
 
															         if structure.doc_type == "structured":
														
 
															             console.print(f"  [green]✓[/] {len(structure.sections)} sections detected")
														
 
															+        # ── Sanity check: too many sections = crappy PDF, force flat ────────
														
 
															+        if structure.doc_type == "structured" and len(structure.sections) > cfg.max_sections:
														
 
															+            log.warning(
														
 
															+                "%s yielded %d sections (max=%d) — looks like OCR noise, "
														
 
															+                "falling back to flat processing",
														
 
															+                structure.source_file, len(structure.sections), cfg.max_sections
														
 
															+            )
														
 
															+            console.print(
														
 
															+                f"  [yellow]⚠ {len(structure.sections)} sections detected — "
														
 
															+                f"exceeds MAX_SECTIONS={cfg.max_sections}, treating as flat[/]"
														
 
															+            )
														
 
															+            structure.doc_type = "flat"
														
 
															+            structure.full_text = "\n\n".join(s.raw_text for s in structure.sections)
														
 
															+            structure.sections = []
														
 
															+
														
 
															         # ── Route to appropriate sub-pipeline ───────────────────────────────
														
 
															         if structure.doc_type == "structured":
														
 
															             return _structured_pipeline(pdf_path, structure, file_hash, progress)
														
@@ -229,4 +244,4 @@ def _print_summary(memories: dict, title: str) -> None:
 
															         f"[cyan]{memories.get('chapter_summary', 0)}[/] chapters · "
														
 
															         f"[cyan]{memories.get('content', 0)}[/] chunks · "
														
 
															         f"[bold]{total}[/] total\n"
														
 
															-    )
														
 
															+    )
作者	SHA1 備註	提交日期
Lukas Goldschmidt	4122e5f338 pdf reading improved	4 月之前
Lukas Goldschmidt	8220079693 request throttle	4 月之前