1
0

2 کامیت‌ها 2f0c8659b0 ... 4122e5f338

نویسنده SHA1 پیام تاریخ
  Lukas Goldschmidt 4122e5f338 pdf reading improved 1 روز پیش
  Lukas Goldschmidt 8220079693 request throttle 2 روز پیش
5فایلهای تغییر یافته به همراه93 افزوده شده و 6 حذف شده
  1. 11 0
      .env.example
  2. 10 1
      book_ingestor/config.py
  3. 29 1
      book_ingestor/detector.py
  4. 27 3
      book_ingestor/mem0_writer.py
  5. 16 1
      book_ingestor/pipeline.py

+ 11 - 0
.env.example

@@ -1,6 +1,8 @@
 # mem0 server (your LAN address)
 MEM0_BASE_URL=http://192.168.0.200:8420
 MEM0_AGENT_ID=knowledge_base
+# book-ingestor posts to /knowledge — change if your server uses a different path
+# MEM0_KNOWLEDGE_PATH=/knowledge
 
 # Groq
 GROQ_API_KEY=your_groq_key_here
@@ -15,5 +17,14 @@ BOOKS_MANIFESTS=./books/manifests
 # Chunking
 CHUNK_SIZE_TOKENS=350
 
+# Max sections before falling back to flat processing
+# Prevents token burn on crappy OCR'd PDFs with hundreds of fake chapters
+MAX_SECTIONS=64
+
+# Throttling — delay in seconds between chunk POSTs (0 = no delay)
+# Increase if Ollama/nomic embedder is pegging your GPU
+# 0.5 = 2 chunks/sec, 1.0 = 1 chunk/sec
+INGEST_DELAY=0.5
+
 # Logging: DEBUG | INFO | WARNING
 LOG_LEVEL=INFO

+ 10 - 1
book_ingestor/config.py

@@ -29,6 +29,13 @@ class Config:
     # Chunking
     chunk_size_tokens: int
 
+    # Safety cap — docs with more sections than this are treated as flat
+    # Prevents token burn on crappy OCR'd PDFs with hundreds of fake chapters
+    max_sections: int
+
+    # Throttling — delay between POSTs to spare the embedder/GPU
+    ingest_delay: float
+
     # Logging
     log_level: str
 
@@ -50,9 +57,11 @@ def load_config() -> Config:
         books_done=os.getenv("BOOKS_DONE", "./books/done"),
         books_manifests=os.getenv("BOOKS_MANIFESTS", "./books/manifests"),
         chunk_size_tokens=int(os.getenv("CHUNK_SIZE_TOKENS", "350")),
+        max_sections=int(os.getenv("MAX_SECTIONS", "60")),
+        ingest_delay=float(os.getenv("INGEST_DELAY", "0.5")),
         log_level=os.getenv("LOG_LEVEL", "INFO"),
     )
 
 
 # Singleton — import this everywhere
-cfg = load_config()
+cfg = load_config()

+ 29 - 1
book_ingestor/detector.py

@@ -41,8 +41,18 @@ class DocumentStructure:
 
 # Heading candidates: bold text significantly larger than body font
 HEADING_FONT_RATIO = 1.15        # heading must be ≥ 15% larger than median body size
-MIN_HEADING_LENGTH = 3
+MIN_HEADING_LENGTH = 4           # minimum chars
 MAX_HEADING_LENGTH = 120
+MIN_HEADING_WORDS = 2            # must have at least 2 real words
+MIN_SECTION_TEXT_CHARS = 200     # sections with less content than this are likely TOC artifacts
+
+# Single-word headings that ARE valid chapter titles — whitelist
+VALID_SINGLE_WORD_HEADINGS = {
+    "introduction", "preface", "foreword", "prologue", "epilogue",
+    "conclusion", "appendix", "bibliography", "glossary", "index",
+    "summary", "abstract", "acknowledgements", "acknowledgments",
+    "contents", "overview", "background", "references", "afterword",
+}
 MIN_SECTIONS_FOR_STRUCTURED = 2  # need at least 2 headings to call it structured
 
 # Patterns that strongly suggest a chapter heading
@@ -169,6 +179,24 @@ def _is_heading_span(span: dict, body_size: float) -> bool:
         if len(text) > 40:
             return False
 
+    # Filter single-word headings — almost always TOC entries, running headers, or OCR noise
+    # Exception: known valid single-word headings like INTRODUCTION, PREFACE etc.
+    words = [w for w in text.split() if w.strip('\'".,;:-')]
+    clean_words = [w for w in words if any(c.isalpha() for c in w)]
+    if len(clean_words) < MIN_HEADING_WORDS:
+        if text.strip('\'".,;:- ').lower() not in VALID_SINGLE_WORD_HEADINGS:
+            return False
+
+    # Filter OCR fragments starting with punctuation — e.g. "'THE", "—Section"
+    if text[0] in '\'".,;:-—–':
+        return False
+
+    # Filter running headers: single capitalised common noun, likely page header
+    # e.g. "Company", "Suberling" — real headings are phrases or have "CHAPTER/PART" etc.
+    if len(clean_words) == 1 and clean_words[0][0].isupper() and len(clean_words[0]) < 20:
+        if clean_words[0].lower() not in VALID_SINGLE_WORD_HEADINGS:
+            return False
+
     # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
     # e.g. "Chapter 1 ............. 12" or "——————————"
     filler_chars = set(".-_·•=~*#/\\|")

+ 27 - 3
book_ingestor/mem0_writer.py

@@ -10,6 +10,7 @@ Server expects: { text, user_id, metadata, infer }
 from __future__ import annotations
 
 import logging
+import time
 from datetime import datetime, timezone
 
 import requests
@@ -75,17 +76,23 @@ def write_content_chunk(chunk: Chunk, doc_title: str) -> str | None:
             "chunk_index": chunk.chunk_index,
             "token_count": chunk.token_count,
         },
-        infer=True,
+        infer=False,
     )
 
 
 def write_content_chunks_batch(chunks: list[Chunk], doc_title: str) -> list[str]:
-    """POST multiple content chunks. Returns list of successful memory IDs."""
+    """
+    POST multiple content chunks. Returns list of successful memory IDs.
+    Throttled by INGEST_DELAY to avoid hammering the Ollama embedder.
+    """
     ids = []
-    for chunk in chunks:
+    for i, chunk in enumerate(chunks):
         mem_id = write_content_chunk(chunk, doc_title)
         if mem_id:
             ids.append(mem_id)
+        # Throttle after every chunk except the last — give the GPU breathing room
+        if cfg.ingest_delay > 0 and i < len(chunks) - 1:
+            time.sleep(cfg.ingest_delay)
     return ids
 
 
@@ -99,6 +106,22 @@ def _base_meta(source_file: str, memory_type: str) -> dict:
     }
 
 
+def _sanitize_meta(metadata: dict) -> dict:
+    """
+    ChromaDB only accepts str, int, float, bool as metadata values.
+    Remove None values and convert anything else to str.
+    """
+    clean = {}
+    for k, v in metadata.items():
+        if v is None:
+            continue  # drop None entirely
+        if isinstance(v, (str, int, float, bool)):
+            clean[k] = v
+        else:
+            clean[k] = str(v)  # last resort conversion
+    return clean
+
+
 def _post(text: str, metadata: dict, infer: bool) -> str | None:
     """
     POST a single entry to the /knowledge endpoint.
@@ -111,6 +134,7 @@ def _post(text: str, metadata: dict, infer: bool) -> str | None:
         "infer": infer,
     }
 
+    metadata = _sanitize_meta(metadata)
     url = f"{cfg.mem0_base_url}/knowledge"
 
     try:

+ 16 - 1
book_ingestor/pipeline.py

@@ -91,6 +91,21 @@ def _run_pipeline(pdf_path: Path, file_hash: str) -> bool:
         if structure.doc_type == "structured":
             console.print(f"  [green]✓[/] {len(structure.sections)} sections detected")
 
+        # ── Sanity check: too many sections = crappy PDF, force flat ────────
+        if structure.doc_type == "structured" and len(structure.sections) > cfg.max_sections:
+            log.warning(
+                "%s yielded %d sections (max=%d) — looks like OCR noise, "
+                "falling back to flat processing",
+                structure.source_file, len(structure.sections), cfg.max_sections
+            )
+            console.print(
+                f"  [yellow]⚠ {len(structure.sections)} sections detected — "
+                f"exceeds MAX_SECTIONS={cfg.max_sections}, treating as flat[/]"
+            )
+            structure.doc_type = "flat"
+            structure.full_text = "\n\n".join(s.raw_text for s in structure.sections)
+            structure.sections = []
+
         # ── Route to appropriate sub-pipeline ───────────────────────────────
         if structure.doc_type == "structured":
             return _structured_pipeline(pdf_path, structure, file_hash, progress)
@@ -229,4 +244,4 @@ def _print_summary(memories: dict, title: str) -> None:
         f"[cyan]{memories.get('chapter_summary', 0)}[/] chapters · "
         f"[cyan]{memories.get('content', 0)}[/] chunks · "
         f"[bold]{total}[/] total\n"
-    )
+    )