Explorar o código

ascii junk filter

Lukas Goldschmidt hai 2 días
pai
achega
2f0c8659b0
Modificáronse 1 ficheiros con 18 adicións e 1 borrados
  1. 18 1
      book_ingestor/detector.py

+ 18 - 1
book_ingestor/detector.py

@@ -169,6 +169,23 @@ def _is_heading_span(span: dict, body_size: float) -> bool:
         if len(text) > 40:
             return False
 
+    # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
+    # e.g. "Chapter 1 ............. 12" or "——————————"
+    filler_chars = set(".-_·•=~*#/\\|")
+    non_space = [c for c in text if not c.isspace()]
+    if non_space and sum(1 for c in non_space if c in filler_chars) / len(non_space) > 0.4:
+        return False
+
+    # Filter ASCII art: very low ratio of letters to total chars
+    letter_ratio = sum(1 for c in text if c.isalpha()) / max(len(text), 1)
+    if letter_ratio < 0.3:
+        return False
+
+    # Filter short TOC entries ending in a page number
+    # e.g. ". — D  6" or "Section 4  42"
+    if len(text) < 15 and text[-1].isdigit():
+        return False
+
     return True
 
 
@@ -269,4 +286,4 @@ def _extract_full_text(doc: fitz.Document) -> str:
         text = _extract_page_text(doc, page_num)
         if text:
             parts.append(text)
-    return "\n\n".join(parts)
+    return "\n\n".join(parts)