4 月之前 · 2f0c8659b0
--- a/book_ingestor/detector.py
+++ b/book_ingestor/detector.py
@@ -169,6 +169,23 @@ def _is_heading_span(span: dict, body_size: float) -> bool:
 
				         if len(text) > 40:
			
 
				             return False
			
 
				 
			
 
				+    # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
			
 
				+    # e.g. "Chapter 1 ............. 12" or "——————————"
			
 
				+    filler_chars = set(".-_·•=~*#/\\|")
			
 
				+    non_space = [c for c in text if not c.isspace()]
			
 
				+    if non_space and sum(1 for c in non_space if c in filler_chars) / len(non_space) > 0.4:
			
 
				+        return False
			
 
				+
			
 
				+    # Filter ASCII art: very low ratio of letters to total chars
			
 
				+    letter_ratio = sum(1 for c in text if c.isalpha()) / max(len(text), 1)
			
 
				+    if letter_ratio < 0.3:
			
 
				+        return False
			
 
				+
			
 
				+    # Filter short TOC entries ending in a page number
			
 
				+    # e.g. ". — D  6" or "Section 4  42"
			
 
				+    if len(text) < 15 and text[-1].isdigit():
			
 
				+        return False
			
 
				+
			
 
				     return True
			
 
				 
			
 
				 
			
@@ -269,4 +286,4 @@ def _extract_full_text(doc: fitz.Document) -> str:
 
				         text = _extract_page_text(doc, page_num)
			
 
				         if text:
			
 
				             parts.append(text)
			
 
				-    return "\n\n".join(parts)
			
 
				+    return "\n\n".join(parts)