|
|
@@ -169,6 +169,23 @@ def _is_heading_span(span: dict, body_size: float) -> bool:
|
|
|
if len(text) > 40:
|
|
|
return False
|
|
|
|
|
|
+ # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
|
|
|
+ # e.g. "Chapter 1 ............. 12" or "——————————"
|
|
|
+ filler_chars = set(".-_·•=~*#/\\|")
|
|
|
+ non_space = [c for c in text if not c.isspace()]
|
|
|
+ if non_space and sum(1 for c in non_space if c in filler_chars) / len(non_space) > 0.4:
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Filter ASCII art: very low ratio of letters to total chars
|
|
|
+ letter_ratio = sum(1 for c in text if c.isalpha()) / max(len(text), 1)
|
|
|
+ if letter_ratio < 0.3:
|
|
|
+ return False
|
|
|
+
|
|
|
+ # Filter short TOC entries ending in a page number
|
|
|
+ # e.g. ". — D 6" or "Section 4 42"
|
|
|
+ if len(text) < 15 and text[-1].isdigit():
|
|
|
+ return False
|
|
|
+
|
|
|
return True
|
|
|
|
|
|
|
|
|
@@ -269,4 +286,4 @@ def _extract_full_text(doc: fitz.Document) -> str:
|
|
|
text = _extract_page_text(doc, page_num)
|
|
|
if text:
|
|
|
parts.append(text)
|
|
|
- return "\n\n".join(parts)
|
|
|
+ return "\n\n".join(parts)
|