|
@@ -41,8 +41,18 @@ class DocumentStructure:
|
|
|
|
|
|
|
|
# Heading candidates: bold text significantly larger than body font
|
|
# Heading candidates: bold text significantly larger than body font
|
|
|
HEADING_FONT_RATIO = 1.15 # heading must be ≥ 15% larger than median body size
|
|
HEADING_FONT_RATIO = 1.15 # heading must be ≥ 15% larger than median body size
|
|
|
-MIN_HEADING_LENGTH = 3
|
|
|
|
|
|
|
+MIN_HEADING_LENGTH = 4 # minimum chars
|
|
|
MAX_HEADING_LENGTH = 120
|
|
MAX_HEADING_LENGTH = 120
|
|
|
|
|
+MIN_HEADING_WORDS = 2 # must have at least 2 real words
|
|
|
|
|
+MIN_SECTION_TEXT_CHARS = 200 # sections with less content than this are likely TOC artifacts
|
|
|
|
|
+
|
|
|
|
|
+# Single-word headings that ARE valid chapter titles — whitelist
|
|
|
|
|
+VALID_SINGLE_WORD_HEADINGS = {
|
|
|
|
|
+ "introduction", "preface", "foreword", "prologue", "epilogue",
|
|
|
|
|
+ "conclusion", "appendix", "bibliography", "glossary", "index",
|
|
|
|
|
+ "summary", "abstract", "acknowledgements", "acknowledgments",
|
|
|
|
|
+ "contents", "overview", "background", "references", "afterword",
|
|
|
|
|
+}
|
|
|
MIN_SECTIONS_FOR_STRUCTURED = 2 # need at least 2 headings to call it structured
|
|
MIN_SECTIONS_FOR_STRUCTURED = 2 # need at least 2 headings to call it structured
|
|
|
|
|
|
|
|
# Patterns that strongly suggest a chapter heading
|
|
# Patterns that strongly suggest a chapter heading
|
|
@@ -169,6 +179,24 @@ def _is_heading_span(span: dict, body_size: float) -> bool:
|
|
|
if len(text) > 40:
|
|
if len(text) > 40:
|
|
|
return False
|
|
return False
|
|
|
|
|
|
|
|
|
|
+ # Filter single-word headings — almost always TOC entries, running headers, or OCR noise
|
|
|
|
|
+ # Exception: known valid single-word headings like INTRODUCTION, PREFACE etc.
|
|
|
|
|
+ words = [w for w in text.split() if w.strip('\'".,;:-')]
|
|
|
|
|
+ clean_words = [w for w in words if any(c.isalpha() for c in w)]
|
|
|
|
|
+ if len(clean_words) < MIN_HEADING_WORDS:
|
|
|
|
|
+ if text.strip('\'".,;:- ').lower() not in VALID_SINGLE_WORD_HEADINGS:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # Filter OCR fragments starting with punctuation — e.g. "'THE", "—Section"
|
|
|
|
|
+ if text[0] in '\'".,;:-—–':
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
|
|
+ # Filter running headers: single capitalised common noun, likely page header
|
|
|
|
|
+ # e.g. "Company", "Suberling" — real headings are phrases or have "CHAPTER/PART" etc.
|
|
|
|
|
+ if len(clean_words) == 1 and clean_words[0][0].isupper() and len(clean_words[0]) < 20:
|
|
|
|
|
+ if clean_words[0].lower() not in VALID_SINGLE_WORD_HEADINGS:
|
|
|
|
|
+ return False
|
|
|
|
|
+
|
|
|
# Filter TOC leaders: lines mostly filled with dots, dashes, underscores
|
|
# Filter TOC leaders: lines mostly filled with dots, dashes, underscores
|
|
|
# e.g. "Chapter 1 ............. 12" or "——————————"
|
|
# e.g. "Chapter 1 ............. 12" or "——————————"
|
|
|
filler_chars = set(".-_·•=~*#/\\|")
|
|
filler_chars = set(".-_·•=~*#/\\|")
|