detector.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
  1. """
  2. detector.py — analyses a PDF and determines its structure purely via PyMuPDF.
  3. No LLM calls. Font sizes, bold flags, and text positioning do all the work.
  4. Returns a DocumentStructure describing:
  5. - whether the doc is structured (has chapters/headings) or flat
  6. - extracted chapters with their page ranges and raw text
  7. - document title (best guess)
  8. """
  9. from __future__ import annotations
  10. import re
  11. from dataclasses import dataclass, field
  12. from pathlib import Path
  13. import fitz # PyMuPDF
  14. # ── Data models ────────────────────────────────────────────────────────────────
  15. @dataclass
  16. class Section:
  17. title: str
  18. chapter_number: int | None # None for flat docs
  19. page_start: int
  20. page_end: int
  21. raw_text: str # full extracted text for this section
  22. @dataclass
  23. class DocumentStructure:
  24. source_file: str
  25. doc_type: str # "structured" | "flat"
  26. title: str
  27. sections: list[Section] = field(default_factory=list)
  28. full_text: str = "" # populated for flat docs
  29. # ── Constants ──────────────────────────────────────────────────────────────────
  30. # Heading candidates: bold text significantly larger than body font
  31. HEADING_FONT_RATIO = 1.15 # heading must be ≥ 15% larger than median body size
  32. MIN_HEADING_LENGTH = 4 # minimum chars
  33. MAX_HEADING_LENGTH = 120
  34. MIN_HEADING_WORDS = 2 # must have at least 2 real words
  35. MIN_SECTION_TEXT_CHARS = 200 # sections with less content than this are likely TOC artifacts
  36. # Single-word headings that ARE valid chapter titles — whitelist
  37. VALID_SINGLE_WORD_HEADINGS = {
  38. "introduction", "preface", "foreword", "prologue", "epilogue",
  39. "conclusion", "appendix", "bibliography", "glossary", "index",
  40. "summary", "abstract", "acknowledgements", "acknowledgments",
  41. "contents", "overview", "background", "references", "afterword",
  42. }
  43. MIN_SECTIONS_FOR_STRUCTURED = 2 # need at least 2 headings to call it structured
  44. # Patterns that strongly suggest a chapter heading
  45. CHAPTER_PATTERNS = [
  46. re.compile(r"^\s*(chapter|part|section|unit|lesson)\s+[\dIVXivx]+", re.IGNORECASE),
  47. re.compile(r"^\s*[\dIVXivx]+[\.\)]\s+\w"), # "1. Introduction" or "IV) Conclusion"
  48. ]
  49. # ── Main entry point ───────────────────────────────────────────────────────────
  50. def detect_structure(pdf_path: str | Path) -> DocumentStructure:
  51. """
  52. Analyse a PDF and return a DocumentStructure.
  53. Always succeeds — falls back to flat doc if structure detection fails.
  54. """
  55. path = Path(pdf_path)
  56. doc = fitz.open(str(path))
  57. try:
  58. title = _extract_title(doc)
  59. body_font_size = _median_body_font_size(doc)
  60. headings = _extract_headings(doc, body_font_size)
  61. if len(headings) >= MIN_SECTIONS_FOR_STRUCTURED:
  62. sections = _build_sections(doc, headings)
  63. return DocumentStructure(
  64. source_file=path.name,
  65. doc_type="structured",
  66. title=title,
  67. sections=sections,
  68. )
  69. else:
  70. full_text = _extract_full_text(doc)
  71. return DocumentStructure(
  72. source_file=path.name,
  73. doc_type="flat",
  74. title=title,
  75. full_text=full_text,
  76. )
  77. finally:
  78. doc.close()
  79. # ── Internal helpers ───────────────────────────────────────────────────────────
  80. def _extract_title(doc: fitz.Document) -> str:
  81. """Try PDF metadata first, then largest text on page 1."""
  82. meta_title = (doc.metadata or {}).get("title", "").strip()
  83. if meta_title and len(meta_title) > 3:
  84. return meta_title
  85. # Fallback: largest font on first page
  86. if doc.page_count == 0:
  87. return "Unknown Document"
  88. page = doc[0]
  89. blocks = page.get_text("dict")["blocks"]
  90. candidates = []
  91. for block in blocks:
  92. if block.get("type") != 0: # type 0 = text
  93. continue
  94. for line in block.get("lines", []):
  95. for span in line.get("spans", []):
  96. text = span["text"].strip()
  97. if MIN_HEADING_LENGTH < len(text) < MAX_HEADING_LENGTH:
  98. candidates.append((span["size"], text))
  99. if candidates:
  100. candidates.sort(reverse=True)
  101. return candidates[0][1]
  102. return Path(doc.name).stem.replace("_", " ").replace("-", " ").title()
  103. def _median_body_font_size(doc: fitz.Document) -> float:
  104. """
  105. Compute the median font size across the document.
  106. This represents 'body text' since it will dominate page count.
  107. Sample up to 10 pages for speed.
  108. """
  109. sizes = []
  110. sample_pages = min(doc.page_count, 10)
  111. for page_num in range(sample_pages):
  112. page = doc[page_num]
  113. for block in page.get_text("dict")["blocks"]:
  114. if block.get("type") != 0:
  115. continue
  116. for line in block.get("lines", []):
  117. for span in line.get("spans", []):
  118. if span["text"].strip():
  119. sizes.append(span["size"])
  120. if not sizes:
  121. return 12.0
  122. sizes.sort()
  123. return sizes[len(sizes) // 2]
  124. def _is_heading_span(span: dict, body_size: float) -> bool:
  125. """Heuristic: is this text span a section heading?"""
  126. text = span["text"].strip()
  127. if not (MIN_HEADING_LENGTH <= len(text) <= MAX_HEADING_LENGTH):
  128. return False
  129. # Must be larger than body OR explicitly bold with any size increase
  130. size = span["size"]
  131. flags = span.get("flags", 0)
  132. is_bold = bool(flags & 2**4) # PyMuPDF bold flag
  133. is_larger = size >= body_size * HEADING_FONT_RATIO
  134. if not (is_larger or (is_bold and size >= body_size)):
  135. return False
  136. # Filter out obvious non-headings
  137. if text.endswith(",") or text.endswith(";"):
  138. return False
  139. if sum(1 for c in text if c.isupper()) / max(len(text), 1) > 0.85:
  140. # ALL CAPS long sentences are likely decorative, not headings
  141. if len(text) > 40:
  142. return False
  143. # Filter single-word headings — almost always TOC entries, running headers, or OCR noise
  144. # Exception: known valid single-word headings like INTRODUCTION, PREFACE etc.
  145. words = [w for w in text.split() if w.strip('\'".,;:-')]
  146. clean_words = [w for w in words if any(c.isalpha() for c in w)]
  147. if len(clean_words) < MIN_HEADING_WORDS:
  148. if text.strip('\'".,;:- ').lower() not in VALID_SINGLE_WORD_HEADINGS:
  149. return False
  150. # Filter OCR fragments starting with punctuation — e.g. "'THE", "—Section"
  151. if text[0] in '\'".,;:-—–':
  152. return False
  153. # Filter running headers: single capitalised common noun, likely page header
  154. # e.g. "Company", "Suberling" — real headings are phrases or have "CHAPTER/PART" etc.
  155. if len(clean_words) == 1 and clean_words[0][0].isupper() and len(clean_words[0]) < 20:
  156. if clean_words[0].lower() not in VALID_SINGLE_WORD_HEADINGS:
  157. return False
  158. # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
  159. # e.g. "Chapter 1 ............. 12" or "——————————"
  160. filler_chars = set(".-_·•=~*#/\\|")
  161. non_space = [c for c in text if not c.isspace()]
  162. if non_space and sum(1 for c in non_space if c in filler_chars) / len(non_space) > 0.4:
  163. return False
  164. # Filter ASCII art: very low ratio of letters to total chars
  165. letter_ratio = sum(1 for c in text if c.isalpha()) / max(len(text), 1)
  166. if letter_ratio < 0.3:
  167. return False
  168. # Filter short TOC entries ending in a page number
  169. # e.g. ". — D 6" or "Section 4 42"
  170. if len(text) < 15 and text[-1].isdigit():
  171. return False
  172. return True
  173. def _extract_headings(doc: fitz.Document, body_size: float) -> list[dict]:
  174. """
  175. Walk every page and collect candidate headings with their page number.
  176. Returns list of {page, text, size, y_pos}
  177. """
  178. headings = []
  179. seen_texts = set()
  180. for page_num in range(doc.page_count):
  181. page = doc[page_num]
  182. for block in page.get_text("dict")["blocks"]:
  183. if block.get("type") != 0:
  184. continue
  185. for line in block.get("lines", []):
  186. for span in line.get("spans", []):
  187. text = span["text"].strip()
  188. if _is_heading_span(span, body_size) and text not in seen_texts:
  189. headings.append({
  190. "page": page_num,
  191. "text": text,
  192. "size": span["size"],
  193. "y": span["origin"][1],
  194. })
  195. seen_texts.add(text)
  196. break # one heading per line is enough
  197. return headings
  198. def _extract_page_text(doc: fitz.Document, page_num: int) -> str:
  199. """Extract clean plain text from a single page."""
  200. return doc[page_num].get_text("text").strip()
  201. def _build_sections(doc: fitz.Document, headings: list[dict]) -> list[Section]:
  202. """
  203. Given a list of headings with page numbers, build Section objects
  204. by extracting text between consecutive headings.
  205. """
  206. sections = []
  207. for idx, heading in enumerate(headings):
  208. start_page = heading["page"]
  209. end_page = headings[idx + 1]["page"] - 1 if idx + 1 < len(headings) else doc.page_count - 1
  210. end_page = max(start_page, end_page)
  211. raw_text_parts = []
  212. for p in range(start_page, end_page + 1):
  213. raw_text_parts.append(_extract_page_text(doc, p))
  214. # Try to detect chapter number from heading text
  215. chapter_num = _parse_chapter_number(heading["text"], idx)
  216. sections.append(Section(
  217. title=heading["text"],
  218. chapter_number=chapter_num,
  219. page_start=start_page + 1, # 1-indexed for humans
  220. page_end=end_page + 1,
  221. raw_text="\n".join(raw_text_parts).strip(),
  222. ))
  223. return sections
  224. def _parse_chapter_number(text: str, fallback: int) -> int:
  225. """Try to extract a numeric chapter number from a heading string."""
  226. # "Chapter 3", "3.", "III."
  227. match = re.search(r"\b(\d+)\b", text)
  228. if match:
  229. return int(match.group(1))
  230. roman = re.search(r"\b(I{1,3}|IV|V|VI{0,3}|IX|X{1,3})\b", text)
  231. if roman:
  232. return _roman_to_int(roman.group(1))
  233. return fallback + 1
  234. def _roman_to_int(s: str) -> int:
  235. values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
  236. s = s.upper()
  237. total = 0
  238. for i, c in enumerate(s):
  239. if i + 1 < len(s) and values[c] < values[s[i + 1]]:
  240. total -= values[c]
  241. else:
  242. total += values[c]
  243. return total
  244. def _extract_full_text(doc: fitz.Document) -> str:
  245. """Extract all text from the document for flat processing."""
  246. parts = []
  247. for page_num in range(doc.page_count):
  248. text = _extract_page_text(doc, page_num)
  249. if text:
  250. parts.append(text)
  251. return "\n\n".join(parts)