detector.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. """
  2. detector.py — analyses a PDF and determines its structure purely via PyMuPDF.
  3. No LLM calls. Font sizes, bold flags, and text positioning do all the work.
  4. Returns a DocumentStructure describing:
  5. - whether the doc is structured (has chapters/headings) or flat
  6. - extracted chapters with their page ranges and raw text
  7. - document title (best guess)
  8. """
  9. from __future__ import annotations
  10. import re
  11. from dataclasses import dataclass, field
  12. from pathlib import Path
  13. import fitz # PyMuPDF
  14. # ── Data models ────────────────────────────────────────────────────────────────
  15. @dataclass
  16. class Section:
  17. title: str
  18. chapter_number: int | None # None for flat docs
  19. page_start: int
  20. page_end: int
  21. raw_text: str # full extracted text for this section
  22. @dataclass
  23. class DocumentStructure:
  24. source_file: str
  25. doc_type: str # "structured" | "flat"
  26. title: str
  27. sections: list[Section] = field(default_factory=list)
  28. full_text: str = "" # populated for flat docs
  29. # ── Constants ──────────────────────────────────────────────────────────────────
  30. # Heading candidates: bold text significantly larger than body font
  31. HEADING_FONT_RATIO = 1.15 # heading must be ≥ 15% larger than median body size
  32. MIN_HEADING_LENGTH = 3
  33. MAX_HEADING_LENGTH = 120
  34. MIN_SECTIONS_FOR_STRUCTURED = 2 # need at least 2 headings to call it structured
  35. # Patterns that strongly suggest a chapter heading
  36. CHAPTER_PATTERNS = [
  37. re.compile(r"^\s*(chapter|part|section|unit|lesson)\s+[\dIVXivx]+", re.IGNORECASE),
  38. re.compile(r"^\s*[\dIVXivx]+[\.\)]\s+\w"), # "1. Introduction" or "IV) Conclusion"
  39. ]
  40. # ── Main entry point ───────────────────────────────────────────────────────────
  41. def detect_structure(pdf_path: str | Path) -> DocumentStructure:
  42. """
  43. Analyse a PDF and return a DocumentStructure.
  44. Always succeeds — falls back to flat doc if structure detection fails.
  45. """
  46. path = Path(pdf_path)
  47. doc = fitz.open(str(path))
  48. try:
  49. title = _extract_title(doc)
  50. body_font_size = _median_body_font_size(doc)
  51. headings = _extract_headings(doc, body_font_size)
  52. if len(headings) >= MIN_SECTIONS_FOR_STRUCTURED:
  53. sections = _build_sections(doc, headings)
  54. return DocumentStructure(
  55. source_file=path.name,
  56. doc_type="structured",
  57. title=title,
  58. sections=sections,
  59. )
  60. else:
  61. full_text = _extract_full_text(doc)
  62. return DocumentStructure(
  63. source_file=path.name,
  64. doc_type="flat",
  65. title=title,
  66. full_text=full_text,
  67. )
  68. finally:
  69. doc.close()
  70. # ── Internal helpers ───────────────────────────────────────────────────────────
  71. def _extract_title(doc: fitz.Document) -> str:
  72. """Try PDF metadata first, then largest text on page 1."""
  73. meta_title = (doc.metadata or {}).get("title", "").strip()
  74. if meta_title and len(meta_title) > 3:
  75. return meta_title
  76. # Fallback: largest font on first page
  77. if doc.page_count == 0:
  78. return "Unknown Document"
  79. page = doc[0]
  80. blocks = page.get_text("dict")["blocks"]
  81. candidates = []
  82. for block in blocks:
  83. if block.get("type") != 0: # type 0 = text
  84. continue
  85. for line in block.get("lines", []):
  86. for span in line.get("spans", []):
  87. text = span["text"].strip()
  88. if MIN_HEADING_LENGTH < len(text) < MAX_HEADING_LENGTH:
  89. candidates.append((span["size"], text))
  90. if candidates:
  91. candidates.sort(reverse=True)
  92. return candidates[0][1]
  93. return Path(doc.name).stem.replace("_", " ").replace("-", " ").title()
  94. def _median_body_font_size(doc: fitz.Document) -> float:
  95. """
  96. Compute the median font size across the document.
  97. This represents 'body text' since it will dominate page count.
  98. Sample up to 10 pages for speed.
  99. """
  100. sizes = []
  101. sample_pages = min(doc.page_count, 10)
  102. for page_num in range(sample_pages):
  103. page = doc[page_num]
  104. for block in page.get_text("dict")["blocks"]:
  105. if block.get("type") != 0:
  106. continue
  107. for line in block.get("lines", []):
  108. for span in line.get("spans", []):
  109. if span["text"].strip():
  110. sizes.append(span["size"])
  111. if not sizes:
  112. return 12.0
  113. sizes.sort()
  114. return sizes[len(sizes) // 2]
  115. def _is_heading_span(span: dict, body_size: float) -> bool:
  116. """Heuristic: is this text span a section heading?"""
  117. text = span["text"].strip()
  118. if not (MIN_HEADING_LENGTH <= len(text) <= MAX_HEADING_LENGTH):
  119. return False
  120. # Must be larger than body OR explicitly bold with any size increase
  121. size = span["size"]
  122. flags = span.get("flags", 0)
  123. is_bold = bool(flags & 2**4) # PyMuPDF bold flag
  124. is_larger = size >= body_size * HEADING_FONT_RATIO
  125. if not (is_larger or (is_bold and size >= body_size)):
  126. return False
  127. # Filter out obvious non-headings
  128. if text.endswith(",") or text.endswith(";"):
  129. return False
  130. if sum(1 for c in text if c.isupper()) / max(len(text), 1) > 0.85:
  131. # ALL CAPS long sentences are likely decorative, not headings
  132. if len(text) > 40:
  133. return False
  134. return True
  135. def _extract_headings(doc: fitz.Document, body_size: float) -> list[dict]:
  136. """
  137. Walk every page and collect candidate headings with their page number.
  138. Returns list of {page, text, size, y_pos}
  139. """
  140. headings = []
  141. seen_texts = set()
  142. for page_num in range(doc.page_count):
  143. page = doc[page_num]
  144. for block in page.get_text("dict")["blocks"]:
  145. if block.get("type") != 0:
  146. continue
  147. for line in block.get("lines", []):
  148. for span in line.get("spans", []):
  149. text = span["text"].strip()
  150. if _is_heading_span(span, body_size) and text not in seen_texts:
  151. headings.append({
  152. "page": page_num,
  153. "text": text,
  154. "size": span["size"],
  155. "y": span["origin"][1],
  156. })
  157. seen_texts.add(text)
  158. break # one heading per line is enough
  159. return headings
  160. def _extract_page_text(doc: fitz.Document, page_num: int) -> str:
  161. """Extract clean plain text from a single page."""
  162. return doc[page_num].get_text("text").strip()
  163. def _build_sections(doc: fitz.Document, headings: list[dict]) -> list[Section]:
  164. """
  165. Given a list of headings with page numbers, build Section objects
  166. by extracting text between consecutive headings.
  167. """
  168. sections = []
  169. for idx, heading in enumerate(headings):
  170. start_page = heading["page"]
  171. end_page = headings[idx + 1]["page"] - 1 if idx + 1 < len(headings) else doc.page_count - 1
  172. end_page = max(start_page, end_page)
  173. raw_text_parts = []
  174. for p in range(start_page, end_page + 1):
  175. raw_text_parts.append(_extract_page_text(doc, p))
  176. # Try to detect chapter number from heading text
  177. chapter_num = _parse_chapter_number(heading["text"], idx)
  178. sections.append(Section(
  179. title=heading["text"],
  180. chapter_number=chapter_num,
  181. page_start=start_page + 1, # 1-indexed for humans
  182. page_end=end_page + 1,
  183. raw_text="\n".join(raw_text_parts).strip(),
  184. ))
  185. return sections
  186. def _parse_chapter_number(text: str, fallback: int) -> int:
  187. """Try to extract a numeric chapter number from a heading string."""
  188. # "Chapter 3", "3.", "III."
  189. match = re.search(r"\b(\d+)\b", text)
  190. if match:
  191. return int(match.group(1))
  192. roman = re.search(r"\b(I{1,3}|IV|V|VI{0,3}|IX|X{1,3})\b", text)
  193. if roman:
  194. return _roman_to_int(roman.group(1))
  195. return fallback + 1
  196. def _roman_to_int(s: str) -> int:
  197. values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
  198. s = s.upper()
  199. total = 0
  200. for i, c in enumerate(s):
  201. if i + 1 < len(s) and values[c] < values[s[i + 1]]:
  202. total -= values[c]
  203. else:
  204. total += values[c]
  205. return total
  206. def _extract_full_text(doc: fitz.Document) -> str:
  207. """Extract all text from the document for flat processing."""
  208. parts = []
  209. for page_num in range(doc.page_count):
  210. text = _extract_page_text(doc, page_num)
  211. if text:
  212. parts.append(text)
  213. return "\n\n".join(parts)