""" detector.py — analyses a PDF and determines its structure purely via PyMuPDF. No LLM calls. Font sizes, bold flags, and text positioning do all the work. Returns a DocumentStructure describing: - whether the doc is structured (has chapters/headings) or flat - extracted chapters with their page ranges and raw text - document title (best guess) """ from __future__ import annotations import re from dataclasses import dataclass, field from pathlib import Path import fitz # PyMuPDF # ── Data models ──────────────────────────────────────────────────────────────── @dataclass class Section: title: str chapter_number: int | None # None for flat docs page_start: int page_end: int raw_text: str # full extracted text for this section @dataclass class DocumentStructure: source_file: str doc_type: str # "structured" | "flat" title: str sections: list[Section] = field(default_factory=list) full_text: str = "" # populated for flat docs # ── Constants ────────────────────────────────────────────────────────────────── # Heading candidates: bold text significantly larger than body font HEADING_FONT_RATIO = 1.15 # heading must be ≥ 15% larger than median body size MIN_HEADING_LENGTH = 3 MAX_HEADING_LENGTH = 120 MIN_SECTIONS_FOR_STRUCTURED = 2 # need at least 2 headings to call it structured # Patterns that strongly suggest a chapter heading CHAPTER_PATTERNS = [ re.compile(r"^\s*(chapter|part|section|unit|lesson)\s+[\dIVXivx]+", re.IGNORECASE), re.compile(r"^\s*[\dIVXivx]+[\.\)]\s+\w"), # "1. Introduction" or "IV) Conclusion" ] # ── Main entry point ─────────────────────────────────────────────────────────── def detect_structure(pdf_path: str | Path) -> DocumentStructure: """ Analyse a PDF and return a DocumentStructure. Always succeeds — falls back to flat doc if structure detection fails. """ path = Path(pdf_path) doc = fitz.open(str(path)) try: title = _extract_title(doc) body_font_size = _median_body_font_size(doc) headings = _extract_headings(doc, body_font_size) if len(headings) >= MIN_SECTIONS_FOR_STRUCTURED: sections = _build_sections(doc, headings) return DocumentStructure( source_file=path.name, doc_type="structured", title=title, sections=sections, ) else: full_text = _extract_full_text(doc) return DocumentStructure( source_file=path.name, doc_type="flat", title=title, full_text=full_text, ) finally: doc.close() # ── Internal helpers ─────────────────────────────────────────────────────────── def _extract_title(doc: fitz.Document) -> str: """Try PDF metadata first, then largest text on page 1.""" meta_title = (doc.metadata or {}).get("title", "").strip() if meta_title and len(meta_title) > 3: return meta_title # Fallback: largest font on first page if doc.page_count == 0: return "Unknown Document" page = doc[0] blocks = page.get_text("dict")["blocks"] candidates = [] for block in blocks: if block.get("type") != 0: # type 0 = text continue for line in block.get("lines", []): for span in line.get("spans", []): text = span["text"].strip() if MIN_HEADING_LENGTH < len(text) < MAX_HEADING_LENGTH: candidates.append((span["size"], text)) if candidates: candidates.sort(reverse=True) return candidates[0][1] return Path(doc.name).stem.replace("_", " ").replace("-", " ").title() def _median_body_font_size(doc: fitz.Document) -> float: """ Compute the median font size across the document. This represents 'body text' since it will dominate page count. Sample up to 10 pages for speed. """ sizes = [] sample_pages = min(doc.page_count, 10) for page_num in range(sample_pages): page = doc[page_num] for block in page.get_text("dict")["blocks"]: if block.get("type") != 0: continue for line in block.get("lines", []): for span in line.get("spans", []): if span["text"].strip(): sizes.append(span["size"]) if not sizes: return 12.0 sizes.sort() return sizes[len(sizes) // 2] def _is_heading_span(span: dict, body_size: float) -> bool: """Heuristic: is this text span a section heading?""" text = span["text"].strip() if not (MIN_HEADING_LENGTH <= len(text) <= MAX_HEADING_LENGTH): return False # Must be larger than body OR explicitly bold with any size increase size = span["size"] flags = span.get("flags", 0) is_bold = bool(flags & 2**4) # PyMuPDF bold flag is_larger = size >= body_size * HEADING_FONT_RATIO if not (is_larger or (is_bold and size >= body_size)): return False # Filter out obvious non-headings if text.endswith(",") or text.endswith(";"): return False if sum(1 for c in text if c.isupper()) / max(len(text), 1) > 0.85: # ALL CAPS long sentences are likely decorative, not headings if len(text) > 40: return False # Filter TOC leaders: lines mostly filled with dots, dashes, underscores # e.g. "Chapter 1 ............. 12" or "——————————" filler_chars = set(".-_·•=~*#/\\|") non_space = [c for c in text if not c.isspace()] if non_space and sum(1 for c in non_space if c in filler_chars) / len(non_space) > 0.4: return False # Filter ASCII art: very low ratio of letters to total chars letter_ratio = sum(1 for c in text if c.isalpha()) / max(len(text), 1) if letter_ratio < 0.3: return False # Filter short TOC entries ending in a page number # e.g. ". — D 6" or "Section 4 42" if len(text) < 15 and text[-1].isdigit(): return False return True def _extract_headings(doc: fitz.Document, body_size: float) -> list[dict]: """ Walk every page and collect candidate headings with their page number. Returns list of {page, text, size, y_pos} """ headings = [] seen_texts = set() for page_num in range(doc.page_count): page = doc[page_num] for block in page.get_text("dict")["blocks"]: if block.get("type") != 0: continue for line in block.get("lines", []): for span in line.get("spans", []): text = span["text"].strip() if _is_heading_span(span, body_size) and text not in seen_texts: headings.append({ "page": page_num, "text": text, "size": span["size"], "y": span["origin"][1], }) seen_texts.add(text) break # one heading per line is enough return headings def _extract_page_text(doc: fitz.Document, page_num: int) -> str: """Extract clean plain text from a single page.""" return doc[page_num].get_text("text").strip() def _build_sections(doc: fitz.Document, headings: list[dict]) -> list[Section]: """ Given a list of headings with page numbers, build Section objects by extracting text between consecutive headings. """ sections = [] for idx, heading in enumerate(headings): start_page = heading["page"] end_page = headings[idx + 1]["page"] - 1 if idx + 1 < len(headings) else doc.page_count - 1 end_page = max(start_page, end_page) raw_text_parts = [] for p in range(start_page, end_page + 1): raw_text_parts.append(_extract_page_text(doc, p)) # Try to detect chapter number from heading text chapter_num = _parse_chapter_number(heading["text"], idx) sections.append(Section( title=heading["text"], chapter_number=chapter_num, page_start=start_page + 1, # 1-indexed for humans page_end=end_page + 1, raw_text="\n".join(raw_text_parts).strip(), )) return sections def _parse_chapter_number(text: str, fallback: int) -> int: """Try to extract a numeric chapter number from a heading string.""" # "Chapter 3", "3.", "III." match = re.search(r"\b(\d+)\b", text) if match: return int(match.group(1)) roman = re.search(r"\b(I{1,3}|IV|V|VI{0,3}|IX|X{1,3})\b", text) if roman: return _roman_to_int(roman.group(1)) return fallback + 1 def _roman_to_int(s: str) -> int: values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000} s = s.upper() total = 0 for i, c in enumerate(s): if i + 1 < len(s) and values[c] < values[s[i + 1]]: total -= values[c] else: total += values[c] return total def _extract_full_text(doc: fitz.Document) -> str: """Extract all text from the document for flat processing.""" parts = [] for page_num in range(doc.page_count): text = _extract_page_text(doc, page_num) if text: parts.append(text) return "\n\n".join(parts)