| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317 |
- """
- detector.py — analyses a PDF and determines its structure purely via PyMuPDF.
- No LLM calls. Font sizes, bold flags, and text positioning do all the work.
- Returns a DocumentStructure describing:
- - whether the doc is structured (has chapters/headings) or flat
- - extracted chapters with their page ranges and raw text
- - document title (best guess)
- """
- from __future__ import annotations
- import re
- from dataclasses import dataclass, field
- from pathlib import Path
- import fitz # PyMuPDF
- # ── Data models ────────────────────────────────────────────────────────────────
- @dataclass
- class Section:
- title: str
- chapter_number: int | None # None for flat docs
- page_start: int
- page_end: int
- raw_text: str # full extracted text for this section
- @dataclass
- class DocumentStructure:
- source_file: str
- doc_type: str # "structured" | "flat"
- title: str
- sections: list[Section] = field(default_factory=list)
- full_text: str = "" # populated for flat docs
- # ── Constants ──────────────────────────────────────────────────────────────────
- # Heading candidates: bold text significantly larger than body font
- HEADING_FONT_RATIO = 1.15 # heading must be ≥ 15% larger than median body size
- MIN_HEADING_LENGTH = 4 # minimum chars
- MAX_HEADING_LENGTH = 120
- MIN_HEADING_WORDS = 2 # must have at least 2 real words
- MIN_SECTION_TEXT_CHARS = 200 # sections with less content than this are likely TOC artifacts
- # Single-word headings that ARE valid chapter titles — whitelist
- VALID_SINGLE_WORD_HEADINGS = {
- "introduction", "preface", "foreword", "prologue", "epilogue",
- "conclusion", "appendix", "bibliography", "glossary", "index",
- "summary", "abstract", "acknowledgements", "acknowledgments",
- "contents", "overview", "background", "references", "afterword",
- }
- MIN_SECTIONS_FOR_STRUCTURED = 2 # need at least 2 headings to call it structured
- # Patterns that strongly suggest a chapter heading
- CHAPTER_PATTERNS = [
- re.compile(r"^\s*(chapter|part|section|unit|lesson)\s+[\dIVXivx]+", re.IGNORECASE),
- re.compile(r"^\s*[\dIVXivx]+[\.\)]\s+\w"), # "1. Introduction" or "IV) Conclusion"
- ]
- # ── Main entry point ───────────────────────────────────────────────────────────
- def detect_structure(pdf_path: str | Path) -> DocumentStructure:
- """
- Analyse a PDF and return a DocumentStructure.
- Always succeeds — falls back to flat doc if structure detection fails.
- """
- path = Path(pdf_path)
- doc = fitz.open(str(path))
- try:
- title = _extract_title(doc)
- body_font_size = _median_body_font_size(doc)
- headings = _extract_headings(doc, body_font_size)
- if len(headings) >= MIN_SECTIONS_FOR_STRUCTURED:
- sections = _build_sections(doc, headings)
- return DocumentStructure(
- source_file=path.name,
- doc_type="structured",
- title=title,
- sections=sections,
- )
- else:
- full_text = _extract_full_text(doc)
- return DocumentStructure(
- source_file=path.name,
- doc_type="flat",
- title=title,
- full_text=full_text,
- )
- finally:
- doc.close()
- # ── Internal helpers ───────────────────────────────────────────────────────────
- def _extract_title(doc: fitz.Document) -> str:
- """Try PDF metadata first, then largest text on page 1."""
- meta_title = (doc.metadata or {}).get("title", "").strip()
- if meta_title and len(meta_title) > 3:
- return meta_title
- # Fallback: largest font on first page
- if doc.page_count == 0:
- return "Unknown Document"
- page = doc[0]
- blocks = page.get_text("dict")["blocks"]
- candidates = []
- for block in blocks:
- if block.get("type") != 0: # type 0 = text
- continue
- for line in block.get("lines", []):
- for span in line.get("spans", []):
- text = span["text"].strip()
- if MIN_HEADING_LENGTH < len(text) < MAX_HEADING_LENGTH:
- candidates.append((span["size"], text))
- if candidates:
- candidates.sort(reverse=True)
- return candidates[0][1]
- return Path(doc.name).stem.replace("_", " ").replace("-", " ").title()
- def _median_body_font_size(doc: fitz.Document) -> float:
- """
- Compute the median font size across the document.
- This represents 'body text' since it will dominate page count.
- Sample up to 10 pages for speed.
- """
- sizes = []
- sample_pages = min(doc.page_count, 10)
- for page_num in range(sample_pages):
- page = doc[page_num]
- for block in page.get_text("dict")["blocks"]:
- if block.get("type") != 0:
- continue
- for line in block.get("lines", []):
- for span in line.get("spans", []):
- if span["text"].strip():
- sizes.append(span["size"])
- if not sizes:
- return 12.0
- sizes.sort()
- return sizes[len(sizes) // 2]
- def _is_heading_span(span: dict, body_size: float) -> bool:
- """Heuristic: is this text span a section heading?"""
- text = span["text"].strip()
- if not (MIN_HEADING_LENGTH <= len(text) <= MAX_HEADING_LENGTH):
- return False
- # Must be larger than body OR explicitly bold with any size increase
- size = span["size"]
- flags = span.get("flags", 0)
- is_bold = bool(flags & 2**4) # PyMuPDF bold flag
- is_larger = size >= body_size * HEADING_FONT_RATIO
- if not (is_larger or (is_bold and size >= body_size)):
- return False
- # Filter out obvious non-headings
- if text.endswith(",") or text.endswith(";"):
- return False
- if sum(1 for c in text if c.isupper()) / max(len(text), 1) > 0.85:
- # ALL CAPS long sentences are likely decorative, not headings
- if len(text) > 40:
- return False
- # Filter single-word headings — almost always TOC entries, running headers, or OCR noise
- # Exception: known valid single-word headings like INTRODUCTION, PREFACE etc.
- words = [w for w in text.split() if w.strip('\'".,;:-')]
- clean_words = [w for w in words if any(c.isalpha() for c in w)]
- if len(clean_words) < MIN_HEADING_WORDS:
- if text.strip('\'".,;:- ').lower() not in VALID_SINGLE_WORD_HEADINGS:
- return False
- # Filter OCR fragments starting with punctuation — e.g. "'THE", "—Section"
- if text[0] in '\'".,;:-—–':
- return False
- # Filter running headers: single capitalised common noun, likely page header
- # e.g. "Company", "Suberling" — real headings are phrases or have "CHAPTER/PART" etc.
- if len(clean_words) == 1 and clean_words[0][0].isupper() and len(clean_words[0]) < 20:
- if clean_words[0].lower() not in VALID_SINGLE_WORD_HEADINGS:
- return False
- # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
- # e.g. "Chapter 1 ............. 12" or "——————————"
- filler_chars = set(".-_·•=~*#/\\|")
- non_space = [c for c in text if not c.isspace()]
- if non_space and sum(1 for c in non_space if c in filler_chars) / len(non_space) > 0.4:
- return False
- # Filter ASCII art: very low ratio of letters to total chars
- letter_ratio = sum(1 for c in text if c.isalpha()) / max(len(text), 1)
- if letter_ratio < 0.3:
- return False
- # Filter short TOC entries ending in a page number
- # e.g. ". — D 6" or "Section 4 42"
- if len(text) < 15 and text[-1].isdigit():
- return False
- return True
- def _extract_headings(doc: fitz.Document, body_size: float) -> list[dict]:
- """
- Walk every page and collect candidate headings with their page number.
- Returns list of {page, text, size, y_pos}
- """
- headings = []
- seen_texts = set()
- for page_num in range(doc.page_count):
- page = doc[page_num]
- for block in page.get_text("dict")["blocks"]:
- if block.get("type") != 0:
- continue
- for line in block.get("lines", []):
- for span in line.get("spans", []):
- text = span["text"].strip()
- if _is_heading_span(span, body_size) and text not in seen_texts:
- headings.append({
- "page": page_num,
- "text": text,
- "size": span["size"],
- "y": span["origin"][1],
- })
- seen_texts.add(text)
- break # one heading per line is enough
- return headings
- def _extract_page_text(doc: fitz.Document, page_num: int) -> str:
- """Extract clean plain text from a single page."""
- return doc[page_num].get_text("text").strip()
- def _build_sections(doc: fitz.Document, headings: list[dict]) -> list[Section]:
- """
- Given a list of headings with page numbers, build Section objects
- by extracting text between consecutive headings.
- """
- sections = []
- for idx, heading in enumerate(headings):
- start_page = heading["page"]
- end_page = headings[idx + 1]["page"] - 1 if idx + 1 < len(headings) else doc.page_count - 1
- end_page = max(start_page, end_page)
- raw_text_parts = []
- for p in range(start_page, end_page + 1):
- raw_text_parts.append(_extract_page_text(doc, p))
- # Try to detect chapter number from heading text
- chapter_num = _parse_chapter_number(heading["text"], idx)
- sections.append(Section(
- title=heading["text"],
- chapter_number=chapter_num,
- page_start=start_page + 1, # 1-indexed for humans
- page_end=end_page + 1,
- raw_text="\n".join(raw_text_parts).strip(),
- ))
- return sections
- def _parse_chapter_number(text: str, fallback: int) -> int:
- """Try to extract a numeric chapter number from a heading string."""
- # "Chapter 3", "3.", "III."
- match = re.search(r"\b(\d+)\b", text)
- if match:
- return int(match.group(1))
- roman = re.search(r"\b(I{1,3}|IV|V|VI{0,3}|IX|X{1,3})\b", text)
- if roman:
- return _roman_to_int(roman.group(1))
- return fallback + 1
- def _roman_to_int(s: str) -> int:
- values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
- s = s.upper()
- total = 0
- for i, c in enumerate(s):
- if i + 1 < len(s) and values[c] < values[s[i + 1]]:
- total -= values[c]
- else:
- total += values[c]
- return total
- def _extract_full_text(doc: fitz.Document) -> str:
- """Extract all text from the document for flat processing."""
- parts = []
- for page_num in range(doc.page_count):
- text = _extract_page_text(doc, page_num)
- if text:
- parts.append(text)
- return "\n\n".join(parts)
|