lucky
/
book-ingestor


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317
							"""
detector.py — analyses a PDF and determines its structure purely via PyMuPDF.
No LLM calls. Font sizes, bold flags, and text positioning do all the work.

Returns a DocumentStructure describing:
  - whether the doc is structured (has chapters/headings) or flat
  - extracted chapters with their page ranges and raw text
  - document title (best guess)
"""

from __future__ import annotations

import re
from dataclasses import dataclass, field
from pathlib import Path

import fitz  # PyMuPDF


# ── Data models ────────────────────────────────────────────────────────────────

@dataclass
class Section:
    title: str
    chapter_number: int | None   # None for flat docs
    page_start: int
    page_end: int
    raw_text: str                 # full extracted text for this section


@dataclass
class DocumentStructure:
    source_file: str
    doc_type: str                 # "structured" | "flat"
    title: str
    sections: list[Section] = field(default_factory=list)
    full_text: str = ""           # populated for flat docs


# ── Constants ──────────────────────────────────────────────────────────────────

# Heading candidates: bold text significantly larger than body font
HEADING_FONT_RATIO = 1.15        # heading must be ≥ 15% larger than median body size
MIN_HEADING_LENGTH = 4           # minimum chars
MAX_HEADING_LENGTH = 120
MIN_HEADING_WORDS = 2            # must have at least 2 real words
MIN_SECTION_TEXT_CHARS = 200     # sections with less content than this are likely TOC artifacts

# Single-word headings that ARE valid chapter titles — whitelist
VALID_SINGLE_WORD_HEADINGS = {
    "introduction", "preface", "foreword", "prologue", "epilogue",
    "conclusion", "appendix", "bibliography", "glossary", "index",
    "summary", "abstract", "acknowledgements", "acknowledgments",
    "contents", "overview", "background", "references", "afterword",
}
MIN_SECTIONS_FOR_STRUCTURED = 2  # need at least 2 headings to call it structured

# Patterns that strongly suggest a chapter heading
CHAPTER_PATTERNS = [
    re.compile(r"^\s*(chapter|part|section|unit|lesson)\s+[\dIVXivx]+", re.IGNORECASE),
    re.compile(r"^\s*[\dIVXivx]+[\.\)]\s+\w"),   # "1. Introduction" or "IV) Conclusion"
]


# ── Main entry point ───────────────────────────────────────────────────────────

def detect_structure(pdf_path: str | Path) -> DocumentStructure:
    """
    Analyse a PDF and return a DocumentStructure.
    Always succeeds — falls back to flat doc if structure detection fails.
    """
    path = Path(pdf_path)
    doc = fitz.open(str(path))

    try:
        title = _extract_title(doc)
        body_font_size = _median_body_font_size(doc)
        headings = _extract_headings(doc, body_font_size)

        if len(headings) >= MIN_SECTIONS_FOR_STRUCTURED:
            sections = _build_sections(doc, headings)
            return DocumentStructure(
                source_file=path.name,
                doc_type="structured",
                title=title,
                sections=sections,
            )
        else:
            full_text = _extract_full_text(doc)
            return DocumentStructure(
                source_file=path.name,
                doc_type="flat",
                title=title,
                full_text=full_text,
            )
    finally:
        doc.close()


# ── Internal helpers ───────────────────────────────────────────────────────────

def _extract_title(doc: fitz.Document) -> str:
    """Try PDF metadata first, then largest text on page 1."""
    meta_title = (doc.metadata or {}).get("title", "").strip()
    if meta_title and len(meta_title) > 3:
        return meta_title

    # Fallback: largest font on first page
    if doc.page_count == 0:
        return "Unknown Document"

    page = doc[0]
    blocks = page.get_text("dict")["blocks"]
    candidates = []

    for block in blocks:
        if block.get("type") != 0:  # type 0 = text
            continue
        for line in block.get("lines", []):
            for span in line.get("spans", []):
                text = span["text"].strip()
                if MIN_HEADING_LENGTH < len(text) < MAX_HEADING_LENGTH:
                    candidates.append((span["size"], text))

    if candidates:
        candidates.sort(reverse=True)
        return candidates[0][1]

    return Path(doc.name).stem.replace("_", " ").replace("-", " ").title()


def _median_body_font_size(doc: fitz.Document) -> float:
    """
    Compute the median font size across the document.
    This represents 'body text' since it will dominate page count.
    Sample up to 10 pages for speed.
    """
    sizes = []
    sample_pages = min(doc.page_count, 10)

    for page_num in range(sample_pages):
        page = doc[page_num]
        for block in page.get_text("dict")["blocks"]:
            if block.get("type") != 0:
                continue
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    if span["text"].strip():
                        sizes.append(span["size"])

    if not sizes:
        return 12.0

    sizes.sort()
    return sizes[len(sizes) // 2]


def _is_heading_span(span: dict, body_size: float) -> bool:
    """Heuristic: is this text span a section heading?"""
    text = span["text"].strip()

    if not (MIN_HEADING_LENGTH <= len(text) <= MAX_HEADING_LENGTH):
        return False

    # Must be larger than body OR explicitly bold with any size increase
    size = span["size"]
    flags = span.get("flags", 0)
    is_bold = bool(flags & 2**4)  # PyMuPDF bold flag
    is_larger = size >= body_size * HEADING_FONT_RATIO

    if not (is_larger or (is_bold and size >= body_size)):
        return False

    # Filter out obvious non-headings
    if text.endswith(",") or text.endswith(";"):
        return False
    if sum(1 for c in text if c.isupper()) / max(len(text), 1) > 0.85:
        # ALL CAPS long sentences are likely decorative, not headings
        if len(text) > 40:
            return False

    # Filter single-word headings — almost always TOC entries, running headers, or OCR noise
    # Exception: known valid single-word headings like INTRODUCTION, PREFACE etc.
    words = [w for w in text.split() if w.strip('\'".,;:-')]
    clean_words = [w for w in words if any(c.isalpha() for c in w)]
    if len(clean_words) < MIN_HEADING_WORDS:
        if text.strip('\'".,;:- ').lower() not in VALID_SINGLE_WORD_HEADINGS:
            return False

    # Filter OCR fragments starting with punctuation — e.g. "'THE", "—Section"
    if text[0] in '\'".,;:-—–':
        return False

    # Filter running headers: single capitalised common noun, likely page header
    # e.g. "Company", "Suberling" — real headings are phrases or have "CHAPTER/PART" etc.
    if len(clean_words) == 1 and clean_words[0][0].isupper() and len(clean_words[0]) < 20:
        if clean_words[0].lower() not in VALID_SINGLE_WORD_HEADINGS:
            return False

    # Filter TOC leaders: lines mostly filled with dots, dashes, underscores
    # e.g. "Chapter 1 ............. 12" or "——————————"
    filler_chars = set(".-_·•=~*#/\\|")
    non_space = [c for c in text if not c.isspace()]
    if non_space and sum(1 for c in non_space if c in filler_chars) / len(non_space) > 0.4:
        return False

    # Filter ASCII art: very low ratio of letters to total chars
    letter_ratio = sum(1 for c in text if c.isalpha()) / max(len(text), 1)
    if letter_ratio < 0.3:
        return False

    # Filter short TOC entries ending in a page number
    # e.g. ". — D  6" or "Section 4  42"
    if len(text) < 15 and text[-1].isdigit():
        return False

    return True


def _extract_headings(doc: fitz.Document, body_size: float) -> list[dict]:
    """
    Walk every page and collect candidate headings with their page number.
    Returns list of {page, text, size, y_pos}
    """
    headings = []
    seen_texts = set()

    for page_num in range(doc.page_count):
        page = doc[page_num]
        for block in page.get_text("dict")["blocks"]:
            if block.get("type") != 0:
                continue
            for line in block.get("lines", []):
                for span in line.get("spans", []):
                    text = span["text"].strip()
                    if _is_heading_span(span, body_size) and text not in seen_texts:
                        headings.append({
                            "page": page_num,
                            "text": text,
                            "size": span["size"],
                            "y": span["origin"][1],
                        })
                        seen_texts.add(text)
                        break  # one heading per line is enough

    return headings


def _extract_page_text(doc: fitz.Document, page_num: int) -> str:
    """Extract clean plain text from a single page."""
    return doc[page_num].get_text("text").strip()


def _build_sections(doc: fitz.Document, headings: list[dict]) -> list[Section]:
    """
    Given a list of headings with page numbers, build Section objects
    by extracting text between consecutive headings.
    """
    sections = []

    for idx, heading in enumerate(headings):
        start_page = heading["page"]
        end_page = headings[idx + 1]["page"] - 1 if idx + 1 < len(headings) else doc.page_count - 1
        end_page = max(start_page, end_page)

        raw_text_parts = []
        for p in range(start_page, end_page + 1):
            raw_text_parts.append(_extract_page_text(doc, p))

        # Try to detect chapter number from heading text
        chapter_num = _parse_chapter_number(heading["text"], idx)

        sections.append(Section(
            title=heading["text"],
            chapter_number=chapter_num,
            page_start=start_page + 1,  # 1-indexed for humans
            page_end=end_page + 1,
            raw_text="\n".join(raw_text_parts).strip(),
        ))

    return sections


def _parse_chapter_number(text: str, fallback: int) -> int:
    """Try to extract a numeric chapter number from a heading string."""
    # "Chapter 3", "3.", "III."
    match = re.search(r"\b(\d+)\b", text)
    if match:
        return int(match.group(1))

    roman = re.search(r"\b(I{1,3}|IV|V|VI{0,3}|IX|X{1,3})\b", text)
    if roman:
        return _roman_to_int(roman.group(1))

    return fallback + 1


def _roman_to_int(s: str) -> int:
    values = {"I": 1, "V": 5, "X": 10, "L": 50, "C": 100, "D": 500, "M": 1000}
    s = s.upper()
    total = 0
    for i, c in enumerate(s):
        if i + 1 < len(s) and values[c] < values[s[i + 1]]:
            total -= values[c]
        else:
            total += values[c]
    return total


def _extract_full_text(doc: fitz.Document) -> str:
    """Extract all text from the document for flat processing."""
    parts = []
    for page_num in range(doc.page_count):
        text = _extract_page_text(doc, page_num)
        if text:
            parts.append(text)
    return "\n\n".join(parts)