chunker.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181
  1. """
  2. chunker.py — splits text into token-sized chunks, purely in Python.
  3. No LLM calls. Uses tiktoken for accurate token counting.
  4. Strategy:
  5. - Split on paragraph boundaries first (double newline)
  6. - If a paragraph exceeds chunk_size, split on sentence boundaries
  7. - If a sentence exceeds chunk_size, hard-split on token count
  8. - Chunks carry their source metadata (page range, section title)
  9. """
  10. from __future__ import annotations
  11. import re
  12. from dataclasses import dataclass
  13. import tiktoken
  14. from .config import cfg
  15. # Use cl100k_base — matches most modern LLMs well enough for counting
  16. _ENCODER = tiktoken.get_encoding("cl100k_base")
  17. # ── Data model ─────────────────────────────────────────────────────────────────
  18. @dataclass
  19. class Chunk:
  20. text: str
  21. token_count: int
  22. source_file: str
  23. section_title: str | None # None for flat docs
  24. chapter_number: int | None
  25. page_start: int
  26. page_end: int
  27. chunk_index: int # position within parent section
  28. # ── Public API ─────────────────────────────────────────────────────────────────
  29. def chunk_section(
  30. text: str,
  31. source_file: str,
  32. section_title: str | None = None,
  33. chapter_number: int | None = None,
  34. page_start: int = 0,
  35. page_end: int = 0,
  36. chunk_size: int | None = None,
  37. ) -> list[Chunk]:
  38. """
  39. Chunk a block of text into token-sized pieces.
  40. Returns a list of Chunk objects with metadata attached.
  41. """
  42. size = chunk_size or cfg.chunk_size_tokens
  43. paragraphs = _split_paragraphs(text)
  44. raw_chunks = _build_chunks(paragraphs, size)
  45. return [
  46. Chunk(
  47. text=raw,
  48. token_count=count_tokens(raw),
  49. source_file=source_file,
  50. section_title=section_title,
  51. chapter_number=chapter_number,
  52. page_start=page_start,
  53. page_end=page_end,
  54. chunk_index=idx,
  55. )
  56. for idx, raw in enumerate(raw_chunks)
  57. if raw.strip()
  58. ]
  59. def count_tokens(text: str) -> int:
  60. """Count tokens in a string using tiktoken."""
  61. return len(_ENCODER.encode(text))
  62. # ── Internal helpers ───────────────────────────────────────────────────────────
  63. def _split_paragraphs(text: str) -> list[str]:
  64. """Split on blank lines, clean up whitespace."""
  65. paragraphs = re.split(r"\n\s*\n", text)
  66. return [p.strip() for p in paragraphs if p.strip()]
  67. def _split_sentences(text: str) -> list[str]:
  68. """
  69. Rough sentence splitter — handles common abbreviations.
  70. Good enough for chunking purposes without an NLP library.
  71. """
  72. # Protect common abbreviations from splitting
  73. protected = re.sub(
  74. r"\b(Mr|Mrs|Ms|Dr|Prof|Sr|Jr|vs|etc|approx|dept|est|fig|govt|inc|ltd|no|vol)\.",
  75. r"\1<DOT>",
  76. text,
  77. flags=re.IGNORECASE,
  78. )
  79. # Split on sentence-ending punctuation followed by whitespace + capital
  80. sentences = re.split(r"(?<=[.!?])\s+(?=[A-Z\"\'])", protected)
  81. # Restore protected dots
  82. return [s.replace("<DOT>", ".").strip() for s in sentences if s.strip()]
  83. def _build_chunks(paragraphs: list[str], max_tokens: int) -> list[str]:
  84. """
  85. Greedily build chunks by accumulating paragraphs.
  86. Respects max_tokens boundary, spilling over to sentences then hard splits.
  87. """
  88. chunks: list[str] = []
  89. current_parts: list[str] = []
  90. current_tokens = 0
  91. for para in paragraphs:
  92. para_tokens = count_tokens(para)
  93. if para_tokens > max_tokens:
  94. # Paragraph too big — flush current buffer, then split paragraph
  95. if current_parts:
  96. chunks.append(" ".join(current_parts))
  97. current_parts, current_tokens = [], 0
  98. chunks.extend(_split_large_paragraph(para, max_tokens))
  99. continue
  100. if current_tokens + para_tokens > max_tokens:
  101. # Would exceed limit — flush and start fresh
  102. if current_parts:
  103. chunks.append(" ".join(current_parts))
  104. current_parts = [para]
  105. current_tokens = para_tokens
  106. else:
  107. current_parts.append(para)
  108. current_tokens += para_tokens
  109. if current_parts:
  110. chunks.append(" ".join(current_parts))
  111. return chunks
  112. def _split_large_paragraph(para: str, max_tokens: int) -> list[str]:
  113. """Split an oversized paragraph at sentence boundaries."""
  114. sentences = _split_sentences(para)
  115. chunks: list[str] = []
  116. current_parts: list[str] = []
  117. current_tokens = 0
  118. for sent in sentences:
  119. sent_tokens = count_tokens(sent)
  120. if sent_tokens > max_tokens:
  121. # Single sentence too long — hard split by tokens
  122. if current_parts:
  123. chunks.append(" ".join(current_parts))
  124. current_parts, current_tokens = [], 0
  125. chunks.extend(_hard_split(sent, max_tokens))
  126. continue
  127. if current_tokens + sent_tokens > max_tokens:
  128. if current_parts:
  129. chunks.append(" ".join(current_parts))
  130. current_parts = [sent]
  131. current_tokens = sent_tokens
  132. else:
  133. current_parts.append(sent)
  134. current_tokens += sent_tokens
  135. if current_parts:
  136. chunks.append(" ".join(current_parts))
  137. return chunks
  138. def _hard_split(text: str, max_tokens: int) -> list[str]:
  139. """Last resort: split a string into max_tokens-sized pieces by token index."""
  140. tokens = _ENCODER.encode(text)
  141. chunks = []
  142. for i in range(0, len(tokens), max_tokens):
  143. chunk_tokens = tokens[i: i + max_tokens]
  144. chunks.append(_ENCODER.decode(chunk_tokens))
  145. return chunks