Skip to content

Commit

Permalink
Improve text splitter for non-English documents (#1326)
Browse files Browse the repository at this point in the history
* Add tests to verify that tokens in each section can never be above 500 tokens

* Improve the test so we parametrize it for each PDF instead of failing on the first one.

* Add arabic book

* Formatting fixes

* Verbose assertion messages with the file name

* Implement a recursive splitter when sections are too large.

* 5 percent overlap with recursive splitter

* Resolve some of the PR feedback

* Make the tests set to the max tokens per section of the text splitter instead of hardcoded values

* Find a better split position in the central third of the text

* Correct the boundary check and fix the position to come after the full stops

* Remove some silly line breaks

* Update import

* reformatting

* Make the overlap percent and section size defaults module level constants

* Reformatted PDFs using the online PDF parser for better accessibility

* Add an RTL test

* Add a korean test file about a mouse that goes to the big city. Add a test for table overlapping.

* Add a snapshot of the content sections from the test data PDFs

* Fix my formatting

* Sort the keys first

---------

Co-authored-by: Pamela Fox <[email protected]>
  • Loading branch information
tonybaloney and pamelafox authored Mar 6, 2024
1 parent afbe70c commit b04fc66
Show file tree
Hide file tree
Showing 10 changed files with 834 additions and 10 deletions.
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ coverage
playwright
pytest-cov
pytest-playwright
pytest-snapshot
pre-commit
locust
pip-tools
Expand Down
116 changes: 108 additions & 8 deletions scripts/prepdocslib/textsplitter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from abc import ABC
from typing import Generator, List

import tiktoken

from .page import Page, SplitPage


Expand All @@ -16,20 +18,118 @@ def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]:
yield # pragma: no cover - this is necessary for mypy to type check


ENCODING_MODEL = "text-embedding-ada-002"

STANDARD_WORD_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]

# See W3C document https://www.w3.org/TR/jlreq/#cl-01
CJK_WORD_BREAKS = [
"、",
",",
";",
":",
"(",
")",
"【",
"】",
"「",
"」",
"『",
"』",
"〔",
"〕",
"〈",
"〉",
"《",
"》",
"〖",
"〗",
"〘",
"〙",
"〚",
"〛",
"〝",
"〞",
"〟",
"〰",
"–",
"—",
"‘",
"’",
"‚",
"‛",
"“",
"”",
"„",
"‟",
"‹",
"›",
]

STANDARD_SENTENCE_ENDINGS = [".", "!", "?"]

# See CL05 and CL06, based on JIS X 4051:2004
# https://www.w3.org/TR/jlreq/#cl-04
CJK_SENTENCE_ENDINGS = ["。", "!", "?", "‼", "⁇", "⁈", "⁉"]

# NB: text-embedding-3-XX is the same BPE as text-embedding-ada-002
bpe = tiktoken.encoding_for_model(ENCODING_MODEL)

DEFAULT_OVERLAP_PERCENT = 10 # See semantic search article for 10% overlap performance
DEFAULT_SECTION_LENGTH = 1000 # Roughly 400-500 tokens for English


class SentenceTextSplitter(TextSplitter):
"""
Class that splits pages into smaller chunks. This is required because embedding models may not be able to analyze an entire page at once
"""

def __init__(self, has_image_embeddings: bool, verbose: bool = False):
self.sentence_endings = [".", "!", "?"]
self.word_breaks = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
self.max_section_length = 1000
def __init__(self, has_image_embeddings: bool, verbose: bool = False, max_tokens_per_section: int = 500):
self.sentence_endings = STANDARD_SENTENCE_ENDINGS + CJK_SENTENCE_ENDINGS
self.word_breaks = STANDARD_WORD_BREAKS + CJK_WORD_BREAKS
self.max_section_length = DEFAULT_SECTION_LENGTH
self.sentence_search_limit = 100
self.section_overlap = 100
self.max_tokens_per_section = max_tokens_per_section
self.section_overlap = self.max_section_length // DEFAULT_OVERLAP_PERCENT
self.verbose = verbose
self.has_image_embeddings = has_image_embeddings

def split_page_by_max_tokens(self, page_num: int, text: str) -> Generator[SplitPage, None, None]:
"""
Recursively splits page by maximum number of tokens to better handle languages with higher token/word ratios.
"""
tokens = bpe.encode(text)
if len(tokens) <= self.max_tokens_per_section:
# Section is already within max tokens, return
yield SplitPage(page_num=page_num, text=text)
else:
# Start from the center and try and find the closest sentence ending by spiralling outward.
# IF we get to the outer thirds, then just split in half with a 5% overlap
start = int(len(text) // 2)
pos = 0
boundary = int(len(text) // 3)
split_position = -1
while start - pos > boundary:
if text[start - pos] in self.sentence_endings:
split_position = start - pos
break
elif text[start + pos] in self.sentence_endings:
split_position = start + pos
break
else:
pos += 1

if split_position > 0:
first_half = text[: split_position + 1]
second_half = text[split_position + 1 :]
else:
# Split page in half and call function again
# Overlap first and second halves by DEFAULT_OVERLAP_PERCENT%
first_half = text[: int(len(text) // (2.0 + (DEFAULT_OVERLAP_PERCENT / 100)))]
second_half = text[int(len(text) // (1.0 - (DEFAULT_OVERLAP_PERCENT / 100))) :]
yield from self.split_page_by_max_tokens(page_num, first_half)
yield from self.split_page_by_max_tokens(page_num, second_half)

def split_pages(self, pages: List[Page]) -> Generator[SplitPage, None, None]:
# Chunking is disabled when using GPT4V. To be updated in the future.
if self.has_image_embeddings:
Expand All @@ -49,7 +149,7 @@ def find_page(offset):

length = len(all_text)
if length <= self.max_section_length:
yield SplitPage(page_num=find_page(0), text=all_text)
yield from self.split_page_by_max_tokens(page_num=find_page(0), text=all_text)
return

start = 0
Expand Down Expand Up @@ -91,7 +191,7 @@ def find_page(offset):
start += 1

section_text = all_text[start:end]
yield SplitPage(page_num=find_page(start), text=section_text)
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=section_text)

last_table_start = section_text.rfind("<table")
if last_table_start > 2 * self.sentence_search_limit and last_table_start > section_text.rfind("</table"):
Expand All @@ -107,7 +207,7 @@ def find_page(offset):
start = end - self.section_overlap

if start + self.section_overlap < end:
yield SplitPage(page_num=find_page(start), text=all_text[start:end])
yield from self.split_page_by_max_tokens(page_num=find_page(start), text=all_text[start:end])


class SimpleTextSplitter(TextSplitter):
Expand Down
Loading

0 comments on commit b04fc66

Please sign in to comment.