Skip to content

Commit

Permalink
[#30] Implement text_split as a generator, while deprecating text_spl…
Browse files Browse the repository at this point in the history
…itter
  • Loading branch information
uogbuji committed Apr 20, 2024
1 parent c6e6abf commit de56be6
Show file tree
Hide file tree
Showing 2 changed files with 202 additions and 14 deletions.
2 changes: 1 addition & 1 deletion pylib/__about__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
# SPDX-License-Identifier: Apache-2.0
# ogbujipt.about

__version__ = "0.8.0"
__version__ = '0.8.1'
214 changes: 201 additions & 13 deletions pylib/text_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,207 @@
import warnings


def text_splitter(text: str,
def text_split(text: str, chunk_size: int, separator: str='\n\n', len_func=len) -> list[str]:
'''
Split string and generate the sequence of chunks
>>> from ogbujipt.text_helper import text_split
>>> list(text_split('She sells seashells by the seashore', chunk_size=5, separator=' '))
['She', 'sells', 'seashells', 'by', 'the', 'seashore']
>>> list(text_split('She sells seashells by the seashore', chunk_size=10, separator=' '))
['She sells', 'seashells', 'by the', 'seashore']
# Notice case sensitivity, plus the fact that the separator is not included in the chunks
>>> list(text_split('She sells seashells by the seashore', chunk_size=10, separator='s'))
['She ', 'ells ', 'ea', 'hell', ' by the ', 'ea', 'hore']
Args:
text (str): String to be split into chunks
chunk_size (int): Guidance on maximum length (based on distance_function) of each chunk
seperator (str, optional): String that already splits "text" into sections
distance_function (callable, optional): Function to measure length, len() by default
Returns:
chunks (List[str]): List of chunks of the text provided
'''
assert separator, 'Separator must be non-empty'

if ((not isinstance(text, str))
or (not isinstance(separator, str))):
raise ValueError(f'text and separator must be strings.\n'
f'Got {text.__class__} for text and {separator.__class__} for separator')

if ((not isinstance(chunk_size, int)) or (chunk_size <= 0)):
raise ValueError(f'chunk_size must be a positive integer, got {chunk_size}.')

# Split up the text by the separator
# FIXME: Need a step for escaping regex
sep_pat = re.compile(separator)
fine_split = re.split(sep_pat, text)
separator_len = len_func(separator)

if len(fine_split) <= 1:
warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?')

curr_chunk = []
chunk_len = 0

for fs in fine_split:
if not fs: continue # noqa E701
print(fs)
len_fs = len_func(fs)
# if len_fs > chunk_size:
# warnings.warn(f'One of the splits is larger than the chunk size. '
# f'Consider increasing the chunk size or splitting the text differently.')

if chunk_len + len_fs + separator_len > chunk_size:
yield separator.join(curr_chunk)
curr_chunk, chunk_len = [fs], len_fs
else:
curr_chunk.append(fs)
chunk_len += len_fs + separator_len

if curr_chunk:
yield separator.join(curr_chunk)


def text_split_fuzzy(text: str,
chunk_size: int,
chunk_overlap: int=0,
separator: str='\n\n',
len_func=len
) -> list[str]:
'''
Split string into a set of chunks
Split string into a sequence of chunks in a "fuzzy" manner. Will generally not split the text into sequences
of chunk_size length, but will instead preserve some overlap on either side of the given separators.
This results in slightly larger chunks than the chunk_size number by itself suggests.
>>> from ogbujipt.text_helper import text_splitter
>>> chunks = text_split_fuzzy('she sells seashells by the seashore', chunk_size=50, separator=' ')
Args:
text (str): (Multiline) String to be split into chunks
chunk_size (int): Number of characters to include per chunk
chunk_overlap (int, optional): Number of characters to overlap at the edges of chunks
seperator (str, optional): String that already splits "text" into sections
distance_function (callable, optional): Function to measure length, len() by default
Returns:
chunks (List[str]): List of chunks of the text provided
'''
assert separator, 'Separator must be non-empty'

if ((not isinstance(text, str))
or (not isinstance(separator, str))):
raise ValueError(f'text and separator must be strings.\n'
f'Got {text.__class__} for text and {separator.__class__} for separator')

if chunk_overlap == 0:
msg = 'chunk_overlap must be a positive integer. For no overlap, use text_split() instead'
raise ValueError(msg)

if ((not isinstance(chunk_size, int))
or (not isinstance(chunk_overlap, int))
or (chunk_size <= 0)
or (chunk_overlap < 0)
or (chunk_size < chunk_overlap)):
raise ValueError(f'chunk_size must be a positive integer, '
f'chunk_overlap must be a non-negative integer, and'
f'chunk_size must be greater than chunk_overlap.\n'
f'Got {chunk_size} chunk_size and {chunk_overlap} chunk_overlap.')

# Split up the text by the separator
# FIXME: Need a step for escaping regex
sep_pat = re.compile(separator)
fine_split = re.split(sep_pat, text)
separator_len = len_func(separator)

Note that this function is "fuzzy"; it will not necessarily split the text into perfectly chunk_size length chunks,
and will preserve items between the given separators. this results in slightly larger chunks than requested.
if len(fine_split) <= 1:
warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?')

Much like langchain's CharTextSplitter.py
# Combine the small pieces into medium size chunks
# chunks will accumulate processed text chunks as we go along
# curr_chunk will be a list of subchunks comprising the main, current chunk
# back_overlap will be appended, once ready, to the end of the previous chunk (if any)
# fwd_overlap will be prepended, once ready, to the start of the next chunk
chunks = []
curr_chunk, curr_chunk_len = [], 0
back_overlap, back_overlap_len = None, 0 # None signals not yet gathering
fwd_overlap, fwd_overlap_len = None, 0

for s in fine_split:
if not s: continue # noqa E701
split_len = len_func(s) + separator_len
# Check for full back_overlap (if relevant, i.e. back_overlap isn't None)
if back_overlap is not None and (back_overlap_len + split_len > chunk_overlap): # noqa: F821
chunks[-1].extend(back_overlap)
back_overlap, back_overlap_len = None, 0

# Will adding this split take us into overlap room?
if curr_chunk_len + split_len > (chunk_size - chunk_overlap):
fwd_overlap, fwd_overlap_len = [], 0 # Start gathering

# Will adding this split take us over chunk size?
if curr_chunk_len + split_len > chunk_size:
# If so, complete current chunk & start a new one

# fwd_overlap should be non-None at this point, so check empty
if not fwd_overlap and curr_chunk:
# If empty, look back to make sure there is some overlap
fwd_overlap.append(curr_chunk[-1])

chunks.append(curr_chunk)
# fwd_overlap intentionally not counted in running chunk length
curr_chunk, curr_chunk_len = fwd_overlap, 0
back_overlap, back_overlap_len = [], 0 # Start gathering
fwd_overlap, fwd_overlap_len = None, 0 # Stop gathering

if fwd_overlap is not None:
fwd_overlap.append(s)
fwd_overlap_len += split_len

if back_overlap is not None:
back_overlap.append(s)
back_overlap_len += split_len

curr_chunk.append(s)
curr_chunk_len += split_len

# Done with the splits; use the final back_overlap, if any
if back_overlap:
chunks[-1].extend(back_overlap)

# Concatenate all the split parts of all the chunks
chunks = [separator.join(c) for c in chunks]

# Handle degenerate case where no splits found & chunk size too large
# Just becomes one big chunk
if not chunks:
chunks = [text]

# chunks.append(separator.join(curr_chunk))
return chunks


def text_splitter(text: str,
chunk_size: int,
chunk_overlap: int=0,
separator: str='\n\n',
len_func=len
) -> list[str]:
'''
Split string into a sequence of chunks in a "fuzzy" manner. Will generally not split the text into sequences
of chunk_size length, but will instead preserve some overlap on either side of the given separators.
This results in slightly larger chunks than the chunk_size number by itself suggests.
>>> from ogbujipt.text_helper import text_splitter
>>> from PyPDF2 import PdfReader
>>> pdf_reader = PdfReader('monopoly-board-game-manual.pdf')
>>> text = ''.join((page.extract_text() for page in pdf_reader.pages))
>>> chunks = text_splitter(text, chunk_size=500, separator='\n')
>>> chunks = text_split_fuzzy('she sells seashells by the seashore', chunk_size=50, separator=' ')
Args:
text (str): (Multiline) String to be split into chunks
Expand All @@ -43,13 +225,19 @@ def text_splitter(text: str,
Returns:
chunks (List[str]): List of chunks of the text provided
'''
warnings.warn('text_splitter() is deprecated. Use text_split_fuzzy() instead.')

assert separator, 'Separator must be non-empty'

if ((not isinstance(text, str))
or (not isinstance(separator, str))):
raise ValueError(f'text and separator must be strings.\n'
f'Got {text.__class__} for text and {separator.__class__} for separator')

if chunk_overlap == 0:
msg = 'chunk_overlap must be a positive integer. For no overlap, use text_split() instead'
raise ValueError(msg)

if ((not isinstance(chunk_size, int))
or (not isinstance(chunk_overlap, int))
or (chunk_size <= 0)
Expand All @@ -67,11 +255,11 @@ def text_splitter(text: str,
separator_len = len_func(separator)

if len(fine_split) <= 1:
warnings.warn(f'No splits detected. Problem with separator ({repr(separator)})?')
warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?')

# Combine the small pieces into medium size chunks to send to LLM
# Initialize accumulators; chunks will be the target list of the chunks so far
# curr_chunk will be a list of parts comprising the main, current chunk
# Combine the small pieces into medium size chunks
# chunks will accumulate processed text chunks as we go along
# curr_chunk will be a list of subchunks comprising the main, current chunk
# back_overlap will be appended, once ready, to the end of the previous chunk (if any)
# fwd_overlap will be prepended, once ready, to the start of the next chunk
chunks = []
Expand Down

0 comments on commit de56be6

Please sign in to comment.