diff --git a/pylib/__about__.py b/pylib/__about__.py index c6a6895..b9bfa3d 100644 --- a/pylib/__about__.py +++ b/pylib/__about__.py @@ -3,4 +3,4 @@ # SPDX-License-Identifier: Apache-2.0 # ogbujipt.about -__version__ = "0.8.0" +__version__ = '0.8.1' diff --git a/pylib/text_helper.py b/pylib/text_helper.py index c5cd29b..5fcfecf 100644 --- a/pylib/text_helper.py +++ b/pylib/text_helper.py @@ -9,25 +9,207 @@ import warnings -def text_splitter(text: str, +def text_split(text: str, chunk_size: int, separator: str='\n\n', len_func=len) -> list[str]: + ''' + Split string and generate the sequence of chunks + + >>> from ogbujipt.text_helper import text_split + >>> list(text_split('She sells seashells by the seashore', chunk_size=5, separator=' ')) + ['She', 'sells', 'seashells', 'by', 'the', 'seashore'] + >>> list(text_split('She sells seashells by the seashore', chunk_size=10, separator=' ')) + ['She sells', 'seashells', 'by the', 'seashore'] + # Notice case sensitivity, plus the fact that the separator is not included in the chunks + >>> list(text_split('She sells seashells by the seashore', chunk_size=10, separator='s')) + ['She ', 'ells ', 'ea', 'hell', ' by the ', 'ea', 'hore'] + + Args: + text (str): String to be split into chunks + + chunk_size (int): Guidance on maximum length (based on distance_function) of each chunk + + seperator (str, optional): String that already splits "text" into sections + + distance_function (callable, optional): Function to measure length, len() by default + + Returns: + chunks (List[str]): List of chunks of the text provided + ''' + assert separator, 'Separator must be non-empty' + + if ((not isinstance(text, str)) + or (not isinstance(separator, str))): + raise ValueError(f'text and separator must be strings.\n' + f'Got {text.__class__} for text and {separator.__class__} for separator') + + if ((not isinstance(chunk_size, int)) or (chunk_size <= 0)): + raise ValueError(f'chunk_size must be a positive integer, got {chunk_size}.') + + # Split up the text by the separator + # FIXME: Need a step for escaping regex + sep_pat = re.compile(separator) + fine_split = re.split(sep_pat, text) + separator_len = len_func(separator) + + if len(fine_split) <= 1: + warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?') + + curr_chunk = [] + chunk_len = 0 + + for fs in fine_split: + if not fs: continue # noqa E701 + print(fs) + len_fs = len_func(fs) + # if len_fs > chunk_size: + # warnings.warn(f'One of the splits is larger than the chunk size. ' + # f'Consider increasing the chunk size or splitting the text differently.') + + if chunk_len + len_fs + separator_len > chunk_size: + yield separator.join(curr_chunk) + curr_chunk, chunk_len = [fs], len_fs + else: + curr_chunk.append(fs) + chunk_len += len_fs + separator_len + + if curr_chunk: + yield separator.join(curr_chunk) + + +def text_split_fuzzy(text: str, chunk_size: int, chunk_overlap: int=0, separator: str='\n\n', len_func=len ) -> list[str]: ''' - Split string into a set of chunks + Split string into a sequence of chunks in a "fuzzy" manner. Will generally not split the text into sequences + of chunk_size length, but will instead preserve some overlap on either side of the given separators. + This results in slightly larger chunks than the chunk_size number by itself suggests. + + >>> from ogbujipt.text_helper import text_splitter + >>> chunks = text_split_fuzzy('she sells seashells by the seashore', chunk_size=50, separator=' ') + + Args: + text (str): (Multiline) String to be split into chunks + + chunk_size (int): Number of characters to include per chunk + + chunk_overlap (int, optional): Number of characters to overlap at the edges of chunks + + seperator (str, optional): String that already splits "text" into sections + + distance_function (callable, optional): Function to measure length, len() by default + + Returns: + chunks (List[str]): List of chunks of the text provided + ''' + assert separator, 'Separator must be non-empty' + + if ((not isinstance(text, str)) + or (not isinstance(separator, str))): + raise ValueError(f'text and separator must be strings.\n' + f'Got {text.__class__} for text and {separator.__class__} for separator') + + if chunk_overlap == 0: + msg = 'chunk_overlap must be a positive integer. For no overlap, use text_split() instead' + raise ValueError(msg) + + if ((not isinstance(chunk_size, int)) + or (not isinstance(chunk_overlap, int)) + or (chunk_size <= 0) + or (chunk_overlap < 0) + or (chunk_size < chunk_overlap)): + raise ValueError(f'chunk_size must be a positive integer, ' + f'chunk_overlap must be a non-negative integer, and' + f'chunk_size must be greater than chunk_overlap.\n' + f'Got {chunk_size} chunk_size and {chunk_overlap} chunk_overlap.') + + # Split up the text by the separator + # FIXME: Need a step for escaping regex + sep_pat = re.compile(separator) + fine_split = re.split(sep_pat, text) + separator_len = len_func(separator) - Note that this function is "fuzzy"; it will not necessarily split the text into perfectly chunk_size length chunks, - and will preserve items between the given separators. this results in slightly larger chunks than requested. + if len(fine_split) <= 1: + warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?') - Much like langchain's CharTextSplitter.py + # Combine the small pieces into medium size chunks + # chunks will accumulate processed text chunks as we go along + # curr_chunk will be a list of subchunks comprising the main, current chunk + # back_overlap will be appended, once ready, to the end of the previous chunk (if any) + # fwd_overlap will be prepended, once ready, to the start of the next chunk + chunks = [] + curr_chunk, curr_chunk_len = [], 0 + back_overlap, back_overlap_len = None, 0 # None signals not yet gathering + fwd_overlap, fwd_overlap_len = None, 0 + + for s in fine_split: + if not s: continue # noqa E701 + split_len = len_func(s) + separator_len + # Check for full back_overlap (if relevant, i.e. back_overlap isn't None) + if back_overlap is not None and (back_overlap_len + split_len > chunk_overlap): # noqa: F821 + chunks[-1].extend(back_overlap) + back_overlap, back_overlap_len = None, 0 + + # Will adding this split take us into overlap room? + if curr_chunk_len + split_len > (chunk_size - chunk_overlap): + fwd_overlap, fwd_overlap_len = [], 0 # Start gathering + + # Will adding this split take us over chunk size? + if curr_chunk_len + split_len > chunk_size: + # If so, complete current chunk & start a new one + + # fwd_overlap should be non-None at this point, so check empty + if not fwd_overlap and curr_chunk: + # If empty, look back to make sure there is some overlap + fwd_overlap.append(curr_chunk[-1]) + + chunks.append(curr_chunk) + # fwd_overlap intentionally not counted in running chunk length + curr_chunk, curr_chunk_len = fwd_overlap, 0 + back_overlap, back_overlap_len = [], 0 # Start gathering + fwd_overlap, fwd_overlap_len = None, 0 # Stop gathering + + if fwd_overlap is not None: + fwd_overlap.append(s) + fwd_overlap_len += split_len + + if back_overlap is not None: + back_overlap.append(s) + back_overlap_len += split_len + + curr_chunk.append(s) + curr_chunk_len += split_len + + # Done with the splits; use the final back_overlap, if any + if back_overlap: + chunks[-1].extend(back_overlap) + + # Concatenate all the split parts of all the chunks + chunks = [separator.join(c) for c in chunks] + + # Handle degenerate case where no splits found & chunk size too large + # Just becomes one big chunk + if not chunks: + chunks = [text] + + # chunks.append(separator.join(curr_chunk)) + return chunks + + +def text_splitter(text: str, + chunk_size: int, + chunk_overlap: int=0, + separator: str='\n\n', + len_func=len + ) -> list[str]: + ''' + Split string into a sequence of chunks in a "fuzzy" manner. Will generally not split the text into sequences + of chunk_size length, but will instead preserve some overlap on either side of the given separators. + This results in slightly larger chunks than the chunk_size number by itself suggests. >>> from ogbujipt.text_helper import text_splitter - >>> from PyPDF2 import PdfReader - >>> pdf_reader = PdfReader('monopoly-board-game-manual.pdf') - >>> text = ''.join((page.extract_text() for page in pdf_reader.pages)) - >>> chunks = text_splitter(text, chunk_size=500, separator='\n') + >>> chunks = text_split_fuzzy('she sells seashells by the seashore', chunk_size=50, separator=' ') Args: text (str): (Multiline) String to be split into chunks @@ -43,6 +225,8 @@ def text_splitter(text: str, Returns: chunks (List[str]): List of chunks of the text provided ''' + warnings.warn('text_splitter() is deprecated. Use text_split_fuzzy() instead.') + assert separator, 'Separator must be non-empty' if ((not isinstance(text, str)) @@ -50,6 +234,10 @@ def text_splitter(text: str, raise ValueError(f'text and separator must be strings.\n' f'Got {text.__class__} for text and {separator.__class__} for separator') + if chunk_overlap == 0: + msg = 'chunk_overlap must be a positive integer. For no overlap, use text_split() instead' + raise ValueError(msg) + if ((not isinstance(chunk_size, int)) or (not isinstance(chunk_overlap, int)) or (chunk_size <= 0) @@ -67,11 +255,11 @@ def text_splitter(text: str, separator_len = len_func(separator) if len(fine_split) <= 1: - warnings.warn(f'No splits detected. Problem with separator ({repr(separator)})?') + warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?') - # Combine the small pieces into medium size chunks to send to LLM - # Initialize accumulators; chunks will be the target list of the chunks so far - # curr_chunk will be a list of parts comprising the main, current chunk + # Combine the small pieces into medium size chunks + # chunks will accumulate processed text chunks as we go along + # curr_chunk will be a list of subchunks comprising the main, current chunk # back_overlap will be appended, once ready, to the end of the previous chunk (if any) # fwd_overlap will be prepended, once ready, to the start of the next chunk chunks = []