[#30] Implement text_split as a generator, while deprecating text_spl…

…itter
OoriData · Apr 20, 2024 · de56be6 · de56be6
1 parent c6e6abf
commit de56be6
Show file tree

Hide file tree

Showing 2 changed files with 202 additions and 14 deletions.
diff --git a/pylib/__about__.py b/pylib/__about__.py
@@ -3,4 +3,4 @@
 # SPDX-License-Identifier: Apache-2.0
 # ogbujipt.about
 
-__version__ = "0.8.0"
+__version__ = '0.8.1'
diff --git a/pylib/text_helper.py b/pylib/text_helper.py
@@ -9,25 +9,207 @@
 import warnings
 
 
-def text_splitter(text: str,
+def text_split(text: str, chunk_size: int, separator: str='\n\n', len_func=len) -> list[str]:
+    '''
+    Split string and generate the sequence of chunks
+
+    >>> from ogbujipt.text_helper import text_split
+    >>> list(text_split('She sells seashells by the seashore', chunk_size=5, separator=' '))
+    ['She', 'sells', 'seashells', 'by', 'the', 'seashore']
+    >>> list(text_split('She sells seashells by the seashore', chunk_size=10, separator=' '))
+    ['She sells', 'seashells', 'by the', 'seashore']
+    # Notice case sensitivity, plus the fact that the separator is not included in the chunks
+    >>> list(text_split('She sells seashells by the seashore', chunk_size=10, separator='s'))
+    ['She ', 'ells ', 'ea', 'hell', ' by the ', 'ea', 'hore']
+
+    Args:
+        text (str): String to be split into chunks
+
+        chunk_size (int): Guidance on maximum length (based on distance_function) of each chunk
+
+        seperator (str, optional): String that already splits "text" into sections
+
+        distance_function (callable, optional): Function to measure length, len() by default
+
+    Returns:
+        chunks (List[str]): List of chunks of the text provided
+    '''
+    assert separator, 'Separator must be non-empty'
+
+    if ((not isinstance(text, str))
+        or (not isinstance(separator, str))):
+        raise ValueError(f'text and separator must be strings.\n'
+                         f'Got {text.__class__} for text and {separator.__class__} for separator')
+
+    if ((not isinstance(chunk_size, int)) or (chunk_size <= 0)):
+        raise ValueError(f'chunk_size must be a positive integer, got {chunk_size}.')
+
+    # Split up the text by the separator
+    # FIXME: Need a step for escaping regex
+    sep_pat = re.compile(separator)
+    fine_split = re.split(sep_pat, text)
+    separator_len = len_func(separator)
+
+    if len(fine_split) <= 1:
+        warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?')
+
+    curr_chunk = []
+    chunk_len = 0
+
+    for fs in fine_split:
+        if not fs: continue  # noqa E701
+        print(fs)
+        len_fs = len_func(fs)
+        # if len_fs > chunk_size:
+        #     warnings.warn(f'One of the splits is larger than the chunk size. '
+        #                   f'Consider increasing the chunk size or splitting the text differently.')
+
+        if chunk_len + len_fs + separator_len > chunk_size:
+            yield separator.join(curr_chunk)
+            curr_chunk, chunk_len = [fs], len_fs
+        else:
+            curr_chunk.append(fs)
+            chunk_len += len_fs + separator_len
+
+    if curr_chunk:
+        yield separator.join(curr_chunk)
+
+
+def text_split_fuzzy(text: str,
         chunk_size: int,
         chunk_overlap: int=0,
         separator: str='\n\n',
         len_func=len
     ) -> list[str]:
     '''
-    Split string into a set of chunks
+    Split string into a sequence of chunks in a "fuzzy" manner. Will generally not split the text into sequences
+    of chunk_size length, but will instead preserve some overlap on either side of the given separators.
+    This results in slightly larger chunks than the chunk_size number by itself suggests.
+
+    >>> from ogbujipt.text_helper import text_splitter
+    >>> chunks = text_split_fuzzy('she sells seashells by the seashore', chunk_size=50, separator=' ')
+
+    Args:
+        text (str): (Multiline) String to be split into chunks
+
+        chunk_size (int): Number of characters to include per chunk
+
+        chunk_overlap (int, optional): Number of characters to overlap at the edges of chunks
+
+        seperator (str, optional): String that already splits "text" into sections
+
+        distance_function (callable, optional): Function to measure length, len() by default
+
+    Returns:
+        chunks (List[str]): List of chunks of the text provided
+    '''
+    assert separator, 'Separator must be non-empty'
+
+    if ((not isinstance(text, str))
+        or (not isinstance(separator, str))):
+        raise ValueError(f'text and separator must be strings.\n'
+                         f'Got {text.__class__} for text and {separator.__class__} for separator')
+
+    if chunk_overlap == 0:
+        msg = 'chunk_overlap must be a positive integer. For no overlap, use text_split() instead'
+        raise ValueError(msg)
+
+    if ((not isinstance(chunk_size, int))
+        or (not isinstance(chunk_overlap, int))
+        or (chunk_size <= 0)
+        or (chunk_overlap < 0)
+        or (chunk_size < chunk_overlap)):
+        raise ValueError(f'chunk_size must be a positive integer, '
+                         f'chunk_overlap must be a non-negative integer, and'
+                         f'chunk_size must be greater than chunk_overlap.\n'
+                         f'Got {chunk_size} chunk_size and {chunk_overlap} chunk_overlap.')
+
+    # Split up the text by the separator
+    # FIXME: Need a step for escaping regex
+    sep_pat = re.compile(separator)
+    fine_split = re.split(sep_pat, text)
+    separator_len = len_func(separator)
 
-    Note that this function is "fuzzy"; it will not necessarily split the text into perfectly chunk_size length chunks, 
-    and will preserve items between the given separators. this results in slightly larger chunks than requested.
+    if len(fine_split) <= 1:
+        warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?')
 
-    Much like langchain's CharTextSplitter.py
+    # Combine the small pieces into medium size chunks
+    # chunks will accumulate processed text chunks as we go along
+    # curr_chunk will be a list of subchunks comprising the main, current chunk
+    # back_overlap will be appended, once ready, to the end of the previous chunk (if any)
+    # fwd_overlap will be prepended, once ready, to the start of the next chunk
+    chunks = []
+    curr_chunk, curr_chunk_len = [], 0
+    back_overlap, back_overlap_len = None, 0  # None signals not yet gathering
+    fwd_overlap, fwd_overlap_len = None, 0
+
+    for s in fine_split:
+        if not s: continue  # noqa E701
+        split_len = len_func(s) + separator_len
+        # Check for full back_overlap (if relevant, i.e. back_overlap isn't None)
+        if back_overlap is not None and (back_overlap_len + split_len > chunk_overlap):  # noqa: F821
+            chunks[-1].extend(back_overlap)
+            back_overlap, back_overlap_len = None, 0
+
+        # Will adding this split take us into overlap room?
+        if curr_chunk_len + split_len > (chunk_size - chunk_overlap):
+            fwd_overlap, fwd_overlap_len = [], 0  # Start gathering
+
+        # Will adding this split take us over chunk size?
+        if curr_chunk_len + split_len > chunk_size:
+            # If so, complete current chunk & start a new one
+
+            # fwd_overlap should be non-None at this point, so check empty
+            if not fwd_overlap and curr_chunk:
+                # If empty, look back to make sure there is some overlap
+                fwd_overlap.append(curr_chunk[-1])
+
+            chunks.append(curr_chunk)
+            # fwd_overlap intentionally not counted in running chunk length
+            curr_chunk, curr_chunk_len = fwd_overlap, 0
+            back_overlap, back_overlap_len = [], 0  # Start gathering
+            fwd_overlap, fwd_overlap_len = None, 0  # Stop gathering
+
+        if fwd_overlap is not None:
+            fwd_overlap.append(s)
+            fwd_overlap_len += split_len
+
+        if back_overlap is not None:
+            back_overlap.append(s)
+            back_overlap_len += split_len
+
+        curr_chunk.append(s)
+        curr_chunk_len += split_len
+
+    # Done with the splits; use the final back_overlap, if any
+    if back_overlap:
+        chunks[-1].extend(back_overlap)
+
+    # Concatenate all the split parts of all the chunks
+    chunks = [separator.join(c) for c in chunks]
+
+    # Handle degenerate case where no splits found & chunk size too large
+    # Just becomes one big chunk
+    if not chunks:
+        chunks = [text]
+
+    # chunks.append(separator.join(curr_chunk))
+    return chunks
+
+
+def text_splitter(text: str,
+        chunk_size: int,
+        chunk_overlap: int=0,
+        separator: str='\n\n',
+        len_func=len
+    ) -> list[str]:
+    '''
+    Split string into a sequence of chunks in a "fuzzy" manner. Will generally not split the text into sequences
+    of chunk_size length, but will instead preserve some overlap on either side of the given separators.
+    This results in slightly larger chunks than the chunk_size number by itself suggests.
 
     >>> from ogbujipt.text_helper import text_splitter
-    >>> from PyPDF2 import PdfReader
-    >>> pdf_reader = PdfReader('monopoly-board-game-manual.pdf')
-    >>> text = ''.join((page.extract_text() for page in pdf_reader.pages))
-    >>> chunks = text_splitter(text, chunk_size=500, separator='\n')
+    >>> chunks = text_split_fuzzy('she sells seashells by the seashore', chunk_size=50, separator=' ')
 
     Args:
         text (str): (Multiline) String to be split into chunks
@@ -43,13 +225,19 @@ def text_splitter(text: str,
     Returns:
         chunks (List[str]): List of chunks of the text provided
     '''
+    warnings.warn('text_splitter() is deprecated. Use text_split_fuzzy() instead.')
+
     assert separator, 'Separator must be non-empty'
 
     if ((not isinstance(text, str))
         or (not isinstance(separator, str))):
         raise ValueError(f'text and separator must be strings.\n'
                          f'Got {text.__class__} for text and {separator.__class__} for separator')
 
+    if chunk_overlap == 0:
+        msg = 'chunk_overlap must be a positive integer. For no overlap, use text_split() instead'
+        raise ValueError(msg)
+
     if ((not isinstance(chunk_size, int))
         or (not isinstance(chunk_overlap, int))
         or (chunk_size <= 0)
@@ -67,11 +255,11 @@ def text_splitter(text: str,
     separator_len = len_func(separator)
 
     if len(fine_split) <= 1:
-        warnings.warn(f'No splits detected. Problem with separator ({repr(separator)})?')
+        warnings.warn(f'No splits detected. Perhaps a problem with separator? ({repr(separator)})?')
 
-    # Combine the small pieces into medium size chunks to send to LLM
-    # Initialize accumulators; chunks will be the target list of the chunks so far
-    # curr_chunk will be a list of parts comprising the main, current chunk
+    # Combine the small pieces into medium size chunks
+    # chunks will accumulate processed text chunks as we go along
+    # curr_chunk will be a list of subchunks comprising the main, current chunk
     # back_overlap will be appended, once ready, to the end of the previous chunk (if any)
     # fwd_overlap will be prepended, once ready, to the start of the next chunk
     chunks = []