feat(chunking): add overlap on chunk-splits (Unstructured-IO#2305)

scanny · web-flow · commit eb1b022ff868 · 2023-12-22T20:35:18.000Z
There are two distinct overlap operations with completely different
implementations. This is "intra-chunk" overlap, applying overlap to
chunks resulting from text-splitting an oversized element.

So if an oversized element had text "abcd efgh ijkl mnop qrst" and was
split at 15 chars with overlap of 5, it would produce "abcd efgh ijkl"
and "ijkl mnop qrst". Any inter-chunk overlap from the prior chunk and
applied at the beginning of the string (before "abcd") is handled in a
separate operation in the next PR.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,9 @@
-## 0.11.7-dev0
+## 0.11.7-dev1
 
 ### Enhancements
 
+* **Add intra-chunk overlap capability.** Implement overlap for split-chunks where text-splitting is used to divide an oversized chunk into two or more chunks that fit in the chunking window. Note this capability is not yet available from the API but will shortly be made accessible using a new `overlap` kwarg on partition functions.
+
 ### Features
 
 ### Fixes
diff --git a/setup.cfg b/setup.cfg
@@ -2,6 +2,7 @@
 license_files = LICENSE.md
 
 [flake8]
+ignore = E203,W503
 max-line-length = 100
 exclude =
     .venv
diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py
@@ -149,23 +149,33 @@ class Describe_TextSplitter:
     """Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
 
     def it_splits_on_a_preferred_separator_when_it_can(self):
-        opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=("\n", " "))
+        opts = ChunkingOptions.new(
+            max_characters=50, text_splitting_separators=("\n", " "), overlap=10
+        )
         split = _TextSplitter(opts)
         text = (
-            "Lorem ipsum dolor amet consectetur adipiscing.\n"
-            "In rhoncus ipsum sed lectus porta volutpat."
+            "Lorem ipsum dolor amet consectetur adipiscing.  \n  "
+            "In rhoncus ipsum sed lectus porta."
         )
 
         s, remainder = split(text)
+
+        # -- trailing whitespace is stripped from split --
         assert s == "Lorem ipsum dolor amet consectetur adipiscing."
-        assert remainder == "In rhoncus ipsum sed lectus porta volutpat."
+        # -- leading whitespace is stripped from remainder
+        # -- overlap is separated by single space
+        # -- overlap-prefix is computed on arbitrary character boundary
+        # -- overlap-prefix len includes space separator (text portion is one less than specified)
+        assert remainder == "ipiscing. In rhoncus ipsum sed lectus porta."
         # --
         s, remainder = split(remainder)
-        assert s == "In rhoncus ipsum sed lectus porta volutpat."
+        assert s == "ipiscing. In rhoncus ipsum sed lectus porta."
         assert remainder == ""
 
     def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
-        opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
+        opts = ChunkingOptions.new(
+            max_characters=40, text_splitting_separators=("\n", " "), overlap=10
+        )
         split = _TextSplitter(opts)
         text = (
             "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
@@ -174,28 +184,34 @@ def and_it_splits_on_the_next_available_separator_when_the_first_is_not_availabl
 
         s, remainder = split(text)
         assert s == "Lorem ipsum dolor amet consectetur"
-        assert remainder == "adipiscing. In rhoncus ipsum sed lectus porta volutpat."
+        assert remainder == "nsectetur adipiscing. In rhoncus ipsum sed lectus porta volutpat."
         # --
         s, remainder = split(remainder)
-        assert s == "adipiscing. In rhoncus ipsum sed lectus"
-        assert remainder == "porta volutpat."
+        assert s == "nsectetur adipiscing. In rhoncus ipsum"
+        assert remainder == "cus ipsum sed lectus porta volutpat."
         # --
         s, remainder = split(remainder)
-        assert s == "porta volutpat."
+        assert s == "cus ipsum sed lectus porta volutpat."
         assert remainder == ""
 
     def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
-        opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
+        opts = ChunkingOptions.new(
+            max_characters=30, text_splitting_separators=("\n", " "), overlap=10
+        )
         split = _TextSplitter(opts)
         text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
 
         s, remainder = split(text)
-        assert s == "Loremipsumdolorametconsecteturadipiscing"
-        assert remainder == "elit. In rhoncus ipsum sed lectus porta."
+        assert s == "Loremipsumdolorametconsectetur"
+        assert remainder == "onsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
         # --
         s, remainder = split(remainder)
-        assert s == "elit. In rhoncus ipsum sed lectus porta."
-        assert remainder == ""
+        assert s == "onsecteturadipiscingelit. In"
+        assert remainder == "gelit. In rhoncus ipsum sed lectus porta."
+        # --
+        s, remainder = split(remainder)
+        assert s == "gelit. In rhoncus ipsum sed"
+        assert remainder == "ipsum sed lectus porta."
 
     @pytest.mark.parametrize(
         "text",
@@ -205,7 +221,7 @@ def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
         ],
     )
     def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
-        opts = ChunkingOptions.new(max_characters=46)
+        opts = ChunkingOptions.new(max_characters=46, overlap=10)
         split = _TextSplitter(opts)
 
         s, remainder = split(text)
@@ -214,7 +230,7 @@ def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
         assert remainder == ""
 
     def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
-        opts = ChunkingOptions.new(max_characters=38)
+        opts = ChunkingOptions.new(max_characters=38, overlap=10)
         split = _TextSplitter(opts)
         text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
 
@@ -223,17 +239,19 @@ def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
         assert s == "Loremipsumdolorametconsecteturadipisci"
         assert len(s) == 38
 
-    @pytest.mark.parametrize("separators", [("\n", " "), ()])
+    @pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
     def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
-        opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=separators)
+        opts = ChunkingOptions.new(
+            max_characters=50, text_splitting_separators=separators, overlap=10
+        )
         split = _TextSplitter(opts)
-        text = "Lorem ipsum dolor amet consectetur adipiscing.       In rhoncus ipsum sed lectus."
-        #       |------------------------------------------------^  50-chars
+        text = "Lorem ipsum dolor amet consectetur adipiscing.   \n\n In rhoncus ipsum sed lectus."
+        #       |-------------------------------------------------^  50-chars
 
         s, remainder = split(text)
 
         assert s == "Lorem ipsum dolor amet consectetur adipiscing."
-        assert remainder == "In rhoncus ipsum sed lectus."
+        assert remainder == "ipiscing. In rhoncus ipsum sed lectus."
 
 
 # ================================================================================================
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.11.7-dev0"  # pragma: no cover
+__version__ = "0.11.7-dev1"  # pragma: no cover
diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py
@@ -286,17 +286,23 @@ def __call__(self, s: str) -> Tuple[str, str]:
         if len(s) <= maxlen:
             return s, ""
 
-        for p, length in self._patterns:
+        for p, sep_len in self._patterns:
             # -- length of separator must be added to include that separator when it happens to be
             # -- located exactly at maxlen. Otherwise the search-from-end regex won't find it.
-            fragment, remainder = self._split_from_maxlen(p, maxlen + length, s)
-            if not fragment:
+            fragment, remainder = self._split_from_maxlen(p, sep_len, s)
+            if (
+                # -- no available split with this separator --
+                not fragment
+                # -- split did not progress, consuming part of the string --
+                or len(remainder) >= len(s)
+            ):
                 continue
             return fragment.rstrip(), remainder.lstrip()
 
         # -- the terminal "" pattern is not actually executed via regex since its implementation is
-        # -- trivial and provides a hard back-stop here in this method.
-        return s[:maxlen].rstrip(), s[maxlen:].lstrip()
+        # -- trivial and provides a hard back-stop here in this method. No separator is used between
+        # -- tail and remainder on arb-char split.
+        return s[:maxlen].rstrip(), s[maxlen - self._opts.overlap :].lstrip()
 
     @lazyproperty
     def _patterns(self) -> Tuple[Tuple[regex.Pattern[str], int], ...]:
@@ -312,21 +318,47 @@ def _patterns(self) -> Tuple[Tuple[regex.Pattern[str], int], ...]:
         separators = self._opts.text_splitting_separators
         return tuple((regex.compile(f"(?r){sep}"), len(sep)) for sep in separators)
 
-    @staticmethod
-    def _split_from_maxlen(pattern: regex.Pattern[str], maxlen: int, s: str) -> Tuple[str, str]:
+    def _split_from_maxlen(
+        self, pattern: regex.Pattern[str], sep_len: int, s: str
+    ) -> Tuple[str, str]:
         """Return (split, remainder) pair split from `s` on the right-most match before `maxlen`.
 
-        Returns `"", s` if no suitable match was found. The first string in the pair will never be
-        longer than `maxlen` and there is no longer split available using `pattern`.
+        Returns `"", s` if no suitable match was found. Also returns `"", s` if splitting on this
+        separator produces a split shorter than the required overlap (which would produce an
+        infinite loop).
+
+        `split` will never be longer than `maxlen` and there is no longer split available using
+        `pattern`.
 
         The separator is removed and does not appear in either the split or remainder.
         """
-        match = pattern.search(s[:maxlen])
+        maxlen, overlap = self._opts.hard_max, self._opts.overlap
+
+        # -- A split not longer than overlap will not progress (infinite loop). On the right side,
+        # -- need to extend search range to include a separator located exactly at maxlen.
+        match = pattern.search(s, pos=overlap + 1, endpos=maxlen + sep_len)
         if match is None:
             return "", s
-        start: int = match.start()
-        end: int = match.end()
-        return s[:start], s[end:]
+
+        # -- characterize match location
+        match_start, match_end = match.span()
+        # -- matched separator is replaced by single-space in overlap string --
+        separator = " "
+
+        # -- in multi-space situation, fragment may have trailing whitespace because match is from
+        # -- right to left
+        fragment = s[:match_start].rstrip()
+        # -- remainder can have leading space when match is on "\n" followed by spaces --
+        raw_remainder = s[match_end:].lstrip()
+
+        if overlap <= len(separator):
+            return fragment, raw_remainder
+
+        # -- compute overlap --
+        tail_len = overlap - len(separator)
+        tail = fragment[-tail_len:].lstrip()
+        overlapped_remainder = tail + separator + raw_remainder
+        return fragment, overlapped_remainder
 
 
 # ================================================================================================

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.11.7-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.11.7-dev1" # pragma: no cover`