Skip to content

Commit eb1b022

Browse files
authored
feat(chunking): add overlap on chunk-splits (Unstructured-IO#2305)
There are two distinct overlap operations with completely different implementations. This is "intra-chunk" overlap, applying overlap to chunks resulting from text-splitting an oversized element. So if an oversized element had text "abcd efgh ijkl mnop qrst" and was split at 15 chars with overlap of 5, it would produce "abcd efgh ijkl" and "ijkl mnop qrst". Any inter-chunk overlap from the prior chunk and applied at the beginning of the string (before "abcd") is handled in a separate operation in the next PR.
1 parent 5c0043a commit eb1b022

File tree

5 files changed

+90
-37
lines changed

5 files changed

+90
-37
lines changed

CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
## 0.11.7-dev0
1+
## 0.11.7-dev1
22

33
### Enhancements
44

5+
* **Add intra-chunk overlap capability.** Implement overlap for split-chunks where text-splitting is used to divide an oversized chunk into two or more chunks that fit in the chunking window. Note this capability is not yet available from the API but will shortly be made accessible using a new `overlap` kwarg on partition functions.
6+
57
### Features
68

79
### Fixes

setup.cfg

+1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
license_files = LICENSE.md
33

44
[flake8]
5+
ignore = E203,W503
56
max-line-length = 100
67
exclude =
78
.venv

test_unstructured/chunking/test_base.py

+40-22
Original file line numberDiff line numberDiff line change
@@ -149,23 +149,33 @@ class Describe_TextSplitter:
149149
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
150150

151151
def it_splits_on_a_preferred_separator_when_it_can(self):
152-
opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=("\n", " "))
152+
opts = ChunkingOptions.new(
153+
max_characters=50, text_splitting_separators=("\n", " "), overlap=10
154+
)
153155
split = _TextSplitter(opts)
154156
text = (
155-
"Lorem ipsum dolor amet consectetur adipiscing.\n"
156-
"In rhoncus ipsum sed lectus porta volutpat."
157+
"Lorem ipsum dolor amet consectetur adipiscing. \n "
158+
"In rhoncus ipsum sed lectus porta."
157159
)
158160

159161
s, remainder = split(text)
162+
163+
# -- trailing whitespace is stripped from split --
160164
assert s == "Lorem ipsum dolor amet consectetur adipiscing."
161-
assert remainder == "In rhoncus ipsum sed lectus porta volutpat."
165+
# -- leading whitespace is stripped from remainder
166+
# -- overlap is separated by single space
167+
# -- overlap-prefix is computed on arbitrary character boundary
168+
# -- overlap-prefix len includes space separator (text portion is one less than specified)
169+
assert remainder == "ipiscing. In rhoncus ipsum sed lectus porta."
162170
# --
163171
s, remainder = split(remainder)
164-
assert s == "In rhoncus ipsum sed lectus porta volutpat."
172+
assert s == "ipiscing. In rhoncus ipsum sed lectus porta."
165173
assert remainder == ""
166174

167175
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available(self):
168-
opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
176+
opts = ChunkingOptions.new(
177+
max_characters=40, text_splitting_separators=("\n", " "), overlap=10
178+
)
169179
split = _TextSplitter(opts)
170180
text = (
171181
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
@@ -174,28 +184,34 @@ def and_it_splits_on_the_next_available_separator_when_the_first_is_not_availabl
174184

175185
s, remainder = split(text)
176186
assert s == "Lorem ipsum dolor amet consectetur"
177-
assert remainder == "adipiscing. In rhoncus ipsum sed lectus porta volutpat."
187+
assert remainder == "nsectetur adipiscing. In rhoncus ipsum sed lectus porta volutpat."
178188
# --
179189
s, remainder = split(remainder)
180-
assert s == "adipiscing. In rhoncus ipsum sed lectus"
181-
assert remainder == "porta volutpat."
190+
assert s == "nsectetur adipiscing. In rhoncus ipsum"
191+
assert remainder == "cus ipsum sed lectus porta volutpat."
182192
# --
183193
s, remainder = split(remainder)
184-
assert s == "porta volutpat."
194+
assert s == "cus ipsum sed lectus porta volutpat."
185195
assert remainder == ""
186196

187197
def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
188-
opts = ChunkingOptions.new(max_characters=40, text_splitting_separators=("\n", " "))
198+
opts = ChunkingOptions.new(
199+
max_characters=30, text_splitting_separators=("\n", " "), overlap=10
200+
)
189201
split = _TextSplitter(opts)
190202
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
191203

192204
s, remainder = split(text)
193-
assert s == "Loremipsumdolorametconsecteturadipiscing"
194-
assert remainder == "elit. In rhoncus ipsum sed lectus porta."
205+
assert s == "Loremipsumdolorametconsectetur"
206+
assert remainder == "onsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
195207
# --
196208
s, remainder = split(remainder)
197-
assert s == "elit. In rhoncus ipsum sed lectus porta."
198-
assert remainder == ""
209+
assert s == "onsecteturadipiscingelit. In"
210+
assert remainder == "gelit. In rhoncus ipsum sed lectus porta."
211+
# --
212+
s, remainder = split(remainder)
213+
assert s == "gelit. In rhoncus ipsum sed"
214+
assert remainder == "ipsum sed lectus porta."
199215

200216
@pytest.mark.parametrize(
201217
"text",
@@ -205,7 +221,7 @@ def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
205221
],
206222
)
207223
def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
208-
opts = ChunkingOptions.new(max_characters=46)
224+
opts = ChunkingOptions.new(max_characters=46, overlap=10)
209225
split = _TextSplitter(opts)
210226

211227
s, remainder = split(text)
@@ -214,7 +230,7 @@ def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
214230
assert remainder == ""
215231

216232
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
217-
opts = ChunkingOptions.new(max_characters=38)
233+
opts = ChunkingOptions.new(max_characters=38, overlap=10)
218234
split = _TextSplitter(opts)
219235
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
220236

@@ -223,17 +239,19 @@ def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
223239
assert s == "Loremipsumdolorametconsecteturadipisci"
224240
assert len(s) == 38
225241

226-
@pytest.mark.parametrize("separators", [("\n", " "), ()])
242+
@pytest.mark.parametrize("separators", [("\n", " "), (" ",)])
227243
def it_strips_whitespace_around_the_split(self, separators: Sequence[str]):
228-
opts = ChunkingOptions.new(max_characters=50, text_splitting_separators=separators)
244+
opts = ChunkingOptions.new(
245+
max_characters=50, text_splitting_separators=separators, overlap=10
246+
)
229247
split = _TextSplitter(opts)
230-
text = "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus."
231-
# |------------------------------------------------^ 50-chars
248+
text = "Lorem ipsum dolor amet consectetur adipiscing. \n\n In rhoncus ipsum sed lectus."
249+
# |-------------------------------------------------^ 50-chars
232250

233251
s, remainder = split(text)
234252

235253
assert s == "Lorem ipsum dolor amet consectetur adipiscing."
236-
assert remainder == "In rhoncus ipsum sed lectus."
254+
assert remainder == "ipiscing. In rhoncus ipsum sed lectus."
237255

238256

239257
# ================================================================================================

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.11.7-dev0" # pragma: no cover
1+
__version__ = "0.11.7-dev1" # pragma: no cover

unstructured/chunking/base.py

+45-13
Original file line numberDiff line numberDiff line change
@@ -286,17 +286,23 @@ def __call__(self, s: str) -> Tuple[str, str]:
286286
if len(s) <= maxlen:
287287
return s, ""
288288

289-
for p, length in self._patterns:
289+
for p, sep_len in self._patterns:
290290
# -- length of separator must be added to include that separator when it happens to be
291291
# -- located exactly at maxlen. Otherwise the search-from-end regex won't find it.
292-
fragment, remainder = self._split_from_maxlen(p, maxlen + length, s)
293-
if not fragment:
292+
fragment, remainder = self._split_from_maxlen(p, sep_len, s)
293+
if (
294+
# -- no available split with this separator --
295+
not fragment
296+
# -- split did not progress, consuming part of the string --
297+
or len(remainder) >= len(s)
298+
):
294299
continue
295300
return fragment.rstrip(), remainder.lstrip()
296301

297302
# -- the terminal "" pattern is not actually executed via regex since its implementation is
298-
# -- trivial and provides a hard back-stop here in this method.
299-
return s[:maxlen].rstrip(), s[maxlen:].lstrip()
303+
# -- trivial and provides a hard back-stop here in this method. No separator is used between
304+
# -- tail and remainder on arb-char split.
305+
return s[:maxlen].rstrip(), s[maxlen - self._opts.overlap :].lstrip()
300306

301307
@lazyproperty
302308
def _patterns(self) -> Tuple[Tuple[regex.Pattern[str], int], ...]:
@@ -312,21 +318,47 @@ def _patterns(self) -> Tuple[Tuple[regex.Pattern[str], int], ...]:
312318
separators = self._opts.text_splitting_separators
313319
return tuple((regex.compile(f"(?r){sep}"), len(sep)) for sep in separators)
314320

315-
@staticmethod
316-
def _split_from_maxlen(pattern: regex.Pattern[str], maxlen: int, s: str) -> Tuple[str, str]:
321+
def _split_from_maxlen(
322+
self, pattern: regex.Pattern[str], sep_len: int, s: str
323+
) -> Tuple[str, str]:
317324
"""Return (split, remainder) pair split from `s` on the right-most match before `maxlen`.
318325
319-
Returns `"", s` if no suitable match was found. The first string in the pair will never be
320-
longer than `maxlen` and there is no longer split available using `pattern`.
326+
Returns `"", s` if no suitable match was found. Also returns `"", s` if splitting on this
327+
separator produces a split shorter than the required overlap (which would produce an
328+
infinite loop).
329+
330+
`split` will never be longer than `maxlen` and there is no longer split available using
331+
`pattern`.
321332
322333
The separator is removed and does not appear in either the split or remainder.
323334
"""
324-
match = pattern.search(s[:maxlen])
335+
maxlen, overlap = self._opts.hard_max, self._opts.overlap
336+
337+
# -- A split not longer than overlap will not progress (infinite loop). On the right side,
338+
# -- need to extend search range to include a separator located exactly at maxlen.
339+
match = pattern.search(s, pos=overlap + 1, endpos=maxlen + sep_len)
325340
if match is None:
326341
return "", s
327-
start: int = match.start()
328-
end: int = match.end()
329-
return s[:start], s[end:]
342+
343+
# -- characterize match location
344+
match_start, match_end = match.span()
345+
# -- matched separator is replaced by single-space in overlap string --
346+
separator = " "
347+
348+
# -- in multi-space situation, fragment may have trailing whitespace because match is from
349+
# -- right to left
350+
fragment = s[:match_start].rstrip()
351+
# -- remainder can have leading space when match is on "\n" followed by spaces --
352+
raw_remainder = s[match_end:].lstrip()
353+
354+
if overlap <= len(separator):
355+
return fragment, raw_remainder
356+
357+
# -- compute overlap --
358+
tail_len = overlap - len(separator)
359+
tail = fragment[-tail_len:].lstrip()
360+
overlapped_remainder = tail + separator + raw_remainder
361+
return fragment, overlapped_remainder
330362

331363

332364
# ================================================================================================

0 commit comments

Comments
 (0)