@@ -149,23 +149,33 @@ class Describe_TextSplitter:
149
149
"""Unit-test suite for `unstructured.chunking.base._TextSplitter` objects."""
150
150
151
151
def it_splits_on_a_preferred_separator_when_it_can (self ):
152
- opts = ChunkingOptions .new (max_characters = 50 , text_splitting_separators = ("\n " , " " ))
152
+ opts = ChunkingOptions .new (
153
+ max_characters = 50 , text_splitting_separators = ("\n " , " " ), overlap = 10
154
+ )
153
155
split = _TextSplitter (opts )
154
156
text = (
155
- "Lorem ipsum dolor amet consectetur adipiscing.\n "
156
- "In rhoncus ipsum sed lectus porta volutpat ."
157
+ "Lorem ipsum dolor amet consectetur adipiscing. \n "
158
+ "In rhoncus ipsum sed lectus porta."
157
159
)
158
160
159
161
s , remainder = split (text )
162
+
163
+ # -- trailing whitespace is stripped from split --
160
164
assert s == "Lorem ipsum dolor amet consectetur adipiscing."
161
- assert remainder == "In rhoncus ipsum sed lectus porta volutpat."
165
+ # -- leading whitespace is stripped from remainder
166
+ # -- overlap is separated by single space
167
+ # -- overlap-prefix is computed on arbitrary character boundary
168
+ # -- overlap-prefix len includes space separator (text portion is one less than specified)
169
+ assert remainder == "ipiscing. In rhoncus ipsum sed lectus porta."
162
170
# --
163
171
s , remainder = split (remainder )
164
- assert s == "In rhoncus ipsum sed lectus porta volutpat ."
172
+ assert s == "ipiscing. In rhoncus ipsum sed lectus porta."
165
173
assert remainder == ""
166
174
167
175
def and_it_splits_on_the_next_available_separator_when_the_first_is_not_available (self ):
168
- opts = ChunkingOptions .new (max_characters = 40 , text_splitting_separators = ("\n " , " " ))
176
+ opts = ChunkingOptions .new (
177
+ max_characters = 40 , text_splitting_separators = ("\n " , " " ), overlap = 10
178
+ )
169
179
split = _TextSplitter (opts )
170
180
text = (
171
181
"Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus porta"
@@ -174,28 +184,34 @@ def and_it_splits_on_the_next_available_separator_when_the_first_is_not_availabl
174
184
175
185
s , remainder = split (text )
176
186
assert s == "Lorem ipsum dolor amet consectetur"
177
- assert remainder == "adipiscing. In rhoncus ipsum sed lectus porta volutpat."
187
+ assert remainder == "nsectetur adipiscing. In rhoncus ipsum sed lectus porta volutpat."
178
188
# --
179
189
s , remainder = split (remainder )
180
- assert s == "adipiscing. In rhoncus ipsum sed lectus "
181
- assert remainder == "porta volutpat."
190
+ assert s == "nsectetur adipiscing. In rhoncus ipsum"
191
+ assert remainder == "cus ipsum sed lectus porta volutpat."
182
192
# --
183
193
s , remainder = split (remainder )
184
- assert s == "porta volutpat."
194
+ assert s == "cus ipsum sed lectus porta volutpat."
185
195
assert remainder == ""
186
196
187
197
def and_it_splits_on_an_arbitrary_character_as_a_last_resort (self ):
188
- opts = ChunkingOptions .new (max_characters = 40 , text_splitting_separators = ("\n " , " " ))
198
+ opts = ChunkingOptions .new (
199
+ max_characters = 30 , text_splitting_separators = ("\n " , " " ), overlap = 10
200
+ )
189
201
split = _TextSplitter (opts )
190
202
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
191
203
192
204
s , remainder = split (text )
193
- assert s == "Loremipsumdolorametconsecteturadipiscing "
194
- assert remainder == "elit . In rhoncus ipsum sed lectus porta."
205
+ assert s == "Loremipsumdolorametconsectetur "
206
+ assert remainder == "onsecteturadipiscingelit . In rhoncus ipsum sed lectus porta."
195
207
# --
196
208
s , remainder = split (remainder )
197
- assert s == "elit. In rhoncus ipsum sed lectus porta."
198
- assert remainder == ""
209
+ assert s == "onsecteturadipiscingelit. In"
210
+ assert remainder == "gelit. In rhoncus ipsum sed lectus porta."
211
+ # --
212
+ s , remainder = split (remainder )
213
+ assert s == "gelit. In rhoncus ipsum sed"
214
+ assert remainder == "ipsum sed lectus porta."
199
215
200
216
@pytest .mark .parametrize (
201
217
"text" ,
@@ -205,7 +221,7 @@ def and_it_splits_on_an_arbitrary_character_as_a_last_resort(self):
205
221
],
206
222
)
207
223
def it_does_not_split_a_string_that_is_not_longer_than_maxlen (self , text : str ):
208
- opts = ChunkingOptions .new (max_characters = 46 )
224
+ opts = ChunkingOptions .new (max_characters = 46 , overlap = 10 )
209
225
split = _TextSplitter (opts )
210
226
211
227
s , remainder = split (text )
@@ -214,7 +230,7 @@ def it_does_not_split_a_string_that_is_not_longer_than_maxlen(self, text: str):
214
230
assert remainder == ""
215
231
216
232
def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split (self ):
217
- opts = ChunkingOptions .new (max_characters = 38 )
233
+ opts = ChunkingOptions .new (max_characters = 38 , overlap = 10 )
218
234
split = _TextSplitter (opts )
219
235
text = "Loremipsumdolorametconsecteturadipiscingelit. In rhoncus ipsum sed lectus porta."
220
236
@@ -223,17 +239,19 @@ def it_fills_the_window_when_falling_back_to_an_arbitrary_character_split(self):
223
239
assert s == "Loremipsumdolorametconsecteturadipisci"
224
240
assert len (s ) == 38
225
241
226
- @pytest .mark .parametrize ("separators" , [("\n " , " " ), ()])
242
+ @pytest .mark .parametrize ("separators" , [("\n " , " " ), (" " , )])
227
243
def it_strips_whitespace_around_the_split (self , separators : Sequence [str ]):
228
- opts = ChunkingOptions .new (max_characters = 50 , text_splitting_separators = separators )
244
+ opts = ChunkingOptions .new (
245
+ max_characters = 50 , text_splitting_separators = separators , overlap = 10
246
+ )
229
247
split = _TextSplitter (opts )
230
- text = "Lorem ipsum dolor amet consectetur adipiscing. In rhoncus ipsum sed lectus."
231
- # |------------------------------------------------^ 50-chars
248
+ text = "Lorem ipsum dolor amet consectetur adipiscing. \n \n In rhoncus ipsum sed lectus."
249
+ # |------------------------------------------------- ^ 50-chars
232
250
233
251
s , remainder = split (text )
234
252
235
253
assert s == "Lorem ipsum dolor amet consectetur adipiscing."
236
- assert remainder == "In rhoncus ipsum sed lectus."
254
+ assert remainder == "ipiscing. In rhoncus ipsum sed lectus."
237
255
238
256
239
257
# ================================================================================================
0 commit comments