@@ -176,6 +176,70 @@ struct PreTokenizerTests {
176176 )
177177 }
178178
179+ @Test ( " Split behavior merged with previous " )
180+ func splitBehaviorMergedWithPrevious( ) {
181+ #expect(
182+ " the-final--countdown " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithPrevious) ==
183+ [ " the- " , " final- " , " - " , " countdown " ]
184+ )
185+
186+ #expect(
187+ " the-final--countdown- " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithPrevious) ==
188+ [ " the- " , " final- " , " - " , " countdown- " ]
189+ )
190+
191+ #expect(
192+ " the-final--countdown-- " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithPrevious) ==
193+ [ " the- " , " final- " , " - " , " countdown- " , " - " ]
194+ )
195+
196+ #expect(
197+ " -the-final--countdown-- " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithPrevious) ==
198+ [ " - " , " the- " , " final- " , " - " , " countdown- " , " - " ]
199+ )
200+
201+ #expect(
202+ " --the-final--countdown-- " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithPrevious) ==
203+ [ " - " , " - " , " the- " , " final- " , " - " , " countdown- " , " - " ]
204+ )
205+ }
206+
207+ @Test ( " Split behavior merged with next " )
208+ func splitBehaviorMergedWithNext( ) {
209+ #expect(
210+ " the-final--countdown " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithNext) ==
211+ [ " the " , " -final " , " - " , " -countdown " ]
212+ )
213+
214+ #expect(
215+ " -the-final--countdown " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithNext) ==
216+ [ " -the " , " -final " , " - " , " -countdown " ]
217+ )
218+
219+ #expect(
220+ " --the-final--countdown " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithNext) ==
221+ [ " - " , " -the " , " -final " , " - " , " -countdown " ]
222+ )
223+
224+ #expect(
225+ " --the-final--countdown- " . split ( by: " - " , options: . caseInsensitive, behavior: . mergedWithNext) ==
226+ [ " - " , " -the " , " -final " , " - " , " -countdown " , " - " ]
227+ )
228+ }
229+
230+ @Test ( " Split behavior other " )
231+ func splitBehaviorOther( ) {
232+ #expect(
233+ " the-final--countdown " . split ( by: " - " , options: . caseInsensitive, behavior: . isolated) ==
234+ [ " the " , " - " , " final " , " - " , " - " , " countdown " ]
235+ )
236+
237+ #expect(
238+ " the-final--countdown " . split ( by: " - " , options: . caseInsensitive, behavior: . removed) ==
239+ [ " the " , " final " , " countdown " ]
240+ )
241+ }
242+
179243 /// https://github.com/huggingface/tokenizers/pull/1357
180244 @Test ( " Metaspace pre-tokenizer with prefix space handling " )
181245 func metaspacePreTokenizer( ) {
0 commit comments