@@ -238,158 +238,3 @@ class SplitPreTokenizer: PreTokenizer {
238238 return pattern. split ( text, invert: invert)
239239 }
240240}
241-
242- enum StringSplitPattern {
243- case regexp( regexp: String )
244- case string( pattern: String )
245-
246- func split( _ text: String , invert: Bool = true ) -> [ String ] {
247- switch self {
248- case let . regexp( regexp) :
249- text. split ( by: regexp, includeSeparators: true )
250- case let . string( substring) :
251- text. split ( by: substring, options: [ ] , includeSeparators: !invert)
252- }
253- }
254-
255- static func from( config: Config ) -> StringSplitPattern ? {
256- if let pattern = config. pattern. String. string ( ) {
257- return . string( pattern: pattern)
258- }
259- if let pattern = config. pattern. Regex. string ( ) {
260- return . regexp( regexp: pattern)
261- }
262- return nil
263- }
264- }
265-
266- enum SplitDelimiterBehavior {
267- case removed
268- case isolated
269- case mergedWithPrevious
270- case mergedWithNext
271- }
272-
273- extension String {
274- func ranges( of string: String , options: CompareOptions = . regularExpression) -> [ Range < Index > ] {
275- var result : [ Range < Index > ] = [ ]
276- var start = startIndex
277- while let range = range ( of: string, options: options, range: start..< endIndex) {
278- result. append ( range)
279- start = range. lowerBound < range. upperBound ? range. upperBound : index ( range. lowerBound, offsetBy: 1 , limitedBy: endIndex) ?? endIndex
280- }
281- return result
282- }
283-
284- func split( by string: String , options: CompareOptions = . regularExpression, includeSeparators: Bool = false , omittingEmptySubsequences: Bool = true ) -> [ String ] {
285- var result : [ String ] = [ ]
286- var start = startIndex
287- while let range = range ( of: string, options: options, range: start..< endIndex) {
288- // Prevent empty strings
289- if omittingEmptySubsequences, start < range. lowerBound {
290- result. append ( String ( self [ start..< range. lowerBound] ) )
291- }
292- if includeSeparators {
293- result. append ( String ( self [ range] ) )
294- }
295- start = range. upperBound
296- }
297-
298- if omittingEmptySubsequences, start < endIndex {
299- result. append ( String ( self [ start... ] ) )
300- }
301- return result
302- }
303-
304- /// This version supports capture groups, wheres the one above doesn't
305- func split( by captureRegex: NSRegularExpression ) -> [ String ] {
306- // Find the matching capture groups
307- let selfRange = NSRange ( startIndex..< endIndex, in: self )
308- let matches = captureRegex. matches ( in: self , options: [ ] , range: selfRange)
309-
310- if matches. isEmpty { return [ self ] }
311-
312- var result : [ String ] = [ ]
313- var start = startIndex
314-
315- for match in matches {
316- // IMPORTANT: convert from NSRange to Range<String.Index>
317- // https://stackoverflow.com/questions/75543272/convert-a-given-utf8-nsrange-in-a-string-to-a-utf16-nsrange
318- guard let matchRange = Range ( match. range, in: self ) else { continue }
319-
320- // Add text before the match
321- if start < matchRange. lowerBound {
322- result. append ( String ( self [ start..< matchRange. lowerBound] ) )
323- }
324-
325- // Move start to after the match
326- start = matchRange. upperBound
327-
328- // Append separator, supporting capture groups
329- for r in ( 0 ..< match. numberOfRanges) . reversed ( ) {
330- let nsRange = match. range ( at: r)
331- if let sepRange = Range ( nsRange, in: self ) {
332- result. append ( String ( self [ sepRange] ) )
333- break
334- }
335- }
336- }
337-
338- // Append remaining suffix
339- if start < endIndex {
340- result. append ( String ( self [ start... ] ) )
341- }
342-
343- return result
344- }
345-
346- func split( by string: String , options: CompareOptions = . regularExpression, behavior: SplitDelimiterBehavior ) -> [ String ] {
347- func mergedWithNext( ranges: [ Range < String . Index > ] ) -> [ Range < String . Index > ] {
348- var merged : [ Range < String . Index > ] = [ ]
349- var currentStart = startIndex
350- for range in ranges {
351- if range. lowerBound == startIndex { continue }
352- let mergedRange = currentStart..< range. lowerBound
353- currentStart = range. lowerBound
354- merged. append ( mergedRange)
355- }
356- if currentStart < endIndex {
357- merged. append ( currentStart..< endIndex)
358- }
359- return merged
360- }
361-
362- func mergedWithPrevious( ranges: [ Range < String . Index > ] ) -> [ Range < String . Index > ] {
363- var merged : [ Range < String . Index > ] = [ ]
364- var currentStart = startIndex
365- for range in ranges {
366- let mergedRange = currentStart..< range. upperBound
367- currentStart = range. upperBound
368- merged. append ( mergedRange)
369- }
370- if currentStart < endIndex {
371- merged. append ( currentStart..< endIndex)
372- }
373- return merged
374- }
375-
376- switch behavior {
377- case . removed:
378- return split ( by: string, options: options, includeSeparators: false )
379- case . isolated:
380- return split ( by: string, options: options, includeSeparators: true )
381- case . mergedWithNext:
382- // Obtain ranges and merge them
383- // "the-final--countdown" -> (3, 4), (9, 10), (10, 11) -> (start, 2), (3, 8), (9, 9), (10, end)
384- let ranges = ranges ( of: string, options: options)
385- let merged = mergedWithNext ( ranges: ranges)
386- return merged. map { String ( self [ $0] ) }
387- case . mergedWithPrevious:
388- // Obtain ranges and merge them
389- // "the-final--countdown" -> (3, 4), (9, 10), (10, 11) -> (start, 3), (4, 9), (10, 10), (11, end)
390- let ranges = ranges ( of: string, options: options)
391- let merged = mergedWithPrevious ( ranges: ranges)
392- return merged. map { String ( self [ $0] ) }
393- }
394- }
395- }
0 commit comments