huggingface · mattt · Sep 24, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/Sources/Generation/MLMultiArray+Utils.swift → Sources/Generation/CoreML+Extensions.swift b/Sources/Generation/MLMultiArray+Utils.swift → Sources/Generation/CoreML+Extensions.swift
@@ -1,5 +1,5 @@
 //
-//  MLMultiArray+Utils.swift
+//  CoreML+Extensions.swift
 //  CoreMLBert
 //
 //  Created by Julien Chaumond on 27/06/2019.
@@ -10,7 +10,7 @@
 import CoreML
 import Foundation
 
-public extension MLMultiArray {
+extension MLMultiArray {
     /// All values will be stored in the last dimension of the MLMultiArray (default is dims=1)
     static func from(_ arr: [Int], dims: Int = 1) -> MLMultiArray {
         var shape = Array(repeating: 1, count: dims)
@@ -88,7 +88,7 @@ public extension MLMultiArray {
     }
 }
 
-public extension MLMultiArray {
+extension MLMultiArray {
     /// Provides a way to index n-dimensionals arrays a la numpy.
     enum Indexing: Equatable {
         case select(Int)
@@ -197,4 +197,48 @@ extension MLMultiArray {
         return s + "]"
     }
 }
+
+extension MLShapedArray<Float> {
+    var floats: [Float] {
+        guard strides.first == 1, strides.count == 1 else {
+            // For some reason this path is slow.
+            // If strides is not 1, we can write a Metal kernel to copy the values properly.
+            return scalars
+        }
+
+        // Fast path: memcpy
+        let mlArray = MLMultiArray(self)
+        return mlArray.floats ?? scalars
+    }
+}
+
+extension MLShapedArraySlice<Float> {
+    var floats: [Float] {
+        guard strides.first == 1, strides.count == 1 else {
+            // For some reason this path is slow.
+            // If strides is not 1, we can write a Metal kernel to copy the values properly.
+            return scalars
+        }
+
+        // Fast path: memcpy
+        let mlArray = MLMultiArray(self)
+        return mlArray.floats ?? scalars
+    }
+}
+
+extension MLMultiArray {
+    var floats: [Float]? {
+        guard dataType == .float32 else { return nil }
+
+        var result: [Float] = Array(repeating: 0, count: count)
+        return withUnsafeBytes { ptr in
+            guard let source = ptr.baseAddress else { return nil }
+            result.withUnsafeMutableBytes { resultPtr in
+                let dest = resultPtr.baseAddress!
+                memcpy(dest, source, self.count * MemoryLayout<Float>.stride)
+            }
+            return result
+        }
+    }
+}
 #endif // canImport(CoreML)
diff --git a/Sources/Generation/MLShapedArray+Utils.swift b/Sources/Generation/MLShapedArray+Utils.swift
diff --git a/Sources/Hub/BOMDoubling.swift → ...ub/Extensions/JSONSerialization+BOM.swift b/Sources/Hub/BOMDoubling.swift → ...ub/Extensions/JSONSerialization+BOM.swift
@@ -1,13 +1,19 @@
 //
-//  BOMDoubling.swift
+//  JSONSerialization+BOM.swift
 //  swift-transformers
 //
 //  Created by Pedro Cuenca on 20250912
 //
 
 import Foundation
 
-extension Data {
+extension JSONSerialization {
+    class func bomPreservingJsonObject(with data: Data, options: JSONSerialization.ReadingOptions = []) throws -> Any {
+        try JSONSerialization.jsonObject(with: data.duplicatingBOMsAfterQuotes, options: options)
+    }
+}
+
+private extension Data {
     /// Workaround for https://github.com/huggingface/swift-transformers/issues/116
     /// Duplicate a BOM sequence that follows a quote. The first BOM is swallowed by JSONSerialization.jsonObject
     /// because it thinks it marks the encoding.
@@ -40,9 +46,3 @@ extension Data {
         }
     }
 }
-
-extension JSONSerialization {
-    class func bomPreservingJsonObject(with data: Data, options: JSONSerialization.ReadingOptions = []) throws -> Any {
-        try JSONSerialization.jsonObject(with: data.duplicatingBOMsAfterQuotes, options: options)
-    }
-}
diff --git a/Sources/Hub/HubApi.swift b/Sources/Hub/HubApi.swift
@@ -891,13 +891,13 @@ public extension Hub {
     }
 }
 
-public extension [String] {
+private extension [String] {
     func matching(glob: String) -> [String] {
         filter { fnmatch(glob, $0, 0) == 0 }
     }
 }
 
-public extension FileManager {
+private extension FileManager {
     func getFileUrls(at directoryUrl: URL) throws -> [URL] {
         var fileUrls = [URL]()
 

diff --git a/Sources/Tokenizers/BertTokenizer.swift b/Sources/Tokenizers/BertTokenizer.swift
@@ -226,7 +226,7 @@ class BasicTokenizer {
     }
 }
 
-extension Character {
+private extension Character {
     /// https://github.com/huggingface/transformers/blob/8c1b5d37827a6691fef4b2d926f2d04fb6f5a9e3/src/transformers/tokenization_utils.py#L367
     var isExtendedPunctuation: Bool {
         if isPunctuation { return true }

diff --git a/Sources/Tokenizers/Decoder.swift b/Sources/Tokenizers/Decoder.swift
@@ -236,7 +236,7 @@ class MetaspaceDecoder: Decoder {
 }
 
 /// We could use firstIndex(where:), lastIndex(where:) for possibly better efficiency (and do both ends at once)
-public extension String {
+private extension String {
     func trimmingFromStart(character: Character = " ", upto: Int) -> String {
         var result = self
         var trimmed = 0

diff --git a/Sources/Tokenizers/PreTokenizer.swift b/Sources/Tokenizers/PreTokenizer.swift
@@ -238,164 +238,3 @@ class SplitPreTokenizer: PreTokenizer {
         return pattern.split(text, invert: invert)
     }
 }
-
-enum StringSplitPattern {
-    case regexp(regexp: String)
-    case string(pattern: String)
-}
-
-extension StringSplitPattern {
-    func split(_ text: String, invert: Bool = true) -> [String] {
-        switch self {
-        case let .regexp(regexp):
-            text.split(by: regexp, includeSeparators: true)
-        case let .string(substring):
-            text.split(by: substring, options: [], includeSeparators: !invert)
-        }
-    }
-}
-
-extension StringSplitPattern {
-    static func from(config: Config) -> StringSplitPattern? {
-        if let pattern = config.pattern.String.string() {
-            return StringSplitPattern.string(pattern: pattern)
-        }
-        if let pattern = config.pattern.Regex.string() {
-            return StringSplitPattern.regexp(regexp: pattern)
-        }
-        return nil
-    }
-}
-
-public extension String {
-    func ranges(of string: String, options: CompareOptions = .regularExpression) -> [Range<Index>] {
-        var result: [Range<Index>] = []
-        var start = startIndex
-        while let range = range(of: string, options: options, range: start..<endIndex) {
-            result.append(range)
-            start = range.lowerBound < range.upperBound ? range.upperBound : index(range.lowerBound, offsetBy: 1, limitedBy: endIndex) ?? endIndex
-        }
-        return result
-    }
-
-    func split(by string: String, options: CompareOptions = .regularExpression, includeSeparators: Bool = false, omittingEmptySubsequences: Bool = true) -> [String] {
-        var result: [String] = []
-        var start = startIndex
-        while let range = range(of: string, options: options, range: start..<endIndex) {
-            // Prevent empty strings
-            if omittingEmptySubsequences, start < range.lowerBound {
-                result.append(String(self[start..<range.lowerBound]))
-            }
-            if includeSeparators {
-                result.append(String(self[range]))
-            }
-            start = range.upperBound
-        }
-
-        if omittingEmptySubsequences, start < endIndex {
-            result.append(String(self[start...]))
-        }
-        return result
-    }
-
-    /// This version supports capture groups, wheres the one above doesn't
-    func split(by captureRegex: NSRegularExpression) -> [String] {
-        // Find the matching capture groups
-        let selfRange = NSRange(startIndex..<endIndex, in: self)
-        let matches = captureRegex.matches(in: self, options: [], range: selfRange)
-
-        if matches.isEmpty { return [self] }
-
-        var result: [String] = []
-        var start = startIndex
-
-        for match in matches {
-            // IMPORTANT: convert from NSRange to Range<String.Index>
-            // https://stackoverflow.com/questions/75543272/convert-a-given-utf8-nsrange-in-a-string-to-a-utf16-nsrange
-            guard let matchRange = Range(match.range, in: self) else { continue }
-
-            // Add text before the match
-            if start < matchRange.lowerBound {
-                result.append(String(self[start..<matchRange.lowerBound]))
-            }
-
-            // Move start to after the match
-            start = matchRange.upperBound
-
-            // Append separator, supporting capture groups
-            for r in (0..<match.numberOfRanges).reversed() {
-                let nsRange = match.range(at: r)
-                if let sepRange = Range(nsRange, in: self) {
-                    result.append(String(self[sepRange]))
-                    break
-                }
-            }
-        }
-
-        // Append remaining suffix
-        if start < endIndex {
-            result.append(String(self[start...]))
-        }
-
-        return result
-    }
-}
-
-public enum SplitDelimiterBehavior {
-    case removed
-    case isolated
-    case mergedWithPrevious
-    case mergedWithNext
-}
-
-public extension String {
-    func split(by string: String, options: CompareOptions = .regularExpression, behavior: SplitDelimiterBehavior) -> [String] {
-        func mergedWithNext(ranges: [Range<String.Index>]) -> [Range<String.Index>] {
-            var merged: [Range<String.Index>] = []
-            var currentStart = startIndex
-            for range in ranges {
-                if range.lowerBound == startIndex { continue }
-                let mergedRange = currentStart..<range.lowerBound
-                currentStart = range.lowerBound
-                merged.append(mergedRange)
-            }
-            if currentStart < endIndex {
-                merged.append(currentStart..<endIndex)
-            }
-            return merged
-        }
-
-        func mergedWithPrevious(ranges: [Range<String.Index>]) -> [Range<String.Index>] {
-            var merged: [Range<String.Index>] = []
-            var currentStart = startIndex
-            for range in ranges {
-                let mergedRange = currentStart..<range.upperBound
-                currentStart = range.upperBound
-                merged.append(mergedRange)
-            }
-            if currentStart < endIndex {
-                merged.append(currentStart..<endIndex)
-            }
-            return merged
-        }
-
-        switch behavior {
-        case .removed:
-            return split(by: string, options: options, includeSeparators: false)
-        case .isolated:
-            return split(by: string, options: options, includeSeparators: true)
-        case .mergedWithNext:
-            // Obtain ranges and merge them
-            // "the-final--countdown" -> (3, 4), (9, 10), (10, 11) -> (start, 2), (3, 8), (9, 9), (10, end)
-            let ranges = ranges(of: string, options: options)
-            let merged = mergedWithNext(ranges: ranges)
-            return merged.map { String(self[$0]) }
-        case .mergedWithPrevious:
-            // Obtain ranges and merge them
-            // "the-final--countdown" -> (3, 4), (9, 10), (10, 11) -> (start, 3), (4, 9), (10, 10), (11, end)
-            let ranges = ranges(of: string, options: options)
-            let merged = mergedWithPrevious(ranges: ranges)
-            return merged.map { String(self[$0]) }
-        }
-    }
-}