Adopt Sendable protocol for tokenizer protocols and conforming types

mattt · mattt · commit bbd06f7758d7 · 2025-09-25T06:30:20.000-07:00
diff --git a/Sources/Tokenizers/BPETokenizer.swift b/Sources/Tokenizers/BPETokenizer.swift
@@ -10,7 +10,7 @@ import Foundation
 import Hub
 
 /// A pair of byte/token strings used in Byte-Pair Encoding (BPE) merge operations.
-struct BytePair: Hashable {
+struct BytePair: Hashable, Sendable {
     let a: String
     let b: String
     init(_ a: String, _ b: String) {
@@ -38,7 +38,7 @@ struct BytePair: Hashable {
 /// BPE tokenizers learn to merge the most frequently occurring pairs of characters
 /// or character sequences. This implementation supports various BPE-based models
 /// including GPT-2, RoBERTa, and other transformer models.
-class BPETokenizer: PreTrainedTokenizerModel {
+class BPETokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
     let bpeRanks: [BytePair: Int]
     private let tokensToIds: [NSString: Int]
     private let idsToTokens: [Int: NSString]
diff --git a/Sources/Tokenizers/BertTokenizer.swift b/Sources/Tokenizers/BertTokenizer.swift
@@ -14,7 +14,7 @@ import Hub
 /// This tokenizer performs basic tokenization (whitespace and punctuation splitting)
 /// followed by WordPiece subword tokenization, which is the approach used by BERT
 /// and related models.
-public class BertTokenizer {
+public class BertTokenizer: @unchecked Sendable {
     private let basicTokenizer: BasicTokenizer
     private let wordpieceTokenizer: WordpieceTokenizer
     private let maxLen = 512
diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift
@@ -215,7 +215,7 @@ public enum ChatTemplateArgument {
 ///
 /// This is the main protocol that defines all tokenizer operations, including text processing,
 /// chat template application, and special token handling.
-public protocol Tokenizer {
+public protocol Tokenizer: Sendable {
     /// Tokenizes the input text into a sequence of tokens.
     ///
     /// - Parameter text: The input text to tokenize
@@ -451,7 +451,7 @@ let specialTokenAttributes: [String] = [
 /// This class provides a complete tokenizer implementation that can be initialized from
 /// Hugging Face Hub configuration files and supports all standard tokenization operations
 /// including chat template application, normalization, pre-tokenization, and post-processing.
-public class PreTrainedTokenizer: Tokenizer {
+public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
     let model: TokenizingModel
 
     public var bosToken: String? { model.bosToken }
@@ -905,7 +905,7 @@ public extension AutoTokenizer {
 
 // MARK: - Tokenizer model classes
 
-class T5Tokenizer: UnigramTokenizer {}
+class T5Tokenizer: UnigramTokenizer, @unchecked Sendable {}
 
 // MARK: - PreTrainedTokenizer classes
 
@@ -954,7 +954,7 @@ func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?)
 }
 
 /// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions
-class LlamaPreTrainedTokenizer: PreTrainedTokenizer {
+class LlamaPreTrainedTokenizer: PreTrainedTokenizer, @unchecked Sendable {
     let isLegacy: Bool
 
     required init(tokenizerConfig: Config, tokenizerData: Config, strict: Bool = true) throws {
diff --git a/Sources/Tokenizers/UnigramTokenizer.swift b/Sources/Tokenizers/UnigramTokenizer.swift
@@ -14,7 +14,7 @@ import Hub
 /// Unigram tokenizers use a probabilistic approach where each token has a score,
 /// and the tokenization process finds the most probable segmentation of the input text.
 /// This is commonly used in models like T5 and XLM-RoBERTa.
-class UnigramTokenizer: PreTrainedTokenizerModel {
+class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
     /// A token with its associated score in the Unigram model.
     struct SentencePieceToken {
         var token: String