diff --git a/Sources/Tokenizers/BPETokenizer.swift b/Sources/Tokenizers/BPETokenizer.swift index c71269c8..a2cc00d6 100644 --- a/Sources/Tokenizers/BPETokenizer.swift +++ b/Sources/Tokenizers/BPETokenizer.swift @@ -10,7 +10,7 @@ import Foundation import Hub /// A pair of byte/token strings used in Byte-Pair Encoding (BPE) merge operations. -struct BytePair: Hashable { +struct BytePair: Hashable, Sendable { let a: String let b: String init(_ a: String, _ b: String) { @@ -38,7 +38,7 @@ struct BytePair: Hashable { /// BPE tokenizers learn to merge the most frequently occurring pairs of characters /// or character sequences. This implementation supports various BPE-based models /// including GPT-2, RoBERTa, and other transformer models. -class BPETokenizer: PreTrainedTokenizerModel { +class BPETokenizer: PreTrainedTokenizerModel, @unchecked Sendable { let bpeRanks: [BytePair: Int] private let tokensToIds: [NSString: Int] private let idsToTokens: [Int: NSString] diff --git a/Sources/Tokenizers/BertTokenizer.swift b/Sources/Tokenizers/BertTokenizer.swift index b7b3dea2..ec4396c1 100644 --- a/Sources/Tokenizers/BertTokenizer.swift +++ b/Sources/Tokenizers/BertTokenizer.swift @@ -14,7 +14,7 @@ import Hub /// This tokenizer performs basic tokenization (whitespace and punctuation splitting) /// followed by WordPiece subword tokenization, which is the approach used by BERT /// and related models. -public class BertTokenizer { +public final class BertTokenizer: Sendable { private let basicTokenizer: BasicTokenizer private let wordpieceTokenizer: WordpieceTokenizer private let maxLen = 512 @@ -27,16 +27,16 @@ public class BertTokenizer { private let ids_to_tokens: [Int: String] /// The beginning-of-sequence token string, if defined. - public var bosToken: String? + public let bosToken: String? /// The numeric ID of the beginning-of-sequence token, if defined. - public var bosTokenId: Int? + public let bosTokenId: Int? /// The end-of-sequence token string, if defined. - public var eosToken: String? + public let eosToken: String? /// The numeric ID of the end-of-sequence token, if defined. - public var eosTokenId: Int? + public let eosTokenId: Int? /// Whether consecutive unknown tokens should be fused together. public let fuseUnknownTokens: Bool @@ -225,7 +225,7 @@ extension BertTokenizer: PreTrainedTokenizerModel { } } -class BasicTokenizer { +final class BasicTokenizer: Sendable { let doLowerCase: Bool init(doLowerCase: Bool = true) { @@ -291,7 +291,7 @@ private extension Character { } } -class WordpieceTokenizer { +final class WordpieceTokenizer: Sendable { let unkToken = "[UNK]" private let maxInputCharsPerWord = 100 private let vocab: [String: Int] diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 1f084c63..6a1a410a 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -215,7 +215,7 @@ public enum ChatTemplateArgument { /// /// This is the main protocol that defines all tokenizer operations, including text processing, /// chat template application, and special token handling. -public protocol Tokenizer { +public protocol Tokenizer: Sendable { /// Tokenizes the input text into a sequence of tokens. /// /// - Parameter text: The input text to tokenize @@ -451,7 +451,7 @@ let specialTokenAttributes: [String] = [ /// This class provides a complete tokenizer implementation that can be initialized from /// Hugging Face Hub configuration files and supports all standard tokenization operations /// including chat template application, normalization, pre-tokenization, and post-processing. -public class PreTrainedTokenizer: Tokenizer { +public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer { let model: TokenizingModel public var bosToken: String? { model.bosToken } @@ -477,6 +477,9 @@ public class PreTrainedTokenizer: Tokenizer { /// Cache for compiled Jinja templates keyed by their literal template string private var compiledChatTemplateCache: [String: Template] = [:] + /// Lock to protect the compiled chat template cache from concurrent access + private let cacheLock = NSLock() + /// Initializes a tokenizer from Hugging Face configuration files. /// /// - Parameters: @@ -531,10 +534,26 @@ public class PreTrainedTokenizer: Tokenizer { } private func compiledTemplate(for templateString: String) throws -> Template { + // Fast path: check cache under lock + cacheLock.lock() if let cached = compiledChatTemplateCache[templateString] { + cacheLock.unlock() return cached } + cacheLock.unlock() + + // Compile template outside of lock to avoid holding lock during expensive operation let compiled = try Template(templateString) + + // Insert into cache under lock (using double-checked locking pattern) + cacheLock.lock() + defer { cacheLock.unlock() } + + // Check again in case another thread compiled the same template + if let cached = compiledChatTemplateCache[templateString] { + return cached + } + compiledChatTemplateCache[templateString] = compiled return compiled } @@ -905,7 +924,7 @@ public extension AutoTokenizer { // MARK: - Tokenizer model classes -class T5Tokenizer: UnigramTokenizer {} +class T5Tokenizer: UnigramTokenizer, @unchecked Sendable {} // MARK: - PreTrainedTokenizer classes @@ -954,7 +973,7 @@ func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?) } /// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions -class LlamaPreTrainedTokenizer: PreTrainedTokenizer { +class LlamaPreTrainedTokenizer: PreTrainedTokenizer, @unchecked Sendable { let isLegacy: Bool required init(tokenizerConfig: Config, tokenizerData: Config, strict: Bool = true) throws { diff --git a/Sources/Tokenizers/UnigramTokenizer.swift b/Sources/Tokenizers/UnigramTokenizer.swift index 5f5fe720..7ca8cc4c 100644 --- a/Sources/Tokenizers/UnigramTokenizer.swift +++ b/Sources/Tokenizers/UnigramTokenizer.swift @@ -14,7 +14,7 @@ import Hub /// Unigram tokenizers use a probabilistic approach where each token has a score, /// and the tokenization process finds the most probable segmentation of the input text. /// This is commonly used in models like T5 and XLM-RoBERTa. -class UnigramTokenizer: PreTrainedTokenizerModel { +class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable { /// A token with its associated score in the Unigram model. struct SentencePieceToken { var token: String diff --git a/Tests/TokenizersTests/BertTokenizerTests.swift b/Tests/TokenizersTests/BertTokenizerTests.swift index 7cad3e34..e6c269a5 100644 --- a/Tests/TokenizersTests/BertTokenizerTests.swift +++ b/Tests/TokenizersTests/BertTokenizerTests.swift @@ -83,6 +83,7 @@ private enum Squad { // MARK: - +@MainActor private let bertTokenizer: BertTokenizer = { let vocab = { let url = Bundle.module.url(forResource: "bert-vocab", withExtension: "txt")! @@ -101,6 +102,7 @@ private let bertTokenizer: BertTokenizer = { // MARK: - @Suite("BERT Tokenizer Tests") +@MainActor struct BertTokenizerTests { @Test("Basic tokenizer correctly tokenizes text") func testBasicTokenizer() { diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift index c9245af1..c6eddba8 100644 --- a/Tests/TokenizersTests/ChatTemplateTests.swift +++ b/Tests/TokenizersTests/ChatTemplateTests.swift @@ -19,6 +19,7 @@ struct ChatTemplateTests { ] ] + @MainActor static let phiTokenizerTask = Task { try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct") } @@ -27,6 +28,7 @@ struct ChatTemplateTests { try await phiTokenizerTask.value } + @MainActor static let tokenizerWithTemplateArrayTask = Task { try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit") }