Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Sources/Tokenizers/BPETokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import Foundation
import Hub

/// A pair of byte/token strings used in Byte-Pair Encoding (BPE) merge operations.
struct BytePair: Hashable {
struct BytePair: Hashable, Sendable {
let a: String
let b: String
init(_ a: String, _ b: String) {
Expand Down Expand Up @@ -38,7 +38,7 @@ struct BytePair: Hashable {
/// BPE tokenizers learn to merge the most frequently occurring pairs of characters
/// or character sequences. This implementation supports various BPE-based models
/// including GPT-2, RoBERTa, and other transformer models.
class BPETokenizer: PreTrainedTokenizerModel {
class BPETokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
let bpeRanks: [BytePair: Int]
private let tokensToIds: [NSString: Int]
private let idsToTokens: [Int: NSString]
Expand Down
14 changes: 7 additions & 7 deletions Sources/Tokenizers/BertTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import Hub
/// This tokenizer performs basic tokenization (whitespace and punctuation splitting)
/// followed by WordPiece subword tokenization, which is the approach used by BERT
/// and related models.
public class BertTokenizer {
public final class BertTokenizer: Sendable {
private let basicTokenizer: BasicTokenizer
private let wordpieceTokenizer: WordpieceTokenizer
private let maxLen = 512
Expand All @@ -27,16 +27,16 @@ public class BertTokenizer {
private let ids_to_tokens: [Int: String]

/// The beginning-of-sequence token string, if defined.
public var bosToken: String?
public let bosToken: String?

/// The numeric ID of the beginning-of-sequence token, if defined.
public var bosTokenId: Int?
public let bosTokenId: Int?

/// The end-of-sequence token string, if defined.
public var eosToken: String?
public let eosToken: String?

/// The numeric ID of the end-of-sequence token, if defined.
public var eosTokenId: Int?
public let eosTokenId: Int?

/// Whether consecutive unknown tokens should be fused together.
public let fuseUnknownTokens: Bool
Expand Down Expand Up @@ -225,7 +225,7 @@ extension BertTokenizer: PreTrainedTokenizerModel {
}
}

class BasicTokenizer {
final class BasicTokenizer: Sendable {
let doLowerCase: Bool

init(doLowerCase: Bool = true) {
Expand Down Expand Up @@ -291,7 +291,7 @@ private extension Character {
}
}

class WordpieceTokenizer {
final class WordpieceTokenizer: Sendable {
let unkToken = "[UNK]"
private let maxInputCharsPerWord = 100
private let vocab: [String: Int]
Expand Down
27 changes: 23 additions & 4 deletions Sources/Tokenizers/Tokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ public enum ChatTemplateArgument {
///
/// This is the main protocol that defines all tokenizer operations, including text processing,
/// chat template application, and special token handling.
public protocol Tokenizer {
public protocol Tokenizer: Sendable {
/// Tokenizes the input text into a sequence of tokens.
///
/// - Parameter text: The input text to tokenize
Expand Down Expand Up @@ -451,7 +451,7 @@ let specialTokenAttributes: [String] = [
/// This class provides a complete tokenizer implementation that can be initialized from
/// Hugging Face Hub configuration files and supports all standard tokenization operations
/// including chat template application, normalization, pre-tokenization, and post-processing.
public class PreTrainedTokenizer: Tokenizer {
public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
let model: TokenizingModel

public var bosToken: String? { model.bosToken }
Expand All @@ -477,6 +477,9 @@ public class PreTrainedTokenizer: Tokenizer {
/// Cache for compiled Jinja templates keyed by their literal template string
private var compiledChatTemplateCache: [String: Template] = [:]

/// Lock to protect the compiled chat template cache from concurrent access
private let cacheLock = NSLock()

/// Initializes a tokenizer from Hugging Face configuration files.
///
/// - Parameters:
Expand Down Expand Up @@ -531,10 +534,26 @@ public class PreTrainedTokenizer: Tokenizer {
}

private func compiledTemplate(for templateString: String) throws -> Template {
// Fast path: check cache under lock
cacheLock.lock()
if let cached = compiledChatTemplateCache[templateString] {
cacheLock.unlock()
return cached
}
cacheLock.unlock()

// Compile template outside of lock to avoid holding lock during expensive operation
let compiled = try Template(templateString)

// Insert into cache under lock (using double-checked locking pattern)
cacheLock.lock()
defer { cacheLock.unlock() }

// Check again in case another thread compiled the same template
if let cached = compiledChatTemplateCache[templateString] {
return cached
}

compiledChatTemplateCache[templateString] = compiled
return compiled
}
Expand Down Expand Up @@ -905,7 +924,7 @@ public extension AutoTokenizer {

// MARK: - Tokenizer model classes

class T5Tokenizer: UnigramTokenizer {}
class T5Tokenizer: UnigramTokenizer, @unchecked Sendable {}

// MARK: - PreTrainedTokenizer classes

Expand Down Expand Up @@ -954,7 +973,7 @@ func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?)
}

/// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions
class LlamaPreTrainedTokenizer: PreTrainedTokenizer {
class LlamaPreTrainedTokenizer: PreTrainedTokenizer, @unchecked Sendable {
let isLegacy: Bool

required init(tokenizerConfig: Config, tokenizerData: Config, strict: Bool = true) throws {
Expand Down
2 changes: 1 addition & 1 deletion Sources/Tokenizers/UnigramTokenizer.swift
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import Hub
/// Unigram tokenizers use a probabilistic approach where each token has a score,
/// and the tokenization process finds the most probable segmentation of the input text.
/// This is commonly used in models like T5 and XLM-RoBERTa.
class UnigramTokenizer: PreTrainedTokenizerModel {
class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
/// A token with its associated score in the Unigram model.
struct SentencePieceToken {
var token: String
Expand Down
2 changes: 2 additions & 0 deletions Tests/TokenizersTests/BertTokenizerTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ private enum Squad {

// MARK: -

@MainActor
private let bertTokenizer: BertTokenizer = {
let vocab = {
let url = Bundle.module.url(forResource: "bert-vocab", withExtension: "txt")!
Expand All @@ -101,6 +102,7 @@ private let bertTokenizer: BertTokenizer = {
// MARK: -

@Suite("BERT Tokenizer Tests")
@MainActor
struct BertTokenizerTests {
@Test("Basic tokenizer correctly tokenizes text")
func testBasicTokenizer() {
Expand Down
2 changes: 2 additions & 0 deletions Tests/TokenizersTests/ChatTemplateTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ struct ChatTemplateTests {
]
]

@MainActor
static let phiTokenizerTask = Task {
try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct")
}
Expand All @@ -27,6 +28,7 @@ struct ChatTemplateTests {
try await phiTokenizerTask.value
}

@MainActor
static let tokenizerWithTemplateArrayTask = Task {
try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit")
}
Expand Down