Skip to content

Commit bbd06f7

Browse files
committed
Adopt Sendable protocol for tokenizer protocols and conforming types
1 parent 52a6f59 commit bbd06f7

File tree

4 files changed

+8
-8
lines changed

4 files changed

+8
-8
lines changed

Sources/Tokenizers/BPETokenizer.swift

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import Foundation
1010
import Hub
1111

1212
/// A pair of byte/token strings used in Byte-Pair Encoding (BPE) merge operations.
13-
struct BytePair: Hashable {
13+
struct BytePair: Hashable, Sendable {
1414
let a: String
1515
let b: String
1616
init(_ a: String, _ b: String) {
@@ -38,7 +38,7 @@ struct BytePair: Hashable {
3838
/// BPE tokenizers learn to merge the most frequently occurring pairs of characters
3939
/// or character sequences. This implementation supports various BPE-based models
4040
/// including GPT-2, RoBERTa, and other transformer models.
41-
class BPETokenizer: PreTrainedTokenizerModel {
41+
class BPETokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
4242
let bpeRanks: [BytePair: Int]
4343
private let tokensToIds: [NSString: Int]
4444
private let idsToTokens: [Int: NSString]

Sources/Tokenizers/BertTokenizer.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import Hub
1414
/// This tokenizer performs basic tokenization (whitespace and punctuation splitting)
1515
/// followed by WordPiece subword tokenization, which is the approach used by BERT
1616
/// and related models.
17-
public class BertTokenizer {
17+
public class BertTokenizer: @unchecked Sendable {
1818
private let basicTokenizer: BasicTokenizer
1919
private let wordpieceTokenizer: WordpieceTokenizer
2020
private let maxLen = 512

Sources/Tokenizers/Tokenizer.swift

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ public enum ChatTemplateArgument {
215215
///
216216
/// This is the main protocol that defines all tokenizer operations, including text processing,
217217
/// chat template application, and special token handling.
218-
public protocol Tokenizer {
218+
public protocol Tokenizer: Sendable {
219219
/// Tokenizes the input text into a sequence of tokens.
220220
///
221221
/// - Parameter text: The input text to tokenize
@@ -451,7 +451,7 @@ let specialTokenAttributes: [String] = [
451451
/// This class provides a complete tokenizer implementation that can be initialized from
452452
/// Hugging Face Hub configuration files and supports all standard tokenization operations
453453
/// including chat template application, normalization, pre-tokenization, and post-processing.
454-
public class PreTrainedTokenizer: Tokenizer {
454+
public class PreTrainedTokenizer: @unchecked Sendable, Tokenizer {
455455
let model: TokenizingModel
456456

457457
public var bosToken: String? { model.bosToken }
@@ -905,7 +905,7 @@ public extension AutoTokenizer {
905905

906906
// MARK: - Tokenizer model classes
907907

908-
class T5Tokenizer: UnigramTokenizer {}
908+
class T5Tokenizer: UnigramTokenizer, @unchecked Sendable {}
909909

910910
// MARK: - PreTrainedTokenizer classes
911911

@@ -954,7 +954,7 @@ func maybeUpdatePostProcessor(tokenizerConfig: Config, processorConfig: Config?)
954954
}
955955

956956
/// See https://github.com/xenova/transformers.js/blob/1a9964fb09b8f54fcbeac46dc6aae8d76795809d/src/tokenizers.js#L3203 for these exceptions
957-
class LlamaPreTrainedTokenizer: PreTrainedTokenizer {
957+
class LlamaPreTrainedTokenizer: PreTrainedTokenizer, @unchecked Sendable {
958958
let isLegacy: Bool
959959

960960
required init(tokenizerConfig: Config, tokenizerData: Config, strict: Bool = true) throws {

Sources/Tokenizers/UnigramTokenizer.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ import Hub
1414
/// Unigram tokenizers use a probabilistic approach where each token has a score,
1515
/// and the tokenization process finds the most probable segmentation of the input text.
1616
/// This is commonly used in models like T5 and XLM-RoBERTa.
17-
class UnigramTokenizer: PreTrainedTokenizerModel {
17+
class UnigramTokenizer: PreTrainedTokenizerModel, @unchecked Sendable {
1818
/// A token with its associated score in the Unigram model.
1919
struct SentencePieceToken {
2020
var token: String

0 commit comments

Comments
 (0)