Skip to content
Merged
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ struct TransformersCLI: AsyncParsableCommand {
@Option(help: "Repetition penalty to discourage repeating tokens (typical: 1.0-2.0, 1.0 = no penalty)")
var repetitionPenalty: Float?

@Option(help: "Path to a local folder containing tokenizer_config.json and tokenizer.json")
var tokenizerFolder: String?

func generate(
model: LanguageModel,
config: GenerationConfig,
Expand Down Expand Up @@ -104,7 +107,17 @@ struct TransformersCLI: AsyncParsableCommand {
let url = URL(filePath: modelPath)
let compiledURL = try compile(at: url)
print("Loading model \(compiledURL)")
let model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits)
let model: LanguageModel
if let tokenizerFolder {
let tokenizerURL = URL(filePath: tokenizerFolder, directoryHint: .isDirectory)
model = try LanguageModel.loadCompiled(
url: compiledURL,
tokenizerFolder: tokenizerURL,
computeUnits: computeUnits.asMLComputeUnits
)
} else {
model = try LanguageModel.loadCompiled(url: compiledURL, computeUnits: computeUnits.asMLComputeUnits)
}

var config = model.defaultGenerationConfig
config.doSample = doSample
Expand Down
37 changes: 35 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,41 @@ example converting and running Mistral 7B using CoreML [here](https://github.com

The [modernization of Core ML](https://github.com/huggingface/swift-transformers/pull/257) and corresponding examples were primarily contributed by @joshnewnham, @1duo, @alejandro-isaza, @aseemw. Thank you 🙏

### Offline CoreML tokenizers

When you bundle a compiled CoreML model and tokenizer files with your app, you can skip any network requests by injecting
the tokenizer (or a local configuration) when constructing `LanguageModel`:

```swift
let compiledURL: URL = ... // path to .mlmodelc
let tokenizerFolder: URL = ... // folder containing tokenizer_config.json and tokenizer.json

let model = try LanguageModel.loadCompiled(
url: compiledURL,
tokenizerFolder: tokenizerFolder
)

// Or construct the tokenizer yourself (inside an async context)
let tokenizer = try await AutoTokenizer.from(modelFolder: tokenizerFolder)
let modelWithTokenizer = try LanguageModel.loadCompiled(
url: compiledURL,
tokenizer: tokenizer
)
```

Make sure the tokenizer assets come from the same Hugging Face repo as the original checkpoint. For the
Mistral example in `Examples/Mistral7B/`, you can fetch the tokenizer like this:

```bash
huggingface-cli download \
mistralai/Mistral-7B-Instruct-v0.3 \
tokenizer.json tokenizer_config.json \
--local-dir Examples/Mistral7B/local-tokenizer
```

If the repo is gated, authenticate with `huggingface-cli login` first. Both initializers reuse the tokenizer
you pass in and never reach out to the Hugging Face Hub.

## Usage via SwiftPM

To use `swift-transformers` with SwiftPM, you can add this to your `Package.swift`:
Expand Down Expand Up @@ -139,5 +174,3 @@ To format your code, run `swift format -i --recursive .`.
## License

[Apache 2](LICENSE).


96 changes: 85 additions & 11 deletions Sources/Models/LanguageModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,30 @@ public class LanguageModel {

/// Creates a new language model instance from a CoreML model.
///
/// - Parameter model: The CoreML model to wrap
/// - Parameters:
/// - model: The CoreML model to wrap
/// - configuration: Optional Hub configuration already resolved on disk
/// - tokenizer: Optional preconstructed tokenizer to reuse
/// - Important: Triggers a fatal error if the model doesn't have the expected input shape information
public required init(model: MLModel) {
public required init(
model: MLModel,
configuration: LanguageModelConfigurationFromHub? = nil,
tokenizer: Tokenizer? = nil
) {
self.model = model
_tokenizer = tokenizer
(minContextLength, maxContextLength) = Self.contextRange(from: model)
configuration = LanguageModelConfigurationFromHub(modelName: modelName)
if let configuration {
self.configuration = configuration
} else if tokenizer == nil {
self.configuration = LanguageModelConfigurationFromHub(modelName: modelName)
} else {
self.configuration = nil
}
}

public convenience required init(model: MLModel) {
self.init(model: model, configuration: nil, tokenizer: nil)
}

public func resetState() async {}
Expand Down Expand Up @@ -142,17 +160,60 @@ public extension LanguageModel {
/// - Parameters:
/// - url: The URL of the compiled CoreML model file (.mlmodelc)
/// - computeUnits: The compute units to use for model inference
/// - configuration: Optional Hub configuration describing tokenizer/model metadata
/// - tokenizer: Optional tokenizer instance to reuse instead of loading from disk
/// - Returns: A configured `LanguageModel` instance
/// - Throws: An error if the model cannot be loaded from the specified URL
static func loadCompiled(url: URL, computeUnits: MLComputeUnits = .cpuAndGPU) throws -> LanguageModel {
static func loadCompiled(
url: URL,
computeUnits: MLComputeUnits = .cpuAndGPU,
configuration: LanguageModelConfigurationFromHub? = nil,
tokenizer: Tokenizer? = nil
) throws -> LanguageModel {
let config = MLModelConfiguration()
config.computeUnits = computeUnits
let model = try MLModel(contentsOf: url, configuration: config)
return switch kvCacheAvailability(for: model) {
case .statefulKVCache: LanguageModelWithStatefulKVCache(model: model)
default: LanguageModel(model: model)
case .statefulKVCache:
LanguageModelWithStatefulKVCache(
model: model,
configuration: configuration,
tokenizer: tokenizer
)
default:
LanguageModel(
model: model,
configuration: configuration,
tokenizer: tokenizer
)
}
}

static func loadCompiled(
url: URL,
tokenizerFolder: URL,
computeUnits: MLComputeUnits = .cpuAndGPU
) throws -> LanguageModel {
let configuration = LanguageModelConfigurationFromHub(modelFolder: tokenizerFolder)
return try loadCompiled(
url: url,
computeUnits: computeUnits,
configuration: configuration
)
}

static func loadCompiled(
url: URL,
tokenizer: Tokenizer,
computeUnits: MLComputeUnits = .cpuAndGPU
) throws -> LanguageModel {
try loadCompiled(
url: url,
computeUnits: computeUnits,
configuration: nil,
tokenizer: tokenizer
)
}
}

@available(macOS 15.0, iOS 18.0, *)
Expand Down Expand Up @@ -304,7 +365,8 @@ public extension LanguageModel {
/// - Throws: An error if the configuration cannot be loaded
var modelConfig: Config? {
get async throws {
try await configuration!.modelConfig
guard let configuration else { return nil }
return try await configuration.modelConfig
}
}

Expand All @@ -314,7 +376,8 @@ public extension LanguageModel {
/// - Throws: An error if the configuration cannot be loaded
var tokenizerConfig: Config? {
get async throws {
try await configuration!.tokenizerConfig
guard let configuration else { return nil }
return try await configuration.tokenizerConfig
}
}

Expand All @@ -324,7 +387,10 @@ public extension LanguageModel {
/// - Throws: An error if the tokenizer data cannot be loaded
var tokenizerData: Config {
get async throws {
try await configuration!.tokenizerData
guard let configuration else {
throw TokenizerError.missingConfig
}
return try await configuration.tokenizerData
}
}

Expand Down Expand Up @@ -434,8 +500,12 @@ public class LanguageModelWithStatefulKVCache: LanguageModel {

var state: MLState?

public required init(model: MLModel) {
super.init(model: model)
public required init(
model: MLModel,
configuration: LanguageModelConfigurationFromHub? = nil,
tokenizer: Tokenizer? = nil
) {
super.init(model: model, configuration: configuration, tokenizer: tokenizer)
// To support pre-filling and extend, the input must support
// flexible shapes.
guard maxContextLength - minContextLength > 1 else {
Expand Down Expand Up @@ -506,11 +576,15 @@ public class LanguageModelWithStatefulKVCache: LanguageModel {
public enum TokenizerError: LocalizedError {
/// The tokenizer configuration file could not be found.
case tokenizerConfigNotFound
/// The language model configuration required to load tokenizer data is missing.
case missingConfig

public var errorDescription: String? {
switch self {
case .tokenizerConfigNotFound:
String(localized: "Tokenizer configuration could not be found. The model may be missing required tokenizer files.", comment: "Error when tokenizer configuration is missing")
case .missingConfig:
String(localized: "Language model configuration was not set, tokenizer assets could not be loaded.", comment: "Error when configuration needed for tokenizer data is missing")
}
}
}
Expand Down