From 48f9167e7f0df347e22de6a4afb389f5fd87a820 Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Tue, 1 Oct 2024 16:22:16 +0200 Subject: [PATCH 1/9] Improve chat template parsing --- Sources/Tokenizers/Tokenizer.swift | 32 ++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index e026d71..0ac0de6 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -125,7 +125,8 @@ public protocol Tokenizer { chatTemplate: String?, addGenerationPrompt: Bool, truncation: Bool, - maxLength: Int? + maxLength: Int?, + tools: [[String: Any]]? ) throws -> [Int] } @@ -323,12 +324,35 @@ public class PreTrainedTokenizer: Tokenizer { public func applyChatTemplate( messages: [[String: String]], - chatTemplate: String?, + chatTemplate: String? = nil, addGenerationPrompt: Bool = false, truncation: Bool = false, - maxLength: Int? + maxLength: Int? = nil, + tools: [[String: Any]]? = nil ) throws -> [Int] { - let template = try Template(chatTemplate ?? tokenizerConfig.chatTemplate?.stringValue ?? defaultChatTemplate) + var chatTemplateFromConfig: String? + if let chatTemplateValue = tokenizerConfig.chatTemplate { + if let chatTemplateStringValue = chatTemplateValue.stringValue { + chatTemplateFromConfig = chatTemplateStringValue + } else if let chatTemplateArrayValue = chatTemplateValue.arrayValue { + // If a list of chat templates is specified, convert them to a dict + let templateDict = Dictionary(uniqueKeysWithValues: chatTemplateArrayValue.compactMap { template in + guard let name = template[dynamicMember: "name"]?.stringValue, + let templateString = template[dynamicMember: "template"]?.stringValue else { + return nil + } + return (name, templateString) + }) + // Choose the appropriate template + if let tools = tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { + chatTemplateFromConfig = toolUseTemplate + } else { + chatTemplateFromConfig = templateDict["default"] + } + } + } + + let template = try Template(chatTemplate ?? chatTemplateFromConfig ?? defaultChatTemplate) var context: [String: Any] = [ "messages": messages, "add_generation_prompt": addGenerationPrompt From ac91113058da2ba7f3336f097eb7f16fcd9de16d Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Wed, 2 Oct 2024 18:26:50 +0200 Subject: [PATCH 2/9] Clean up --- Sources/Tokenizers/Tokenizer.swift | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 0ac0de6..577014e 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -337,8 +337,8 @@ public class PreTrainedTokenizer: Tokenizer { } else if let chatTemplateArrayValue = chatTemplateValue.arrayValue { // If a list of chat templates is specified, convert them to a dict let templateDict = Dictionary(uniqueKeysWithValues: chatTemplateArrayValue.compactMap { template in - guard let name = template[dynamicMember: "name"]?.stringValue, - let templateString = template[dynamicMember: "template"]?.stringValue else { + guard let name = template.name?.stringValue, + let templateString = template.template?.stringValue else { return nil } return (name, templateString) From 852ea261704c14f263696454ad38f9562c8c9ccc Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Wed, 2 Oct 2024 19:05:26 +0200 Subject: [PATCH 3/9] Improve chat template selection --- Sources/Tokenizers/Tokenizer.swift | 56 ++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 19 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 577014e..90eb97f 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -9,12 +9,13 @@ import Hub import Foundation import Jinja -enum TokenizerError : Error { +enum TokenizerError: Error { case missingConfig case missingTokenizerClassInConfig case unsupportedTokenizer(String) case missingVocab case malformedVocab + case noChatTemplateSpecified case tooLong(String) } @@ -177,8 +178,6 @@ public class PreTrainedTokenizer: Tokenizer { private let tokenizerConfig: Config private let cleanUpTokenizationSpaces: Bool - - private let defaultChatTemplate: String = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" required public init(tokenizerConfig: Config, tokenizerData: Config) throws { var addedTokens: [String : Int] = [:] @@ -324,35 +323,54 @@ public class PreTrainedTokenizer: Tokenizer { public func applyChatTemplate( messages: [[String: String]], + /// A Jinja template or the name of a template to use for this conversion. + /// It is usually not necessary to pass anything to this argument, + /// as the model's template will be used by default. chatTemplate: String? = nil, addGenerationPrompt: Bool = false, truncation: Bool = false, maxLength: Int? = nil, + /// A list of tools (callable functions) that will be accessible to the model. If the template does not + /// support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, + /// giving the name, description and argument types for the tool. See the + /// [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) + /// for more information. tools: [[String: Any]]? = nil ) throws -> [Int] { - var chatTemplateFromConfig: String? - if let chatTemplateValue = tokenizerConfig.chatTemplate { - if let chatTemplateStringValue = chatTemplateValue.stringValue { - chatTemplateFromConfig = chatTemplateStringValue - } else if let chatTemplateArrayValue = chatTemplateValue.arrayValue { - // If a list of chat templates is specified, convert them to a dict - let templateDict = Dictionary(uniqueKeysWithValues: chatTemplateArrayValue.compactMap { template in - guard let name = template.name?.stringValue, - let templateString = template.template?.stringValue else { + var selectedChatTemplate: String? + if let valueFromConfig = tokenizerConfig.chatTemplate { + if let arrayValue = valueFromConfig.arrayValue { + // If the config specifies a list of chat templates, convert them to a dictionary + let templateDict = Dictionary(uniqueKeysWithValues: arrayValue.compactMap { item in + guard let name = item.name?.stringValue, let template = item.template?.stringValue else { return nil } - return (name, templateString) + return (name, template) }) - // Choose the appropriate template - if let tools = tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { - chatTemplateFromConfig = toolUseTemplate - } else { - chatTemplateFromConfig = templateDict["default"] + if let chatTemplateArgument = chatTemplate, let matchingDictEntry = templateDict[chatTemplateArgument] { + // Use chat template from config that matches the name specified in the `chatTemplate` argument + selectedChatTemplate = matchingDictEntry + } else if let tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { + // Use tool use chat template from config + selectedChatTemplate = toolUseTemplate + } else if let defaultChatTemplate = templateDict["default"] { + // Use default chat template from config + selectedChatTemplate = defaultChatTemplate } + } else if let chatTemplateArgument = chatTemplate { + // Use chat template from argument + selectedChatTemplate = chatTemplateArgument + } else if let stringValue = valueFromConfig.stringValue { + // Use chat template from config + selectedChatTemplate = stringValue } } - let template = try Template(chatTemplate ?? chatTemplateFromConfig ?? defaultChatTemplate) + guard let selectedChatTemplate else { + throw TokenizerError.noChatTemplateSpecified + } + + let template = try Template(selectedChatTemplate) var context: [String: Any] = [ "messages": messages, "add_generation_prompt": addGenerationPrompt From b916247f48feb6090129a509982e6b1b4fa922cd Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Wed, 2 Oct 2024 20:31:43 +0200 Subject: [PATCH 4/9] Add tests for chat templates --- Tests/TokenizersTests/ChatTemplateTests.swift | 61 +++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 Tests/TokenizersTests/ChatTemplateTests.swift diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift new file mode 100644 index 0000000..450b319 --- /dev/null +++ b/Tests/TokenizersTests/ChatTemplateTests.swift @@ -0,0 +1,61 @@ +// +// ChatTemplateTests.swift +// swift-transformers +// +// Created by Anthony DePasquale on 2/10/24. +// + +import XCTest +import Tokenizers + +class ChatTemplateTests: XCTestCase { + let messages = [[ + "role": "user", + "content": "Describe the Swift programming language.", + ]] + + func testTemplateFromConfig() async throws { + let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct") + let encoded = try tokenizer.applyChatTemplate(messages: messages) + let encodedTarget = [32010, 4002, 29581, 278, 14156, 8720, 4086, 29889, 32007, 32001] + let decoded = tokenizer.decode(tokens: encoded) + let decodedTarget = "<|user|>Describe the Swift programming language.<|end|><|assistant|>" + XCTAssertEqual(encoded, encodedTarget) + XCTAssertEqual(decoded, decodedTarget) + } + + func testDefaultTemplateFromArrayInConfig() async throws { + let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit") + let encoded = try tokenizer.applyChatTemplate(messages: messages) + let encodedTarget = [1, 29473, 3, 28752, 1040, 4672, 2563, 17060, 4610, 29491, 29473, 4] + let decoded = tokenizer.decode(tokens: encoded) + let decodedTarget = " [INST] Describe the Swift programming language. [/INST]" + XCTAssertEqual(encoded, encodedTarget) + XCTAssertEqual(decoded, decodedTarget) + } + + func testTemplateFromArgument() async throws { + let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct") + // Purposely not using the correct template for this model to verify that the template from the config is not being used + let mistral7BDefaultTemplate = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: mistral7BDefaultTemplate, addGenerationPrompt: false, truncation: false, maxLength: nil, tools: nil) + let encodedTarget = [1, 518, 25580, 29962, 20355, 915, 278, 14156, 8720, 4086, 29889, 518, 29914, 25580, 29962] + let decoded = tokenizer.decode(tokens: encoded) + let decodedTarget = " [INST] Describe the Swift programming language. [/INST]" + XCTAssertEqual(encoded, encodedTarget) + XCTAssertEqual(decoded, decodedTarget) + } + + func testNamedTemplateFromArgument() async throws { + let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit") + // Normally it is not necessary to specify the name `default`, but I'm not aware of models with lists of templates in the config that are not `default` or `tool_use` + let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: "default", addGenerationPrompt: false, truncation: false, maxLength: nil, tools: nil) + let encodedTarget = [1, 29473, 3, 28752, 1040, 4672, 2563, 17060, 4610, 29491, 29473, 4] + let decoded = tokenizer.decode(tokens: encoded) + let decodedTarget = " [INST] Describe the Swift programming language. [/INST]" + XCTAssertEqual(encoded, encodedTarget) + XCTAssertEqual(decoded, decodedTarget) + } + + // TODO: Add tests for tool use template +} From cb47ba4064057cec9eda5c1332fe78f52c96a3f8 Mon Sep 17 00:00:00 2001 From: Anthony Date: Thu, 3 Oct 2024 00:06:39 +0200 Subject: [PATCH 5/9] Update Sources/Tokenizers/Tokenizer.swift Co-authored-by: Pedro Cuenca --- Sources/Tokenizers/Tokenizer.swift | 1 + 1 file changed, 1 insertion(+) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 90eb97f..ae680d4 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -335,6 +335,7 @@ public class PreTrainedTokenizer: Tokenizer { /// giving the name, description and argument types for the tool. See the /// [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) /// for more information. + /// Note: tool calling is not supported yet, it will be available in a future update. tools: [[String: Any]]? = nil ) throws -> [Int] { var selectedChatTemplate: String? From 19a5da7bbf7f9e93ef8d9d96ebe803a6740d8e61 Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Thu, 3 Oct 2024 01:18:47 +0200 Subject: [PATCH 6/9] Improve template selection --- Sources/Tokenizers/Tokenizer.swift | 50 +++++++++++++------ Tests/TokenizersTests/ChatTemplateTests.swift | 4 +- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index ae680d4..dfb226f 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -15,8 +15,7 @@ enum TokenizerError: Error { case unsupportedTokenizer(String) case missingVocab case malformedVocab - case noChatTemplateSpecified - + case chatTemplate(String) case tooLong(String) } @@ -120,10 +119,15 @@ public protocol Tokenizer { var unknownTokenId: Int? { get } func applyChatTemplate(messages: [[String: String]]) throws -> [Int] - + + func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] + + func applyChatTemplate(messages: [[String: String]], chatTemplateName: String) throws -> [Int] + func applyChatTemplate( messages: [[String: String]], chatTemplate: String?, + chatTemplateName: String?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, @@ -318,15 +322,23 @@ public class PreTrainedTokenizer: Tokenizer { } public func applyChatTemplate(messages: [[String: String]]) throws -> [Int] { - try applyChatTemplate(messages: messages, chatTemplate: nil, addGenerationPrompt: true, maxLength: nil) + try applyChatTemplate(messages: messages, addGenerationPrompt: true) } - + + public func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] { + try applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true) + } + + public func applyChatTemplate(messages: [[String: String]], chatTemplateName: String) throws -> [Int] { + try applyChatTemplate(messages: messages, chatTemplateName: chatTemplateName, addGenerationPrompt: true) + } + public func applyChatTemplate( messages: [[String: String]], - /// A Jinja template or the name of a template to use for this conversion. - /// It is usually not necessary to pass anything to this argument, - /// as the model's template will be used by default. + /// A Jinja template to use for this conversion. Normally it is not necessary to provide a template, since it will be read from the tokenizer config file. chatTemplate: String? = nil, + /// For models whose tokenizer config file includes multiple chat templates, the template can be specified by name. Normally this is not necessary. + chatTemplateName: String? = nil, addGenerationPrompt: Bool = false, truncation: Bool = false, maxLength: Int? = nil, @@ -339,7 +351,10 @@ public class PreTrainedTokenizer: Tokenizer { tools: [[String: Any]]? = nil ) throws -> [Int] { var selectedChatTemplate: String? - if let valueFromConfig = tokenizerConfig.chatTemplate { + if let chatTemplate { + // Use chat template from argument + selectedChatTemplate = chatTemplate + } else if let valueFromConfig = tokenizerConfig.chatTemplate { if let arrayValue = valueFromConfig.arrayValue { // If the config specifies a list of chat templates, convert them to a dictionary let templateDict = Dictionary(uniqueKeysWithValues: arrayValue.compactMap { item in @@ -348,9 +363,13 @@ public class PreTrainedTokenizer: Tokenizer { } return (name, template) }) - if let chatTemplateArgument = chatTemplate, let matchingDictEntry = templateDict[chatTemplateArgument] { - // Use chat template from config that matches the name specified in the `chatTemplate` argument - selectedChatTemplate = matchingDictEntry + if let chatTemplateName { + // Select chat template from config by name + if let matchingDictEntry = templateDict[chatTemplateName] { + selectedChatTemplate = matchingDictEntry + } else { + throw TokenizerError.chatTemplate("No chat template named \"\(chatTemplateName)\" was found in the tokenizer config file") + } } else if let tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { // Use tool use chat template from config selectedChatTemplate = toolUseTemplate @@ -358,9 +377,6 @@ public class PreTrainedTokenizer: Tokenizer { // Use default chat template from config selectedChatTemplate = defaultChatTemplate } - } else if let chatTemplateArgument = chatTemplate { - // Use chat template from argument - selectedChatTemplate = chatTemplateArgument } else if let stringValue = valueFromConfig.stringValue { // Use chat template from config selectedChatTemplate = stringValue @@ -368,13 +384,15 @@ public class PreTrainedTokenizer: Tokenizer { } guard let selectedChatTemplate else { - throw TokenizerError.noChatTemplateSpecified + throw TokenizerError.chatTemplate("No chat template was specified") } let template = try Template(selectedChatTemplate) var context: [String: Any] = [ "messages": messages, "add_generation_prompt": addGenerationPrompt + // TODO: Add `tools` entry when support is added in Jinja + // "tools": tools ] // TODO: maybe keep NSString here diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift index 450b319..b085d0b 100644 --- a/Tests/TokenizersTests/ChatTemplateTests.swift +++ b/Tests/TokenizersTests/ChatTemplateTests.swift @@ -38,7 +38,7 @@ class ChatTemplateTests: XCTestCase { let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct") // Purposely not using the correct template for this model to verify that the template from the config is not being used let mistral7BDefaultTemplate = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" - let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: mistral7BDefaultTemplate, addGenerationPrompt: false, truncation: false, maxLength: nil, tools: nil) + let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: mistral7BDefaultTemplate) let encodedTarget = [1, 518, 25580, 29962, 20355, 915, 278, 14156, 8720, 4086, 29889, 518, 29914, 25580, 29962] let decoded = tokenizer.decode(tokens: encoded) let decodedTarget = " [INST] Describe the Swift programming language. [/INST]" @@ -49,7 +49,7 @@ class ChatTemplateTests: XCTestCase { func testNamedTemplateFromArgument() async throws { let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit") // Normally it is not necessary to specify the name `default`, but I'm not aware of models with lists of templates in the config that are not `default` or `tool_use` - let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: "default", addGenerationPrompt: false, truncation: false, maxLength: nil, tools: nil) + let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplateName: "default") let encodedTarget = [1, 29473, 3, 28752, 1040, 4672, 2563, 17060, 4610, 29491, 29473, 4] let decoded = tokenizer.decode(tokens: encoded) let decodedTarget = " [INST] Describe the Swift programming language. [/INST]" From de740f8d4fef7664c896aedeec1ef48c20df4a2f Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Thu, 3 Oct 2024 12:04:12 +0200 Subject: [PATCH 7/9] More elegant solution for chatTemplate argument --- Sources/Tokenizers/Tokenizer.swift | 46 +++++++++---------- Tests/TokenizersTests/ChatTemplateTests.swift | 4 +- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index dfb226f..1cb0b36 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -94,6 +94,13 @@ struct TokenizerModel { } } +public enum ChatTemplateArgument { + /// A Jinja template to use for the conversion. Normally it is not necessary to provide a template, since it will be read from the tokenizer config file. + case literal(String) + /// For models whose tokenizer config file includes multiple chat templates, the template can be specified by name. Normally this is not necessary. + case name(String) +} + public protocol Tokenizer { func tokenize(text: String) -> [String] @@ -117,17 +124,15 @@ public protocol Tokenizer { var eosTokenId: Int? { get } var unknownToken: String? { get } var unknownTokenId: Int? { get } - - func applyChatTemplate(messages: [[String: String]]) throws -> [Int] - func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] + func applyChatTemplate(messages: [[String: String]]) throws -> [Int] - func applyChatTemplate(messages: [[String: String]], chatTemplateName: String) throws -> [Int] + func applyChatTemplate(messages: [[String: String]], chatTemplate: ChatTemplateArgument) throws -> [Int] func applyChatTemplate( messages: [[String: String]], - chatTemplate: String?, - chatTemplateName: String?, + /// A chat template can optionally be provided or specified by name when several templates are included in the tokenizer config file. Normally this is not necessary. + chatTemplate: ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, maxLength: Int?, @@ -226,7 +231,7 @@ public class PreTrainedTokenizer: Tokenizer { self.decoder = DecoderFactory.fromConfig(config: tokenizerData.decoder) self.cleanUpTokenizationSpaces = tokenizerConfig.cleanUpTokenizationSpaces?.boolValue ?? true self.tokenizerConfig = tokenizerConfig - + model = try TokenizerModel.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData, addedTokens: addedTokens) } @@ -320,25 +325,18 @@ public class PreTrainedTokenizer: Tokenizer { public func convertIdToToken(_ id: Int) -> String? { model.convertIdToToken(id) } - + public func applyChatTemplate(messages: [[String: String]]) throws -> [Int] { try applyChatTemplate(messages: messages, addGenerationPrompt: true) } - public func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] { + public func applyChatTemplate(messages: [[String: String]], chatTemplate: ChatTemplateArgument) throws -> [Int] { try applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true) } - public func applyChatTemplate(messages: [[String: String]], chatTemplateName: String) throws -> [Int] { - try applyChatTemplate(messages: messages, chatTemplateName: chatTemplateName, addGenerationPrompt: true) - } - public func applyChatTemplate( messages: [[String: String]], - /// A Jinja template to use for this conversion. Normally it is not necessary to provide a template, since it will be read from the tokenizer config file. - chatTemplate: String? = nil, - /// For models whose tokenizer config file includes multiple chat templates, the template can be specified by name. Normally this is not necessary. - chatTemplateName: String? = nil, + chatTemplate: ChatTemplateArgument? = nil, addGenerationPrompt: Bool = false, truncation: Bool = false, maxLength: Int? = nil, @@ -351,9 +349,9 @@ public class PreTrainedTokenizer: Tokenizer { tools: [[String: Any]]? = nil ) throws -> [Int] { var selectedChatTemplate: String? - if let chatTemplate { + if let chatTemplate, case .literal(let template) = chatTemplate { // Use chat template from argument - selectedChatTemplate = chatTemplate + selectedChatTemplate = template } else if let valueFromConfig = tokenizerConfig.chatTemplate { if let arrayValue = valueFromConfig.arrayValue { // If the config specifies a list of chat templates, convert them to a dictionary @@ -363,12 +361,12 @@ public class PreTrainedTokenizer: Tokenizer { } return (name, template) }) - if let chatTemplateName { + if let chatTemplate, case .name(let name) = chatTemplate { // Select chat template from config by name - if let matchingDictEntry = templateDict[chatTemplateName] { + if let matchingDictEntry = templateDict[name] { selectedChatTemplate = matchingDictEntry } else { - throw TokenizerError.chatTemplate("No chat template named \"\(chatTemplateName)\" was found in the tokenizer config file") + throw TokenizerError.chatTemplate("No chat template named \"\(name)\" was found in the tokenizer config file") } } else if let tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { // Use tool use chat template from config @@ -458,7 +456,7 @@ extension AutoTokenizer { return try AutoTokenizer.from(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) } - + public static func from( modelFolder: URL, hubApi: HubApi = .shared @@ -466,7 +464,7 @@ extension AutoTokenizer { let config = LanguageModelConfigurationFromHub(modelFolder: modelFolder, hubApi: hubApi) guard let tokenizerConfig = try await config.tokenizerConfig else { throw TokenizerError.missingConfig } let tokenizerData = try await config.tokenizerData - + return try PreTrainedTokenizer(tokenizerConfig: tokenizerConfig, tokenizerData: tokenizerData) } } diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift index b085d0b..d8c2306 100644 --- a/Tests/TokenizersTests/ChatTemplateTests.swift +++ b/Tests/TokenizersTests/ChatTemplateTests.swift @@ -38,7 +38,7 @@ class ChatTemplateTests: XCTestCase { let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct") // Purposely not using the correct template for this model to verify that the template from the config is not being used let mistral7BDefaultTemplate = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" - let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: mistral7BDefaultTemplate) + let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: .literal(mistral7BDefaultTemplate)) let encodedTarget = [1, 518, 25580, 29962, 20355, 915, 278, 14156, 8720, 4086, 29889, 518, 29914, 25580, 29962] let decoded = tokenizer.decode(tokens: encoded) let decodedTarget = " [INST] Describe the Swift programming language. [/INST]" @@ -49,7 +49,7 @@ class ChatTemplateTests: XCTestCase { func testNamedTemplateFromArgument() async throws { let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit") // Normally it is not necessary to specify the name `default`, but I'm not aware of models with lists of templates in the config that are not `default` or `tool_use` - let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplateName: "default") + let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: .name("default")) let encodedTarget = [1, 29473, 3, 28752, 1040, 4672, 2563, 17060, 4610, 29491, 29473, 4] let decoded = tokenizer.decode(tokens: encoded) let decodedTarget = " [INST] Describe the Swift programming language. [/INST]" From 62ccb4135e7b6d1607f283511bfd56fdfdb121c0 Mon Sep 17 00:00:00 2001 From: Anthony Date: Thu, 3 Oct 2024 15:21:51 +0200 Subject: [PATCH 8/9] Update Sources/Tokenizers/Tokenizer.swift Co-authored-by: Pedro Cuenca --- Sources/Tokenizers/Tokenizer.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 1cb0b36..34b7215 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -95,7 +95,7 @@ struct TokenizerModel { } public enum ChatTemplateArgument { - /// A Jinja template to use for the conversion. Normally it is not necessary to provide a template, since it will be read from the tokenizer config file. + /// A Jinja template to use for the conversation. Normally it is not necessary to provide a template, since it will be read from the tokenizer config file. case literal(String) /// For models whose tokenizer config file includes multiple chat templates, the template can be specified by name. Normally this is not necessary. case name(String) From c0355ddaba33e8f7f4f77d7ede9b631470105137 Mon Sep 17 00:00:00 2001 From: Anthony DePasquale Date: Thu, 3 Oct 2024 15:43:39 +0200 Subject: [PATCH 9/9] Add overload with `chatTemplate` argument of type `String` --- Sources/Tokenizers/Tokenizer.swift | 17 +++++++++++++---- Tests/TokenizersTests/ChatTemplateTests.swift | 14 +++++++++++++- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/Sources/Tokenizers/Tokenizer.swift b/Sources/Tokenizers/Tokenizer.swift index 34b7215..9c8c381 100644 --- a/Sources/Tokenizers/Tokenizer.swift +++ b/Sources/Tokenizers/Tokenizer.swift @@ -95,9 +95,9 @@ struct TokenizerModel { } public enum ChatTemplateArgument { - /// A Jinja template to use for the conversation. Normally it is not necessary to provide a template, since it will be read from the tokenizer config file. + /// A Jinja template to use for the conversation. Normally it is not necessary to provide a template, since it will be read from the tokenizer config. case literal(String) - /// For models whose tokenizer config file includes multiple chat templates, the template can be specified by name. Normally this is not necessary. + /// For models whose tokenizer config includes multiple chat templates, the template can be specified by name. Normally this is not necessary. case name(String) } @@ -125,13 +125,18 @@ public protocol Tokenizer { var unknownToken: String? { get } var unknownTokenId: Int? { get } + /// The appropriate chat template is selected from the tokenizer config func applyChatTemplate(messages: [[String: String]]) throws -> [Int] + /// The chat template is provided as a string literal or specified by name func applyChatTemplate(messages: [[String: String]], chatTemplate: ChatTemplateArgument) throws -> [Int] + /// The chat template is provided as a string literal + func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] + func applyChatTemplate( messages: [[String: String]], - /// A chat template can optionally be provided or specified by name when several templates are included in the tokenizer config file. Normally this is not necessary. + /// A chat template can optionally be provided or specified by name when several templates are included in the tokenizer config. Normally this is not necessary. chatTemplate: ChatTemplateArgument?, addGenerationPrompt: Bool, truncation: Bool, @@ -334,6 +339,10 @@ public class PreTrainedTokenizer: Tokenizer { try applyChatTemplate(messages: messages, chatTemplate: chatTemplate, addGenerationPrompt: true) } + public func applyChatTemplate(messages: [[String: String]], chatTemplate: String) throws -> [Int] { + try applyChatTemplate(messages: messages, chatTemplate: .literal(chatTemplate), addGenerationPrompt: true) + } + public func applyChatTemplate( messages: [[String: String]], chatTemplate: ChatTemplateArgument? = nil, @@ -366,7 +375,7 @@ public class PreTrainedTokenizer: Tokenizer { if let matchingDictEntry = templateDict[name] { selectedChatTemplate = matchingDictEntry } else { - throw TokenizerError.chatTemplate("No chat template named \"\(name)\" was found in the tokenizer config file") + throw TokenizerError.chatTemplate("No chat template named \"\(name)\" was found in the tokenizer config") } } else if let tools, !tools.isEmpty, let toolUseTemplate = templateDict["tool_use"] { // Use tool use chat template from config diff --git a/Tests/TokenizersTests/ChatTemplateTests.swift b/Tests/TokenizersTests/ChatTemplateTests.swift index d8c2306..3ee7aa1 100644 --- a/Tests/TokenizersTests/ChatTemplateTests.swift +++ b/Tests/TokenizersTests/ChatTemplateTests.swift @@ -34,7 +34,7 @@ class ChatTemplateTests: XCTestCase { XCTAssertEqual(decoded, decodedTarget) } - func testTemplateFromArgument() async throws { + func testTemplateFromArgumentWithEnum() async throws { let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct") // Purposely not using the correct template for this model to verify that the template from the config is not being used let mistral7BDefaultTemplate = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" @@ -46,6 +46,18 @@ class ChatTemplateTests: XCTestCase { XCTAssertEqual(decoded, decodedTarget) } + func testTemplateFromArgumentWithString() async throws { + let tokenizer = try await AutoTokenizer.from(pretrained: "microsoft/Phi-3-mini-128k-instruct") + // Purposely not using the correct template for this model to verify that the template from the config is not being used + let mistral7BDefaultTemplate = "{{bos_token}}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + ' ' + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}" + let encoded = try tokenizer.applyChatTemplate(messages: messages, chatTemplate: mistral7BDefaultTemplate) + let encodedTarget = [1, 518, 25580, 29962, 20355, 915, 278, 14156, 8720, 4086, 29889, 518, 29914, 25580, 29962] + let decoded = tokenizer.decode(tokens: encoded) + let decodedTarget = " [INST] Describe the Swift programming language. [/INST]" + XCTAssertEqual(encoded, encodedTarget) + XCTAssertEqual(decoded, decodedTarget) + } + func testNamedTemplateFromArgument() async throws { let tokenizer = try await AutoTokenizer.from(pretrained: "mlx-community/Mistral-7B-Instruct-v0.3-4bit") // Normally it is not necessary to specify the name `default`, but I'm not aware of models with lists of templates in the config that are not `default` or `tool_use`