Skip to content

Commit 9d9750a

Browse files
authored
New tokenizer for gpt4o (#714)
* Add new tokenizer for GPT-4o and 4o mini * Fix service bootstrap warnings caused by unselected tokenizer. Default to GPT4 tokenizer.
1 parent f0d2ee9 commit 9d9750a

File tree

5 files changed

+52
-14
lines changed

5 files changed

+52
-14
lines changed

extensions/OpenAI/OpenAI/Tokenizers/GPT2Tokenizer.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
// Copyright (c) Microsoft. All rights reserved.
22

33
using System.Collections.Generic;
4-
using System.Diagnostics.CodeAnalysis;
54
using System.Linq;
65
using Microsoft.ML.Tokenizers;
76

@@ -11,7 +10,6 @@ namespace Microsoft.KernelMemory.AI.OpenAI;
1110
/// <summary>
1211
/// TikToken GPT2 tokenizer (gpt2.tiktoken)
1312
/// </summary>
14-
[Experimental("KMEXP01")]
1513
public sealed class GPT2Tokenizer : ITextTokenizer
1614
{
1715
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("gpt2");

extensions/OpenAI/OpenAI/Tokenizers/GPT3Tokenizer.cs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
// Copyright (c) Microsoft. All rights reserved.
22

33
using System.Collections.Generic;
4-
using System.Diagnostics.CodeAnalysis;
54
using System.Linq;
65
using Microsoft.ML.Tokenizers;
76

@@ -11,7 +10,6 @@ namespace Microsoft.KernelMemory.AI.OpenAI;
1110
/// <summary>
1211
/// TikToken GPT3 tokenizer (p50k_base.tiktoken)
1312
/// </summary>
14-
[Experimental("KMEXP01")]
1513
public sealed class GPT3Tokenizer : ITextTokenizer
1614
{
1715
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("text-davinci-003");

extensions/OpenAI/OpenAI/Tokenizers/GPT4Tokenizer.cs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
11
// Copyright (c) Microsoft. All rights reserved.
22

33
using System.Collections.Generic;
4-
using System.Diagnostics.CodeAnalysis;
54
using System.Linq;
65
using Microsoft.ML.Tokenizers;
76

87
// ReSharper disable once CheckNamespace
98
namespace Microsoft.KernelMemory.AI.OpenAI;
109

1110
/// <summary>
12-
/// GPT 3.5 and GPT 4+ tokenizer (cl100k_base.tiktoken + special tokens)
11+
/// GPT 3.5 and GPT 4 tokenizer (cl100k_base.tiktoken + special tokens)
1312
/// </summary>
14-
[Experimental("KMEXP01")]
1513
public sealed class GPT4Tokenizer : ITextTokenizer
1614
{
17-
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("gpt-4", new Dictionary<string, int> { { "<|im_start|>", 100264 }, { "<|im_end|>", 100265 } });
15+
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("gpt-4",
16+
new Dictionary<string, int> { { "<|im_start|>", 100264 }, { "<|im_end|>", 100265 } });
1817

1918
/// <inheritdoc />
2019
public int CountTokens(string text)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// Copyright (c) Microsoft. All rights reserved.
2+
3+
using System.Collections.Generic;
4+
using System.Linq;
5+
using Microsoft.ML.Tokenizers;
6+
7+
// ReSharper disable once CheckNamespace
8+
namespace Microsoft.KernelMemory.AI.OpenAI;
9+
10+
/// <summary>
11+
/// GPT 4o / 4o mini tokenizer (cl200k_base.tiktoken + special tokens)
12+
/// </summary>
13+
// ReSharper disable once InconsistentNaming
14+
public sealed class GPT4oTokenizer : ITextTokenizer
15+
{
16+
private static readonly Tokenizer s_tokenizer = Tokenizer.CreateTiktokenForModel("gpt-4o",
17+
new Dictionary<string, int> { { "<|im_start|>", 100264 }, { "<|im_end|>", 100265 } });
18+
19+
/// <inheritdoc />
20+
public int CountTokens(string text)
21+
{
22+
return s_tokenizer.CountTokens(text);
23+
}
24+
25+
/// <inheritdoc />
26+
public IReadOnlyList<string> GetTokens(string text)
27+
{
28+
return s_tokenizer.Encode(text, out string? _).Select(t => t.Value).ToList();
29+
}
30+
}

service/Service/ServiceConfiguration.cs

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using Microsoft.Extensions.DependencyInjection;
77
using Microsoft.KernelMemory.AI;
88
using Microsoft.KernelMemory.AI.Anthropic;
9+
using Microsoft.KernelMemory.AI.OpenAI;
910
using Microsoft.KernelMemory.DocumentStorage.DevTools;
1011
using Microsoft.KernelMemory.MemoryDb.SQLServer;
1112
using Microsoft.KernelMemory.MemoryStorage;
@@ -212,15 +213,19 @@ private void ConfigureIngestionEmbeddingGenerators(IKernelMemoryBuilder builder)
212213
case string y when y.Equals("AzureOpenAIEmbedding", StringComparison.OrdinalIgnoreCase):
213214
{
214215
var instance = this.GetServiceInstance<ITextEmbeddingGenerator>(builder,
215-
s => s.AddAzureOpenAIEmbeddingGeneration(this.GetServiceConfig<AzureOpenAIConfig>("AzureOpenAIEmbedding")));
216+
s => s.AddAzureOpenAIEmbeddingGeneration(
217+
config: this.GetServiceConfig<AzureOpenAIConfig>("AzureOpenAIEmbedding"),
218+
textTokenizer: new GPT4Tokenizer()));
216219
builder.AddIngestionEmbeddingGenerator(instance);
217220
break;
218221
}
219222

220223
case string x when x.Equals("OpenAI", StringComparison.OrdinalIgnoreCase):
221224
{
222225
var instance = this.GetServiceInstance<ITextEmbeddingGenerator>(builder,
223-
s => s.AddOpenAITextEmbeddingGeneration(this.GetServiceConfig<OpenAIConfig>("OpenAI")));
226+
s => s.AddOpenAITextEmbeddingGeneration(
227+
config: this.GetServiceConfig<OpenAIConfig>("OpenAI"),
228+
textTokenizer: new GPT4Tokenizer()));
224229
builder.AddIngestionEmbeddingGenerator(instance);
225230
break;
226231
}
@@ -345,11 +350,15 @@ private void ConfigureRetrievalEmbeddingGenerator(IKernelMemoryBuilder builder)
345350
{
346351
case string x when x.Equals("AzureOpenAI", StringComparison.OrdinalIgnoreCase):
347352
case string y when y.Equals("AzureOpenAIEmbedding", StringComparison.OrdinalIgnoreCase):
348-
builder.Services.AddAzureOpenAIEmbeddingGeneration(this.GetServiceConfig<AzureOpenAIConfig>("AzureOpenAIEmbedding"));
353+
builder.Services.AddAzureOpenAIEmbeddingGeneration(
354+
config: this.GetServiceConfig<AzureOpenAIConfig>("AzureOpenAIEmbedding"),
355+
textTokenizer: new GPT4Tokenizer());
349356
break;
350357

351358
case string x when x.Equals("OpenAI", StringComparison.OrdinalIgnoreCase):
352-
builder.Services.AddOpenAITextEmbeddingGeneration(this.GetServiceConfig<OpenAIConfig>("OpenAI"));
359+
builder.Services.AddOpenAITextEmbeddingGeneration(
360+
config: this.GetServiceConfig<OpenAIConfig>("OpenAI"),
361+
textTokenizer: new GPT4Tokenizer());
353362
break;
354363

355364
default:
@@ -412,11 +421,15 @@ private void ConfigureTextGenerator(IKernelMemoryBuilder builder)
412421
{
413422
case string x when x.Equals("AzureOpenAI", StringComparison.OrdinalIgnoreCase):
414423
case string y when y.Equals("AzureOpenAIText", StringComparison.OrdinalIgnoreCase):
415-
builder.Services.AddAzureOpenAITextGeneration(this.GetServiceConfig<AzureOpenAIConfig>("AzureOpenAIText"));
424+
builder.Services.AddAzureOpenAITextGeneration(
425+
config: this.GetServiceConfig<AzureOpenAIConfig>("AzureOpenAIText"),
426+
textTokenizer: new GPT4Tokenizer());
416427
break;
417428

418429
case string x when x.Equals("OpenAI", StringComparison.OrdinalIgnoreCase):
419-
builder.Services.AddOpenAITextGeneration(this.GetServiceConfig<OpenAIConfig>("OpenAI"));
430+
builder.Services.AddOpenAITextGeneration(
431+
config: this.GetServiceConfig<OpenAIConfig>("OpenAI"),
432+
textTokenizer: new GPT4Tokenizer());
420433
break;
421434

422435
case string x when x.Equals("Anthropic", StringComparison.OrdinalIgnoreCase):

0 commit comments

Comments
 (0)