Skip to content

Commit 5c1f8c9

Browse files
authored
Merge pull request #51387 from dotnet/main
Merge main into live
2 parents c70a2cf + 771f7b8 commit 5c1f8c9

116 files changed

Lines changed: 729 additions & 6436 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.openpublishing.redirection.core.json

Lines changed: 380 additions & 0 deletions
Large diffs are not rendered by default.

docs/ai/conceptual/understanding-tokens.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ Generative AI services might also be limited regarding the maximum number of tok
103103

104104
## Related content
105105

106+
- [Use Microsoft.ML.Tokenizers for text tokenization](../how-to/use-tokenizers.md)
106107
- [How generative AI and LLMs work](how-genai-and-llms-work.md)
107108
- [Understand embeddings](embeddings.md)
108109
- [Work with vector databases](vector-databases.md)
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Net.Http;
5+
using System.Threading.Tasks;
6+
using Microsoft.ML.Tokenizers;
7+
8+
internal class BpeExample
9+
{
10+
public static async Task RunAsync()
11+
{
12+
await BasicUsageAsync();
13+
}
14+
15+
private static async Task BasicUsageAsync()
16+
{
17+
// <BpeBasic>
18+
// BPE (Byte Pair Encoding) tokenizer can be created from vocabulary and merges files.
19+
// Download the GPT-2 tokenizer files from Hugging Face.
20+
using HttpClient httpClient = new();
21+
const string vocabUrl = @"https://huggingface.co/openai-community/gpt2/raw/main/vocab.json";
22+
const string mergesUrl = @"https://huggingface.co/openai-community/gpt2/raw/main/merges.txt";
23+
24+
using Stream vocabStream = await httpClient.GetStreamAsync(vocabUrl);
25+
using Stream mergesStream = await httpClient.GetStreamAsync(mergesUrl);
26+
27+
// Create the BPE tokenizer using the vocabulary and merges streams.
28+
Tokenizer bpeTokenizer = BpeTokenizer.Create(vocabStream, mergesStream);
29+
30+
string text = "Hello, how are you doing today?";
31+
32+
// Encode text to token IDs.
33+
IReadOnlyList<int> ids = bpeTokenizer.EncodeToIds(text);
34+
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
35+
36+
// Count tokens.
37+
int tokenCount = bpeTokenizer.CountTokens(text);
38+
Console.WriteLine($"Token count: {tokenCount}");
39+
40+
// Get detailed token information.
41+
IReadOnlyList<EncodedToken> tokens = bpeTokenizer.EncodeToTokens(text, out string? normalizedString);
42+
Console.WriteLine("Tokens:");
43+
foreach (EncodedToken token in tokens)
44+
{
45+
Console.WriteLine($" ID: {token.Id}, Value: '{token.Value}'");
46+
}
47+
48+
// Decode tokens back to text.
49+
string? decoded = bpeTokenizer.Decode(ids);
50+
Console.WriteLine($"Decoded: {decoded}");
51+
52+
// Note: BpeTokenizer might not always decode IDs to the exact original text
53+
// as it can remove spaces during tokenization depending on the model configuration.
54+
// </BpeBasic>
55+
}
56+
}
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.IO;
4+
using System.Net.Http;
5+
using System.Threading.Tasks;
6+
using Microsoft.ML.Tokenizers;
7+
8+
internal class LlamaExample
9+
{
10+
public static async Task RunAsync()
11+
{
12+
await BasicUsageAndAdvancedOptionsAsync();
13+
}
14+
15+
private static async Task BasicUsageAndAdvancedOptionsAsync()
16+
{
17+
// <LlamaBasic>
18+
// Open a stream to the remote Llama tokenizer model data file.
19+
using HttpClient httpClient = new();
20+
const string modelUrl = @"https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model";
21+
using Stream remoteStream = await httpClient.GetStreamAsync(modelUrl);
22+
23+
// Create the Llama tokenizer using the remote stream.
24+
Tokenizer llamaTokenizer = LlamaTokenizer.Create(remoteStream);
25+
26+
string input = "Hello, world!";
27+
28+
// Encode text to token IDs.
29+
IReadOnlyList<int> ids = llamaTokenizer.EncodeToIds(input);
30+
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
31+
// Output: Token IDs: 1, 15043, 29892, 3186, 29991
32+
33+
// Count the tokens.
34+
Console.WriteLine($"Tokens: {llamaTokenizer.CountTokens(input)}");
35+
// Output: Tokens: 5
36+
37+
// Decode token IDs back to text.
38+
string? decoded = llamaTokenizer.Decode(ids);
39+
Console.WriteLine($"Decoded: {decoded}");
40+
// Output: Decoded: Hello, world!
41+
// </LlamaBasic>
42+
43+
// <LlamaAdvanced>
44+
ReadOnlySpan<char> textSpan = "Hello World".AsSpan();
45+
46+
// Bypass normalization during encoding.
47+
ids = llamaTokenizer.EncodeToIds(textSpan, considerNormalization: false);
48+
49+
// Bypass pretokenization during encoding.
50+
ids = llamaTokenizer.EncodeToIds(textSpan, considerPreTokenization: false);
51+
52+
// Bypass both normalization and pretokenization.
53+
ids = llamaTokenizer.EncodeToIds(textSpan, considerNormalization: false, considerPreTokenization: false);
54+
// </LlamaAdvanced>
55+
}
56+
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
using System;
2+
using System.Threading.Tasks;
3+
4+
// Run examples
5+
Console.WriteLine("=== Tiktoken Examples ===");
6+
TiktokenExample.Run();
7+
8+
Console.WriteLine("\n=== Llama Examples ===");
9+
try
10+
{
11+
await LlamaExample.RunAsync();
12+
}
13+
catch (Exception ex)
14+
{
15+
Console.WriteLine($"Note: Llama example requires network access to download model files: {ex.Message}");
16+
}
17+
18+
Console.WriteLine("\n=== BPE Examples ===");
19+
try
20+
{
21+
await BpeExample.RunAsync();
22+
}
23+
catch (Exception ex)
24+
{
25+
Console.WriteLine($"Note: BPE example requires network access to download tokenizer files: {ex.Message}");
26+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Tokenizers;
4+
5+
internal class TiktokenExample
6+
{
7+
public static void Run()
8+
{
9+
BasicUsage();
10+
TrimText();
11+
}
12+
13+
private static void BasicUsage()
14+
{
15+
// <TiktokenBasic>
16+
// Initialize the tokenizer for the gpt-4o model.
17+
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
18+
19+
string source = "Text tokenization is the process of splitting a string into a list of tokens.";
20+
21+
// Count the tokens in the text.
22+
Console.WriteLine($"Tokens: {tokenizer.CountTokens(source)}");
23+
// Output: Tokens: 16
24+
25+
// Encode text to token IDs.
26+
IReadOnlyList<int> ids = tokenizer.EncodeToIds(source);
27+
Console.WriteLine($"Token IDs: {string.Join(", ", ids)}");
28+
// Output: Token IDs: 1279, 6602, 2860, 382, 290, 2273, 328, 87130, 261, 1621, 1511, 261, 1562, 328, 20290, 13
29+
30+
// Decode token IDs back to text.
31+
string? decoded = tokenizer.Decode(ids);
32+
Console.WriteLine($"Decoded: {decoded}");
33+
// Output: Decoded: Text tokenization is the process of splitting a string into a list of tokens.
34+
// </TiktokenBasic>
35+
}
36+
37+
private static void TrimText()
38+
{
39+
// <TiktokenTrim>
40+
Tokenizer tokenizer = TiktokenTokenizer.CreateForModel("gpt-4o");
41+
42+
string source = "Text tokenization is the process of splitting a string into a list of tokens.";
43+
44+
// Get the last 5 tokens from the text.
45+
var trimIndex = tokenizer.GetIndexByTokenCountFromEnd(source, 5, out string? processedText, out _);
46+
processedText ??= source;
47+
Console.WriteLine($"Last 5 tokens: {processedText.Substring(trimIndex)}");
48+
// Output: Last 5 tokens: a list of tokens.
49+
50+
// Get the first 5 tokens from the text.
51+
trimIndex = tokenizer.GetIndexByTokenCount(source, 5, out processedText, out _);
52+
processedText ??= source;
53+
Console.WriteLine($"First 5 tokens: {processedText.Substring(0, trimIndex)}");
54+
// Output: First 5 tokens: Text tokenization is the
55+
// </TiktokenTrim>
56+
}
57+
}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<Project Sdk="Microsoft.NET.Sdk">
2+
3+
<PropertyGroup>
4+
<OutputType>Exe</OutputType>
5+
<TargetFramework>net10.0</TargetFramework>
6+
<ImplicitUsings>enable</ImplicitUsings>
7+
<Nullable>enable</Nullable>
8+
</PropertyGroup>
9+
10+
<ItemGroup>
11+
<PackageReference Include="Microsoft.ML.Tokenizers" Version="2.0.0" />
12+
<PackageReference Include="Microsoft.ML.Tokenizers.Data.O200kBase" Version="2.0.0" />
13+
</ItemGroup>
14+
15+
</Project>

docs/ai/how-to/use-tokenizers.md

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
---
2+
title: Use Microsoft.ML.Tokenizers for text tokenization
3+
description: Learn how to use the Microsoft.ML.Tokenizers library to tokenize text for AI models, manage token counts, and work with various tokenization algorithms.
4+
ms.topic: how-to
5+
ms.date: 10/29/2025
6+
ai-usage: ai-assisted
7+
---
8+
# Use Microsoft.ML.Tokenizers for text tokenization
9+
10+
The [Microsoft.ML.Tokenizers](https://www.nuget.org/packages/Microsoft.ML.Tokenizers) library provides a comprehensive set of tools for tokenizing text in .NET applications. Tokenization is essential when you work with large language models (LLMs), as it allows you to manage token counts, estimate costs, and preprocess text for AI models.
11+
12+
This article shows you how to use the library's key features and work with different tokenizer models.
13+
14+
## Prerequisites
15+
16+
- [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0) or later
17+
18+
> [!NOTE]
19+
> The Microsoft.ML.Tokenizers library also supports .NET Standard 2.0, making it compatible with .NET Framework 4.6.1 and later.
20+
21+
## Install the package
22+
23+
Install the Microsoft.ML.Tokenizers NuGet package:
24+
25+
```dotnetcli
26+
dotnet add package Microsoft.ML.Tokenizers
27+
```
28+
29+
For Tiktoken models (like GPT-4), you also need to install the corresponding data package:
30+
31+
```dotnetcli
32+
dotnet add package Microsoft.ML.Tokenizers.Data.O200kBase
33+
```
34+
35+
## Key features
36+
37+
The Microsoft.ML.Tokenizers library provides:
38+
39+
- **Extensible tokenizer architecture**: Allows specialization of Normalizer, PreTokenizer, Model/Encoder, and Decoder components.
40+
- **Multiple tokenization algorithms**: Supports BPE (byte-pair encoding), Tiktoken, Llama, CodeGen, and more.
41+
- **Token counting and estimation**: Helps manage costs and context limits when working with AI services.
42+
- **Flexible encoding options**: Provides methods to encode text to token IDs, count tokens, and decode tokens back to text.
43+
44+
## Use Tiktoken tokenizer
45+
46+
The Tiktoken tokenizer is commonly used with OpenAI models like GPT-4. The following example shows how to initialize a Tiktoken tokenizer and perform common operations:
47+
48+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenBasic":::
49+
50+
For better performance, you should cache and reuse the tokenizer instance throughout your app.
51+
52+
When you work with LLMs, you often need to manage text within token limits. The following example shows how to trim text to a specific token count:
53+
54+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/TiktokenExample.cs" id="TiktokenTrim":::
55+
56+
## Use Llama tokenizer
57+
58+
The Llama tokenizer is designed for the Llama family of models. It requires a tokenizer model file, which you can download from model repositories like Hugging Face:
59+
60+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaBasic":::
61+
62+
All tokenizers support advanced encoding options, such as controlling normalization and pretokenization:
63+
64+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/LlamaExample.cs" id="LlamaAdvanced":::
65+
66+
## Use BPE tokenizer
67+
68+
*Byte-pair encoding* (BPE) is the underlying algorithm used by many tokenizers, including Tiktoken. BPE was initially developed as an algorithm to compress texts, and then used by OpenAI for tokenization when it pretrained the GPT model. The following example demonstrates BPE tokenization:
69+
70+
:::code language="csharp" source="./snippets/use-tokenizers/csharp/TokenizersExamples/BpeExample.cs" id="BpeBasic":::
71+
72+
The library also provides specialized tokenizers like <xref:Microsoft.ML.Tokenizers.BpeTokenizer> and <xref:Microsoft.ML.Tokenizers.EnglishRobertaTokenizer> that you can configure with custom vocabularies for specific models.
73+
74+
For more information about BPE, see [Byte-pair encoding tokenization](https://huggingface.co/learn/llm-course/chapter6/5).
75+
76+
## Common tokenizer operations
77+
78+
All tokenizers in the library implement the <xref:Microsoft.ML.Tokenizers.Tokenizer> base class. The following table shows the available methods.
79+
80+
| Method | Description |
81+
|-------------------------------------------------------|--------------------------------------|
82+
| <xref:Microsoft.ML.Tokenizers.Tokenizer.EncodeToIds*> | Converts text to a list of token IDs. |
83+
| <xref:Microsoft.ML.Tokenizers.Tokenizer.Decode*> | Converts token IDs back to text. |
84+
| <xref:Microsoft.ML.Tokenizers.Tokenizer.CountTokens*> | Returns the number of tokens in a text string. |
85+
| <xref:Microsoft.ML.Tokenizers.Tokenizer.EncodeToTokens*> | Returns detailed token information including values and IDs. |
86+
| <xref:Microsoft.ML.Tokenizers.Tokenizer.GetIndexByTokenCount*> | Finds the character index for a specific token count from the start. |
87+
| <xref:Microsoft.ML.Tokenizers.Tokenizer.GetIndexByTokenCountFromEnd*> | Finds the character index for a specific token count from the end. |
88+
89+
## Migrate from other libraries
90+
91+
If you're currently using `DeepDev.TokenizerLib` or `SharpToken`, consider migrating to Microsoft.ML.Tokenizers. The library has been enhanced to cover scenarios from those libraries and provides better performance and support. For migration guidance, see the [migration guide](https://github.com/dotnet/machinelearning/blob/main/docs/code/microsoft-ml-tokenizers-migration-guide.md).
92+
93+
## Related content
94+
95+
- [Understanding tokens](../conceptual/understanding-tokens.md)
96+
- [Microsoft.ML.Tokenizers API reference](/dotnet/api/microsoft.ml.tokenizers)
97+
- [Microsoft.ML.Tokenizers NuGet package](https://www.nuget.org/packages/Microsoft.ML.Tokenizers)

docs/ai/toc.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@ items:
9090
href: quickstarts/build-mcp-server.md
9191
- name: Publish to the Official MCP Registry
9292
href: quickstarts/publish-mcp-registry.md
93+
- name: Tokenization
94+
items:
95+
- name: Use Microsoft.ML.Tokenizers
96+
href: how-to/use-tokenizers.md
9397
- name: Security and content safety
9498
items:
9599
- name: Authentication for Azure-hosted apps and services

docs/azure/includes/dotnet-all.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@
8181
| OpenAI Assistants | NuGet [1.0.0-beta.4](https://www.nuget.org/packages/Azure.AI.OpenAI.Assistants/1.0.0-beta.4) | [docs](/dotnet/api/overview/azure/AI.OpenAI.Assistants-readme?view=azure-dotnet-preview&amp;preserve-view=true) | GitHub [1.0.0-beta.4](https://github.com/Azure/azure-sdk-for-net/tree/Azure.AI.OpenAI.Assistants_1.0.0-beta.4/sdk/openai/Azure.AI.OpenAI.Assistants/) |
8282
| OpenAI Inference | NuGet [2.1.0](https://www.nuget.org/packages/Azure.AI.OpenAI/2.1.0)<br>NuGet [2.8.0-beta.1](https://www.nuget.org/packages/Azure.AI.OpenAI/2.8.0-beta.1) | [docs](/dotnet/api/overview/azure/AI.OpenAI-readme) | GitHub [2.1.0](https://github.com/Azure/azure-sdk-for-net/tree/Azure.AI.OpenAI_2.1.0/sdk/openai/Azure.AI.OpenAI/)<br>GitHub [2.8.0-beta.1](https://github.com/Azure/azure-sdk-for-net/tree/Azure.AI.OpenAI_2.8.0-beta.1/sdk/openai/Azure.AI.OpenAI/) |
8383
| OpenTelemetry AspNetCore | NuGet [1.4.0](https://www.nuget.org/packages/Azure.Monitor.OpenTelemetry.AspNetCore/1.4.0) | [docs](/dotnet/api/overview/azure/Monitor.OpenTelemetry.AspNetCore-readme) | GitHub [1.4.0](https://github.com/Azure/azure-sdk-for-net/tree/Azure.Monitor.OpenTelemetry.AspNetCore_1.4.0/sdk/monitor/Azure.Monitor.OpenTelemetry.AspNetCore/) |
84-
| OpenTelemetry Exporter | NuGet [1.5.0](https://www.nuget.org/packages/Azure.Monitor.OpenTelemetry.Exporter/1.5.0)<br>NuGet [1.6.0-beta.2](https://www.nuget.org/packages/Azure.Monitor.OpenTelemetry.Exporter/1.6.0-beta.2) | [docs](/dotnet/api/overview/azure/Monitor.OpenTelemetry.Exporter-readme) | GitHub [1.5.0](https://github.com/Azure/azure-sdk-for-net/tree/Azure.Monitor.OpenTelemetry.Exporter_1.5.0/sdk/monitor/Azure.Monitor.OpenTelemetry.Exporter/)<br>GitHub [1.6.0-beta.2](https://github.com/Azure/azure-sdk-for-net/tree/Azure.Monitor.OpenTelemetry.Exporter_1.6.0-beta.2/sdk/monitor/Azure.Monitor.OpenTelemetry.Exporter/) |
84+
| OpenTelemetry Exporter | NuGet [1.6.0](https://www.nuget.org/packages/Azure.Monitor.OpenTelemetry.Exporter/1.6.0) | [docs](/dotnet/api/overview/azure/Monitor.OpenTelemetry.Exporter-readme) | GitHub [1.6.0](https://github.com/Azure/azure-sdk-for-net/tree/Azure.Monitor.OpenTelemetry.Exporter_1.6.0/sdk/monitor/Azure.Monitor.OpenTelemetry.Exporter/) |
8585
| Personalizer | NuGet [2.0.0-beta.2](https://www.nuget.org/packages/Azure.AI.Personalizer/2.0.0-beta.2) | [docs](/dotnet/api/overview/azure/AI.Personalizer-readme?view=azure-dotnet-preview&amp;preserve-view=true) | GitHub [2.0.0-beta.2](https://github.com/Azure/azure-sdk-for-net/tree/Azure.AI.Personalizer_2.0.0-beta.2/sdk/personalizer/Azure.AI.Personalizer/) |
8686
| Playwright | NuGet [1.0.0](https://www.nuget.org/packages/Azure.Developer.Playwright/1.0.0) | [docs](/dotnet/api/overview/azure/Developer.Playwright-readme) | GitHub [1.0.0](https://github.com/Azure/azure-sdk-for-net/tree/Azure.Developer.Playwright_1.0.0/sdk/loadtestservice/Azure.Developer.Playwright/) |
8787
| Playwright NUnit | NuGet [1.0.0](https://www.nuget.org/packages/Azure.Developer.Playwright.NUnit/1.0.0) | [docs](/dotnet/api/overview/azure/Developer.Playwright.NUnit-readme) | GitHub [1.0.0](https://github.com/Azure/azure-sdk-for-net/tree/Azure.Developer.Playwright.NUnit_1.0.0/sdk/loadtestservice/Azure.Developer.Playwright.NUnit/) |

0 commit comments

Comments
 (0)