@@ -30,28 +30,37 @@ Run today's most used tokenizers directly in your browser or Node.js application
3030npm install @huggingface/tokenizers
3131```
3232
33+ Alternatively, you can use it via a CDN as follows:
34+
35+ ``` html
36+ <script type =" module" >
37+ import { Tokenizer } from " https://cdn.jsdelivr.net/npm/@huggingface/tokenizers" ;
38+ </script >
39+ ```
40+
3341## Usage
3442
3543``` javascript
36- import { Tokenizer } from ' @huggingface/tokenizers' ;
44+ import { Tokenizer } from " @huggingface/tokenizers" ;
3745
38- // Load files from the Hugging Face Hub
46+ // Load files from the Hugging Face Hub
3947const modelId = " HuggingFaceTB/SmolLM3-3B" ;
40- const tokenizerJson = await fetch (` https://huggingface.co/${ modelId} /resolve/main/tokenizer.json` ).then (res => res .json ());
41- const tokenizerConfig = await fetch (` https://huggingface.co/${ modelId} /resolve/main/tokenizer_config.json` ).then (res => res .json ());
48+ const tokenizerJson = await fetch (` https://huggingface.co/${ modelId} /resolve/main/tokenizer.json` ).then (( res ) => res .json ());
49+ const tokenizerConfig = await fetch (` https://huggingface.co/${ modelId} /resolve/main/tokenizer_config.json` ).then (( res ) => res .json ());
4250
4351// Create tokenizer
4452const tokenizer = new Tokenizer (tokenizerJson, tokenizerConfig);
4553
4654// Tokenize text
47- const tokens = tokenizer .tokenize (' Hello World' ); // ['Hello', 'ĠWorld']
48- const encoded = tokenizer .encode (' Hello World' ); // { ids: [9906, 4435], tokens: ['Hello', 'ĠWorld'], attention_mask: [1, 1] }
49- const decoded = tokenizer .decode (encoded .ids ); // 'Hello World'
55+ const tokens = tokenizer .tokenize (" Hello World" ); // ['Hello', 'ĠWorld']
56+ const encoded = tokenizer .encode (" Hello World" ); // { ids: [9906, 4435], tokens: ['Hello', 'ĠWorld'], attention_mask: [1, 1] }
57+ const decoded = tokenizer .decode (encoded .ids ); // 'Hello World'
5058```
5159
5260## Requirements
5361
5462This library expects two files from Hugging Face models:
63+
5564- ` tokenizer.json ` - Contains the tokenizer configuration
5665- ` tokenizer_config.json ` - Contains additional metadata
5766
@@ -60,6 +69,7 @@ This library expects two files from Hugging Face models:
6069Tokenizers.js supports [ Hugging Face tokenizer components] ( https://huggingface.co/docs/tokenizers/components ) :
6170
6271### Normalizers
72+
6373- NFD
6474- NFKC
6575- NFC
@@ -73,6 +83,7 @@ Tokenizers.js supports [Hugging Face tokenizer components](https://huggingface.c
7383- Sequence
7484
7585### Pre-tokenizers
86+
7687- BERT
7788- ByteLevel
7889- Whitespace
@@ -84,19 +95,22 @@ Tokenizers.js supports [Hugging Face tokenizer components](https://huggingface.c
8495- Digits
8596
8697### Models
98+
8799- BPE (Byte-Pair Encoding)
88100- WordPiece
89101- Unigram
90102- Legacy
91103
92104### Post-processors
105+
93106- ByteLevel
94107- TemplateProcessing
95108- RobertaProcessing
96109- BertProcessing
97110- Sequence
98111
99112### Decoders
113+
100114- ByteLevel
101115- WordPiece
102116- Metaspace
0 commit comments