Skip to content

Commit 42110bd

Browse files
committed
resolved conflicts with main
2 parents 324fb03 + 4fa7912 commit 42110bd

File tree

10 files changed

+105
-17
lines changed

10 files changed

+105
-17
lines changed

.github/workflows/publish.yml

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
name: Publish to NPM
2+
3+
permissions:
4+
contents: write
5+
id-token: write
6+
packages: write
7+
8+
on:
9+
workflow_dispatch:
10+
inputs:
11+
version:
12+
description: "Version bump type"
13+
required: true
14+
type: choice
15+
options:
16+
- patch
17+
- minor
18+
- major
19+
default: patch
20+
21+
jobs:
22+
publish:
23+
runs-on: ubuntu-latest
24+
25+
steps:
26+
- name: Checkout code
27+
uses: actions/checkout@v5
28+
with:
29+
token: ${{ secrets.GITHUB_TOKEN }}
30+
31+
- name: Setup Node.js
32+
uses: actions/setup-node@v6
33+
with:
34+
node-version: 24.x
35+
36+
- name: Log NPM version
37+
run: npm -v
38+
39+
- name: Configure git
40+
run: |
41+
git config user.name "github-actions[bot]"
42+
git config user.email "github-actions[bot]@users.noreply.github.com"
43+
44+
- name: Install dependencies
45+
run: npm ci
46+
47+
- name: Run format check
48+
run: npm run format:check
49+
50+
- name: Run linter
51+
run: npm run lint
52+
53+
- name: Build application
54+
run: npm run build
55+
56+
- name: Run tests
57+
run: npm test
58+
59+
- name: Bump version
60+
run: npm version ${{ inputs.version }} -m "🔖 @huggingface/tokenizers@%s"
61+
62+
- name: Get new version
63+
id: package-version
64+
run: echo "version=$(node -p "require('./package.json').version")" >> $GITHUB_OUTPUT
65+
66+
- name: Push changes
67+
run: |
68+
git push
69+
git push --tags
70+
71+
- name: Publish to NPM
72+
run: npm publish

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ dist
1313
dist-ssr
1414
*.local
1515
types
16+
tests/data
1617

1718
# Editor directories and files
1819
.vscode/*

.prettierrc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"overrides": [
33
{
4-
"files": ["tests/**/*.ts"],
4+
"files": ["README.md", "tests/**/*.ts"],
55
"options": {
66
"printWidth": 10000000
77
}

README.md

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,28 +30,37 @@ Run today's most used tokenizers directly in your browser or Node.js application
3030
npm install @huggingface/tokenizers
3131
```
3232

33+
Alternatively, you can use it via a CDN as follows:
34+
35+
```html
36+
<script type="module">
37+
import { Tokenizer } from "https://cdn.jsdelivr.net/npm/@huggingface/tokenizers";
38+
</script>
39+
```
40+
3341
## Usage
3442

3543
```javascript
36-
import { Tokenizer } from '@huggingface/tokenizers';
44+
import { Tokenizer } from "@huggingface/tokenizers";
3745

38-
// Load files from the Hugging Face Hub
46+
// Load files from the Hugging Face Hub
3947
const modelId = "HuggingFaceTB/SmolLM3-3B";
40-
const tokenizerJson = await fetch(`https://huggingface.co/${modelId}/resolve/main/tokenizer.json`).then(res => res.json());
41-
const tokenizerConfig = await fetch(`https://huggingface.co/${modelId}/resolve/main/tokenizer_config.json`).then(res => res.json());
48+
const tokenizerJson = await fetch(`https://huggingface.co/${modelId}/resolve/main/tokenizer.json`).then((res) => res.json());
49+
const tokenizerConfig = await fetch(`https://huggingface.co/${modelId}/resolve/main/tokenizer_config.json`).then((res) => res.json());
4250

4351
// Create tokenizer
4452
const tokenizer = new Tokenizer(tokenizerJson, tokenizerConfig);
4553

4654
// Tokenize text
47-
const tokens = tokenizer.tokenize('Hello World'); // ['Hello', 'ĠWorld']
48-
const encoded = tokenizer.encode('Hello World'); // { ids: [9906, 4435], tokens: ['Hello', 'ĠWorld'], attention_mask: [1, 1] }
49-
const decoded = tokenizer.decode(encoded.ids); // 'Hello World'
55+
const tokens = tokenizer.tokenize("Hello World"); // ['Hello', 'ĠWorld']
56+
const encoded = tokenizer.encode("Hello World"); // { ids: [9906, 4435], tokens: ['Hello', 'ĠWorld'], attention_mask: [1, 1] }
57+
const decoded = tokenizer.decode(encoded.ids); // 'Hello World'
5058
```
5159

5260
## Requirements
5361

5462
This library expects two files from Hugging Face models:
63+
5564
- `tokenizer.json` - Contains the tokenizer configuration
5665
- `tokenizer_config.json` - Contains additional metadata
5766

@@ -60,6 +69,7 @@ This library expects two files from Hugging Face models:
6069
Tokenizers.js supports [Hugging Face tokenizer components](https://huggingface.co/docs/tokenizers/components):
6170

6271
### Normalizers
72+
6373
- NFD
6474
- NFKC
6575
- NFC
@@ -73,6 +83,7 @@ Tokenizers.js supports [Hugging Face tokenizer components](https://huggingface.c
7383
- Sequence
7484

7585
### Pre-tokenizers
86+
7687
- BERT
7788
- ByteLevel
7889
- Whitespace
@@ -84,19 +95,22 @@ Tokenizers.js supports [Hugging Face tokenizer components](https://huggingface.c
8495
- Digits
8596

8697
### Models
98+
8799
- BPE (Byte-Pair Encoding)
88100
- WordPiece
89101
- Unigram
90102
- Legacy
91103

92104
### Post-processors
105+
93106
- ByteLevel
94107
- TemplateProcessing
95108
- RobertaProcessing
96109
- BertProcessing
97110
- Sequence
98111

99112
### Decoders
113+
100114
- ByteLevel
101115
- WordPiece
102116
- Metaspace

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@huggingface/tokenizers",
3-
"version": "0.0.1",
3+
"version": "0.0.2",
44
"description": "🤗 Tokenizers.js: A pure JS/TS implementation of today's most used tokenizers",
55
"type": "module",
66
"main": "dist/tokenizers.min.mjs",
@@ -32,7 +32,11 @@
3232
"dev": "npm run clean && node scripts/dev.mjs",
3333
"lint": "eslint src --ext .ts,.tsx",
3434
"test": "node --experimental-vm-modules --expose-gc --max-old-space-size=4096 node_modules/jest/bin/jest.js --verbose --logHeapUsage --maxWorkers=10%",
35-
"format": "prettier --write \"src/**/*.ts\" \"tests/**/*.ts\""
35+
"format": "prettier --write .",
36+
"format:check": "prettier --check ."
37+
},
38+
"publishConfig": {
39+
"access": "public"
3640
},
3741
"devDependencies": {
3842
"@types/jest": "^30.0.0",

scripts/build.mjs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ const build = async (outfile) => {
4343
...minifyOptions,
4444
});
4545
reportSize(outfile);
46-
}
46+
};
4747

4848
await build("dist/tokenizers.mjs");
4949
await build("dist/tokenizers.cjs");

src/static/tokenizer.d.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ export interface TokenizerJSON {
7979
added_tokens?: AddedToken[];
8080
normalizer?: TokenizerConfigNormalizer;
8181
pre_tokenizer?: TokenizerConfigPreTokenizer;
82-
post_processor?: TokenConfigPostProcessor;
82+
post_processor?: TokenizerConfigPostProcessor;
8383
decoder?: TokenizerConfigDecoder;
8484
model: TokenizerModelConfig;
8585
}

tests/data/.gitignore

Lines changed: 0 additions & 2 deletions
This file was deleted.

tsconfig.json

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
"jsx": "react-jsx",
77
"moduleResolution": "bundler",
88
"esModuleInterop": true,
9-
"skipLibCheck": true,
109
"declaration": true,
1110
"declarationDir": "types",
1211
"outDir": "types",

0 commit comments

Comments
 (0)