Skip to content

Commit 0f7e5c2

Browse files
committed
Add web binding Tokenizer.tokenToId()
1 parent 4bb7533 commit 0f7e5c2

File tree

4 files changed

+32
-2
lines changed

4 files changed

+32
-2
lines changed

src/sentencepiece_tokenizer.cc

+7-1
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,13 @@ class SentencePieceTokenizer : public Tokenizer {
3737

3838
std::string IdToToken(int32_t id) final { return sentence_piece_.IdToPiece(id); }
3939

40-
int32_t TokenToId(const std::string& token) final { return sentence_piece_.PieceToId(token); }
40+
int32_t TokenToId(const std::string& token) final {
41+
int32_t id = sentence_piece_.PieceToId(token);
42+
if (id == sentence_piece_.unk_id()) {
43+
return -1;
44+
}
45+
return id;
46+
}
4147

4248
private:
4349
// the tokenizer

web/src/tokenizers.ts

+11
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,17 @@ export class Tokenizer {
7575
return res;
7676
}
7777

78+
/**
79+
* Convert the given token to its corresponding id if it exists. If not, return -1.
80+
*
81+
* @param token the input token string.
82+
* @returns The encoded token id.
83+
*/
84+
tokenToId(token: string): number {
85+
const id = this.handle.TokenToId(token.slice());
86+
return id;
87+
}
88+
7889
/**
7990
* Create a tokenizer from jsonArrayBuffer
8091
*

web/src/tokenizers_binding.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,6 @@ EMSCRIPTEN_BINDINGS(tokenizers) {
2323
.function("Encode", &tokenizers::Tokenizer::Encode)
2424
.function("Decode", &tokenizers::Tokenizer::Decode)
2525
.function("GetVocabSize", &tokenizers::Tokenizer::GetVocabSize)
26-
.function("IdToToken", &tokenizers::Tokenizer::IdToToken);
26+
.function("IdToToken", &tokenizers::Tokenizer::IdToToken)
27+
.function("TokenToId", &tokenizers::Tokenizer::TokenToId);
2728
}

web/tests/src/index.ts

+12
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,18 @@ async function testJSONTokenizer() {
2727
if (tok49407 !== "<|endoftext|>") {
2828
throw Error("Expect token 49407 to be <|endoftext|>");
2929
}
30+
31+
const id0 = tok.tokenToId("!");
32+
console.log("id0=" + id0);
33+
if (id0 !== 0) {
34+
throw Error("Expect token 0 to be !");
35+
}
36+
37+
const id49407 = tok.tokenToId("<|endoftext|>");
38+
console.log("id49407=" + id49407);
39+
if (id49407 !== 49407) {
40+
throw Error("Expect token 49407 to be <|endoftext|>");
41+
}
3042
}
3143

3244
async function testLlamaTokenizer() {

0 commit comments

Comments
 (0)