File tree 4 files changed +32
-2
lines changed
4 files changed +32
-2
lines changed Original file line number Diff line number Diff line change @@ -37,7 +37,13 @@ class SentencePieceTokenizer : public Tokenizer {
37
37
38
38
std::string IdToToken (int32_t id) final { return sentence_piece_.IdToPiece (id); }
39
39
40
- int32_t TokenToId (const std::string& token) final { return sentence_piece_.PieceToId (token); }
40
+ int32_t TokenToId (const std::string& token) final {
41
+ int32_t id = sentence_piece_.PieceToId (token);
42
+ if (id == sentence_piece_.unk_id ()) {
43
+ return -1 ;
44
+ }
45
+ return id;
46
+ }
41
47
42
48
private:
43
49
// the tokenizer
Original file line number Diff line number Diff line change @@ -75,6 +75,17 @@ export class Tokenizer {
75
75
return res ;
76
76
}
77
77
78
+ /**
79
+ * Convert the given token to its corresponding id if it exists. If not, return -1.
80
+ *
81
+ * @param token the input token string.
82
+ * @returns The encoded token id.
83
+ */
84
+ tokenToId ( token : string ) : number {
85
+ const id = this . handle . TokenToId ( token . slice ( ) ) ;
86
+ return id ;
87
+ }
88
+
78
89
/**
79
90
* Create a tokenizer from jsonArrayBuffer
80
91
*
Original file line number Diff line number Diff line change @@ -23,5 +23,6 @@ EMSCRIPTEN_BINDINGS(tokenizers) {
23
23
.function (" Encode" , &tokenizers::Tokenizer::Encode)
24
24
.function (" Decode" , &tokenizers::Tokenizer::Decode)
25
25
.function (" GetVocabSize" , &tokenizers::Tokenizer::GetVocabSize)
26
- .function (" IdToToken" , &tokenizers::Tokenizer::IdToToken);
26
+ .function (" IdToToken" , &tokenizers::Tokenizer::IdToToken)
27
+ .function (" TokenToId" , &tokenizers::Tokenizer::TokenToId);
27
28
}
Original file line number Diff line number Diff line change @@ -27,6 +27,18 @@ async function testJSONTokenizer() {
27
27
if ( tok49407 !== "<|endoftext|>" ) {
28
28
throw Error ( "Expect token 49407 to be <|endoftext|>" ) ;
29
29
}
30
+
31
+ const id0 = tok . tokenToId ( "!" ) ;
32
+ console . log ( "id0=" + id0 ) ;
33
+ if ( id0 !== 0 ) {
34
+ throw Error ( "Expect token 0 to be !" ) ;
35
+ }
36
+
37
+ const id49407 = tok . tokenToId ( "<|endoftext|>" ) ;
38
+ console . log ( "id49407=" + id49407 ) ;
39
+ if ( id49407 !== 49407 ) {
40
+ throw Error ( "Expect token 49407 to be <|endoftext|>" ) ;
41
+ }
30
42
}
31
43
32
44
async function testLlamaTokenizer ( ) {
You can’t perform that action at this time.
0 commit comments