@@ -3503,8 +3503,14 @@ void llama_vocab_plamo2::build(const std::vector<vocab_entry> & vocab) {
3503
3503
3504
3504
// Add token and all its suffixes to suffix_to_score
3505
3505
suffix_to_score[entry.text ] = entry.score ;
3506
- for (size_t i = 1 ; i < entry.text .length (); ++i) {
3507
- std::string suffix = entry.text .substr (i);
3506
+
3507
+ // Extract suffixes character by character (UTF-8 aware)
3508
+ std::vector<uint32_t > cpts = unicode_cpts_from_utf8 (entry.text );
3509
+ for (size_t i = 1 ; i < cpts.size (); ++i) {
3510
+ std::string suffix;
3511
+ for (size_t j = i; j < cpts.size (); ++j) {
3512
+ suffix += unicode_cpt_to_utf8 (cpts[j]);
3513
+ }
3508
3514
if (suffix_to_score.find (suffix) == suffix_to_score.end ()) {
3509
3515
suffix_to_score[suffix] = std::numeric_limits<float >::quiet_NaN ();
3510
3516
}
@@ -3535,26 +3541,34 @@ void llama_vocab_plamo2::build(const std::vector<vocab_entry> & vocab) {
3535
3541
std::unordered_map<std::string, int32_t > suffix_to_id;
3536
3542
int32_t num_pieces = 0 ;
3537
3543
3538
- for (const auto & s : suffixes) {
3539
- suffix_to_id[s ] = num_pieces;
3540
- if (!s .empty ()) {
3541
- // Convert first character to Unicode code point
3542
- std::vector< int32_t > unicode_chars = utf8_to_unicode (s);
3543
- if (!unicode_chars. empty ()) {
3544
- int64_t piece_code = ( static_cast < int64_t >(unicode_chars[ 0 ]) << 32 ) | suffix_to_id[s. substr ( 1 )];
3545
- to_suffix_id_[piece_code] = num_pieces ;
3544
+ for (const auto & suffix : suffixes) {
3545
+ suffix_to_id[suffix ] = num_pieces;
3546
+ if (!suffix .empty ()) {
3547
+ std::vector< uint32_t > cpts = unicode_cpts_from_utf8 (suffix);
3548
+
3549
+ std::string remaining;
3550
+ for ( size_t i = 1 ; i < cpts. size (); ++i) {
3551
+ remaining += unicode_cpt_to_utf8 (cpts[i]) ;
3546
3552
}
3547
- }
3548
3553
3549
- // Count number of pieces for this suffix
3550
- int32_t pieces_for_suffix = 1 ; // sentinel row
3551
- for (size_t i = 1 ; i <= s.length (); ++i) {
3552
- std::string prefix = s.substr (0 , i);
3553
- if (suffix_to_score.find (prefix) != suffix_to_score.end ()) {
3554
- pieces_for_suffix++;
3554
+ int64_t piece_code = (static_cast <int64_t >(cpts[0 ]) << 32 ) | suffix_to_id[remaining];
3555
+ to_suffix_id_[piece_code] = num_pieces;
3556
+
3557
+ // Count number of pieces for this suffix
3558
+ int32_t pieces_for_suffix = 1 ; // sentinel row
3559
+ for (int32_t piece_length = static_cast <int32_t >(cpts.size ()); piece_length > 0 ; --piece_length) {
3560
+ std::string piece;
3561
+ for (int32_t i = 0 ; i < piece_length; ++i) {
3562
+ piece += unicode_cpt_to_utf8 (cpts[i]);
3563
+ }
3564
+ if (suffix_to_score.find (piece) != suffix_to_score.end ()) {
3565
+ pieces_for_suffix++;
3566
+ }
3555
3567
}
3568
+ num_pieces += pieces_for_suffix;
3569
+ } else {
3570
+ num_pieces++; // Empty suffix contributes one piece (sentinel row)
3556
3571
}
3557
- num_pieces += pieces_for_suffix;
3558
3572
}
3559
3573
3560
3574
// Build flattened table
@@ -3563,8 +3577,13 @@ void llama_vocab_plamo2::build(const std::vector<vocab_entry> & vocab) {
3563
3577
3564
3578
for (const auto & suffix : suffixes) {
3565
3579
// Add all prefixes of the suffix to the table (in decreasing order of length)
3566
- for (int32_t piece_length = static_cast <int32_t >(suffix.length ()); piece_length > 0 ; --piece_length) {
3567
- std::string piece = suffix.substr (0 , piece_length);
3580
+ std::vector<uint32_t > cpts = unicode_cpts_from_utf8 (suffix);
3581
+ for (int32_t piece_length = static_cast <int32_t >(cpts.size ()); piece_length > 0 ; --piece_length) {
3582
+ std::string piece;
3583
+ for (int32_t i = 0 ; i < piece_length; ++i) {
3584
+ piece += unicode_cpt_to_utf8 (cpts[i]);
3585
+ }
3586
+
3568
3587
auto score_it = suffix_to_score.find (piece);
3569
3588
if (score_it == suffix_to_score.end ()) {
3570
3589
continue ;
@@ -3590,51 +3609,7 @@ void llama_vocab_plamo2::build(const std::vector<vocab_entry> & vocab) {
3590
3609
}
3591
3610
}
3592
3611
3593
- std::vector<int32_t > llama_vocab_plamo2::utf8_to_unicode (const std::string & text) const {
3594
- std::vector<int32_t > result;
3595
- const char * ptr = text.c_str ();
3596
- const char * end = ptr + text.length ();
3597
-
3598
- while (ptr < end) {
3599
- int32_t codepoint = 0 ;
3600
- int bytes_read = 0 ;
3601
-
3602
- if ((*ptr & 0x80 ) == 0 ) {
3603
- // ASCII
3604
- codepoint = *ptr;
3605
- bytes_read = 1 ;
3606
- } else if ((*ptr & 0xE0 ) == 0xC0 ) {
3607
- // 2-byte UTF-8
3608
- codepoint = (*ptr & 0x1F ) << 6 ;
3609
- codepoint |= (*(ptr + 1 ) & 0x3F );
3610
- bytes_read = 2 ;
3611
- } else if ((*ptr & 0xF0 ) == 0xE0 ) {
3612
- // 3-byte UTF-8
3613
- codepoint = (*ptr & 0x0F ) << 12 ;
3614
- codepoint |= (*(ptr + 1 ) & 0x3F ) << 6 ;
3615
- codepoint |= (*(ptr + 2 ) & 0x3F );
3616
- bytes_read = 3 ;
3617
- } else if ((*ptr & 0xF8 ) == 0xF0 ) {
3618
- // 4-byte UTF-8
3619
- codepoint = (*ptr & 0x07 ) << 18 ;
3620
- codepoint |= (*(ptr + 1 ) & 0x3F ) << 12 ;
3621
- codepoint |= (*(ptr + 2 ) & 0x3F ) << 6 ;
3622
- codepoint |= (*(ptr + 3 ) & 0x3F );
3623
- bytes_read = 4 ;
3624
- } else {
3625
- // Invalid UTF-8, skip this byte
3626
- ptr++;
3627
- continue ;
3628
- }
3629
-
3630
- result.push_back (codepoint);
3631
- ptr += bytes_read;
3632
- }
3633
-
3634
- return result;
3635
- }
3636
-
3637
- std::vector<llama_token> llama_vocab_plamo2::encode_unicode (const std::vector<int32_t > & unicode_data) const {
3612
+ std::vector<llama_token> llama_vocab_plamo2::encode_unicode (const std::vector<uint32_t > & unicode_data) const {
3638
3613
if (unicode_data.empty ()) {
3639
3614
return {};
3640
3615
}
@@ -3652,7 +3627,7 @@ std::vector<llama_token> llama_vocab_plamo2::encode_unicode(const std::vector<in
3652
3627
3653
3628
// Process from end to beginning
3654
3629
for (int i = static_cast <int >(data_len) - 1 ; i >= 0 ; --i) {
3655
- int32_t c = unicode_data[i];
3630
+ uint32_t c = unicode_data[i];
3656
3631
3657
3632
// Find next suffix ID
3658
3633
for (size_t p = suffix_id; p < table_.size (); ++p) {
@@ -3701,40 +3676,38 @@ std::vector<llama_token> llama_vocab_plamo2::encode_unicode(const std::vector<in
3701
3676
token_ids.push_back (path[pos][PATH_TOKEN_ID]);
3702
3677
} else {
3703
3678
// Fall back to byte tokens
3704
- int32_t c = unicode_data[pos];
3679
+ uint32_t c = unicode_data[pos];
3705
3680
int s = 1 + (c >= 0x80 ) + (c >= 0x800 ) + (c >= 0x10000 );
3706
3681
3707
- for (int j = 0 ; j < s; ++j) {
3708
- uint8_t b = (s == 1 ) ? c :
3709
- (j == 0 ) ? (0xF00 >> s) & 0xFF :
3710
- 0x80 | ((c >> ((s - j - 1 ) * 6 )) & 0x3F );
3711
- token_ids.push_back (bytes_[b]);
3682
+ for (int i = 0 ; i < s; ++i) {
3683
+ uint8_t b;
3684
+ if (s == 1 ) {
3685
+ b = c;
3686
+ } else {
3687
+ if (i == 0 ) {
3688
+ b = (0xF00 >> s) & 0xFF ;
3689
+ } else {
3690
+ b = 0x80 ;
3691
+ }
3692
+ }
3693
+ token_ids.push_back (bytes_[b | ((c >> ((s - i - 1 ) * 6 )) & 0x3F )]);
3712
3694
}
3713
3695
}
3714
3696
3697
+ assert (path[pos][PATH_TOKEN_LENGTH] > 0 );
3715
3698
pos += path[pos][PATH_TOKEN_LENGTH];
3716
3699
}
3717
3700
3718
3701
return token_ids;
3719
3702
}
3720
3703
3721
3704
std::vector<llama_token> llama_vocab_plamo2::encode (const std::string & text) const {
3722
- std::vector<int32_t > unicode_data = utf8_to_unicode (text);
3723
- return encode_unicode (unicode_data);
3724
- }
3725
-
3726
- std::vector<std::string> llama_vocab_plamo2::encode_as_tokens (const std::string & text) const {
3727
- std::vector<llama_token> token_ids = encode (text);
3728
- std::vector<std::string> result;
3729
- result.reserve (token_ids.size ());
3730
-
3731
- for (llama_token id : token_ids) {
3732
- if (id >= 0 && id < static_cast <llama_token>(tokens_.size ())) {
3733
- result.push_back (tokens_[id]);
3734
- }
3705
+ std::vector<uint32_t > unicode_data = unicode_cpts_from_utf8 (text);
3706
+ // Skip the first code point if it is a BOM (Byte Order Mark)
3707
+ if (!unicode_data.empty () && unicode_data[0 ] == 0xFEFF ) {
3708
+ unicode_data.erase (unicode_data.begin ());
3735
3709
}
3736
-
3737
- return result;
3710
+ return encode_unicode (unicode_data);
3738
3711
}
3739
3712
3740
3713
const std::string & llama_vocab_plamo2::get_token_text (llama_token id) const {
0 commit comments