Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : support raw NUL bytes in tokens #8992

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2224,9 +2224,8 @@ class InternLM2Model(Model):

def set_vocab(self):
# (TODO): Is there a better way?
# Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character
# \x00 specially and convert it into an emoji character to prevent it from being mistakenly
# recognized as an empty string in C++.
# Copy from _set_vocab_sentencepiece, The only difference is that we find mislabeled UNUSED tokens,
# and that we set '<|im_end|>' as the eos token for chat models.
from sentencepiece import SentencePieceProcessor
from sentencepiece import sentencepiece_model_pb2 as model

Expand All @@ -2253,11 +2252,6 @@ def set_vocab(self):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)
if text == b"\x00":
# (TODO): fixme
# Hack here and replace the \x00 characters.
logger.warning(f"InternLM2 convert token '{text}' to '🐉'!")
text = "🐉".encode("utf-8")

toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ static void load_vocab(const char * filename, const Config * config, struct llam
vocab->id_to_token.resize(n_vocab);

for (uint32_t i = 0; i < n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
std::string word(gguf_get_arr_str(ctx, token_idx, i), gguf_get_arr_str_n(ctx, token_idx, i));

vocab->token_to_id[word] = i;

Expand Down
2 changes: 1 addition & 1 deletion examples/export-lora/export-lora.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ static bool g_verbose = false;

static std::string get_kv_str(struct gguf_context * ctx_gguf, const std::string & key){
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id), gguf_get_val_str_n(ctx_gguf, id));
}

static float get_kv_f32(struct gguf_context * ctx_gguf, const std::string & key) {
Expand Down
4 changes: 2 additions & 2 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {

switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
return std::string(gguf_get_val_str(ctx_gguf, i), gguf_get_val_str_n(ctx_gguf, i));
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
Expand All @@ -235,7 +235,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
std::string val(gguf_get_arr_str(ctx_gguf, i, j), gguf_get_arr_str_n(ctx_gguf, i, j));
// escape quotes
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
Expand Down
2 changes: 2 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2313,10 +2313,12 @@ extern "C" {
GGML_API double gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
GGML_API bool gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
GGML_API int gguf_get_val_str_n(const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
GGML_API int gguf_get_arr_n (const struct gguf_context * ctx, int key_id);
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
GGML_API int gguf_get_arr_str_n(const struct gguf_context * ctx, int key_id, int i);

GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
Expand Down
14 changes: 14 additions & 0 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -21335,6 +21335,14 @@ const char * gguf_get_arr_str(const struct gguf_context * ctx, int key_id, int i
return str->data;
}

int gguf_get_arr_str_n(const struct gguf_context * ctx, int key_id, int i) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
struct gguf_kv * kv = &ctx->kv[key_id];
struct gguf_str * str = &((struct gguf_str *) kv->value.arr.data)[i];
return str->n;
}

int gguf_get_arr_n(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_ARRAY);
Expand Down Expand Up @@ -21413,6 +21421,12 @@ const char * gguf_get_val_str(const struct gguf_context * ctx, int key_id) {
return ctx->kv[key_id].value.str.data;
}

int gguf_get_val_str_n(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type == GGUF_TYPE_STRING);
return ctx->kv[key_id].value.str.n;
}

const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id) {
GGML_ASSERT(key_id >= 0 && key_id < gguf_get_n_kv(ctx));
GGML_ASSERT(ctx->kv[key_id].type != GGUF_TYPE_ARRAY);
Expand Down
12 changes: 6 additions & 6 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1406,7 +1406,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {

switch (type) {
case GGUF_TYPE_STRING:
return gguf_get_val_str(ctx_gguf, i);
return std::string(gguf_get_val_str(ctx_gguf, i), gguf_get_val_str_n(ctx_gguf, i));
case GGUF_TYPE_ARRAY:
{
const enum gguf_type arr_type = gguf_get_arr_type(ctx_gguf, i);
Expand All @@ -1416,7 +1416,7 @@ static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
ss << "[";
for (int j = 0; j < arr_n; j++) {
if (arr_type == GGUF_TYPE_STRING) {
std::string val = gguf_get_arr_str(ctx_gguf, i, j);
std::string val(gguf_get_arr_str(ctx_gguf, i, j), gguf_get_arr_str_n(ctx_gguf, i, j));
// escape quotes
replace_all(val, "\\", "\\\\");
replace_all(val, "\"", "\\\"");
Expand Down Expand Up @@ -3436,7 +3436,7 @@ namespace GGUFMeta {
static constexpr gguf_type gt = GGUF_TYPE_STRING;

static std::string getter(const gguf_context * ctx, const int kid) {
return gguf_get_val_str(ctx, kid);
return std::string(gguf_get_val_str(ctx, kid), gguf_get_val_str_n(ctx, kid));
}
};

Expand Down Expand Up @@ -5316,7 +5316,7 @@ static void llm_load_vocab(

const int n_merges = gguf_get_arr_n(ctx, merges_keyidx);
for (int i = 0; i < n_merges; i++) {
const std::string word = gguf_get_arr_str(ctx, merges_keyidx, i);
const std::string word(gguf_get_arr_str(ctx, merges_keyidx, i), gguf_get_arr_str_n(ctx, merges_keyidx, i));
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);

std::string first;
Expand Down Expand Up @@ -5521,7 +5521,7 @@ static void llm_load_vocab(
vocab.id_to_token.resize(n_vocab);

for (uint32_t i = 0; i < n_vocab; i++) {
std::string word = gguf_get_arr_str(ctx, token_idx, i);
std::string word(gguf_get_arr_str(ctx, token_idx, i), gguf_get_arr_str_n(ctx, token_idx, i));
GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0);

vocab.token_to_id[word] = i;
Expand Down Expand Up @@ -16207,7 +16207,7 @@ static void llama_lora_adapter_init_internal(struct llama_model * model, const c
{
auto get_kv_str = [&](const std::string & key) -> std::string {
int id = gguf_find_key(ctx_gguf, key.c_str());
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id));
return id < 0 ? "" : std::string(gguf_get_val_str(ctx_gguf, id), gguf_get_val_str_n(ctx_gguf, id));
};
auto get_kv_f32 = [&](const std::string & key) -> float {
int id = gguf_find_key(ctx_gguf, key.c_str());
Expand Down
Loading