Skip to content

Commit

Permalink
use llama tokenize
Browse files Browse the repository at this point in the history
  • Loading branch information
hkvision committed Jan 27, 2025
1 parent 8ac2c89 commit bd4da2e
Showing 1 changed file with 55 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,36 @@
#endif


struct gguf_tokenizer_params {
llama_context * ctx;
int32_t bos_token_id;
int32_t eos_token_id;
bool add_bos;
bool parse_special;
};


static void print_usage(int, char ** argv) {
printf("\nexample usage:\n");
printf("\n %s -m npu_model_dir [-cnv] [-n n_predict] [prompt]\n", argv[0]);
printf("\n");
}


vector<int32_t> gguf_tokenize(std::string prompt,
gguf_tokenizer_params tok_params) {
int n_tokens = prompt.length() + 2 * tok_params.add_bos; // TODO: no need
std::vector<int32_t> ids = llama_tokenize(tok_params.ctx, prompt,
tok_params.add_bos, tok_params.parse_special);
return ids;
}

std::string gguf_decode(vector<int32_t> tokens, gguf_tokenizer_params tok_params) {
std::string output = llama_detokenize(tok_params.ctx, tokens, tok_params.parse_special);
return output;
}


const std::string llama2_template = "<s>[INST] <<SYS>>\n\n<</SYS>>\n\n%s [/INST]";
const std::string llama3_template = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n%s<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n";
const std::string minicpm_template = "<用户>%s<AI>";
Expand Down Expand Up @@ -113,7 +136,7 @@ std::string add_chat_history(npu_model_params model_params,
}

std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_inp_size,
npu_model_params model_params, tokenizer_params tok_params, npu_generation_params generation_params){
npu_model_params model_params, gguf_tokenizer_params tok_params, npu_generation_params generation_params){
float* logits = run_prefill(void_model, embd_inp_ptr, embd_inp_size,
generation_params.repetition_penalty);
int32_t token = llm_sample_token(logits, true, model_params.vocab_size);
Expand All @@ -126,15 +149,15 @@ std::string run_generate(void* void_model, int32_t* embd_inp_ptr, int32_t embd_i
auto logits = run_decode(void_model, embd[i-1],
generation_params.repetition_penalty);
int32_t token = llm_sample_token(logits, true, model_params.vocab_size);
if (std::find(tok_params.eos_token_id.begin(), tok_params.eos_token_id.end(), token) == tok_params.eos_token_id.end()){
if (tok_params.eos_token_id != token){
embd.push_back(token);
token_nums ++;
} else {
break;
}
}

std::string output = llm_decode(embd);
std::string output = gguf_decode(embd, tok_params);

return output;
}
Expand Down Expand Up @@ -233,8 +256,27 @@ int main(int argc, char ** argv) {
npu_model_params model_params_npu;
load_config_from_file(model_params_npu, npu_common_params.model);

tokenizer_params tok_params;
load_tokenizer(tok_params, npu_common_params.model);
// load gguf model again with vocab_only for tokenizer
llama_backend_init();
llama_model_params gguf_model_params = llama_model_default_params();
gguf_model_params.vocab_only = true;
llama_model * gguf_model = llama_load_model_from_file(params.model.c_str(), gguf_model_params);
auto cparams = llama_context_default_params();
llama_context * vocab_ctx = llama_new_context_with_model(gguf_model, cparams);
// if (!gguf_model) {
// fprintf(stderr, "Error: could not load model from file '%s'.\n", gguf_path);
// return 1;
// }

gguf_tokenizer_params tok_params;
tok_params.ctx = vocab_ctx;
tok_params.bos_token_id = llama_token_bos(gguf_model);
tok_params.eos_token_id = llama_token_eos(gguf_model);
tok_params.add_bos = llama_add_bos_token(gguf_model);
tok_params.parse_special = true;

// tokenizer_params tok_params;
// load_tokenizer(tok_params, npu_common_params.model);

npu_generation_params generation_params;
load_generation_config_from_file(generation_params, npu_common_params.model);
Expand All @@ -255,11 +297,11 @@ int main(int argc, char ** argv) {
std::string full_prompt = add_chat_history(model_params_npu, prompt, history, true);

// tokenize input
std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
std::vector<int32_t> embd_inp = gguf_tokenize(full_prompt, tok_params);
if (embd_inp.size() > model_params_npu.max_prompt_len){
// empty chat history
full_prompt = add_chat_history(model_params_npu, prompt, "", true);
embd_inp = llm_tokenize(full_prompt, false);
embd_inp = gguf_tokenize(full_prompt, tok_params);
}

generation_params.max_new_token = model_params_npu.kv_len - embd_inp.size();
Expand All @@ -280,7 +322,7 @@ int main(int argc, char ** argv) {
std::string full_prompt = add_chat_template(model_params_npu, params.prompt);

// tokenize input
std::vector<int32_t> embd_inp = llm_tokenize(full_prompt, false);
std::vector<int32_t> embd_inp = gguf_tokenize(full_prompt, tok_params);

// single text generation
std::string output = run_generate(npu_model, embd_inp.data(), embd_inp.size(),
Expand All @@ -289,5 +331,10 @@ int main(int argc, char ** argv) {
std::cout << output << std::endl << std::endl;
llm_perf_print(npu_model);
}

llama_free(vocab_ctx);
llama_free_model(gguf_model);

llama_backend_free();
return 0;
}

0 comments on commit bd4da2e

Please sign in to comment.