diff --git a/common/arg.cpp b/common/arg.cpp index 9f87e9910b540..3b752848dc1f5 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -5,6 +5,7 @@ #include "log.h" #include "sampling.h" #include "chat.h" +#include "catalog.h" // fix problem with std::min and std::max #if defined(_WIN32) @@ -608,7 +609,10 @@ std::pair> common_remote_get_content(const std::string & * * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files. */ -static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) { +static struct common_hf_file_res common_get_hf_file( + const std::string & hf_repo_with_tag, + const std::string & bearer_token, + const std::string & model_endpoint) { auto parts = string_split(hf_repo_with_tag, ':'); std::string tag = parts.size() > 1 ? parts.back() : "latest"; std::string hf_repo = parts[0]; @@ -616,7 +620,7 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_ throw std::invalid_argument("error: invalid HF repo format, expected /[:quant]\n"); } - std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag; + std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag; // headers std::vector headers; @@ -715,7 +719,7 @@ static bool common_download_model( return false; } -static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) { +static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, const std::string &) { LOG_ERR("error: built without CURL, cannot download model from the internet\n"); return {}; } @@ -742,7 +746,7 @@ struct handle_model_result { static handle_model_result common_params_handle_model( struct common_params_model & model, const std::string & bearer_token, - const std::string & model_path_default) { + const std::string & model_endpoint) { handle_model_result result; // handle pre-fill default model path and url based on hf_repo and hf_file { @@ -750,7 +754,7 @@ static handle_model_result common_params_handle_model( // short-hand to avoid specifying --hf-file -> default it to --model if (model.hf_file.empty()) { if (model.path.empty()) { - auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token); + auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, model_endpoint); if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) { exit(1); // built without CURL, error message already printed } @@ -766,7 +770,6 @@ static handle_model_result common_params_handle_model( } } - std::string model_endpoint = get_model_endpoint(); model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file; // make sure model path is present (for caching purposes) if (model.path.empty()) { @@ -784,8 +787,6 @@ static handle_model_result common_params_handle_model( model.path = fs_get_cache_file(string_split(f, '/').back()); } - } else if (model.path.empty()) { - model.path = model_path_default; } } @@ -835,7 +836,6 @@ static std::string get_all_kv_cache_types() { // static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) { - std::string arg; const std::string arg_prefix = "--"; common_params & params = ctx_arg.params; @@ -875,16 +875,91 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context } }; + // normalize args + std::string input_pos_arg; + std::vector input_opt_args; + input_opt_args.reserve(argc - 1); for (int i = 1; i < argc; i++) { const std::string arg_prefix = "--"; std::string arg = argv[i]; + if (arg_to_options.find(arg) == arg_to_options.end()) { + // if we don't have a match, check if this can be a positional argument + if (input_pos_arg.empty()) { + input_pos_arg = std::move(arg); + continue; + } else { + // if the positional argument is already set, we cannot have another one + throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); + } + } + + // normalize the argument (only applied to optional args) if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { std::replace(arg.begin(), arg.end(), '_', '-'); } - if (arg_to_options.find(arg) == arg_to_options.end()) { - throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str())); + input_opt_args.emplace_back(arg); + } + + // handle positional argument (we only support one positional argument) + // the logic is as follow: + // 1. we try to find the model name in the catalog + // 2. if not found, we check the prefix protocol:// + // 3. if no protocol found, we assume it is a local file + { + bool is_handled = false; + // check catalog + for (auto & entry : model_catalog) { + if (input_pos_arg == entry.name) { + is_handled = true; + // check if the model support current example + bool is_supported = false; + for (auto & ex : entry.examples) { + if (ctx_arg.ex == ex) { + is_supported = true; + break; + } + } + if (is_supported) { + entry.handler(params); + } else { + LOG_ERR("error: model '%s' is not supported by this tool\n", entry.name); + exit(1); + } + break; + } + } + // check protocol + // for contributors: if you want to add a new protocol, + // please add make sure it support either /resolve/main or registry API + // see common_params_handle_model() to understand it is handled + // note: we don't support ollama because it usually contains their proprietary model (incompatible with llama.cpp) + if (!is_handled) { + const std::string & arg = input_pos_arg; + // check if it is a URL + if (string_starts_with(arg, "http://") || string_starts_with(arg, "https://")) { + params.model.url = arg; + } else if (string_starts_with(arg, "hf://")) { + // hugging face repo + params.model.hf_repo = arg.substr(5); + } else if (string_starts_with(arg, "hf-mirror://")) { + // hugging face mirror + params.custom_model_endpoint = "hf-mirror.com"; + params.model.hf_repo = arg.substr(12); + } else if (string_starts_with(arg, "ms://")) { + // modelscope + params.custom_model_endpoint = "modelscope.cn"; + params.model.hf_repo = arg.substr(5); + } else { + // assume it is a local file + params.model.path = arg; + } } + } + + // handle optional args + for (size_t i = 1; i < input_opt_args.size(); i++) { + const std::string & arg = input_opt_args[i]; auto opt = *arg_to_options[arg]; if (opt.has_value_from_env()) { fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str()); @@ -934,7 +1009,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // handle model and download { - auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH); + std::string model_endpoint = params.get_model_endpoint(); + auto res = common_params_handle_model(params.model, params.hf_token, model_endpoint); if (params.no_mmproj) { params.mmproj = {}; } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) { @@ -944,12 +1020,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context // only download mmproj if the current example is using it for (auto & ex : mmproj_examples) { if (ctx_arg.ex == ex) { - common_params_handle_model(params.mmproj, params.hf_token, ""); + common_params_handle_model(params.mmproj, params.hf_token, model_endpoint); break; } } - common_params_handle_model(params.speculative.model, params.hf_token, ""); - common_params_handle_model(params.vocoder.model, params.hf_token, ""); + common_params_handle_model(params.speculative.model, params.hf_token, model_endpoint); + common_params_handle_model(params.vocoder.model, params.hf_token, model_endpoint); } if (params.escape) { @@ -985,6 +1061,13 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context )); } + if (params.model.path.empty()) { + throw std::invalid_argument( + "model path is empty\n" + "please specify a model file or use one from the catalog\n" + "use --catalog to see the list of available models\n"); + } + return true; } @@ -3178,145 +3261,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } ).set_examples({LLAMA_EXAMPLE_TTS})); - // model-specific - add_opt(common_arg( - {"--tts-oute-default"}, - string_format("use default OuteTTS models (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; - params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; - params.vocoder.model.hf_repo = "ggml-org/WavTokenizer"; - params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf"; - } - ).set_examples({LLAMA_EXAMPLE_TTS})); - - add_opt(common_arg( - {"--embd-bge-small-en-default"}, - string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; - params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf"; - params.pooling_type = LLAMA_POOLING_TYPE_NONE; - params.embd_normalize = 2; - params.n_ctx = 512; - params.verbose_prompt = true; - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--embd-e5-small-en-default"}, - string_format("use default e5-small-v2 model (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF"; - params.model.hf_file = "e5-small-v2-q8_0.gguf"; - params.pooling_type = LLAMA_POOLING_TYPE_NONE; - params.embd_normalize = 2; - params.n_ctx = 512; - params.verbose_prompt = true; - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--embd-gte-small-default"}, - string_format("use default gte-small model (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF"; - params.model.hf_file = "gte-small-q8_0.gguf"; - params.pooling_type = LLAMA_POOLING_TYPE_NONE; - params.embd_normalize = 2; - params.n_ctx = 512; - params.verbose_prompt = true; - params.embedding = true; - } - ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--fim-qwen-1.5b-default"}, - string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF"; - params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf"; - params.port = 8012; - params.n_gpu_layers = 99; - params.flash_attn = true; - params.n_ubatch = 1024; - params.n_batch = 1024; - params.n_ctx = 0; - params.n_cache_reuse = 256; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--fim-qwen-3b-default"}, - string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF"; - params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf"; - params.port = 8012; - params.n_gpu_layers = 99; - params.flash_attn = true; - params.n_ubatch = 1024; - params.n_batch = 1024; - params.n_ctx = 0; - params.n_cache_reuse = 256; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--fim-qwen-7b-default"}, - string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; - params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; - params.port = 8012; - params.n_gpu_layers = 99; - params.flash_attn = true; - params.n_ubatch = 1024; - params.n_batch = 1024; - params.n_ctx = 0; - params.n_cache_reuse = 256; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--fim-qwen-7b-spec"}, - string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; - params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; - params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; - params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; - params.speculative.n_gpu_layers = 99; - params.port = 8012; - params.n_gpu_layers = 99; - params.flash_attn = true; - params.n_ubatch = 1024; - params.n_batch = 1024; - params.n_ctx = 0; - params.n_cache_reuse = 256; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - - add_opt(common_arg( - {"--fim-qwen-14b-spec"}, - string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"), - [](common_params & params) { - params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"; - params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf"; - params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; - params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; - params.speculative.n_gpu_layers = 99; - params.port = 8012; - params.n_gpu_layers = 99; - params.flash_attn = true; - params.n_ubatch = 1024; - params.n_batch = 1024; - params.n_ctx = 0; - params.n_cache_reuse = 256; - } - ).set_examples({LLAMA_EXAMPLE_SERVER})); - return ctx_arg; } diff --git a/common/catalog.h b/common/catalog.h new file mode 100644 index 0000000000000..3cb264dac672e --- /dev/null +++ b/common/catalog.h @@ -0,0 +1,169 @@ +#pragma once + +#include + +#include "common.h" + +struct common_catalog_entry { + const char * name; + const char * description; + std::initializer_list examples = {LLAMA_EXAMPLE_COMMON}; + void (*handler)(common_params & params); +}; + +// This is a list of models that are available in the catalog +// The rule for naming is: []- +// The is optional, for example: "fim" or "embd" +// The is the name of the model, for example: "qwen-7b" + +// For contributors: +// - Model MUST be hosted on hf.co/ggml-org +// - If you want to add your model to the catalog, please open an issue, we will consider copying it to ggml-org +// - For better user experience, we don't add models that are: +// - NSFW or not having NSFW safeguard +// - Not having an open-source license +// - Too old (more than 1 year old) +// - Having too many issues or poor quality (for ex. no chat templates, sensitive to system prompts) +// - Or, having too little usage (less than 1000 downloads monthly) + +const std::initializer_list model_catalog = { + { + "tts-oute", + "OuteTTS model", + {LLAMA_EXAMPLE_TTS}, + [](common_params & params) { + params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF"; + params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf"; + params.vocoder.model.hf_repo = "ggml-org/WavTokenizer"; + params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf"; + } + }, + { + "embd-bge-small-en", + "bge-small-en-v1.5 text embedding model", + {LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}, + [](common_params & params) { + params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF"; + params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + }, + { + "embd-e5-small-en", + "e5-small-v2 text embedding model", + {LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}, + [](common_params & params) { + params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF"; + params.model.hf_file = "e5-small-v2-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + }, + { + "embd-gte-small", + "gte-small text embedding model", + {LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}, + [](common_params & params) { + params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF"; + params.model.hf_file = "gte-small-q8_0.gguf"; + params.pooling_type = LLAMA_POOLING_TYPE_NONE; + params.embd_normalize = 2; + params.n_ctx = 512; + params.verbose_prompt = true; + params.embedding = true; + } + }, + { + "fim-qwen-1.5b", + "Qwen 2.5 Coder 1.5B", + {LLAMA_EXAMPLE_SERVER}, + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + }, + { + "fim-qwen-3b", + "Qwen 2.5 Coder 3B (support fill-in-the-middle)", + {LLAMA_EXAMPLE_SERVER}, + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + }, + { + "fim-qwen-7b", + "Qwen 2.5 Coder 7B (support fill-in-the-middle)", + {LLAMA_EXAMPLE_SERVER}, + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + }, + { + "fim-qwen-7b-spec", + "use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (support fill-in-the-middle)", + {LLAMA_EXAMPLE_SERVER}, + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf"; + params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + }, + { + "fim-qwen-14b-spec", + "use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (support fill-in-the-middle)", + {LLAMA_EXAMPLE_SERVER}, + [](common_params & params) { + params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF"; + params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf"; + params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF"; + params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf"; + params.speculative.n_gpu_layers = 99; + params.port = 8012; + params.n_gpu_layers = 99; + params.flash_attn = true; + params.n_ubatch = 1024; + params.n_batch = 1024; + params.n_ctx = 0; + params.n_cache_reuse = 256; + } + }, +}; diff --git a/common/common.cpp b/common/common.cpp index bd20af233695c..2e03bb2fd378c 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1029,19 +1029,6 @@ struct common_init_result common_init_from_params(common_params & params) { return iparams; } -std::string get_model_endpoint() { - const char * model_endpoint_env = getenv("MODEL_ENDPOINT"); - // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility. - const char * hf_endpoint_env = getenv("HF_ENDPOINT"); - const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env; - std::string model_endpoint = "https://huggingface.co/"; - if (endpoint_env) { - model_endpoint = endpoint_env; - if (model_endpoint.back() != '/') model_endpoint += '/'; - } - return model_endpoint; -} - void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora) { llama_clear_adapter_lora(ctx); for (auto & la : lora) { diff --git a/common/common.h b/common/common.h index 90702245463cb..769676ebf629a 100644 --- a/common/common.h +++ b/common/common.h @@ -424,6 +424,25 @@ struct common_params { // common params std::string out_file; // output filename for all example programs + + // set internally by positional argument handler + std::string custom_model_endpoint = ""; // custom model endpoint (e.g. for HF mirrors) + std::string get_model_endpoint() { + if (!custom_model_endpoint.empty()) { + return custom_model_endpoint; + } + // otherwise, we read it from the environment variable + const char * model_endpoint_env = getenv("MODEL_ENDPOINT"); + // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility. + const char * hf_endpoint_env = getenv("HF_ENDPOINT"); + const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env; + std::string model_endpoint = "https://huggingface.co/"; + if (endpoint_env) { + model_endpoint = endpoint_env; + if (model_endpoint.back() != '/') model_endpoint += '/'; + } + return model_endpoint; + } }; // call once at the start of a program if it uses libcommon @@ -545,8 +564,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p // clear LoRA adapters from context, then apply new list of adapters void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); -std::string get_model_endpoint(); - // // Batch utils //