diff --git a/common/arg.cpp b/common/arg.cpp
index 9f87e9910b540..3b752848dc1f5 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -5,6 +5,7 @@
 #include "log.h"
 #include "sampling.h"
 #include "chat.h"
+#include "catalog.h"
 
 // fix problem with std::min and std::max
 #if defined(_WIN32)
@@ -608,7 +609,10 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
  *
  * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
  */
-static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
+static struct common_hf_file_res common_get_hf_file(
+        const std::string & hf_repo_with_tag,
+        const std::string & bearer_token,
+        const std::string & model_endpoint) {
     auto parts = string_split<std::string>(hf_repo_with_tag, ':');
     std::string tag = parts.size() > 1 ? parts.back() : "latest";
     std::string hf_repo = parts[0];
@@ -616,7 +620,7 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
         throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
     }
 
-    std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
+    std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag;
 
     // headers
     std::vector<std::string> headers;
@@ -715,7 +719,7 @@ static bool common_download_model(
     return false;
 }
 
-static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
+static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, const std::string &) {
     LOG_ERR("error: built without CURL, cannot download model from the internet\n");
     return {};
 }
@@ -742,7 +746,7 @@ struct handle_model_result {
 static handle_model_result common_params_handle_model(
         struct common_params_model & model,
         const std::string & bearer_token,
-        const std::string & model_path_default) {
+        const std::string & model_endpoint) {
     handle_model_result result;
     // handle pre-fill default model path and url based on hf_repo and hf_file
     {
@@ -750,7 +754,7 @@ static handle_model_result common_params_handle_model(
             // short-hand to avoid specifying --hf-file -> default it to --model
             if (model.hf_file.empty()) {
                 if (model.path.empty()) {
-                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
+                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, model_endpoint);
                     if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
                         exit(1); // built without CURL, error message already printed
                     }
@@ -766,7 +770,6 @@ static handle_model_result common_params_handle_model(
                 }
             }
 
-            std::string model_endpoint = get_model_endpoint();
             model.url = model_endpoint + model.hf_repo + "/resolve/main/" + model.hf_file;
             // make sure model path is present (for caching purposes)
             if (model.path.empty()) {
@@ -784,8 +787,6 @@ static handle_model_result common_params_handle_model(
                 model.path = fs_get_cache_file(string_split<std::string>(f, '/').back());
             }
 
-        } else if (model.path.empty()) {
-            model.path = model_path_default;
         }
     }
 
@@ -835,7 +836,6 @@ static std::string get_all_kv_cache_types() {
 //
 
 static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
-    std::string arg;
     const std::string arg_prefix = "--";
     common_params & params = ctx_arg.params;
 
@@ -875,16 +875,91 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         }
     };
 
+    // normalize args
+    std::string input_pos_arg;
+    std::vector<std::string> input_opt_args;
+    input_opt_args.reserve(argc - 1);
     for (int i = 1; i < argc; i++) {
         const std::string arg_prefix = "--";
 
         std::string arg = argv[i];
+        if (arg_to_options.find(arg) == arg_to_options.end()) {
+            // if we don't have a match, check if this can be a positional argument
+            if (input_pos_arg.empty()) {
+                input_pos_arg = std::move(arg);
+                continue;
+            } else {
+                // if the positional argument is already set, we cannot have another one
+                throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+            }
+        }
+
+        // normalize the argument (only applied to optional args)
         if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
             std::replace(arg.begin(), arg.end(), '_', '-');
         }
-        if (arg_to_options.find(arg) == arg_to_options.end()) {
-            throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
+        input_opt_args.emplace_back(arg);
+    }
+
+    // handle positional argument (we only support one positional argument)
+    // the logic is as follow:
+    // 1. we try to find the model name in the catalog
+    // 2. if not found, we check the prefix protocol://
+    // 3. if no protocol found, we assume it is a local file
+    {
+        bool is_handled = false;
+        // check catalog
+        for (auto & entry : model_catalog) {
+            if (input_pos_arg == entry.name) {
+                is_handled = true;
+                // check if the model support current example
+                bool is_supported = false;
+                for (auto & ex : entry.examples) {
+                    if (ctx_arg.ex == ex) {
+                        is_supported = true;
+                        break;
+                    }
+                }
+                if (is_supported) {
+                    entry.handler(params);
+                } else {
+                    LOG_ERR("error: model '%s' is not supported by this tool\n", entry.name);
+                    exit(1);
+                }
+                break;
+            }
+        }
+        // check protocol
+        // for contributors: if you want to add a new protocol,
+        //                   please add make sure it support either /resolve/main or registry API
+        //                   see common_params_handle_model() to understand it is handled
+        // note: we don't support ollama because it usually contains their proprietary model (incompatible with llama.cpp)
+        if (!is_handled) {
+            const std::string & arg = input_pos_arg;
+            // check if it is a URL
+            if (string_starts_with(arg, "http://") || string_starts_with(arg, "https://")) {
+                params.model.url = arg;
+            } else if (string_starts_with(arg, "hf://")) {
+                // hugging face repo
+                params.model.hf_repo = arg.substr(5);
+            } else if (string_starts_with(arg, "hf-mirror://")) {
+                // hugging face mirror
+                params.custom_model_endpoint = "hf-mirror.com";
+                params.model.hf_repo = arg.substr(12);
+            } else if (string_starts_with(arg, "ms://")) {
+                // modelscope
+                params.custom_model_endpoint = "modelscope.cn";
+                params.model.hf_repo = arg.substr(5);
+            } else {
+                // assume it is a local file
+                params.model.path = arg;
+            }
         }
+    }
+
+    // handle optional args
+    for (size_t i = 1; i < input_opt_args.size(); i++) {
+        const std::string & arg = input_opt_args[i];
         auto opt = *arg_to_options[arg];
         if (opt.has_value_from_env()) {
             fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
@@ -934,7 +1009,8 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
 
     // handle model and download
     {
-        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
+        std::string model_endpoint = params.get_model_endpoint();
+        auto res = common_params_handle_model(params.model, params.hf_token, model_endpoint);
         if (params.no_mmproj) {
             params.mmproj = {};
         } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@@ -944,12 +1020,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         // only download mmproj if the current example is using it
         for (auto & ex : mmproj_examples) {
             if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, "");
+                common_params_handle_model(params.mmproj,    params.hf_token, model_endpoint);
                 break;
             }
         }
-        common_params_handle_model(params.speculative.model, params.hf_token, "");
-        common_params_handle_model(params.vocoder.model,     params.hf_token, "");
+        common_params_handle_model(params.speculative.model, params.hf_token, model_endpoint);
+        common_params_handle_model(params.vocoder.model,     params.hf_token, model_endpoint);
     }
 
     if (params.escape) {
@@ -985,6 +1061,13 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         ));
     }
 
+    if (params.model.path.empty()) {
+        throw std::invalid_argument(
+            "model path is empty\n"
+            "please specify a model file or use one from the catalog\n"
+            "use --catalog to see the list of available models\n");
+    }
+
     return true;
 }
 
@@ -3178,145 +3261,5 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         }
     ).set_examples({LLAMA_EXAMPLE_TTS}));
 
-    // model-specific
-    add_opt(common_arg(
-        {"--tts-oute-default"},
-        string_format("use default OuteTTS models (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
-            params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
-            params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
-            params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
-        }
-    ).set_examples({LLAMA_EXAMPLE_TTS}));
-
-    add_opt(common_arg(
-        {"--embd-bge-small-en-default"},
-        string_format("use default bge-small-en-v1.5 model (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
-            params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
-            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
-            params.embd_normalize = 2;
-            params.n_ctx = 512;
-            params.verbose_prompt = true;
-            params.embedding = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--embd-e5-small-en-default"},
-        string_format("use default e5-small-v2 model (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
-            params.model.hf_file = "e5-small-v2-q8_0.gguf";
-            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
-            params.embd_normalize = 2;
-            params.n_ctx = 512;
-            params.verbose_prompt = true;
-            params.embedding = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--embd-gte-small-default"},
-        string_format("use default gte-small model (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
-            params.model.hf_file = "gte-small-q8_0.gguf";
-            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
-            params.embd_normalize = 2;
-            params.n_ctx = 512;
-            params.verbose_prompt = true;
-            params.embedding = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--fim-qwen-1.5b-default"},
-        string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
-            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
-            params.n_ubatch = 1024;
-            params.n_batch = 1024;
-            params.n_ctx = 0;
-            params.n_cache_reuse = 256;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--fim-qwen-3b-default"},
-        string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
-            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
-            params.n_ubatch = 1024;
-            params.n_batch = 1024;
-            params.n_ctx = 0;
-            params.n_cache_reuse = 256;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--fim-qwen-7b-default"},
-        string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
-            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
-            params.n_ubatch = 1024;
-            params.n_batch = 1024;
-            params.n_ctx = 0;
-            params.n_cache_reuse = 256;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--fim-qwen-7b-spec"},
-        string_format("use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
-            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
-            params.speculative.n_gpu_layers = 99;
-            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
-            params.n_ubatch = 1024;
-            params.n_batch = 1024;
-            params.n_ctx = 0;
-            params.n_cache_reuse = 256;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-
-    add_opt(common_arg(
-        {"--fim-qwen-14b-spec"},
-        string_format("use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (note: can download weights from the internet)"),
-        [](common_params & params) {
-            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
-            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
-            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
-            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
-            params.speculative.n_gpu_layers = 99;
-            params.port = 8012;
-            params.n_gpu_layers = 99;
-            params.flash_attn = true;
-            params.n_ubatch = 1024;
-            params.n_batch = 1024;
-            params.n_ctx = 0;
-            params.n_cache_reuse = 256;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}));
-
     return ctx_arg;
 }
diff --git a/common/catalog.h b/common/catalog.h
new file mode 100644
index 0000000000000..3cb264dac672e
--- /dev/null
+++ b/common/catalog.h
@@ -0,0 +1,169 @@
+#pragma once
+
+#include <initializer_list>
+
+#include "common.h"
+
+struct common_catalog_entry {
+    const char * name;
+    const char * description;
+    std::initializer_list<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
+    void (*handler)(common_params & params);
+};
+
+// This is a list of models that are available in the catalog
+// The rule for naming is: [<capability>]-<model_name>
+// The <capability> is optional, for example: "fim" or "embd"
+// The <model_name> is the name of the model, for example: "qwen-7b"
+
+// For contributors:
+// - Model MUST be hosted on hf.co/ggml-org
+// - If you want to add your model to the catalog, please open an issue, we will consider copying it to ggml-org
+// - For better user experience, we don't add models that are:
+//     - NSFW or not having NSFW safeguard
+//     - Not having an open-source license
+//     - Too old (more than 1 year old)
+//     - Having too many issues or poor quality (for ex. no chat templates, sensitive to system prompts)
+//     - Or, having too little usage (less than 1000 downloads monthly)
+
+const std::initializer_list<common_catalog_entry> model_catalog = {
+    {
+        "tts-oute",
+        "OuteTTS model",
+        {LLAMA_EXAMPLE_TTS},
+        [](common_params & params) {
+            params.model.hf_repo = "OuteAI/OuteTTS-0.2-500M-GGUF";
+            params.model.hf_file = "OuteTTS-0.2-500M-Q8_0.gguf";
+            params.vocoder.model.hf_repo = "ggml-org/WavTokenizer";
+            params.vocoder.model.hf_file = "WavTokenizer-Large-75-F16.gguf";
+        }
+    },
+    {
+        "embd-bge-small-en",
+        "bge-small-en-v1.5 text embedding model",
+        {LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER},
+        [](common_params & params) {
+            params.model.hf_repo = "ggml-org/bge-small-en-v1.5-Q8_0-GGUF";
+            params.model.hf_file = "bge-small-en-v1.5-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    },
+    {
+        "embd-e5-small-en",
+        "e5-small-v2 text embedding model",
+        {LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER},
+        [](common_params & params) {
+            params.model.hf_repo = "ggml-org/e5-small-v2-Q8_0-GGUF";
+            params.model.hf_file = "e5-small-v2-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    },
+    {
+        "embd-gte-small",
+        "gte-small text embedding model",
+        {LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER},
+        [](common_params & params) {
+            params.model.hf_repo = "ggml-org/gte-small-Q8_0-GGUF";
+            params.model.hf_file = "gte-small-q8_0.gguf";
+            params.pooling_type = LLAMA_POOLING_TYPE_NONE;
+            params.embd_normalize = 2;
+            params.n_ctx = 512;
+            params.verbose_prompt = true;
+            params.embedding = true;
+        }
+    },
+    {
+        "fim-qwen-1.5b",
+        "Qwen 2.5 Coder 1.5B",
+        {LLAMA_EXAMPLE_SERVER},
+        [](common_params & params) {
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    },
+    {
+        "fim-qwen-3b",
+        "Qwen 2.5 Coder 3B (support fill-in-the-middle)",
+        {LLAMA_EXAMPLE_SERVER},
+        [](common_params & params) {
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    },
+    {
+        "fim-qwen-7b",
+        "Qwen 2.5 Coder 7B (support fill-in-the-middle)",
+        {LLAMA_EXAMPLE_SERVER},
+        [](common_params & params) {
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    },
+    {
+        "fim-qwen-7b-spec",
+        "use Qwen 2.5 Coder 7B + 0.5B draft for speculative decoding (support fill-in-the-middle)",
+        {LLAMA_EXAMPLE_SERVER},
+        [](common_params & params) {
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
+            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.speculative.n_gpu_layers = 99;
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    },
+    {
+        "fim-qwen-14b-spec",
+        "use Qwen 2.5 Coder 14B + 0.5B draft for speculative decoding (support fill-in-the-middle)",
+        {LLAMA_EXAMPLE_SERVER},
+        [](common_params & params) {
+            params.model.hf_repo = "ggml-org/Qwen2.5-Coder-14B-Q8_0-GGUF";
+            params.model.hf_file = "qwen2.5-coder-14b-q8_0.gguf";
+            params.speculative.model.hf_repo = "ggml-org/Qwen2.5-Coder-0.5B-Q8_0-GGUF";
+            params.speculative.model.hf_file = "qwen2.5-coder-0.5b-q8_0.gguf";
+            params.speculative.n_gpu_layers = 99;
+            params.port = 8012;
+            params.n_gpu_layers = 99;
+            params.flash_attn = true;
+            params.n_ubatch = 1024;
+            params.n_batch = 1024;
+            params.n_ctx = 0;
+            params.n_cache_reuse = 256;
+        }
+    },
+};
diff --git a/common/common.cpp b/common/common.cpp
index bd20af233695c..2e03bb2fd378c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1029,19 +1029,6 @@ struct common_init_result common_init_from_params(common_params & params) {
     return iparams;
 }
 
-std::string get_model_endpoint() {
-    const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
-    // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
-    const char * hf_endpoint_env = getenv("HF_ENDPOINT");
-    const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
-    std::string model_endpoint = "https://huggingface.co/";
-    if (endpoint_env) {
-        model_endpoint = endpoint_env;
-        if (model_endpoint.back() != '/') model_endpoint += '/';
-    }
-    return model_endpoint;
-}
-
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora) {
     llama_clear_adapter_lora(ctx);
     for (auto & la : lora) {
diff --git a/common/common.h b/common/common.h
index 90702245463cb..769676ebf629a 100644
--- a/common/common.h
+++ b/common/common.h
@@ -424,6 +424,25 @@ struct common_params {
 
     // common params
     std::string out_file; // output filename for all example programs
+
+    // set internally by positional argument handler
+    std::string custom_model_endpoint = ""; // custom model endpoint (e.g. for HF mirrors)
+    std::string get_model_endpoint() {
+        if (!custom_model_endpoint.empty()) {
+            return custom_model_endpoint;
+        }
+        // otherwise, we read it from the environment variable
+        const char * model_endpoint_env = getenv("MODEL_ENDPOINT");
+        // We still respect the use of environment-variable "HF_ENDPOINT" for backward-compatibility.
+        const char * hf_endpoint_env = getenv("HF_ENDPOINT");
+        const char * endpoint_env = model_endpoint_env ? model_endpoint_env : hf_endpoint_env;
+        std::string model_endpoint = "https://huggingface.co/";
+        if (endpoint_env) {
+            model_endpoint = endpoint_env;
+            if (model_endpoint.back() != '/') model_endpoint += '/';
+        }
+        return model_endpoint;
+    }
 };
 
 // call once at the start of a program if it uses libcommon
@@ -545,8 +564,6 @@ struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_p
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
 
-std::string                   get_model_endpoint();
-
 //
 // Batch utils
 //