update with rename

zRzRzRzRzRzRzR · zRzRzRzRzRzRzR · commit 9236a4aac5ce · 2025-04-15T15:40:53.000+08:00
diff --git a/demo/intel_device_demo/itrex/itrex_cli_demo.py b/demo/intel_device_demo/itrex/itrex_cli_demo.py
@@ -5,7 +5,7 @@
 import os
 
 
-MODEL_PATH = os.environ.get("MODEL_PATH", "THUDM/GLM-4-9B-Chat-0414")
+MODEL_PATH = os.environ.get("MODEL_PATH", "THUDM/GLM-4-9B-0414")
 
 from threading import Thread
 
diff --git a/demo/intel_device_demo/openvino/convert.py b/demo/intel_device_demo/openvino/convert.py
@@ -16,7 +16,7 @@
     parser = argparse.ArgumentParser(add_help=False)
     parser.add_argument("-h", "--help", action="help", help="Show this help message and exit.")
     parser.add_argument(
-        "-m", "--model_id", default="THUDM/GLM-4-9B-Chat-0414", required=False, type=str, help="orignal model path"
+        "-m", "--model_id", default="THUDM/GLM-4-9B-0414", required=False, type=str, help="orignal model path"
     )
     parser.add_argument(
         "-p",
diff --git a/finetune/README.md b/finetune/README.md
@@ -20,12 +20,12 @@ All fine-tuning tests were performed in the following environment:
 
 + Fine-tuning based on Llama-Factory
 
-| Fine-tuning Model         | Fine-tuning solution | GPU memory usage             |
-|---------------------------|----------------------|------------------------------|
-| GLM-4-9B-Chat-0414        | lora                 | 22G (Each GPU, Need 1 GPU)   |
-| GLM-4-9B-Chat-0414        | SFT (Zero3 method)   | 55G (Each GPU, Need 4 GPUs)  |
-| GLM-4-9B-Chat-0414        | lora                 | 80G (Each GPU, Need 8 GPUs)  |
-| GLM-4-32B-Chat-0414       | SFT (Zero3 method)   | 80G (Each GPU, Need 16 GPUs) |
+| Fine-tuning Model     | Fine-tuning solution | GPU memory usage             |
+|-----------------------|----------------------|------------------------------|
+| GLM-4-9B-0414     | lora                 | 22G (Each GPU, Need 1 GPU)   |
+| GLM-4-9B-0414     | SFT (Zero3 method)   | 55G (Each GPU, Need 4 GPUs)  |
+| GLM-4-9B-0414     | lora                 | 80G (Each GPU, Need 8 GPUs)  |
+| GLM-4-32B-0414    | SFT (Zero3 method)   | 80G (Each GPU, Need 16 GPUs) |
 
 + Fine-tuning based on this repository
 
@@ -38,7 +38,7 @@ All fine-tuning tests were performed in the following environment:
 
 ## Preparation
 
-Before starting fine-tuning, please install the dependencies in \`basic_demo\`, ensure you have cloned the latest version of the model repository, and install the dependencies in this directory:
+Before starting fine-tuning, please install the dependencies in `inference`, ensure you have cloned the latest version of the model repository, and install the dependencies in this directory:
 
 ```bash
 pip install -r requirements.txt
@@ -261,14 +261,14 @@ Execute **single machine multi-card/multi-machine multi-card** run through the f
 the acceleration solution, and you need to install `deepspeed`.
 
 ```shell
-OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8  finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9b-Chat-0414  configs/lora.yaml # For Chat Fine-tune
+OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8  finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9b-0414  configs/lora.yaml # For Chat Fine-tune
 OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8  finetune_vision.py  data/CogVLM-311K/  THUDM/glm-4v-9b  configs/lora.yaml  # For VQA Fine-tune
 ```
 
 Execute **single machine single card** run through the following code.
 
 ```shell
-python finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9B-Chat-0414  configs/lora.yaml # For Chat Fine-tune
+python finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9B-0414  configs/lora.yaml # For Chat Fine-tune
 python finetune_vision.py  data/CogVLM-311K/  THUDM/glm-4v-9b configs/lora.yaml # For VQA Fine-tune
 ```
 
@@ -284,7 +284,7 @@ half-trained model, you can add a fourth parameter, which can be passed in two w
 For example, this is an example code to continue fine-tuning from the last saved point
 
 ```shell
-python finetune.py data/AdvertiseGen/ THUDM/GLM-4-9B-Chat-0414 configs/lora.yaml yes
+python finetune.py data/AdvertiseGen/ THUDM/GLM-4-9B-0414 configs/lora.yaml yes
 ```
 
 ## Use the fine-tuned model
diff --git a/finetune/README_zh.md b/finetune/README_zh.md
@@ -21,12 +21,12 @@ Read this in [English](README)
 
 + 基于 Llama-Factory 进行微调
 
-| Fine-tuning Model         | Fine-tuning solution | GPU memory usage             |
-|---------------------------|----------------------|------------------------------|
-| GLM-4-9B-Chat-0414        | lora                 | 22G (Each GPU, Need 1 GPU)   |
-| GLM-4-9B-Chat-0414        | SFT (Zero3 method)   | 55G (Each GPU, Need 4 GPUs)  |
-| GLM-4-9B-Chat-0414        | lora                 | 80G (Each GPU, Need 8 GPUs)  |
-| GLM-4-32B-Chat-0414       | SFT (Zero3 method)   | 80G (Each GPU, Need 16 GPUs) |
+| Fine-tuning Model     | Fine-tuning solution | GPU memory usage             |
+|-----------------------|----------------------|------------------------------|
+| GLM-4-9B-0414     | lora                 | 22G (Each GPU, Need 1 GPU)   |
+| GLM-4-9B-0414     | SFT (Zero3 method)   | 55G (Each GPU, Need 4 GPUs)  |
+| GLM-4-9B-0414     | lora                 | 80G (Each GPU, Need 8 GPUs)  |
+| GLM-4-32B-0414    | SFT (Zero3 method)   | 80G (Each GPU, Need 16 GPUs) |
 
 + 基于本仓库代码微调
 
@@ -261,14 +261,14 @@ pip install -r requirements.txt
 通过以下代码执行 **单机多卡/多机多卡** 运行，这是使用 `deepspeed` 作为加速方案的，您需要安装 `deepspeed`。接着，按照此命令运行：
 
 ```shell
-OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8  finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9B-Chat-0414  configs/lora.yaml # For Chat Fine-tune
+OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8  finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9B-0414  configs/lora.yaml # For Chat Fine-tune
 OMP_NUM_THREADS=1 torchrun --standalone --nnodes=1 --nproc_per_node=8  finetune_vision.py  data/CogVLM-311K/  THUDM/glm-4v-9b  configs/lora.yaml  # For VQA Fine-tune
 ```
 
 通过以下代码执行 **单机单卡** 运行。
 
 ```shell
-python finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9B-Chat-0414  configs/lora.yaml # For Chat Fine-tune
+python finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9B-0414  configs/lora.yaml # For Chat Fine-tune
 python finetune_vision.py  data/CogVLM-311K/  THUDM/glm-4v-9b configs/lora.yaml # For VQA Fine-tune
 ```
 
@@ -282,7 +282,7 @@ python finetune_vision.py  data/CogVLM-311K/  THUDM/glm-4v-9b configs/lora.yaml
 例如，这就是一个从最后一个保存点继续微调的示例代码
 
 ```shell
-python finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9B-Chat-0414  configs/lora.yaml yes
+python finetune.py  data/AdvertiseGen/  THUDM/GLM-4-9B-0414  configs/lora.yaml yes
 ```
 
 ## 使用微调后的模型
diff --git a/inference/README.md b/inference/README.md
@@ -28,7 +28,7 @@ Test Hardware:
 
 The following stress test results show memory usage and latency during inference. If multiple GPUs are used, "Memory Usage" refers to the maximum usage on a single GPU.
 
-#### GLM-4-32B-Chat-0414
+#### GLM-4-32B-0414
 
 | Precision   | #GPUs | Memory Usage  | First Token Latency | Token Output Speed | Input Tokens |
 |-------------|-------|---------------|---------------------|-------------------|--------------|
@@ -37,7 +37,7 @@ The following stress test results show memory usage and latency during inference
 | BF16        | 2     | 50 GB         | 6.75s               | 8.1 tokens/s      | 32000        |
 | BF16        | 4     | 55 GB         | 37.83s              | 3.0 tokens/s      | 100000       |
 
-#### GLM-4-9B-Chat-0414
+#### GLM-4-9B-0414
 
 | Precision | #GPUs | Memory Usage | First Token Latency | Token Output Speed | Input Tokens |
 |-----------|-------|---------------|----------------------|---------------------|---------------|
@@ -71,35 +71,35 @@ The following stress test results show memory usage and latency during inference
 + Use the command line to communicate with the GLM-4-9B model.
 
 ```shell
-python trans_cli_demo.py # LLM Such as GLM-4-9B-Chat-0414
+python trans_cli_demo.py # LLM Such as GLM-4-9B-0414
 python trans_cli_vision_demo.py # GLM-4V-9B
 ```
 
 + Use the Gradio web client to communicate with the  GLM-4-9B model.
 
 ```shell
-python trans_web_demo.py  # LLM Such as GLM-4-9B-Chat-0414
+python trans_web_demo.py  # LLM Such as GLM-4-9B-0414
 python trans_web_vision_demo.py # GLM-4V-9B
 ```
 
 + Use Batch inference.
 
 ```shell
-python trans_batch_demo.py  # LLM Such as GLM-4-9B-Chat-0414
+python trans_batch_demo.py  # LLM Such as GLM-4-9B-0414
 ```
 
 ### Use vLLM backend code
 
 + Use the command line to communicate with the GLM-4-9B-Chat model.
 
 ```shell
-python vllm_cli_demo.py  # LLM Such as GLM-4-9B-Chat-0414
+python vllm_cli_demo.py  # LLM Such as GLM-4-9B-0414
 ```
 
 + Launch an OpenAI-compatible API service.
 
 ```shell
-vllm serve THUDM/GLM-4-9B-Chat-0414 --tensor_parallel_size 2
+vllm serve THUDM/GLM-4-9B-0414 --tensor_parallel_size 2
 ```
 
 ### Use glm-4v to build an OpenAI-compatible service
diff --git a/inference/README_zh.md b/inference/README_zh.md
@@ -28,7 +28,7 @@ pip install -r requirements.txt
 
 推理的压力测试数据如下，如有多张显卡，则显存占用代表显存占用最大一张显卡的显存消耗。
 
-#### GLM-4-32B-Chat-0414
+#### GLM-4-32B-0414
 
 | 精度   | 显卡数量 | 显存占用  | 首 Token 延迟 | Token 输出速度    | 输入token数 |
 |------|------|-------|------------|---------------|----------|
@@ -37,7 +37,7 @@ pip install -r requirements.txt
 | BF16 | 2    | 50 GB | 6.75s      | 8.1 tokens/s  | 32000    |
 | BF16 | 4    | 55 GB | 37.83s     | 3.0 tokens/s  | 100000   |
 
-#### GLM-4-9B-Chat-0414
+#### GLM-4-9B-0414
 
 | 精度   | 显卡数量 | 显存占用  | 首 Token 延迟 | Token 输出速度    | 输入token数 |
 |------|------|-------|------------|---------------|---------|
@@ -72,14 +72,14 @@ pip install -r requirements.txt
 + 使用命令行与 GLM-4-9B 模型进行对话。
 
 ```shell
-python trans_cli_demo.py # LLM Such as GLM-4-9B-Chat-0414
+python trans_cli_demo.py # LLM Such as GLM-4-9B-0414
 python trans_cli_vision_demo.py # GLM-4V-9B
 ```
 
 + 使用 Gradio 网页端与 GLM-4-9B 模型进行对话。
 
 ```shell
-python trans_web_demo.py  # LLM Such as GLM-4-9B-Chat-0414
+python trans_web_demo.py  # LLM Such as GLM-4-9B-0414
 python trans_web_vision_demo.py # GLM-4V-9B
 ```
 
@@ -94,12 +94,12 @@ python trans_batch_demo.py
 + 使用命令行与 GLM-4-9B-Chat 模型进行对话。
 
 ```shell
-python vllm_cli_demo.py # LLM Such as GLM-4-9B-Chat-0414
+python vllm_cli_demo.py # LLM Such as GLM-4-9B-0414
 ```
 
 + 构建 OpenAI 类 API 服务。
 ```shell
-vllm serve THUDM/GLM-4-9B-Chat-0414 --tensor_parallel_size 2
+vllm serve THUDM/GLM-4-9B-0414 --tensor_parallel_size 2
 ```
 
 ### 使用 glm-4v 构建 OpenAI 服务
diff --git a/inference/glm4v_api_request.py b/inference/glm4v_api_request.py
@@ -2,7 +2,7 @@
 This script creates a OpenAI Request demo for the glm-4v-9b model, just Use OpenAI API to interact with the model.
 For LLM such as GLM-4-9B-0414, using with vLLM OpenAI Server.
 
-vllm serve THUDM/GLM-4-32B-Chat-0414 --tensor_parallel_size 4
+vllm serve THUDM/GLM-4-32B-0414 --tensor_parallel_size 4
 
 """
 
diff --git a/inference/trans_batch_demo.py b/inference/trans_batch_demo.py
@@ -11,7 +11,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, LogitsProcessorList
 
 
-MODEL_PATH = "THUDM/GLM-4-9B-Chat-0414"
+MODEL_PATH = "THUDM/GLM-4-9B-0414"
 
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
 model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto").eval()
diff --git a/inference/trans_cli_demo.py b/inference/trans_cli_demo.py
@@ -25,7 +25,7 @@
 )
 
 
-MODEL_PATH = "THUDM/GLM-4-9B-Chat-0414"
+MODEL_PATH = "THUDM/GLM-4-9B-0414"
 
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)
 
diff --git a/inference/trans_stress_test.py b/inference/trans_stress_test.py
@@ -6,7 +6,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
-MODEL_PATH = "THUDM/GLM-4-9B-Chat-0414"
+MODEL_PATH = "THUDM/GLM-4-9B-0414"
 
 
 def stress_test(input_token_len, n, output_token_len):
diff --git a/inference/trans_web_demo.py b/inference/trans_web_demo.py
@@ -4,7 +4,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 
 
-MODEL_PATH = "THUDM/GLM-4-9B-Chat-0414"
+MODEL_PATH = "THUDM/GLM-4-9B-0414"
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
 model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map="auto")
 
diff --git a/inference/vllm_cli_demo.py b/inference/vllm_cli_demo.py
@@ -19,7 +19,7 @@
 from vllm.lora.request import LoRARequest
 
 
-MODEL_PATH = "THUDM/GLM-4-9B-Chat-0414"
+MODEL_PATH = "THUDM/GLM-4-9B-0414"
 LORA_PATH = ""
 
 
@@ -33,7 +33,6 @@ def load_model_and_tokenizer(model_dir: str, enable_lora: bool):
         tensor_parallel_size=1,
         dtype="bfloat16",
         gpu_memory_utilization=0.9,
-        enforce_eager=True,
         disable_log_requests=True,
     )
 

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@`
`16`	`16`	`parser = argparse.ArgumentParser(add_help=False)`
`17`	`17`	`parser.add_argument("-h", "--help", action="help", help="Show this help message and exit.")`
`18`	`18`	`parser.add_argument(`
`19`		`- "-m", "--model_id", default="THUDM/GLM-4-9B-Chat-0414", required=False, type=str, help="orignal model path"`
	`19`	`+ "-m", "--model_id", default="THUDM/GLM-4-9B-0414", required=False, type=str, help="orignal model path"`
`20`	`20`	`)`
`21`	`21`	`parser.add_argument(`
`22`	`22`	`"-p",`
Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@`
`25`	`25`	`)`
`26`	`26`
`27`	`27`
`28`		`-MODEL_PATH = "THUDM/GLM-4-9B-Chat-0414"`
	`28`	`+MODEL_PATH = "THUDM/GLM-4-9B-0414"`
`29`	`29`
`30`	`30`	`tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, trust_remote_code=True)`
`31`	`31`