forked from dusty-nv/jetson-containers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_llama.sh
executable file
·81 lines (68 loc) · 2.16 KB
/
test_llama.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env bash
set -ex
LLAMA_EXAMPLES="/opt/tensorrt_llm/examples/llama"
TRT_LLM_MODELS="/data/models/tensorrt_llm"
: "${FORCE_BUILD:=off}"
llama_fp16()
{
output_dir="$TRT_LLM_MODELS/llama-2-7b-chat-fp16"
if [ ! -f $output_dir/*.safetensors ]; then
python3 $LLAMA_EXAMPLES/convert_checkpoint.py \
--model_dir $(huggingface-downloader meta-llama/Llama-2-7b-chat-hf) \
--output_dir $output_dir \
--dtype float16
fi
trtllm-build \
--checkpoint_dir $output_dir \
--output_dir $output_dir/engines \
--gemm_plugin float16
}
llama_gptq()
{
output_dir="$TRT_LLM_MODELS/llama-2-7b-chat-gptq"
engine_dir="$output_dir/engines"
# --int8_kv_cache \
if [ ! -f $output_dir/*.safetensors ] || [ $FORCE_BUILD = "on" ]; then
python3 $LLAMA_EXAMPLES/convert_checkpoint.py \
--model_dir $(huggingface-downloader meta-llama/Llama-2-7b-chat-hf) \
--output_dir $output_dir \
--dtype float16 \
--modelopt_quant_ckpt_path $(huggingface-downloader TheBloke/Llama-2-7B-Chat-GPTQ/model.safetensors) \
--use_weight_only \
--weight_only_precision int4_gptq \
--group_size 128 \
--per_group
fi
if [ ! -f $engine_dir/*.engine ] || [ $FORCE_BUILD = "on" ]; then
trtllm-build \
--checkpoint_dir $output_dir \
--output_dir $engine_dir \
--gemm_plugin float16 \
--log_level verbose \
--max_batch_size 1 \
--max_num_tokens 4096 \
--max_input_len 4096 \
--max_output_len 128
fi
if false; then
python3 $LLAMA_EXAMPLES/../run.py \
--max_output_len=50 \
--tokenizer_dir $(huggingface-downloader meta-llama/Llama-2-7b-chat-hf) \
--engine_dir $engine_dir
fi
python3 /opt/tensorrt_llm/benchmarks/python/benchmark.py \
-m llama_7b \
--mode plugin \
--batch_size 1 \
--input_output_len "16,128;32,128;64,128;128,128;256,128;512,128;1024,128;2048,128;3072,128;3968,128" \
--log_level verbose \
--enable_cuda_graph \
--warm_up 2 \
--num_runs 3 \
--duration 10 \
--strongly_typed \
--quantization int4_weight_only_gptq \
--engine_dir $engine_dir
}
#llama_fp16
llama_gptq