From e8a98db4922348fca5a59e4eac01c045dbb09448 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 26 Sep 2025 00:20:23 +0200 Subject: [PATCH 01/33] init --- ci/perf_linux.groovy | 307 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 ci/perf_linux.groovy diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy new file mode 100644 index 0000000000..452435b0d7 --- /dev/null +++ b/ci/perf_linux.groovy @@ -0,0 +1,307 @@ +#!groovy +import org.jenkinsci.plugins.pipeline.modeldefinition.Utils +@Library(value='mainlib@master', changelog=false) _ + + +pipeline { + options { + timeout(time: 2, unit: 'HOURS') + } + parameters { + string ( + name: "DOCKER_IMAGE_NAME", + defaultValue: "registry.toolbox.iotg.sclab.intel.com/openvino/model_server-gpu:ubuntu24_main", + description: "Name of the image to be scanned. Can't be empty. Registry/image/tag format." + ) + string ( + name: "MODEL", + defaultValue: "OpenVINO/Qwen3-4B-int4-ov", + description: "Model to use in tests" + ) + string ( + name: "TARGET_ENV", + defaultValue: "ov-spr-19", + description: "Worker label to run tests on" + ) + string ( + name: "DEVICE", + defaultValue: "CPU", + description: "Device to use in tests" + ) + booleanParam( + defaultValue: false, + description: 'Run latency test', + name: 'LATENCY' + ) + booleanParam( + defaultValue: false, + description: 'Run throughput test', + name: 'THROUGHPUT' + ) + booleanParam( + defaultValue: false, + description: 'Run agentic latency test', + name: 'AGENTIC_LATENCY' + ) + booleanParam( + name: "AGENTIC_ACCURACY", + defaultValue: false, + description: "Agentic accuracy" + ) + string ( + name: "MODELS_REPOSITORY_PATH", + defaultValue: "", + description: "Path to models repository" + ) + booleanParam( + name: "SAVE_REFERENCE", + defaultValue: false, + description: "Save reference results" + ) + + } + + agent { + label "${params.TARGET_ENV}" + } + + stages { + stage('Latency') { + when { + expression { params.LATENCY == true } + } + steps { + script { + def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + sh "echo Start docker container && \ + mkdir -p ${modelsPath} && \ + docker pull ${params.DOCKER_IMAGE_NAME} && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO && \ + echo wait for model server to be ready && \ + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" + } + sh "echo Running latency test && \ + mkdir -p results && touch results/results.json && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result && \ + cat results/results.json | jq ." + script { + def mean_tpot_ms_reference = { + if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_tpot_ms.txt")) { + return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_tpot_ms.txt", returnStdout: true).trim().toFloat() + } else { + return 100000.0 + } + }() + def mean_ttft_ms_reference = { + if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_ttft_ms.txt")) { + return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_ttft_ms.txt", returnStdout: true).trim().toFloat() + } else { + return 100000.0 + } + }() + echo "mean_tpot_ms_reference: ${mean_tpot_ms_reference}, mean_ttft_ms_reference: ${mean_ttft_ms_reference}" + // Allow 5% increase in latency + def hasWarnings = sh(returnStdout: true, script: """jq -r '.mean_tpot_ms > ${mean_tpot_ms_reference * 1.05} or .mean_ttft_ms > ${mean_ttft_ms_reference * 1.05}' results/results.json""").trim() == "true" + if (hasWarnings) { + unstable('Performance threshold not met in throughput test') + } + sh '''if [ $(echo "$(cat results/results.json | jq -r '.completed') != $(cat results/results.json | jq -r '.num_prompts')" | bc) -ne 0 ] ; then exit 1; fi''' + } + sh "echo Stop docker container && \ + docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + script { + if (params.SAVE_REFERENCE) { + sh "mkdir -p ${env.WORKSPACE}/reference/${params.MODEL} && jq -r '.mean_tpot_ms' results/results.json > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_tpot_ms.txt && \ + jq -r '.mean_ttft_ms' results/results.json > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_ttft_ms.txt" + } + } + } + } + stage('Throughput') { + when { + expression { params.THROUGHPUT == true } + } + steps { + + script { + def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + sh "echo Start docker container && \ + mkdir -p ${modelsPath} && \ + docker pull ${params.DOCKER_IMAGE_NAME} && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO && \ + echo wait for model server to be ready && \ + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" + } + sh "echo Running latency test && \ + mkdir -p results && touch results/results.json && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result && \ + cat results/results.json | jq ." + script { + def total_token_throughput_reference = { + if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_total_token_throughput.txt")) { + try { + return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_total_token_throughput.txt", returnStdout: true).trim().toFloat() + } catch (Exception e) { + echo "Error reading total_token_throughput reference: ${e.getMessage()}" + return 0.0 + } + } else { + return 0.0 + } + }() + def output_throughput_reference = { + if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_output_throughput.txt")) { + try { + return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_output_throughput.txt", returnStdout: true).trim().toFloat() + } catch (Exception e) { + echo "Error reading output_throughput reference: ${e.getMessage()}" + return 0.0 + } + } else { + return 0.0 + } + }() + echo "total_token_throughput_reference: ${total_token_throughput_reference}, output_throughput_reference: ${output_throughput_reference}" + // Allow 5% decrease in throughput + def hasWarnings = sh(returnStdout: true, script: """jq -r '.total_token_throughput < ${total_token_throughput_reference * 0.95} or .output_throughput < ${output_throughput_reference * 0.95}' results/results.json""").trim() == "true" + if (hasWarnings) { + unstable('Performance threshold not met in throughput test') + } + sh '''if [ $(echo "$(cat results/results.json | jq -r '.completed') != $(cat results/results.json | jq -r '.num_prompts')" | bc) -ne 0 ] ; then exit 1; fi''' + } + sh "echo Stop docker container && \ + docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + script { + if (params.SAVE_REFERENCE) { + sh "mkdir -p ${env.WORKSPACE}/reference/${params.MODEL} && \ + jq -r '.total_token_throughput' results/results.json > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_total_token_throughput.txt && \ + jq -r '.output_throughput' results/results.json > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_output_throughput.txt" + } + } + } + } + stage('Agentic Latency') { + when { + expression { params.AGENTIC_LATENCY == true } + } + steps { + sh "echo Start docker container" + script { + def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + sh "mkdir -p ${modelsPath} && \ + docker pull ${params.DOCKER_IMAGE_NAME} && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO && \ + echo wait for model server to be ready && \ + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" + } + sh "echo Running agentic latency test && \ + test -d .venv || python3 -m venv .venv && \ + test -d vllm || git clone -b v0.10.2 https://github.com/vllm-project/vllm && \ + sed -i -e 's/if not os.path.exists(args.model)/if 1 == 0/g' vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py && \ + test -f pg1184.txt || curl https://www.gutenberg.org/ebooks/1184.txt.utf-8 -o pg1184.txt" + sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt && \ + python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${params.MODEL} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${params.MODEL} --num-clients 1 -n 20 > results_agentic_latency.txt && \ + cat results_agentic_latency.txt" + script { + // Check if requests_per_sec is above threshold + def requests_per_sec = sh(script: '''cat results_agentic_latency.txt | grep requests_per_sec | cut -d= -f2''', returnStdout: true).trim() + def requests_per_sec_reference = { + if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_requests_per_sec.txt")) { + try{ + return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_requests_per_sec.txt", returnStdout: true).trim().toFloat() + } catch (Exception e) { + echo "Error reading requests_per_sec reference: ${e.getMessage()}" + return 0.0 + } + } else { + return 0.0 + } + }() + echo "requests_per_sec: ${requests_per_sec}, requests_per_sec_reference: ${requests_per_sec_reference}" + // Require at least 95% of reference throughput + if (requests_per_sec.toFloat() < requests_per_sec_reference * 0.95) { + echo "WARNING: Requests per second is below threshold" + unstable('Performance threshold not met, requests_per_sec: ' + requests_per_sec) + } + } + sh "echo Stop docker container && \ + docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + script { + if (params.SAVE_REFERENCE) { + sh "mkdir -p ${env.WORKSPACE}/reference/${params.MODEL} && \ + cat results_agentic_latency.txt | grep requests_per_sec | cut -d= -f2 > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_requests_per_sec.txt" + } + } + } + } + stage('Agentic Accuracy') { + when { + expression { params.AGENTIC_ACCURACY == true } + } + steps { + sh "echo Start docker container" + script { + def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ + mkdir -p ${modelsPath} && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --log_level INFO && \ + echo wait for model server to be ready && \ + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"ovms-model\" ] ; do echo waiting for LLM model; sleep 1; done" + } + sh "echo Install BFCL && \ + test -d gorilla || git clone https://github.com/ShishirPatil/gorilla && \ + cd gorilla/berkeley-function-call-leaderboard && git checkout cd9429ccf3d4d04156affe883c495b3b047e6b64 -f && curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/continuous_batching/accuracy/gorilla.patch | git apply -v" + sh "test -d .venv || python3 -m venv .venv && \ + . .venv/bin/activate && pip install -e ./gorilla/berkeley-function-call-leaderboard && \ + echo Running agentic accuracy test && \ + export OPENAI_BASE_URL=http://localhost:9000/v3 && \ + bfcl generate --model ovms-model --test-category simple --temperature 0.0 --num-threads 100 -o --result-dir bfcl_results && bfcl evaluate --model ovms-model --result-dir bfcl_results --score-dir bfcl_scores && \ + cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq ." + script { + def accuracy = sh(script: "cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq -r '.accuracy'", returnStdout: true).trim() + def accuracy_reference = { + if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_accuracy.txt")) { + try { + return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_accuracy.txt", returnStdout: true).trim().toFloat() + } catch (Exception e) { + echo "Error reading accuracy reference: ${e.getMessage()}" + return 0.0 + } + } else { + return 0.0 + } + }() + echo "accuracy: ${accuracy}, accuracy_reference: ${accuracy_reference}" + // Require at least 98% of reference accuracy + if (accuracy.toFloat() < accuracy_reference * 0.98) { + echo "WARNING: Accuracy ${accuracy} is below threshold" + unstable('Accuracy threshold not met') + } + } + sh "echo Stop docker container && \ + docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + script { + if (params.SAVE_REFERENCE) { + sh "mkdir -p ${env.WORKSPACE}/reference/${params.MODEL} && \ + cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq -r '.accuracy' > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_accuracy.txt" + } + } + } + } + } + post { + always { + sh "docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + } + success { + echo 'Pipeline completed successfully!' + } + failure { + echo 'Pipeline failed!' + } + } +} \ No newline at end of file From cbc17a67421190fec38143954ace9211c5abea6b Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 23 Oct 2025 13:35:22 +0200 Subject: [PATCH 02/33] cache size fix and artefacts saving --- ci/perf_linux.groovy | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 452435b0d7..dff9963c73 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -77,7 +77,7 @@ pipeline { sh "echo Start docker container && \ mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 1 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" } @@ -117,6 +117,10 @@ pipeline { } } } + finally { + sh "mv results/results.json results/latency_results.json" + archiveArtifacts allowEmptyArchive: true, artifacts: "results/latency_results.json" + } } stage('Throughput') { when { @@ -130,7 +134,7 @@ pipeline { sh "echo Start docker container && \ mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" } @@ -181,6 +185,10 @@ pipeline { } } } + finally { + sh "mv results/results.json results/throughput_results.json" + archiveArtifacts allowEmptyArchive: true, artifacts: "results/throughput_results.json" + } } stage('Agentic Latency') { when { @@ -193,7 +201,7 @@ pipeline { def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO --cache_size 3 && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" } @@ -236,6 +244,9 @@ pipeline { } } } + finally { + archiveArtifacts allowEmptyArchive: true, artifacts: "results_agentic_latency.txt" + } } stage('Agentic Accuracy') { when { @@ -248,7 +259,7 @@ pipeline { def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ mkdir -p ${modelsPath} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --log_level INFO && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"ovms-model\" ] ; do echo waiting for LLM model; sleep 1; done" } @@ -291,6 +302,9 @@ pipeline { } } } + finally { + archiveArtifacts allowEmptyArchive: true, artifacts: "gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json" + } } } post { From d0c724d71a11e946da5a5aee11e6f2ea30412499 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 23 Oct 2025 13:38:30 +0200 Subject: [PATCH 03/33] fix --- ci/perf_linux.groovy | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index dff9963c73..bded0f79a4 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -117,9 +117,12 @@ pipeline { } } } - finally { - sh "mv results/results.json results/latency_results.json" - archiveArtifacts allowEmptyArchive: true, artifacts: "results/latency_results.json" + } + post { + always { + sh "mv results/results.json results/latency_results.json" + archiveArtifacts allowEmptyArchive: true, artifacts: "results/latency_results.json" + } } } stage('Throughput') { @@ -185,9 +188,11 @@ pipeline { } } } - finally { - sh "mv results/results.json results/throughput_results.json" - archiveArtifacts allowEmptyArchive: true, artifacts: "results/throughput_results.json" + post { + always { + sh "mv results/results.json results/throughput_results.json" + archiveArtifacts allowEmptyArchive: true, artifacts: "results/throughput_results.json" + } } } stage('Agentic Latency') { @@ -244,8 +249,10 @@ pipeline { } } } - finally { - archiveArtifacts allowEmptyArchive: true, artifacts: "results_agentic_latency.txt" + post { + always { + archiveArtifacts allowEmptyArchive: true, artifacts: "results_agentic_latency.txt" + } } } stage('Agentic Accuracy') { @@ -302,8 +309,10 @@ pipeline { } } } - finally { - archiveArtifacts allowEmptyArchive: true, artifacts: "gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json" + post { + always { + archiveArtifacts allowEmptyArchive: true, artifacts: "gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json" + } } } } From c8749f481f9d3e23c7723935c35440a6bf9fd552 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 23 Oct 2025 13:40:14 +0200 Subject: [PATCH 04/33] fix --- ci/perf_linux.groovy | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index bded0f79a4..ba42644e31 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -117,7 +117,6 @@ pipeline { } } } - } post { always { sh "mv results/results.json results/latency_results.json" From 7be9aef4fc48d561f91897f896a09ddc17c82ebf Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Thu, 23 Oct 2025 15:01:38 +0200 Subject: [PATCH 05/33] default --- ci/perf_linux.groovy | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index ba42644e31..20e5f9f8f6 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -29,23 +29,23 @@ pipeline { description: "Device to use in tests" ) booleanParam( - defaultValue: false, + defaultValue: true, description: 'Run latency test', name: 'LATENCY' ) booleanParam( - defaultValue: false, + defaultValue: true, description: 'Run throughput test', name: 'THROUGHPUT' ) booleanParam( - defaultValue: false, + defaultValue: true, description: 'Run agentic latency test', name: 'AGENTIC_LATENCY' ) booleanParam( name: "AGENTIC_ACCURACY", - defaultValue: false, + defaultValue: true, description: "Agentic accuracy" ) string ( @@ -119,7 +119,7 @@ pipeline { } post { always { - sh "mv results/results.json results/latency_results.json" + sh "cat results/results.json | jq . > results/latency_results.json" archiveArtifacts allowEmptyArchive: true, artifacts: "results/latency_results.json" } } @@ -189,7 +189,7 @@ pipeline { } post { always { - sh "mv results/results.json results/throughput_results.json" + sh "cat results/results.json | jq . > results/throughput_results.json" archiveArtifacts allowEmptyArchive: true, artifacts: "results/throughput_results.json" } } From 373bf2916ea9668b9fe5b749fc81757a3ce4b2e0 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 09:44:13 +0200 Subject: [PATCH 06/33] local models --- ci/perf_linux.groovy | 62 +++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 20e5f9f8f6..cf19725ff4 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -2,7 +2,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils @Library(value='mainlib@master', changelog=false) _ - +def model_need_copy = true pipeline { options { timeout(time: 2, unit: 'HOURS') @@ -74,12 +74,14 @@ pipeline { script { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + def model_name = params.MODEL sh "echo Start docker container && \ mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 1 --log_level INFO && \ + if [ -d ${params.MODEL} && $model_need_copy ]; then rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL}); model_name = $(basename ${params.MODEL}); model_need_copy = false; fi && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 1 --log_level INFO && \ echo wait for model server to be ready && \ - while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ @@ -87,15 +89,15 @@ pipeline { cat results/results.json | jq ." script { def mean_tpot_ms_reference = { - if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_tpot_ms.txt")) { - return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_tpot_ms.txt", returnStdout: true).trim().toFloat() + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_tpot_ms.txt")) { + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_tpot_ms.txt", returnStdout: true).trim().toFloat() } else { return 100000.0 } }() def mean_ttft_ms_reference = { - if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_ttft_ms.txt")) { - return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_ttft_ms.txt", returnStdout: true).trim().toFloat() + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_ttft_ms.txt")) { + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_ttft_ms.txt", returnStdout: true).trim().toFloat() } else { return 100000.0 } @@ -112,8 +114,8 @@ pipeline { docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" script { if (params.SAVE_REFERENCE) { - sh "mkdir -p ${env.WORKSPACE}/reference/${params.MODEL} && jq -r '.mean_tpot_ms' results/results.json > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_tpot_ms.txt && \ - jq -r '.mean_ttft_ms' results/results.json > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_mean_ttft_ms.txt" + sh "mkdir -p ${env.WORKSPACE}/reference/${model_name} && jq -r '.mean_tpot_ms' results/results.json > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_tpot_ms.txt && \ + jq -r '.mean_ttft_ms' results/results.json > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_ttft_ms.txt" } } } @@ -133,12 +135,14 @@ pipeline { script { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + def model_name = params.MODEL sh "echo Start docker container && \ mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ + if [ -d ${params.MODEL} && $model_need_copy ]; then rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL}); model_name = $(basename ${params.MODEL}); model_need_copy = false; fi && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ - while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ @@ -146,9 +150,9 @@ pipeline { cat results/results.json | jq ." script { def total_token_throughput_reference = { - if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_total_token_throughput.txt")) { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_total_token_throughput.txt")) { try { - return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_total_token_throughput.txt", returnStdout: true).trim().toFloat() + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_total_token_throughput.txt", returnStdout: true).trim().toFloat() } catch (Exception e) { echo "Error reading total_token_throughput reference: ${e.getMessage()}" return 0.0 @@ -158,9 +162,9 @@ pipeline { } }() def output_throughput_reference = { - if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_output_throughput.txt")) { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_output_throughput.txt")) { try { - return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_output_throughput.txt", returnStdout: true).trim().toFloat() + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_output_throughput.txt", returnStdout: true).trim().toFloat() } catch (Exception e) { echo "Error reading output_throughput reference: ${e.getMessage()}" return 0.0 @@ -181,9 +185,9 @@ pipeline { docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" script { if (params.SAVE_REFERENCE) { - sh "mkdir -p ${env.WORKSPACE}/reference/${params.MODEL} && \ - jq -r '.total_token_throughput' results/results.json > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_total_token_throughput.txt && \ - jq -r '.output_throughput' results/results.json > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_output_throughput.txt" + sh "mkdir -p ${env.WORKSPACE}/reference/${model_name} && \ + jq -r '.total_token_throughput' results/results.json > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_total_token_throughput.txt && \ + jq -r '.output_throughput' results/results.json > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_output_throughput.txt" } } } @@ -203,11 +207,13 @@ pipeline { script { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + def model_name = params.MODEL sh "mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ + if [ -d ${params.MODEL} && $model_need_copy ]; then rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL}); model_name = $(basename ${params.MODEL}); model_need_copy = false; fi && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO --cache_size 3 && \ echo wait for model server to be ready && \ - while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${params.MODEL}\" ] ; do echo waiting for LLM model; sleep 1; done" + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" } sh "echo Running agentic latency test && \ test -d .venv || python3 -m venv .venv && \ @@ -215,15 +221,15 @@ pipeline { sed -i -e 's/if not os.path.exists(args.model)/if 1 == 0/g' vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py && \ test -f pg1184.txt || curl https://www.gutenberg.org/ebooks/1184.txt.utf-8 -o pg1184.txt" sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt && \ - python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${params.MODEL} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${params.MODEL} --num-clients 1 -n 20 > results_agentic_latency.txt && \ + python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${model_name} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${model_name} --num-clients 1 -n 20 > results_agentic_latency.txt && \ cat results_agentic_latency.txt" script { // Check if requests_per_sec is above threshold def requests_per_sec = sh(script: '''cat results_agentic_latency.txt | grep requests_per_sec | cut -d= -f2''', returnStdout: true).trim() def requests_per_sec_reference = { - if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_requests_per_sec.txt")) { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_requests_per_sec.txt")) { try{ - return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_requests_per_sec.txt", returnStdout: true).trim().toFloat() + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_requests_per_sec.txt", returnStdout: true).trim().toFloat() } catch (Exception e) { echo "Error reading requests_per_sec reference: ${e.getMessage()}" return 0.0 @@ -243,8 +249,8 @@ pipeline { docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" script { if (params.SAVE_REFERENCE) { - sh "mkdir -p ${env.WORKSPACE}/reference/${params.MODEL} && \ - cat results_agentic_latency.txt | grep requests_per_sec | cut -d= -f2 > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_requests_per_sec.txt" + sh "mkdir -p ${env.WORKSPACE}/reference/${model_name} && \ + cat results_agentic_latency.txt | grep requests_per_sec | cut -d= -f2 > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_requests_per_sec.txt" } } } @@ -263,7 +269,9 @@ pipeline { script { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + def model_name = params.MODEL sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ + if [ -d ${params.MODEL} && $model_need_copy ]; then rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL}); model_name = $(basename ${params.MODEL}); model_need_copy = false; fi && \ mkdir -p ${modelsPath} && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ @@ -281,9 +289,9 @@ pipeline { script { def accuracy = sh(script: "cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq -r '.accuracy'", returnStdout: true).trim() def accuracy_reference = { - if (fileExists("${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_accuracy.txt")) { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_accuracy.txt")) { try { - return sh(script: "cat ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_accuracy.txt", returnStdout: true).trim().toFloat() + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_accuracy.txt", returnStdout: true).trim().toFloat() } catch (Exception e) { echo "Error reading accuracy reference: ${e.getMessage()}" return 0.0 @@ -303,8 +311,8 @@ pipeline { docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" script { if (params.SAVE_REFERENCE) { - sh "mkdir -p ${env.WORKSPACE}/reference/${params.MODEL} && \ - cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq -r '.accuracy' > ${env.WORKSPACE}/reference/${params.MODEL}/${params.DEVICE}_agentic_accuracy.txt" + sh "mkdir -p ${env.WORKSPACE}/reference/${model_name} && \ + cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq -r '.accuracy' > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_accuracy.txt" } } } From e9d2ab25bb7c249db4b94d5d67db3cb5152d974f Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 09:48:52 +0200 Subject: [PATCH 07/33] local models --- ci/perf_linux.groovy | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index cf19725ff4..2becae4d75 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -75,10 +75,14 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL + if (fileExists(params.MODEL) && model_need_copy) { + sh "rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})" + model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() + model_need_copy = false + } sh "echo Start docker container && \ mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - if [ -d ${params.MODEL} && $model_need_copy ]; then rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL}); model_name = $(basename ${params.MODEL}); model_need_copy = false; fi && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 1 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" @@ -136,10 +140,14 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL + if (fileExists(params.MODEL) && model_need_copy) { + sh "rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})" + model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() + model_need_copy = false + } sh "echo Start docker container && \ mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - if [ -d ${params.MODEL} && $model_need_copy ]; then rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL}); model_name = $(basename ${params.MODEL}); model_need_copy = false; fi && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" @@ -208,9 +216,13 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL + if (fileExists(params.MODEL) && model_need_copy) { + sh "rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})" + model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() + model_need_copy = false + } sh "mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - if [ -d ${params.MODEL} && $model_need_copy ]; then rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL}); model_name = $(basename ${params.MODEL}); model_need_copy = false; fi && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO --cache_size 3 && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" @@ -270,8 +282,12 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL + if (fileExists(params.MODEL) && model_need_copy) { + sh "rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})" + model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() + model_need_copy = false + } sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ - if [ -d ${params.MODEL} && $model_need_copy ]; then rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL}); model_name = $(basename ${params.MODEL}); model_need_copy = false; fi && \ mkdir -p ${modelsPath} && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ From 6d2cb1e43bae0b89bc2ec220235cba7b586ea7ec Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 09:51:14 +0200 Subject: [PATCH 08/33] local models --- ci/perf_linux.groovy | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 2becae4d75..827c816d26 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -76,7 +76,7 @@ pipeline { def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL if (fileExists(params.MODEL) && model_need_copy) { - sh "rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})" + sh '''rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})''' model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } @@ -141,7 +141,7 @@ pipeline { def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL if (fileExists(params.MODEL) && model_need_copy) { - sh "rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})" + sh '''rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})''' model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } @@ -217,7 +217,7 @@ pipeline { def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL if (fileExists(params.MODEL) && model_need_copy) { - sh "rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})" + sh '''rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})''' model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } @@ -283,7 +283,7 @@ pipeline { def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL if (fileExists(params.MODEL) && model_need_copy) { - sh "rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})" + sh '''rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})''' model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } From f582d3e541580b765e06d07ac0393f6b75cef2b9 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:06:49 +0200 Subject: [PATCH 09/33] local models --- ci/perf_linux.groovy | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 827c816d26..661bc7a2e3 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -75,13 +75,13 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL + sh '''mkdir -p ${modelsPath}''' if (fileExists(params.MODEL) && model_need_copy) { - sh '''rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})''' + sh '''cp -R ${params.MODEL} ${modelsPath}''' model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } sh "echo Start docker container && \ - mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 1 --log_level INFO && \ echo wait for model server to be ready && \ @@ -140,8 +140,9 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL + sh '''mkdir -p ${modelsPath}''' if (fileExists(params.MODEL) && model_need_copy) { - sh '''rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})''' + sh '''cp -R ${params.MODEL} ${modelsPath}''' model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } @@ -216,8 +217,9 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL + sh '''mkdir -p ${modelsPath}''' if (fileExists(params.MODEL) && model_need_copy) { - sh '''rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})''' + sh '''cp -R ${params.MODEL} ${modelsPath}''' model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } @@ -282,13 +284,13 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL + sh '''mkdir -p ${modelsPath}''' if (fileExists(params.MODEL) && model_need_copy) { - sh '''rm -Rf ${modelsPath}/$(basename ${params.MODEL}) && cp -R ${params.MODEL} ${modelsPath}/$(basename ${params.MODEL})''' + sh '''cp -R ${params.MODEL} ${modelsPath}''' model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false - } + } sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ - mkdir -p ${modelsPath} && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"ovms-model\" ] ; do echo waiting for LLM model; sleep 1; done" From faea32e7680ff27570ade3cfcde065b850f71d48 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:11:35 +0200 Subject: [PATCH 10/33] local models --- ci/perf_linux.groovy | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 661bc7a2e3..b024fa087f 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -75,9 +75,9 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL - sh '''mkdir -p ${modelsPath}''' + sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { - sh '''cp -R ${params.MODEL} ${modelsPath}''' + sh "cp -R ${params.MODEL} ${modelsPath}" model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } @@ -140,9 +140,9 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL - sh '''mkdir -p ${modelsPath}''' + sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { - sh '''cp -R ${params.MODEL} ${modelsPath}''' + sh "cp -R ${params.MODEL} ${modelsPath}" model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } @@ -217,9 +217,9 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL - sh '''mkdir -p ${modelsPath}''' + sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { - sh '''cp -R ${params.MODEL} ${modelsPath}''' + sh "cp -R ${params.MODEL} ${modelsPath}" model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } @@ -284,9 +284,9 @@ pipeline { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" def model_name = params.MODEL - sh '''mkdir -p ${modelsPath}''' + sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { - sh '''cp -R ${params.MODEL} ${modelsPath}''' + sh "cp -R ${params.MODEL} ${modelsPath}" model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() model_need_copy = false } From 88ba68be612a5696bf8a050c3308d8661cfafedc Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:17:45 +0200 Subject: [PATCH 11/33] local models --- ci/perf_linux.groovy | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index b024fa087f..519593b53d 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -89,7 +89,7 @@ pipeline { } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ - docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { def mean_tpot_ms_reference = { @@ -155,7 +155,7 @@ pipeline { } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ - docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { def total_token_throughput_reference = { From 0a47e8f00c13e5ff9e8076d053407aea16395fd5 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:27:10 +0200 Subject: [PATCH 12/33] local models --- ci/perf_linux.groovy | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 519593b53d..91b408f5bb 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -155,6 +155,7 @@ pipeline { } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ + echo docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { From 10521db7faa5c787fb93e26649e98ba14b901cc5 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:33:59 +0200 Subject: [PATCH 13/33] local models --- ci/perf_linux.groovy | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 91b408f5bb..354e1fd265 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -85,10 +85,12 @@ pipeline { docker pull ${params.DOCKER_IMAGE_NAME} && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 1 --log_level INFO && \ echo wait for model server to be ready && \ - while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done && \ + echo Server is ready" } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ + echo docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { @@ -155,7 +157,7 @@ pipeline { } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ - echo docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { From 8f6dbf53515061c4abcf6a1dd602ea32bb0578b3 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:40:57 +0200 Subject: [PATCH 14/33] local models --- ci/perf_linux.groovy | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 354e1fd265..b39bf2200b 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -3,6 +3,8 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils @Library(value='mainlib@master', changelog=false) _ def model_need_copy = true +def model_name = params.MODEL + pipeline { options { timeout(time: 2, unit: 'HOURS') @@ -74,7 +76,6 @@ pipeline { script { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" - def model_name = params.MODEL sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { sh "cp -R ${params.MODEL} ${modelsPath}" @@ -141,7 +142,6 @@ pipeline { script { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" - def model_name = params.MODEL sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { sh "cp -R ${params.MODEL} ${modelsPath}" @@ -219,7 +219,6 @@ pipeline { script { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" - def model_name = params.MODEL sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { sh "cp -R ${params.MODEL} ${modelsPath}" @@ -286,7 +285,6 @@ pipeline { script { def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" - def model_name = params.MODEL sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { sh "cp -R ${params.MODEL} ${modelsPath}" From 3e1c91cdcd97a424b7db0038f0c8423b8be0900a Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:44:06 +0200 Subject: [PATCH 15/33] local models --- ci/perf_linux.groovy | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index b39bf2200b..e50e2a0423 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -4,6 +4,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils def model_need_copy = true def model_name = params.MODEL +def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" pipeline { options { @@ -74,7 +75,6 @@ pipeline { } steps { script { - def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { @@ -140,7 +140,6 @@ pipeline { steps { script { - def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { @@ -217,7 +216,6 @@ pipeline { steps { sh "echo Start docker container" script { - def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { @@ -283,7 +281,6 @@ pipeline { steps { sh "echo Start docker container" script { - def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { From 44cac0b6187625947e76c5847ebcbcc740a8574f Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:47:11 +0200 Subject: [PATCH 16/33] local models --- ci/perf_linux.groovy | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index e50e2a0423..4b82e7b55b 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -3,8 +3,8 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils @Library(value='mainlib@master', changelog=false) _ def model_need_copy = true -def model_name = params.MODEL -def modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" +def model_name = "" +def modelsPath = "" pipeline { options { @@ -76,6 +76,8 @@ pipeline { steps { script { def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + model_name = params.MODEL sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { sh "cp -R ${params.MODEL} ${modelsPath}" @@ -140,6 +142,8 @@ pipeline { steps { script { + modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + model_name = params.MODEL def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { @@ -216,6 +220,8 @@ pipeline { steps { sh "echo Start docker container" script { + modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + model_name = params.MODEL def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { @@ -281,6 +287,8 @@ pipeline { steps { sh "echo Start docker container" script { + modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + model_name = params.MODEL def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) && model_need_copy) { From 0550dde125ad3b1f9606e3328b71b040061f4f65 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:51:25 +0200 Subject: [PATCH 17/33] local models --- ci/perf_linux.groovy | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 4b82e7b55b..13232b8e18 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -8,7 +8,7 @@ def modelsPath = "" pipeline { options { - timeout(time: 2, unit: 'HOURS') + timeout(time: 20, unit: 'MINUTES') } parameters { string ( @@ -93,8 +93,8 @@ pipeline { } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ - echo docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ - docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ + echo docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { def mean_tpot_ms_reference = { @@ -160,8 +160,8 @@ pipeline { } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ - docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ - docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${params.MODEL} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ + echo docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { def total_token_throughput_reference = { From 6d8f0c3293b258949a5e2cb5d4dac345d4958a53 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 10:57:29 +0200 Subject: [PATCH 18/33] local models --- ci/perf_linux.groovy | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 13232b8e18..77d6e38af8 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -93,7 +93,6 @@ pipeline { } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ - echo docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { @@ -154,13 +153,12 @@ pipeline { sh "echo Start docker container && \ mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" } sh "echo Running latency test && \ mkdir -p results && touch results/results.json && \ - echo docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ cat results/results.json | jq ." script { @@ -231,7 +229,7 @@ pipeline { } sh "mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO --cache_size 3 && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO --cache_size 3 && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" } @@ -297,7 +295,7 @@ pipeline { model_need_copy = false } sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${params.MODEL} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"ovms-model\" ] ; do echo waiting for LLM model; sleep 1; done" } From 59268a33d4ceaee6a3aaa02db41aefe49a2c910c Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 11:03:27 +0200 Subject: [PATCH 19/33] local models --- ci/perf_linux.groovy | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 77d6e38af8..9dad557746 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -79,11 +79,13 @@ pipeline { modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" model_name = params.MODEL sh "mkdir -p ${modelsPath}" - if (fileExists(params.MODEL) && model_need_copy) { - sh "cp -R ${params.MODEL} ${modelsPath}" + if (fileExists(params.MODEL) ) { model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() - model_need_copy = false - } + if (model_need_copy) { + sh "cp -R ${params.MODEL} ${modelsPath}" + model_need_copy = false + } + } sh "echo Start docker container && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 1 --log_level INFO && \ @@ -145,11 +147,13 @@ pipeline { model_name = params.MODEL def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" - if (fileExists(params.MODEL) && model_need_copy) { - sh "cp -R ${params.MODEL} ${modelsPath}" + if (fileExists(params.MODEL) ) { model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() - model_need_copy = false - } + if (model_need_copy) { + sh "cp -R ${params.MODEL} ${modelsPath}" + model_need_copy = false + } + } sh "echo Start docker container && \ mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ @@ -222,11 +226,13 @@ pipeline { model_name = params.MODEL def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" - if (fileExists(params.MODEL) && model_need_copy) { - sh "cp -R ${params.MODEL} ${modelsPath}" + if (fileExists(params.MODEL) ) { model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() - model_need_copy = false - } + if (model_need_copy) { + sh "cp -R ${params.MODEL} ${modelsPath}" + model_need_copy = false + } + } sh "mkdir -p ${modelsPath} && \ docker pull ${params.DOCKER_IMAGE_NAME} && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO --cache_size 3 && \ @@ -289,10 +295,12 @@ pipeline { model_name = params.MODEL def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" sh "mkdir -p ${modelsPath}" - if (fileExists(params.MODEL) && model_need_copy) { - sh "cp -R ${params.MODEL} ${modelsPath}" + if (fileExists(params.MODEL) ) { model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() - model_need_copy = false + if (model_need_copy) { + sh "cp -R ${params.MODEL} ${modelsPath}" + model_need_copy = false + } } sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ From 5ba83d345e3db46d432ddab04e65a1d110ee05ee Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 11:12:57 +0200 Subject: [PATCH 20/33] local models --- ci/perf_linux.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 9dad557746..5fff3cf3ce 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -245,7 +245,7 @@ pipeline { sed -i -e 's/if not os.path.exists(args.model)/if 1 == 0/g' vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py && \ test -f pg1184.txt || curl https://www.gutenberg.org/ebooks/1184.txt.utf-8 -o pg1184.txt" sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt && \ - python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${model_name} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${model_name} --num-clients 1 -n 20 > results_agentic_latency.txt && \ + python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${modelPath}/${model_name} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${model_name} --num-clients 1 -n 20 > results_agentic_latency.txt && \ cat results_agentic_latency.txt" script { // Check if requests_per_sec is above threshold From 3e967d7ff86f848c5a2fafc47ebda89829889ebc Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 11:17:54 +0200 Subject: [PATCH 21/33] local models --- ci/perf_linux.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 5fff3cf3ce..9262d70257 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -245,7 +245,7 @@ pipeline { sed -i -e 's/if not os.path.exists(args.model)/if 1 == 0/g' vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py && \ test -f pg1184.txt || curl https://www.gutenberg.org/ebooks/1184.txt.utf-8 -o pg1184.txt" sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt && \ - python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${modelPath}/${model_name} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${model_name} --num-clients 1 -n 20 > results_agentic_latency.txt && \ + python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${modelsPath}/${model_name} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${model_name} --num-clients 1 -n 20 > results_agentic_latency.txt && \ cat results_agentic_latency.txt" script { // Check if requests_per_sec is above threshold From 700f406998f6a913f76e3ccb95f7fb56e765c519 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 11:31:47 +0200 Subject: [PATCH 22/33] local models --- ci/perf_linux.groovy | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 9262d70257..17ee56a060 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -46,6 +46,11 @@ pipeline { description: 'Run agentic latency test', name: 'AGENTIC_LATENCY' ) + booleanParam( + defaultValue: true, + description: 'Use tool guided generation in agentic accuracy test', + name: 'USE_TOOL_GUIDED_GENERATION' + ) booleanParam( name: "AGENTIC_ACCURACY", defaultValue: true, @@ -303,7 +308,7 @@ pipeline { } } sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_tool_guided_generation true --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_tool_guided_generation ${params.USE_TOOL_GUIDED_GENERATION} --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"ovms-model\" ] ; do echo waiting for LLM model; sleep 1; done" } From 393219e3435de2b6ba32a998619a9549557acf6a Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 12:12:47 +0200 Subject: [PATCH 23/33] local models --- ci/perf_linux.groovy | 16 +++++++++++----- demos/continuous_batching/accuracy/gorilla.patch | 5 +++-- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 17ee56a060..bcdfbcf602 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -46,16 +46,21 @@ pipeline { description: 'Run agentic latency test', name: 'AGENTIC_LATENCY' ) + booleanParam( + name: "AGENTIC_ACCURACY", + defaultValue: true, + description: "Agentic accuracy" + ) booleanParam( defaultValue: true, description: 'Use tool guided generation in agentic accuracy test', name: 'USE_TOOL_GUIDED_GENERATION' ) booleanParam( - name: "AGENTIC_ACCURACY", - defaultValue: true, - description: "Agentic accuracy" - ) + defaultValue: true, + description: 'Use thinking in agentic accuracy test', + name: 'USE_THINKING' + ) string ( name: "MODELS_REPOSITORY_PATH", defaultValue: "", @@ -314,11 +319,12 @@ pipeline { } sh "echo Install BFCL && \ test -d gorilla || git clone https://github.com/ShishirPatil/gorilla && \ - cd gorilla/berkeley-function-call-leaderboard && git checkout cd9429ccf3d4d04156affe883c495b3b047e6b64 -f && curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/main/demos/continuous_batching/accuracy/gorilla.patch | git apply -v" + cd gorilla/berkeley-function-call-leaderboard && git checkout cd9429ccf3d4d04156affe883c495b3b047e6b64 -f && curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/perf-test/demos/continuous_batching/accuracy/gorilla.patch | git apply -v" sh "test -d .venv || python3 -m venv .venv && \ . .venv/bin/activate && pip install -e ./gorilla/berkeley-function-call-leaderboard && \ echo Running agentic accuracy test && \ export OPENAI_BASE_URL=http://localhost:9000/v3 && \ + export ENABLE_THINKING=${params.USE_THINKING} && \ bfcl generate --model ovms-model --test-category simple --temperature 0.0 --num-threads 100 -o --result-dir bfcl_results && bfcl evaluate --model ovms-model --result-dir bfcl_results --score-dir bfcl_scores && \ cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq ." script { diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index c49fab2c1e..8639304527 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -51,7 +51,8 @@ index 8665234..9e85e59 100644 "model": self.model_name.replace("-FC", ""), "temperature": self.temperature, + "tool_choice": os.getenv("TOOL_CHOICE", "auto"), -+ "max_completion_tokens": 2048, ++ "extra_body": {"chat_template_kwargs": {"enable_thinking": os.getenv("ENABLE_THINKING", False)}}, + "max_completion_tokens": 2048, "store": False, } @@ -78,7 +79,7 @@ index 9ce4e7d..076e706 100644 - "enable_thinking": True - }, + extra_body={ "chat_template_kwargs": { -+ "enable_thinking": False ++ "enable_thinking": os.getenv("ENABLE_THINKING", False) + }}, + temperature=self.temperature, stream=True, From 289c28c405807b8783e946a66d1bce9fefb979c0 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 13:01:17 +0200 Subject: [PATCH 24/33] fix patch --- .../accuracy/gorilla.patch | 27 ++++++++++++------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 8639304527..3cc72f16bf 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -1,5 +1,5 @@ diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -index 73731c0..0d966ed 100644 +index 73731c0..b6bbf48 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py @@ -2060,6 +2060,30 @@ third_party_inference_model_map = { @@ -31,10 +31,10 @@ index 73731c0..0d966ed 100644 + underscore_to_dot=True, + ), } - - + + diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py -index 8665234..9e85e59 100644 +index 8665234..9fc1fba 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py @@ -23,7 +23,7 @@ class OpenAICompletionsHandler(BaseHandler): @@ -43,21 +43,21 @@ index 8665234..9e85e59 100644 self.model_style = ModelStyle.OpenAI_Completions - self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + self.client = OpenAI(base_url=os.getenv("OPENAI_BASE_URL"), api_key=os.getenv("OPENAI_API_KEY", "not_used"),timeout=os.getenv("OPENAI_TIMEOUT", 1600)) - + def decode_ast(self, result, language="Python"): if "FC" in self.model_name or self.is_fc_model: -@@ -61,6 +61,8 @@ class OpenAICompletionsHandler(BaseHandler): +@@ -61,6 +61,9 @@ class OpenAICompletionsHandler(BaseHandler): "messages": message, "model": self.model_name.replace("-FC", ""), "temperature": self.temperature, + "tool_choice": os.getenv("TOOL_CHOICE", "auto"), + "extra_body": {"chat_template_kwargs": {"enable_thinking": os.getenv("ENABLE_THINKING", False)}}, - "max_completion_tokens": 2048, ++ "max_completion_tokens": 4096, "store": False, } - + diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -index 9ce4e7d..076e706 100644 +index 9ce4e7d..d3aa379 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py @@ -21,8 +21,8 @@ class QwenAPIHandler(OpenAICompletionsHandler): @@ -69,7 +69,7 @@ index 9ce4e7d..076e706 100644 + base_url=os.getenv("OPENAI_BASE_URL", "https://localhost:8000/v3"), + api_key=os.getenv("QWEN_API_KEY", "not_used"), ) - + #### FC methods #### @@ -38,9 +38,10 @@ class QwenAPIHandler(OpenAICompletionsHandler): model=self.model_name.replace("-FC", ""), @@ -85,3 +85,10 @@ index 9ce4e7d..076e706 100644 stream=True, stream_options={ "include_usage": True +@@ -338,4 +339,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): + 'timeout': 1000, + 'max_tokens': 16384 + } +- }) +\ No newline at end of file ++ }) \ No newline at end of file From 5f7cdb90f56cfab74a2123c735ae606809c64abd Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 13:02:30 +0200 Subject: [PATCH 25/33] fix patch --- demos/continuous_batching/accuracy/gorilla.patch | 2 -- 1 file changed, 2 deletions(-) diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 3cc72f16bf..3223ce74f7 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -90,5 +90,3 @@ index 9ce4e7d..d3aa379 100644 'max_tokens': 16384 } - }) -\ No newline at end of file -+ }) \ No newline at end of file From a02be9b5dda198abdf2a6ac61f5594943413bc53 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Fri, 24 Oct 2025 13:08:50 +0200 Subject: [PATCH 26/33] fix patch --- demos/continuous_batching/accuracy/gorilla.patch | 2 ++ 1 file changed, 2 insertions(+) diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index 3223ce74f7..d34aaa9de0 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -90,3 +90,5 @@ index 9ce4e7d..d3aa379 100644 'max_tokens': 16384 } - }) +\ No newline at end of file ++ }) From 74553593d55d557ae2db95160018b6d08e6e1cb3 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 3 Nov 2025 00:08:15 +0100 Subject: [PATCH 27/33] test --- ci/perf_linux.groovy | 2 +- .../accuracy/gorilla.patch | 33 +++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index bcdfbcf602..c543230fe4 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -319,7 +319,7 @@ pipeline { } sh "echo Install BFCL && \ test -d gorilla || git clone https://github.com/ShishirPatil/gorilla && \ - cd gorilla/berkeley-function-call-leaderboard && git checkout cd9429ccf3d4d04156affe883c495b3b047e6b64 -f && curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/perf-test/demos/continuous_batching/accuracy/gorilla.patch | git apply -v" + cd gorilla/berkeley-function-call-leaderboard && git checkout cd9429ccf3d4d04156affe883c495b3b047e6b64 -f && curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/perf-111/demos/continuous_batching/accuracy/gorilla.patch | git apply -v" sh "test -d .venv || python3 -m venv .venv && \ . .venv/bin/activate && pip install -e ./gorilla/berkeley-function-call-leaderboard && \ echo Running agentic accuracy test && \ diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index d34aaa9de0..3322c9b873 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -1,5 +1,5 @@ diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py -index 73731c0..b6bbf48 100644 +index 73731c0..0d966ed 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py @@ -2060,6 +2060,30 @@ third_party_inference_model_map = { @@ -18,9 +18,9 @@ index 73731c0..b6bbf48 100644 + is_fc_model=True, + underscore_to_dot=True, + ), -+ "ovms-model-stream": ModelConfig( ++ "ovms_model_stream": ModelConfig( + model_name="ovms-model-stream", -+ display_name="ovms-model-stream", ++ display_name="ovms_model_stream", + url="http://localhost:8000/v3", + org="ovms", + license="apache-2.0", @@ -31,10 +31,10 @@ index 73731c0..b6bbf48 100644 + underscore_to_dot=True, + ), } - - + + diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py -index 8665234..9fc1fba 100644 +index 8665234..c224681 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py @@ -23,7 +23,7 @@ class OpenAICompletionsHandler(BaseHandler): @@ -42,8 +42,8 @@ index 8665234..9fc1fba 100644 super().__init__(model_name, temperature) self.model_style = ModelStyle.OpenAI_Completions - self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -+ self.client = OpenAI(base_url=os.getenv("OPENAI_BASE_URL"), api_key=os.getenv("OPENAI_API_KEY", "not_used"),timeout=os.getenv("OPENAI_TIMEOUT", 1600)) - ++ self.client = OpenAI(base_url=os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"), api_key=os.getenv("OPENAI_API_KEY", "not_used"),timeout=os.getenv("OPENAI_TIMEOUT", 3600)) + def decode_ast(self, result, language="Python"): if "FC" in self.model_name or self.is_fc_model: @@ -61,6 +61,9 @@ class OpenAICompletionsHandler(BaseHandler): @@ -51,13 +51,13 @@ index 8665234..9fc1fba 100644 "model": self.model_name.replace("-FC", ""), "temperature": self.temperature, + "tool_choice": os.getenv("TOOL_CHOICE", "auto"), -+ "extra_body": {"chat_template_kwargs": {"enable_thinking": os.getenv("ENABLE_THINKING", False)}}, -+ "max_completion_tokens": 4096, ++ "extra_body": {"chat_template_kwargs": {"enable_thinking": bool(os.getenv("ENABLE_THINKING", ""))}}, ++ "max_completion_tokens": 2048, "store": False, } - + diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -index 9ce4e7d..d3aa379 100644 +index 9ce4e7d..06ec74e 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py @@ -21,8 +21,8 @@ class QwenAPIHandler(OpenAICompletionsHandler): @@ -69,9 +69,9 @@ index 9ce4e7d..d3aa379 100644 + base_url=os.getenv("OPENAI_BASE_URL", "https://localhost:8000/v3"), + api_key=os.getenv("QWEN_API_KEY", "not_used"), ) - + #### FC methods #### -@@ -38,9 +38,10 @@ class QwenAPIHandler(OpenAICompletionsHandler): +@@ -38,9 +38,9 @@ class QwenAPIHandler(OpenAICompletionsHandler): model=self.model_name.replace("-FC", ""), tools=tools, parallel_tool_calls=True, @@ -79,13 +79,12 @@ index 9ce4e7d..d3aa379 100644 - "enable_thinking": True - }, + extra_body={ "chat_template_kwargs": { -+ "enable_thinking": os.getenv("ENABLE_THINKING", False) ++ "enable_thinking": bool(os.getenv("ENABLE_THINKING", "")) + }}, -+ temperature=self.temperature, stream=True, stream_options={ "include_usage": True -@@ -338,4 +339,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): +@@ -338,4 +338,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): 'timeout': 1000, 'max_tokens': 16384 } From 1731e182e096678a27583f749a2e22e542db21e3 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Mon, 3 Nov 2025 00:25:31 +0100 Subject: [PATCH 28/33] test --- ci/perf_linux.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index c543230fe4..0ba85a3a5d 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -324,7 +324,7 @@ pipeline { . .venv/bin/activate && pip install -e ./gorilla/berkeley-function-call-leaderboard && \ echo Running agentic accuracy test && \ export OPENAI_BASE_URL=http://localhost:9000/v3 && \ - export ENABLE_THINKING=${params.USE_THINKING} && \ + ${params.USE_THINKING ? 'export ENABLE_THINKING=true && \\' : ''} \ bfcl generate --model ovms-model --test-category simple --temperature 0.0 --num-threads 100 -o --result-dir bfcl_results && bfcl evaluate --model ovms-model --result-dir bfcl_results --score-dir bfcl_scores && \ cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq ." script { From 2a02eb98acd0e1e6e977f8dbd0457616a289ef25 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 4 Nov 2025 11:59:19 +0100 Subject: [PATCH 29/33] chat template and parsers --- ci/perf_linux.groovy | 53 +++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 0ba85a3a5d..829eac3e06 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -14,12 +14,12 @@ pipeline { string ( name: "DOCKER_IMAGE_NAME", defaultValue: "registry.toolbox.iotg.sclab.intel.com/openvino/model_server-gpu:ubuntu24_main", - description: "Name of the image to be scanned. Can't be empty. Registry/image/tag format." + description: "Name of the image to be scanned. Can't be empty. Registry/image:tag format." ) string ( name: "MODEL", defaultValue: "OpenVINO/Qwen3-4B-int4-ov", - description: "Model to use in tests" + description: "Model to use in tests. Can be a local path or a model name from HF hub." ) string ( name: "TARGET_ENV", @@ -29,7 +29,7 @@ pipeline { string ( name: "DEVICE", defaultValue: "CPU", - description: "Device to use in tests" + description: "Device to use in tests. GPU or CPU" ) booleanParam( defaultValue: true, @@ -51,20 +51,35 @@ pipeline { defaultValue: true, description: "Agentic accuracy" ) - booleanParam( - defaultValue: true, - description: 'Use tool guided generation in agentic accuracy test', - name: 'USE_TOOL_GUIDED_GENERATION' + group( + title: 'Agentic Accuracy', + contents: [ + booleanParam( + defaultValue: true, + description: 'Use tool guided generation in agentic accuracy test', + name: 'USE_TOOL_GUIDED_GENERATION' + ), + booleanParam( + defaultValue: true, + description: 'Use thinking in agentic accuracy test', + name: 'USE_THINKING' + ), + string( + defaultValue: '--tool_parser hermes3 --reasoning_parser qwen3', + description: 'parsers to be applied in agentic accuracy test', + name: 'PARSERS' + ), + string( + defaultValue: '', + description: 'Optional chat template URL for agentic tests', + name: 'CHAT_TEMPLATE_URL' + ) + ] ) - booleanParam( - defaultValue: true, - description: 'Use thinking in agentic accuracy test', - name: 'USE_THINKING' - ) string ( name: "MODELS_REPOSITORY_PATH", defaultValue: "", - description: "Path to models repository" + description: "Path to models repository. Defines where to copy the model for load execution. By default in jenkins workspace/models" ) booleanParam( name: "SAVE_REFERENCE", @@ -86,7 +101,9 @@ pipeline { steps { script { def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" - modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + modelsPath = params.M + + ?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" model_name = params.MODEL sh "mkdir -p ${modelsPath}" if (fileExists(params.MODEL) ) { @@ -311,9 +328,13 @@ pipeline { sh "cp -R ${params.MODEL} ${modelsPath}" model_need_copy = false } - } + } + if (params.CHAT_TEMPLATE_URL?.trim()) { + def chatTemplateFile = "${modelsPath}/${model_name}/chat_template.json" + sh "curl -sSL '${params.CHAT_TEMPLATE_URL}' -o '${chatTemplateFile}'" + } sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ - docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_tool_guided_generation ${params.USE_TOOL_GUIDED_GENERATION} --tool_parser hermes3 --reasoning_parser qwen3 --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_tool_guided_generation ${params.USE_TOOL_GUIDED_GENERATION} ${params.PARSERS} --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ echo wait for model server to be ready && \ while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"ovms-model\" ] ; do echo waiting for LLM model; sleep 1; done" } From c25a991bf4270e93d3cec3d972bf68eae4105cc1 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 4 Nov 2025 16:30:03 +0100 Subject: [PATCH 30/33] test --- ci/perf_linux.groovy | 46 ++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 829eac3e06..e644cd8c8a 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -51,36 +51,32 @@ pipeline { defaultValue: true, description: "Agentic accuracy" ) - group( - title: 'Agentic Accuracy', - contents: [ - booleanParam( - defaultValue: true, - description: 'Use tool guided generation in agentic accuracy test', - name: 'USE_TOOL_GUIDED_GENERATION' - ), - booleanParam( - defaultValue: true, - description: 'Use thinking in agentic accuracy test', - name: 'USE_THINKING' - ), - string( - defaultValue: '--tool_parser hermes3 --reasoning_parser qwen3', - description: 'parsers to be applied in agentic accuracy test', - name: 'PARSERS' - ), - string( - defaultValue: '', - description: 'Optional chat template URL for agentic tests', - name: 'CHAT_TEMPLATE_URL' - ) - ] + booleanParam( + defaultValue: true, + description: 'Use tool guided generation in agentic accuracy test', + name: 'USE_TOOL_GUIDED_GENERATION' + ) + booleanParam( + defaultValue: true, + description: 'Use thinking in agentic accuracy test', + name: 'USE_THINKING', + + ) + string( + defaultValue: '--tool_parser hermes3 --reasoning_parser qwen3', + description: 'parsers to be applied in agentic accuracy test', + name: 'PARSERS' + ) + string( + defaultValue: '', + description: 'Optional chat template URL for agentic tests', + name: 'CHAT_TEMPLATE_URL' ) string ( name: "MODELS_REPOSITORY_PATH", defaultValue: "", description: "Path to models repository. Defines where to copy the model for load execution. By default in jenkins workspace/models" - ) + ) booleanParam( name: "SAVE_REFERENCE", defaultValue: false, From 40222055354a3077e8d008c3ea564b30e1c2b846 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 4 Nov 2025 23:43:56 +0100 Subject: [PATCH 31/33] fix mistral --- ci/perf_linux.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index e644cd8c8a..acd546f732 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -267,7 +267,7 @@ pipeline { test -d vllm || git clone -b v0.10.2 https://github.com/vllm-project/vllm && \ sed -i -e 's/if not os.path.exists(args.model)/if 1 == 0/g' vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py && \ test -f pg1184.txt || curl https://www.gutenberg.org/ebooks/1184.txt.utf-8 -o pg1184.txt" - sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt && \ + sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt sentencepiece && \ python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${modelsPath}/${model_name} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${model_name} --num-clients 1 -n 20 > results_agentic_latency.txt && \ cat results_agentic_latency.txt" script { From 4deb77c2427b98a601eeee25eaff59176afed470 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 4 Nov 2025 23:49:21 +0100 Subject: [PATCH 32/33] fix mistral --- ci/perf_linux.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index acd546f732..16478d2382 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -267,7 +267,7 @@ pipeline { test -d vllm || git clone -b v0.10.2 https://github.com/vllm-project/vllm && \ sed -i -e 's/if not os.path.exists(args.model)/if 1 == 0/g' vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py && \ test -f pg1184.txt || curl https://www.gutenberg.org/ebooks/1184.txt.utf-8 -o pg1184.txt" - sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt sentencepiece && \ + sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt sentencepiece protobuf && \ python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${modelsPath}/${model_name} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${model_name} --num-clients 1 -n 20 > results_agentic_latency.txt && \ cat results_agentic_latency.txt" script { From 5f29be1b7baf2ba6a804e5acd72ea97131c9aeb3 Mon Sep 17 00:00:00 2001 From: Dariusz Trawinski Date: Tue, 4 Nov 2025 23:56:42 +0100 Subject: [PATCH 33/33] fix mistral --- ci/perf_linux.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy index 16478d2382..4695223357 100644 --- a/ci/perf_linux.groovy +++ b/ci/perf_linux.groovy @@ -326,7 +326,7 @@ pipeline { } } if (params.CHAT_TEMPLATE_URL?.trim()) { - def chatTemplateFile = "${modelsPath}/${model_name}/chat_template.json" + def chatTemplateFile = "${modelsPath}/${model_name}/chat_template.jinja" sh "curl -sSL '${params.CHAT_TEMPLATE_URL}' -o '${chatTemplateFile}'" } sh "docker pull ${params.DOCKER_IMAGE_NAME} && \