diff --git a/ci/perf_linux.groovy b/ci/perf_linux.groovy new file mode 100644 index 0000000000..4695223357 --- /dev/null +++ b/ci/perf_linux.groovy @@ -0,0 +1,395 @@ +#!groovy +import org.jenkinsci.plugins.pipeline.modeldefinition.Utils +@Library(value='mainlib@master', changelog=false) _ + +def model_need_copy = true +def model_name = "" +def modelsPath = "" + +pipeline { + options { + timeout(time: 20, unit: 'MINUTES') + } + parameters { + string ( + name: "DOCKER_IMAGE_NAME", + defaultValue: "registry.toolbox.iotg.sclab.intel.com/openvino/model_server-gpu:ubuntu24_main", + description: "Name of the image to be scanned. Can't be empty. Registry/image:tag format." + ) + string ( + name: "MODEL", + defaultValue: "OpenVINO/Qwen3-4B-int4-ov", + description: "Model to use in tests. Can be a local path or a model name from HF hub." + ) + string ( + name: "TARGET_ENV", + defaultValue: "ov-spr-19", + description: "Worker label to run tests on" + ) + string ( + name: "DEVICE", + defaultValue: "CPU", + description: "Device to use in tests. GPU or CPU" + ) + booleanParam( + defaultValue: true, + description: 'Run latency test', + name: 'LATENCY' + ) + booleanParam( + defaultValue: true, + description: 'Run throughput test', + name: 'THROUGHPUT' + ) + booleanParam( + defaultValue: true, + description: 'Run agentic latency test', + name: 'AGENTIC_LATENCY' + ) + booleanParam( + name: "AGENTIC_ACCURACY", + defaultValue: true, + description: "Agentic accuracy" + ) + booleanParam( + defaultValue: true, + description: 'Use tool guided generation in agentic accuracy test', + name: 'USE_TOOL_GUIDED_GENERATION' + ) + booleanParam( + defaultValue: true, + description: 'Use thinking in agentic accuracy test', + name: 'USE_THINKING', + + ) + string( + defaultValue: '--tool_parser hermes3 --reasoning_parser qwen3', + description: 'parsers to be applied in agentic accuracy test', + name: 'PARSERS' + ) + string( + defaultValue: '', + description: 'Optional chat template URL for agentic tests', + name: 'CHAT_TEMPLATE_URL' + ) + string ( + name: "MODELS_REPOSITORY_PATH", + defaultValue: "", + description: "Path to models repository. Defines where to copy the model for load execution. By default in jenkins workspace/models" + ) + booleanParam( + name: "SAVE_REFERENCE", + defaultValue: false, + description: "Save reference results" + ) + + } + + agent { + label "${params.TARGET_ENV}" + } + + stages { + stage('Latency') { + when { + expression { params.LATENCY == true } + } + steps { + script { + def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + modelsPath = params.M + + ?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + model_name = params.MODEL + sh "mkdir -p ${modelsPath}" + if (fileExists(params.MODEL) ) { + model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() + if (model_need_copy) { + sh "cp -R ${params.MODEL} ${modelsPath}" + model_need_copy = false + } + } + sh "echo Start docker container && \ + docker pull ${params.DOCKER_IMAGE_NAME} && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 1 --log_level INFO && \ + echo wait for model server to be ready && \ + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done && \ + echo Server is ready" + } + sh "echo Running latency test && \ + mkdir -p results && touch results/results.json && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 1024 --random-output-len 128 --max-concurrency 1 --num-prompts 20 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ + cat results/results.json | jq ." + script { + def mean_tpot_ms_reference = { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_tpot_ms.txt")) { + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_tpot_ms.txt", returnStdout: true).trim().toFloat() + } else { + return 100000.0 + } + }() + def mean_ttft_ms_reference = { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_ttft_ms.txt")) { + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_ttft_ms.txt", returnStdout: true).trim().toFloat() + } else { + return 100000.0 + } + }() + echo "mean_tpot_ms_reference: ${mean_tpot_ms_reference}, mean_ttft_ms_reference: ${mean_ttft_ms_reference}" + // Allow 5% increase in latency + def hasWarnings = sh(returnStdout: true, script: """jq -r '.mean_tpot_ms > ${mean_tpot_ms_reference * 1.05} or .mean_ttft_ms > ${mean_ttft_ms_reference * 1.05}' results/results.json""").trim() == "true" + if (hasWarnings) { + unstable('Performance threshold not met in throughput test') + } + sh '''if [ $(echo "$(cat results/results.json | jq -r '.completed') != $(cat results/results.json | jq -r '.num_prompts')" | bc) -ne 0 ] ; then exit 1; fi''' + } + sh "echo Stop docker container && \ + docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + script { + if (params.SAVE_REFERENCE) { + sh "mkdir -p ${env.WORKSPACE}/reference/${model_name} && jq -r '.mean_tpot_ms' results/results.json > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_tpot_ms.txt && \ + jq -r '.mean_ttft_ms' results/results.json > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_mean_ttft_ms.txt" + } + } + } + post { + always { + sh "cat results/results.json | jq . > results/latency_results.json" + archiveArtifacts allowEmptyArchive: true, artifacts: "results/latency_results.json" + } + } + } + stage('Throughput') { + when { + expression { params.THROUGHPUT == true } + } + steps { + + script { + modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + model_name = params.MODEL + def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + sh "mkdir -p ${modelsPath}" + if (fileExists(params.MODEL) ) { + model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() + if (model_need_copy) { + sh "cp -R ${params.MODEL} ${modelsPath}" + model_need_copy = false + } + } + sh "echo Start docker container && \ + mkdir -p ${modelsPath} && \ + docker pull ${params.DOCKER_IMAGE_NAME} && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --model_repository_path /models --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ + echo wait for model server to be ready && \ + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" + } + sh "echo Running latency test && \ + mkdir -p results && touch results/results.json && \ + docker run -v \$(pwd)/results:/results --rm --network=host -e https_proxy=${env.HTTPS_PROXY} -e no_proxy=localhost -v ${modelsPath}:/models --entrypoint vllm openeuler/vllm-cpu:0.10.1-oe2403lts bench serve --dataset-name random --host localhost --port 9000 --endpoint /v3/chat/completions --endpoint-type openai-chat --random-input-len 256 --random-output-len 128 --random-range-ratio 0.2 --max-concurrency 100 --num-prompts 500 --model ${model_name} --ignore-eos --result-dir /results/ --result-filename results.json --save-result --tokenizer /models/${model_name} && \ + cat results/results.json | jq ." + script { + def total_token_throughput_reference = { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_total_token_throughput.txt")) { + try { + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_total_token_throughput.txt", returnStdout: true).trim().toFloat() + } catch (Exception e) { + echo "Error reading total_token_throughput reference: ${e.getMessage()}" + return 0.0 + } + } else { + return 0.0 + } + }() + def output_throughput_reference = { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_output_throughput.txt")) { + try { + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_output_throughput.txt", returnStdout: true).trim().toFloat() + } catch (Exception e) { + echo "Error reading output_throughput reference: ${e.getMessage()}" + return 0.0 + } + } else { + return 0.0 + } + }() + echo "total_token_throughput_reference: ${total_token_throughput_reference}, output_throughput_reference: ${output_throughput_reference}" + // Allow 5% decrease in throughput + def hasWarnings = sh(returnStdout: true, script: """jq -r '.total_token_throughput < ${total_token_throughput_reference * 0.95} or .output_throughput < ${output_throughput_reference * 0.95}' results/results.json""").trim() == "true" + if (hasWarnings) { + unstable('Performance threshold not met in throughput test') + } + sh '''if [ $(echo "$(cat results/results.json | jq -r '.completed') != $(cat results/results.json | jq -r '.num_prompts')" | bc) -ne 0 ] ; then exit 1; fi''' + } + sh "echo Stop docker container && \ + docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + script { + if (params.SAVE_REFERENCE) { + sh "mkdir -p ${env.WORKSPACE}/reference/${model_name} && \ + jq -r '.total_token_throughput' results/results.json > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_total_token_throughput.txt && \ + jq -r '.output_throughput' results/results.json > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_output_throughput.txt" + } + } + } + post { + always { + sh "cat results/results.json | jq . > results/throughput_results.json" + archiveArtifacts allowEmptyArchive: true, artifacts: "results/throughput_results.json" + } + } + } + stage('Agentic Latency') { + when { + expression { params.AGENTIC_LATENCY == true } + } + steps { + sh "echo Start docker container" + script { + modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + model_name = params.MODEL + def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + sh "mkdir -p ${modelsPath}" + if (fileExists(params.MODEL) ) { + model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() + if (model_need_copy) { + sh "cp -R ${params.MODEL} ${modelsPath}" + model_need_copy = false + } + } + sh "mkdir -p ${modelsPath} && \ + docker pull ${params.DOCKER_IMAGE_NAME} && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_prefix_caching true --model_repository_path /models --target_device ${params.DEVICE} --log_level INFO --cache_size 3 && \ + echo wait for model server to be ready && \ + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"${model_name}\" ] ; do echo waiting for LLM model; sleep 1; done" + } + sh "echo Running agentic latency test && \ + test -d .venv || python3 -m venv .venv && \ + test -d vllm || git clone -b v0.10.2 https://github.com/vllm-project/vllm && \ + sed -i -e 's/if not os.path.exists(args.model)/if 1 == 0/g' vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py && \ + test -f pg1184.txt || curl https://www.gutenberg.org/ebooks/1184.txt.utf-8 -o pg1184.txt" + sh ". .venv/bin/activate && pip install -r vllm/benchmarks/multi_turn/requirements.txt sentencepiece protobuf && \ + python vllm/benchmarks/multi_turn/benchmark_serving_multi_turn.py -m ${modelsPath}/${model_name} --url http://localhost:9000/v3 -i vllm/benchmarks/multi_turn/generate_multi_turn.json --served-model-name ${model_name} --num-clients 1 -n 20 > results_agentic_latency.txt && \ + cat results_agentic_latency.txt" + script { + // Check if requests_per_sec is above threshold + def requests_per_sec = sh(script: '''cat results_agentic_latency.txt | grep requests_per_sec | cut -d= -f2''', returnStdout: true).trim() + def requests_per_sec_reference = { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_requests_per_sec.txt")) { + try{ + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_requests_per_sec.txt", returnStdout: true).trim().toFloat() + } catch (Exception e) { + echo "Error reading requests_per_sec reference: ${e.getMessage()}" + return 0.0 + } + } else { + return 0.0 + } + }() + echo "requests_per_sec: ${requests_per_sec}, requests_per_sec_reference: ${requests_per_sec_reference}" + // Require at least 95% of reference throughput + if (requests_per_sec.toFloat() < requests_per_sec_reference * 0.95) { + echo "WARNING: Requests per second is below threshold" + unstable('Performance threshold not met, requests_per_sec: ' + requests_per_sec) + } + } + sh "echo Stop docker container && \ + docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + script { + if (params.SAVE_REFERENCE) { + sh "mkdir -p ${env.WORKSPACE}/reference/${model_name} && \ + cat results_agentic_latency.txt | grep requests_per_sec | cut -d= -f2 > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_requests_per_sec.txt" + } + } + } + post { + always { + archiveArtifacts allowEmptyArchive: true, artifacts: "results_agentic_latency.txt" + } + } + } + stage('Agentic Accuracy') { + when { + expression { params.AGENTIC_ACCURACY == true } + } + steps { + sh "echo Start docker container" + script { + modelsPath = params.MODELS_REPOSITORY_PATH?.trim() ? params.MODELS_REPOSITORY_PATH : "${env.WORKSPACE}/models" + model_name = params.MODEL + def gpuFlags = "--device /dev/dri --group-add=\$(stat -c \"%g\" /dev/dri/render* | head -n 1)" + sh "mkdir -p ${modelsPath}" + if (fileExists(params.MODEL) ) { + model_name = sh(script: "basename ${params.MODEL}", returnStdout: true).trim() + if (model_need_copy) { + sh "cp -R ${params.MODEL} ${modelsPath}" + model_need_copy = false + } + } + if (params.CHAT_TEMPLATE_URL?.trim()) { + def chatTemplateFile = "${modelsPath}/${model_name}/chat_template.jinja" + sh "curl -sSL '${params.CHAT_TEMPLATE_URL}' -o '${chatTemplateFile}'" + } + sh "docker pull ${params.DOCKER_IMAGE_NAME} && \ + docker run --rm -d --user \$(id -u):\$(id -g) ${gpuFlags} -e https_proxy=${env.HTTPS_PROXY} --name model_server_${BUILD_NUMBER} -p 9000:9000 -v ${modelsPath}:/models ${params.DOCKER_IMAGE_NAME} --source_model ${model_name} --rest_port 9000 --task text_generation --enable_tool_guided_generation ${params.USE_TOOL_GUIDED_GENERATION} ${params.PARSERS} --model_repository_path /models --model_name ovms-model --target_device ${params.DEVICE} --cache_size 3 --log_level INFO && \ + echo wait for model server to be ready && \ + while [ \"\$(curl -s http://localhost:9000/v3/models | jq -r '.data[0].id')\" != \"ovms-model\" ] ; do echo waiting for LLM model; sleep 1; done" + } + sh "echo Install BFCL && \ + test -d gorilla || git clone https://github.com/ShishirPatil/gorilla && \ + cd gorilla/berkeley-function-call-leaderboard && git checkout cd9429ccf3d4d04156affe883c495b3b047e6b64 -f && curl -s https://raw.githubusercontent.com/openvinotoolkit/model_server/refs/heads/perf-111/demos/continuous_batching/accuracy/gorilla.patch | git apply -v" + sh "test -d .venv || python3 -m venv .venv && \ + . .venv/bin/activate && pip install -e ./gorilla/berkeley-function-call-leaderboard && \ + echo Running agentic accuracy test && \ + export OPENAI_BASE_URL=http://localhost:9000/v3 && \ + ${params.USE_THINKING ? 'export ENABLE_THINKING=true && \\' : ''} \ + bfcl generate --model ovms-model --test-category simple --temperature 0.0 --num-threads 100 -o --result-dir bfcl_results && bfcl evaluate --model ovms-model --result-dir bfcl_results --score-dir bfcl_scores && \ + cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq ." + script { + def accuracy = sh(script: "cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq -r '.accuracy'", returnStdout: true).trim() + def accuracy_reference = { + if (fileExists("${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_accuracy.txt")) { + try { + return sh(script: "cat ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_accuracy.txt", returnStdout: true).trim().toFloat() + } catch (Exception e) { + echo "Error reading accuracy reference: ${e.getMessage()}" + return 0.0 + } + } else { + return 0.0 + } + }() + echo "accuracy: ${accuracy}, accuracy_reference: ${accuracy_reference}" + // Require at least 98% of reference accuracy + if (accuracy.toFloat() < accuracy_reference * 0.98) { + echo "WARNING: Accuracy ${accuracy} is below threshold" + unstable('Accuracy threshold not met') + } + } + sh "echo Stop docker container && \ + docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + script { + if (params.SAVE_REFERENCE) { + sh "mkdir -p ${env.WORKSPACE}/reference/${model_name} && \ + cat gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json | head -1 | jq -r '.accuracy' > ${env.WORKSPACE}/reference/${model_name}/${params.DEVICE}_agentic_accuracy.txt" + } + } + } + post { + always { + archiveArtifacts allowEmptyArchive: true, artifacts: "gorilla/berkeley-function-call-leaderboard/bfcl_scores/ovms-model/BFCL_v3_simple_score.json" + } + } + } + } + post { + always { + sh "docker ps -q --filter name=model_server_${BUILD_NUMBER} | xargs -r docker stop" + } + success { + echo 'Pipeline completed successfully!' + } + failure { + echo 'Pipeline failed!' + } + } +} \ No newline at end of file diff --git a/demos/continuous_batching/accuracy/gorilla.patch b/demos/continuous_batching/accuracy/gorilla.patch index c49fab2c1e..3322c9b873 100644 --- a/demos/continuous_batching/accuracy/gorilla.patch +++ b/demos/continuous_batching/accuracy/gorilla.patch @@ -18,9 +18,9 @@ index 73731c0..0d966ed 100644 + is_fc_model=True, + underscore_to_dot=True, + ), -+ "ovms-model-stream": ModelConfig( ++ "ovms_model_stream": ModelConfig( + model_name="ovms-model-stream", -+ display_name="ovms-model-stream", ++ display_name="ovms_model_stream", + url="http://localhost:8000/v3", + org="ovms", + license="apache-2.0", @@ -34,7 +34,7 @@ index 73731c0..0d966ed 100644 diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py -index 8665234..9e85e59 100644 +index 8665234..c224681 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/openai_completion.py @@ -23,7 +23,7 @@ class OpenAICompletionsHandler(BaseHandler): @@ -42,21 +42,22 @@ index 8665234..9e85e59 100644 super().__init__(model_name, temperature) self.model_style = ModelStyle.OpenAI_Completions - self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) -+ self.client = OpenAI(base_url=os.getenv("OPENAI_BASE_URL"), api_key=os.getenv("OPENAI_API_KEY", "not_used"),timeout=os.getenv("OPENAI_TIMEOUT", 1600)) ++ self.client = OpenAI(base_url=os.getenv("OPENAI_BASE_URL","http://localhost:8000/v3"), api_key=os.getenv("OPENAI_API_KEY", "not_used"),timeout=os.getenv("OPENAI_TIMEOUT", 3600)) def decode_ast(self, result, language="Python"): if "FC" in self.model_name or self.is_fc_model: -@@ -61,6 +61,8 @@ class OpenAICompletionsHandler(BaseHandler): +@@ -61,6 +61,9 @@ class OpenAICompletionsHandler(BaseHandler): "messages": message, "model": self.model_name.replace("-FC", ""), "temperature": self.temperature, + "tool_choice": os.getenv("TOOL_CHOICE", "auto"), ++ "extra_body": {"chat_template_kwargs": {"enable_thinking": bool(os.getenv("ENABLE_THINKING", ""))}}, + "max_completion_tokens": 2048, "store": False, } diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py -index 9ce4e7d..076e706 100644 +index 9ce4e7d..06ec74e 100644 --- a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py +++ b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/qwen.py @@ -21,8 +21,8 @@ class QwenAPIHandler(OpenAICompletionsHandler): @@ -70,7 +71,7 @@ index 9ce4e7d..076e706 100644 ) #### FC methods #### -@@ -38,9 +38,10 @@ class QwenAPIHandler(OpenAICompletionsHandler): +@@ -38,9 +38,9 @@ class QwenAPIHandler(OpenAICompletionsHandler): model=self.model_name.replace("-FC", ""), tools=tools, parallel_tool_calls=True, @@ -78,9 +79,15 @@ index 9ce4e7d..076e706 100644 - "enable_thinking": True - }, + extra_body={ "chat_template_kwargs": { -+ "enable_thinking": False ++ "enable_thinking": bool(os.getenv("ENABLE_THINKING", "")) + }}, -+ temperature=self.temperature, stream=True, stream_options={ "include_usage": True +@@ -338,4 +338,4 @@ class QwenAgentNoThinkHandler(QwenAgentThinkHandler): + 'timeout': 1000, + 'max_tokens': 16384 + } +- }) +\ No newline at end of file ++ })