Add 'nop' scenario, update docs, change categories report (#203)

manoelmarques · web-flow · commit a84fa23accce · 2025-07-31T09:34:11.000-04:00
diff --git a/README.md b/README.md
@@ -95,18 +95,35 @@ For a discussion of relevant workloads, please consult this [document](https://d
 
 Pieces of information identifying a particular cluster. This information includes, but it is not limited to, GPU model, llm model and llm-d parameters (an environment file, and optionally a `values.yaml` file for modelservice helm charts)
 
-#### Harness
+#### Harnesses
 
 Load Generator (python code) which drives the benchmark load. Today, llm-d-benchmark supports [fmperf](https://github.com/fmperf-project/fmperf), [inference-perf](https://github.com/kubernetes-sigs/inference-perf), [guidellm](https://github.com/vllm-project/guidellm.git) and the benchmarks found on the `benchmarks` folder on [vllm](https://github.com/vllm-project/vllm.git). There are ongoing efforts to consolidate and provide an easier way to support different load generators.
 
+The `nop` harness, combined with env. variables and when using in `standalone` mode, will parse the vLLM log and create reports with
+loading time statistics.
+
+The additional env. variables to set are:
+
+| Environment Variable                         | Example Values  |
+| -------------------------------------------- | -------------- |
+| LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT   | `safetensors, tensorizer, runai_streamer, fastsafetensors` |
+| LLMDBENCH_VLLM_STANDALONE_VLLM_LOGGING_LEVEL | `DEBUG, INFO, WARNING` etc |
+| LLMDBENCH_VLLM_STANDALONE_PREPROCESS         | `source /setup/preprocess/standalone-preprocess.sh ; /setup/preprocess/standalone-preprocess.py` |
+
+The env. `LMDBENCH_VLLM_STANDALONE_VLLM_LOGGING_LEVEL` must be set to `DEBUG` so that the `nop` categories report finds all categories.
+
+The env. `LLMDBENCH_VLLM_STANDALONE_PREPROCESS` must be set to the above value for the `nop` harness in order to install load format
+dependencies, export additional env. variables and pre-serialize models when using the `tensorizer` load format.
+The preprocess scripts will run in the vLLM standalone pod before the vLLM server starts.
+
 #### Workload
 
 Workload is the actual benchmark load specification which includes the LLM use case to benchmark, traffic pattern, input / output distribution and dataset. Supported workload profiles can be found under `workload/profiles`.
 
 > [!IMPORTANT]
 > The triple `<scenario>`,`<harness>`,`<workload>`, combined with the standup/teardown capabilities provided by [llm-d-infra](https://github.com/llm-d-incubation/llm-d-infra.git) and [llm-d-modelservice](https://github.com/llm-d/llm-d-model-service.git) should provide enough information to allow an experiment to be reproduced.
 
-### Dependecies
+### Dependencies
 
 - [llm-d-infra](https://github.com/llm-d-incubation/llm-d-infra.git)
 - [llm-d-modelservice](https://github.com/llm-d/llm-d-model-service.git)
diff --git a/scenarios/kubernetes_A100_standalone_llama-3b.sh b/scenarios/kubernetes_A100_standalone_llama-3b.sh
@@ -0,0 +1,35 @@
+# Empty env. variables need to be filled by user
+
+export LLMDBENCH_HF_TOKEN=
+export LLMDBENCH_IMAGE_REGISTRY=
+export LLMDBENCH_IMAGE_REPO=
+export LLMDBENCH_IMAGE_NAME=
+export LLMDBENCH_IMAGE_TAG=
+export LLMDBENCH_HARNESS_SERVICE_ACCOUNT=llm-d-benchmark-runner
+
+export LLMDBENCH_HARNESS_NAME=nop
+export LLMDBENCH_HARNESS_EXPERIMENT_PROFILE=nop.yaml
+export LLMDBENCH_CONTROL_WORK_DIR=~/llm-d-benchmark
+export LLMDBENCH_DEPLOY_METHODS=standalone
+export LLMDBENCH_DEPLOY_MODEL_LIST=llama-3b
+
+
+export LLMDBENCH_VLLM_COMMON_NAMESPACE=
+export LLMDBENCH_VLLM_COMMON_AFFINITY=nvidia.com/gpu.product:NVIDIA-A100-SXM4-80GB
+export LLMDBENCH_VLLM_COMMON_PVC_STORAGE_CLASS=
+export LLMDBENCH_VLLM_COMMON_REPLICAS=1
+
+export LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT=safetensors
+#export LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT=tensorizer
+#export LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT=runai_streamer
+#export LLMDBENCH_VLLM_STANDALONE_VLLM_LOAD_FORMAT=fastsafetensors
+
+# set to debug so that all vllm log lines can be categorized
+export LLMDBENCH_VLLM_STANDALONE_VLLM_LOGGING_LEVEL=DEBUG
+
+# source preprocessor script that will install libraries for some load formats and set env. variables
+# run preprocessor python that will change the debug log date format and pre-serialize a model when using
+# tensorizer load format
+export LLMDBENCH_VLLM_STANDALONE_PREPROCESS="source /setup/preprocess/standalone-preprocess.sh ; /setup/preprocess/standalone-preprocess.py"
+
+export LLMDBENCH_VLLM_STANDALONE_IMAGE=vllm/vllm-openai:v0.10.0
diff --git a/workload/harnesses/nop-llm-d-benchmark.py b/workload/harnesses/nop-llm-d-benchmark.py
@@ -16,17 +16,17 @@
 import logging
 from typing import Any
 from urllib.parse import urljoin, urlparse
+from pathlib import Path
 import pandas
 import requests
 
-from pathlib import Path
 from kubernetes import client, config
 
 # Configure logging
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 
-formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
+formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
 
 REQUEST_TIMEOUT = 60.0  # time (seconds) to wait for request
 MAX_VLLM_WAIT = 15.0 * 60.0  # time (seconds) to wait for vllm to respond
@@ -37,6 +37,11 @@
         "start": "No plugins for group",
         "end": "detected platform",
     },
+    {
+        "title": "LLM Imports",
+        "start": "detected platform",
+        "end": "All plugins in this group will be loaded",
+    },
     {
         "title": "Add CLI Args",
         "start": "All plugins in this group will be loaded",
@@ -103,7 +108,9 @@ class LogCategory:
     start: str = ""
     end: str = ""
     start_line: str = ""
+    start_line_number: int = 0
     end_line: str = ""
+    end_line_number: int = 0
     next: LogCategory | None = None
     parent: LogCategory | None = None
     root_child: LogCategory | None = None
@@ -460,7 +467,7 @@ def populate_log_categories(vllm_model: str, logs: str, root_log_category: LogCa
             index = idx + 1
             logger.info(
                 "Skip tensorizer serialization. Start from log line %d: %s",
-                index,
+                index + 1,
                 log_list[index],
             )
             break
@@ -493,7 +500,11 @@ def add_uncategorized_categories(key: list[int], log_category: LogCategory):
             key[0] = log_category.key + 1
             log_category.title = "Uncategorized"
             log_category.start_time = category.end_time
+            log_category.start_line = category.end_line
+            log_category.start_line_number = category.end_line_number
             log_category.end_time = next_category.start_time
+            log_category.end_line = next_category.start_line
+            log_category.end_line_number = next_category.start_line_number
             log_category.parent = category.parent
             log_category.next = next_category
             category.next = log_category
@@ -522,6 +533,7 @@ def populate_log_category(
 
             if category.start_time is not None:
                 category.start_line = log_list[index]
+                category.start_line_number = index + 1
 
         if category.end_line == "" and category.end in log_list[index]:
             category.end_time = extract_datetime(log_list[index])
@@ -535,6 +547,7 @@ def populate_log_category(
 
             if category.end_time is not None:
                 category.end_line = log_list[index]
+                category.end_line_number = index + 1
 
         if category.root_child is not None:
             index = populate_log_category(index, log_list, category.root_child)
@@ -700,29 +713,33 @@ def write_log_categories_to_log(log_category: LogCategory, file: io.BufferedWrit
         elapsed = ""
         if category.start_time is not None and category.end_time is not None:
             time_difference = category.end_time - category.start_time
-            elapsed = f"{time_difference.total_seconds():.2f}"
+            elapsed = f"{time_difference.total_seconds():.3f}"
 
-        file.write(f"Log category : {category.key} '{category.title}'\n")
+        file.write(f"Log category   : {category.key} '{category.title}'\n")
         parent_key = f"{category.parent.key}" if category.parent is not None else ""
-        file.write(f"   parent    : {parent_key}\n")
+        file.write(f"  parent       : {parent_key}\n")
         time_format = "%m-%d %H:%M:%S.%f"
         date_str = (
             category.start_time.strftime(time_format)[:-3]
             if category.start_time is not None
             else ""
         )
-        file.write(f"   start date: {date_str}\n")
+        file.write(f"  start date   : '{date_str}'\n")
         date_str = (
             category.end_time.strftime(time_format)[:-3]
             if category.end_time is not None
             else ""
         )
-        file.write(f"   end date  : {date_str}\n")
-        file.write(f"   elapsed   : {elapsed}\n")
-        file.write(f"   start     : {category.start}\n")
-        file.write(f"   end       : {category.end}\n")
-        file.write(f"   start line: {category.start_line}\n")
-        file.write(f"   end line. : {category.end_line}\n")
+        file.write(f"  end date     : '{date_str}'\n")
+        file.write(f"  elapsed      : {elapsed}\n")
+        file.write(f"  start pattern: '{category.start}'\n")
+        file.write(f"  end pattern  : '{category.end}'\n")
+        file.write(
+            f"  start line   : {category.start_line_number} '{category.start_line}'\n"
+        )
+        file.write(
+            f"  end line     : {category.end_line_number} '{category.end_line}'\n"
+        )
         if category.root_child is not None:
             write_log_categories_to_log(category.root_child, file)
         category = category.next