Skip to content

Commit

Permalink
Kubernetes benchmarks: Refactor kubectl context to allow sets of clus…
Browse files Browse the repository at this point in the history
…ters.

Also add sanity check to make sure each cluster works as part of
initialization, by running a sample pod within.

PiperOrigin-RevId: 706069662
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Dec 14, 2024
1 parent bd0cbf8 commit f876a09
Show file tree
Hide file tree
Showing 47 changed files with 1,302 additions and 664 deletions.
33 changes: 20 additions & 13 deletions images/gpu/ollama/bench/Dockerfile.x86_64
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# https://hub.docker.com/r/ollama/ollama
FROM ollama/ollama:0.1.26
FROM ollama/ollama:0.5.1

ENV PATH=$PATH:/usr/local/nvidia/bin:/bin/nvidia/bin
ENV OLLAMA_ORIGINS=*
Expand All @@ -8,17 +8,24 @@ ENV OLLAMA_HOST=0.0.0.0:11434
COPY pull.sh /tmp

# Pre-install models useful for benchmarking.
# These are huge (total ~120 GiB), but necessary to benchmark
# These are huge (total ~96 GiB), but necessary to benchmark
# models of various sizes. They are in their own image file to
# keep the test-only image lighter by comparison.
RUN /tmp/pull.sh codellama:7b-instruct
RUN /tmp/pull.sh codellama:34b-instruct
RUN /tmp/pull.sh llama2-chinese:7b-chat
RUN /tmp/pull.sh llama2:13b-chat
RUN /tmp/pull.sh llama2:70b-chat
RUN /tmp/pull.sh mistral:7b-instruct
RUN /tmp/pull.sh mixtral:instruct
RUN /tmp/pull.sh gemma:2b-instruct
RUN /tmp/pull.sh gemma:7b-instruct
RUN /tmp/pull.sh llava:7b-v1.6
RUN /tmp/pull.sh llava:34b-v1.6

# Useful as embedding model.
RUN /tmp/pull.sh snowflake-arctic-embed2:568m-l-fp16

# Useful as small model.
RUN /tmp/pull.sh gemma2:2b-instruct-fp16

# Useful as mid-size model.
RUN /tmp/pull.sh sailor2:8b-chat-fp16

# Useful as coding-specific model.
RUN /tmp/pull.sh qwen2.5-coder:7b-instruct-q8_0

# Useful as large model.
RUN /tmp/pull.sh llama2:70b-chat-q4_K_S

# Useful as vision model.
RUN /tmp/pull.sh llama3.2-vision:11b-instruct-fp16
94 changes: 72 additions & 22 deletions images/gpu/pytorch/Dockerfile.x86_64
Original file line number Diff line number Diff line change
@@ -1,29 +1,42 @@
FROM nvidia/cuda:12.2.0-devel-ubuntu22.04

RUN apt-get update && apt-get install --yes \
python3 \
python3-distutils \
python3-pip \
clang \
wget \
vim \
git

RUN python3 -m pip install --ignore-installed \
"clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
torch \
torchvision \
lightning \
numpy \
memory_profiler
FROM nvidia/cuda:12.4.0-devel-ubuntu22.04

# Used for determining the correct pip index URL below.
ENV CUDA_VERSION=12.4

ENV PYTORCH_DATASETS_DIR=/pytorch-data
ENV TORCH_HOME=/pytorch-home
RUN mkdir -p "$TORCH_HOME" && \
mkdir -p "$PYTORCH_DATASETS_DIR"

RUN apt-get update && \
apt-get install --yes \
libgl1-mesa-glx libglib2.0-0 \
pkg-config \
python3 \
python3-distutils \
python3-pip \
clang \
wget \
vim \
git

RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu$(echo "$CUDA_VERSION" | sed 's~\.~~g')" && \
python3 -m pip install --ignore-installed \
boto3 \
"clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
lightning \
matplotlib \
memory_profiler \
numba && \
python3 -m pip install --ignore-installed \
torch \
torchvision \
torchaudio \
numpy \
--index-url "$PIP_INDEX_URL"

COPY download_pytorch_datasets.py /tmp/
# Some PyTorch examples hardcode the data directory to "data", so
# make a symlink for that too.
RUN mkdir "$PYTORCH_DATASETS_DIR" && \
python3 /tmp/download_pytorch_datasets.py && \
RUN python3 /tmp/download_pytorch_datasets.py && \
rm /tmp/download_pytorch_datasets.py

RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \
Expand All @@ -38,3 +51,40 @@ RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \

COPY *.py /
RUN rm /download_pytorch_datasets.py

RUN PYTORCH_BENCHMARKS_COMMIT=675fb8f537d302a4fef3ed2a67349209e65046ac && \
mkdir /pytorch-benchmark && \
cd /pytorch-benchmark && \
git init && \
git remote add origin https://github.com/pytorch/benchmark.git && \
git fetch --depth 1 origin "$PYTORCH_BENCHMARKS_COMMIT" && \
git checkout FETCH_HEAD

# Note that mobilenet_v2 does not have a requirements.txt file.
RUN cd /pytorch-benchmark && \
python3 -m pip install --ignore-installed \
-r requirements.txt \
-r torchbenchmark/models/LearningToPaint/requirements.txt \
-r torchbenchmark/models/fastNLP_Bert/requirements.txt \
-r torchbenchmark/models/hf_BigBird/requirements.txt \
-r torchbenchmark/models/speech_transformer/requirements.txt

# These benchmarks are chosen based on diversity of the type of model and their
# profile with respect to using the GPU and moving data. For more context, see
# this paper: https://arxiv.org/pdf/2304.14226.pdf
RUN cd /pytorch-benchmark && \
python3 install.py \
LearningToPaint \
fastNLP_Bert \
hf_BigBird \
speech_transformer \
mobilenet_v2

# Some of these benchmarks download a dataset at runtime.
# Run them once on CPU just to get this predownloaded into the image.
RUN cd /pytorch-benchmark && \
python3 run.py LearningToPaint --device cpu && \
python3 run.py fastNLP_Bert --device cpu && \
python3 run.py hf_BigBird --device cpu && \
python3 run.py speech_transformer --device cpu && \
python3 run.py mobilenet_v2 --device cpu
129 changes: 113 additions & 16 deletions test/benchmarks/tools/parser_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,26 +48,80 @@ func ParametersToName(params ...Parameter) (string, error) {
}

// NameToParameters parses the string created by ParametersToName and returns
// it as a set of Parameters.
// Example: BenchmarkRuby/server_threads.1/doc_size.16KB-6
// The parameter part of this benchmark is:
// "server_threads.1/doc_size.16KB" (BenchmarkRuby is the name, and 6 is GOMAXPROCS)
// This function will return a slice with two parameters ->
// {Name: server_threads, Value: 1}, {Name: doc_size, Value: 16KB}
func NameToParameters(name string) ([]*Parameter, error) {
// the name components and parameters contained within.
// The separator between the name and value may either be '.' or '='.
//
// Example: "BenchmarkRuby/SubTest/LevelTwo/server_threads.1/doc_size.16KB-6"
// The parameter part of this benchmark is "server_threads.1/doc_size.16KB",
// whereas "BenchmarkRuby/SubTest/LevelTwo" is the name, and the "-6" suffix is
// GOMAXPROCS (optional, may be omitted).
// This function will return a slice of the name components of the benchmark:
//
// [
// "BenchmarkRuby",
// "SubTest",
// "LevelTwo",
// ]
//
// and a slice of the parameters:
//
// [
// {Name: "server_threads", Value: "1"},
// {Name: "doc_size", Value: "16KB"},
// {Name: "GOMAXPROCS", Value: "6"},
// ]
//
// (and a nil error).
func NameToParameters(name string) ([]string, []*Parameter, error) {
var params []*Parameter
for _, cond := range strings.Split(name, "/") {
cs := strings.Split(cond, ".")
var separator string
switch {
case strings.IndexRune(name, '.') != -1 && strings.IndexRune(name, '=') != -1:
return nil, nil, fmt.Errorf("ambiguity while parsing parameters from benchmark name %q: multiple types of parameter separators are present", name)
case strings.IndexRune(name, '.') != -1:
separator = "."
case strings.IndexRune(name, '=') != -1:
separator = "="
default:
// No separator; use '=' which we know is not present in the name,
// but we still need to process the name (even if unparameterized) in
// order to possibly extract GOMAXPROCS.
separator = "="
}
var nameComponents []string
var firstParameterCond string
var goMaxProcs *Parameter
split := strings.Split(name, "/")
for i, cond := range split {
if isLast := i == len(split)-1; isLast {
// On the last component, if it contains a dash, it is a GOMAXPROCS value.
if dashSplit := strings.Split(cond, "-"); len(dashSplit) >= 2 {
goMaxProcs = &Parameter{Name: "GOMAXPROCS", Value: dashSplit[len(dashSplit)-1]}
cond = strings.Join(dashSplit[:len(dashSplit)-1], "-")
}
}
cs := strings.Split(cond, separator)
switch len(cs) {
case 1:
params = append(params, &Parameter{Name: cond, Value: cond})
if firstParameterCond != "" {
return nil, nil, fmt.Errorf("failed to parse params from %q: a non-parametrized component %q was found after a parametrized one %q", name, cond, firstParameterCond)
}
nameComponents = append(nameComponents, cond)
case 2:
if firstParameterCond == "" {
firstParameterCond = cond
}
params = append(params, &Parameter{Name: cs[0], Value: cs[1]})
default:
return nil, fmt.Errorf("failed to parse param: %s", cond)
return nil, nil, fmt.Errorf("failed to parse params from %q: %s", name, cond)
}
}
return params, nil
if goMaxProcs != nil {
// GOMAXPROCS should always be last in order to match the ordering of the
// benchmark name.
params = append(params, goMaxProcs)
}
return nameComponents, params, nil
}

// ReportCustomMetric reports a metric in a set format for parsing.
Expand All @@ -93,9 +147,52 @@ func ParseCustomMetric(value, metric string) (*Metric, error) {
if err != nil {
return nil, fmt.Errorf("failed to parse value: %v", err)
}
nameUnit := strings.Split(metric, ".")
if len(nameUnit) != 2 {
return nil, fmt.Errorf("failed to parse metric: %s", metric)
separators := []rune{'-', '.'}
var separator string
for _, sep := range separators {
if strings.ContainsRune(metric, sep) {
if separator != "" {
return nil, fmt.Errorf("failed to parse metric: ambiguous unit separator: %q (is the separator %q or %q?)", metric, separator, string(sep))
}
separator = string(sep)
}
}
var name, unit string
switch separator {
case "":
unit = metric
default:
components := strings.Split(metric, separator)
name, unit = strings.Join(components[:len(components)-1], ""), components[len(components)-1]
}
// Normalize some unit names to benchstat defaults.
switch unit {
case "":
return nil, fmt.Errorf("failed to parse metric %q: no unit specified", metric)
case "s":
unit = "sec"
case "nanos":
unit = "ns"
case "byte":
unit = "B"
case "bit":
unit = "b"
default:
// Otherwise, leave unit as-is.
}
// If the metric name is unspecified, it can sometimes be inferred from
// the unit.
if name == "" {
switch unit {
case "sec":
name = "duration"
case "req/sec", "tok/sec":
name = "throughput"
case "B/sec":
name = "bandwidth"
default:
return nil, fmt.Errorf("failed to parse metric %q: ambiguous metric name, please format the unit as 'name.unit' or 'name-unit'", metric)
}
}
return &Metric{Name: nameUnit[0], Unit: nameUnit[1], Sample: sample}, nil
return &Metric{Name: name, Unit: unit, Sample: sample}, nil
}
Loading

0 comments on commit f876a09

Please sign in to comment.