From f876a09fab487e7c9863892f184427d21e10ba74 Mon Sep 17 00:00:00 2001
From: Etienne Perot <eperot@google.com>
Date: Fri, 13 Dec 2024 18:20:32 -0800
Subject: [PATCH] Kubernetes benchmarks: Refactor kubectl context to allow sets
 of clusters.

Also add sanity check to make sure each cluster works as part of
initialization, by running a sample pod within.

PiperOrigin-RevId: 706069662
---
 images/gpu/ollama/bench/Dockerfile.x86_64     |  33 ++-
 images/gpu/pytorch/Dockerfile.x86_64          |  94 +++++--
 test/benchmarks/tools/parser_util.go          | 129 +++++++--
 test/gpu/ollama/ollama.go                     | 113 +++++++-
 test/kubernetes/benchmarks/BUILD              |  15 ++
 test/kubernetes/benchmarks/abslbuild.go       |   2 +-
 test/kubernetes/benchmarks/abslbuild_test.go  |  11 +-
 test/kubernetes/benchmarks/ffmpeg.go          |   2 +-
 test/kubernetes/benchmarks/ffmpeg_test.go     |  11 +-
 test/kubernetes/benchmarks/grpc.go            |   2 +-
 test/kubernetes/benchmarks/grpc_test.go       |  11 +-
 test/kubernetes/benchmarks/gsutil.go          |   2 +-
 test/kubernetes/benchmarks/gsutil_test.go     |  11 +-
 .../benchmarks/httpbench/httpbench.go         |  66 +++--
 test/kubernetes/benchmarks/nginx.go           |  24 +-
 test/kubernetes/benchmarks/nginx_test.go      |  11 +-
 test/kubernetes/benchmarks/ollama.go          | 253 ++++++++++++------
 test/kubernetes/benchmarks/ollama_test.go     |  11 +-
 test/kubernetes/benchmarks/postgresql.go      |   6 +-
 test/kubernetes/benchmarks/postgresql_test.go |  11 +-
 test/kubernetes/benchmarks/pytorch.go         |  77 ++----
 test/kubernetes/benchmarks/pytorch_test.go    |  21 +-
 test/kubernetes/benchmarks/redis.go           |   8 +-
 test/kubernetes/benchmarks/redis_test.go      |  11 +-
 test/kubernetes/benchmarks/rubydev.go         |   2 +-
 test/kubernetes/benchmarks/rubydev_test.go    |  11 +-
 test/kubernetes/benchmarks/stablediffusion.go |   7 +-
 .../benchmarks/stablediffusion_test.go        |  11 +-
 test/kubernetes/benchmarks/startup_test.go    |  11 +-
 test/kubernetes/benchmarks/tensorflow.go      |   2 +-
 test/kubernetes/benchmarks/tensorflow_test.go |  11 +-
 test/kubernetes/benchmarks/wordpress.go       |   8 +-
 test/kubernetes/benchmarks/wordpress_test.go  |  11 +-
 test/kubernetes/benchmetric/benchmetric.go    |   2 +-
 test/kubernetes/k8sctx/BUILD                  |   6 -
 test/kubernetes/k8sctx/k8sctx.go              | 106 ++++----
 test/kubernetes/k8sctx/k8sctx_impl.go         |  98 -------
 test/kubernetes/k8sctx/kubectlctx/BUILD       |  26 ++
 .../k8sctx/kubectlctx/kubectlctx.go           | 142 ++++++++++
 test/kubernetes/test_range_config.proto       |  24 +-
 test/kubernetes/testcluster/objects.go        | 211 ++++++++++++++-
 test/kubernetes/testcluster/testcluster.go    | 142 +++++++---
 test/kubernetes/tests/BUILD                   |   2 +-
 test/kubernetes/tests/hello_test.go           |  14 +-
 tools/bigquery/bigquery.go                    |  21 +-
 tools/parsers/go_parser.go                    |  45 +---
 tools/parsers/go_parser_test.go               | 118 ++++++--
 47 files changed, 1302 insertions(+), 664 deletions(-)
 delete mode 100644 test/kubernetes/k8sctx/k8sctx_impl.go
 create mode 100644 test/kubernetes/k8sctx/kubectlctx/BUILD
 create mode 100644 test/kubernetes/k8sctx/kubectlctx/kubectlctx.go

diff --git a/images/gpu/ollama/bench/Dockerfile.x86_64 b/images/gpu/ollama/bench/Dockerfile.x86_64
index e9461a7dc0..637324fc9f 100644
--- a/images/gpu/ollama/bench/Dockerfile.x86_64
+++ b/images/gpu/ollama/bench/Dockerfile.x86_64
@@ -1,5 +1,5 @@
 # https://hub.docker.com/r/ollama/ollama
-FROM ollama/ollama:0.1.26
+FROM ollama/ollama:0.5.1
 
 ENV PATH=$PATH:/usr/local/nvidia/bin:/bin/nvidia/bin
 ENV OLLAMA_ORIGINS=*
@@ -8,17 +8,24 @@ ENV OLLAMA_HOST=0.0.0.0:11434
 COPY pull.sh /tmp
 
 # Pre-install models useful for benchmarking.
-# These are huge (total ~120 GiB), but necessary to benchmark
+# These are huge (total ~96 GiB), but necessary to benchmark
 # models of various sizes. They are in their own image file to
 # keep the test-only image lighter by comparison.
-RUN /tmp/pull.sh codellama:7b-instruct
-RUN /tmp/pull.sh codellama:34b-instruct
-RUN /tmp/pull.sh llama2-chinese:7b-chat
-RUN /tmp/pull.sh llama2:13b-chat
-RUN /tmp/pull.sh llama2:70b-chat
-RUN /tmp/pull.sh mistral:7b-instruct
-RUN /tmp/pull.sh mixtral:instruct
-RUN /tmp/pull.sh gemma:2b-instruct
-RUN /tmp/pull.sh gemma:7b-instruct
-RUN /tmp/pull.sh llava:7b-v1.6
-RUN /tmp/pull.sh llava:34b-v1.6
+
+# Useful as embedding model.
+RUN /tmp/pull.sh snowflake-arctic-embed2:568m-l-fp16
+
+# Useful as small model.
+RUN /tmp/pull.sh gemma2:2b-instruct-fp16
+
+# Useful as mid-size model.
+RUN /tmp/pull.sh sailor2:8b-chat-fp16
+
+# Useful as coding-specific model.
+RUN /tmp/pull.sh qwen2.5-coder:7b-instruct-q8_0
+
+# Useful as large model.
+RUN /tmp/pull.sh llama2:70b-chat-q4_K_S
+
+# Useful as vision model.
+RUN /tmp/pull.sh llama3.2-vision:11b-instruct-fp16
diff --git a/images/gpu/pytorch/Dockerfile.x86_64 b/images/gpu/pytorch/Dockerfile.x86_64
index fed65ef7e5..7890f9f6fb 100644
--- a/images/gpu/pytorch/Dockerfile.x86_64
+++ b/images/gpu/pytorch/Dockerfile.x86_64
@@ -1,29 +1,42 @@
-FROM nvidia/cuda:12.2.0-devel-ubuntu22.04
-
-RUN apt-get update && apt-get install --yes \
-      python3 \
-      python3-distutils \
-      python3-pip \
-      clang \
-      wget \
-      vim \
-      git
-
-RUN python3 -m pip install --ignore-installed \
-      "clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
-      torch \
-      torchvision \
-      lightning \
-      numpy \
-      memory_profiler
+FROM nvidia/cuda:12.4.0-devel-ubuntu22.04
+
+# Used for determining the correct pip index URL below.
+ENV CUDA_VERSION=12.4
 
 ENV PYTORCH_DATASETS_DIR=/pytorch-data
 ENV TORCH_HOME=/pytorch-home
+RUN mkdir -p "$TORCH_HOME" && \
+    mkdir -p "$PYTORCH_DATASETS_DIR"
+
+RUN apt-get update && \
+    apt-get install --yes \
+        libgl1-mesa-glx libglib2.0-0 \
+        pkg-config \
+        python3 \
+        python3-distutils \
+        python3-pip \
+        clang \
+        wget \
+        vim \
+        git
+
+RUN PIP_INDEX_URL="https://download.pytorch.org/whl/cu$(echo "$CUDA_VERSION" | sed 's~\.~~g')" && \
+    python3 -m pip install --ignore-installed \
+        boto3 \
+        "clang~=$(clang --version | grep -oP 'clang version [.0-9]+' | cut -d' ' -f3)" \
+        lightning \
+        matplotlib \
+        memory_profiler \
+        numba && \
+    python3 -m pip install --ignore-installed \
+        torch \
+        torchvision \
+        torchaudio \
+        numpy \
+        --index-url "$PIP_INDEX_URL"
+
 COPY download_pytorch_datasets.py /tmp/
-# Some PyTorch examples hardcode the data directory to "data", so
-# make a symlink for that too.
-RUN mkdir "$PYTORCH_DATASETS_DIR" && \
-    python3 /tmp/download_pytorch_datasets.py && \
+RUN python3 /tmp/download_pytorch_datasets.py && \
     rm /tmp/download_pytorch_datasets.py
 
 RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \
@@ -38,3 +51,40 @@ RUN PYTORCH_EXAMPLES_COMMIT=30b310a977a82dbfc3d8e4a820f3b14d876d3bd2 && \
 
 COPY *.py /
 RUN rm /download_pytorch_datasets.py
+
+RUN PYTORCH_BENCHMARKS_COMMIT=675fb8f537d302a4fef3ed2a67349209e65046ac && \
+    mkdir /pytorch-benchmark && \
+    cd /pytorch-benchmark && \
+    git init && \
+    git remote add origin https://github.com/pytorch/benchmark.git && \
+    git fetch --depth 1 origin "$PYTORCH_BENCHMARKS_COMMIT" && \
+    git checkout FETCH_HEAD
+
+# Note that mobilenet_v2 does not have a requirements.txt file.
+RUN cd /pytorch-benchmark && \
+    python3 -m pip install --ignore-installed \
+        -r requirements.txt \
+        -r torchbenchmark/models/LearningToPaint/requirements.txt \
+        -r torchbenchmark/models/fastNLP_Bert/requirements.txt \
+        -r torchbenchmark/models/hf_BigBird/requirements.txt \
+        -r torchbenchmark/models/speech_transformer/requirements.txt
+
+# These benchmarks are chosen based on diversity of the type of model and their
+# profile with respect to using the GPU and moving data. For more context, see
+# this paper: https://arxiv.org/pdf/2304.14226.pdf
+RUN cd /pytorch-benchmark && \
+    python3 install.py \
+        LearningToPaint \
+        fastNLP_Bert \
+        hf_BigBird \
+        speech_transformer \
+        mobilenet_v2
+
+# Some of these benchmarks download a dataset at runtime.
+# Run them once on CPU just to get this predownloaded into the image.
+RUN cd /pytorch-benchmark && \
+    python3 run.py LearningToPaint --device cpu && \
+    python3 run.py fastNLP_Bert --device cpu && \
+    python3 run.py hf_BigBird --device cpu && \
+    python3 run.py speech_transformer --device cpu && \
+    python3 run.py mobilenet_v2 --device cpu
diff --git a/test/benchmarks/tools/parser_util.go b/test/benchmarks/tools/parser_util.go
index dac7aa5a44..22bffd3150 100644
--- a/test/benchmarks/tools/parser_util.go
+++ b/test/benchmarks/tools/parser_util.go
@@ -48,26 +48,80 @@ func ParametersToName(params ...Parameter) (string, error) {
 }
 
 // NameToParameters parses the string created by ParametersToName and returns
-// it as a set of Parameters.
-// Example: BenchmarkRuby/server_threads.1/doc_size.16KB-6
-// The parameter part of this benchmark is:
-// "server_threads.1/doc_size.16KB" (BenchmarkRuby is the name, and 6 is GOMAXPROCS)
-// This function will return a slice with two parameters ->
-// {Name: server_threads, Value: 1}, {Name: doc_size, Value: 16KB}
-func NameToParameters(name string) ([]*Parameter, error) {
+// the name components and parameters contained within.
+// The separator between the name and value may either be '.' or '='.
+//
+// Example: "BenchmarkRuby/SubTest/LevelTwo/server_threads.1/doc_size.16KB-6"
+// The parameter part of this benchmark is "server_threads.1/doc_size.16KB",
+// whereas "BenchmarkRuby/SubTest/LevelTwo" is the name, and the "-6" suffix is
+// GOMAXPROCS (optional, may be omitted).
+// This function will return a slice of the name components of the benchmark:
+//
+//	[
+//	  "BenchmarkRuby",
+//	  "SubTest",
+//	  "LevelTwo",
+//	]
+//
+// and a slice of the parameters:
+//
+//	[
+//	  {Name: "server_threads", Value: "1"},
+//	  {Name: "doc_size", Value: "16KB"},
+//	  {Name: "GOMAXPROCS", Value: "6"},
+//	]
+//
+// (and a nil error).
+func NameToParameters(name string) ([]string, []*Parameter, error) {
 	var params []*Parameter
-	for _, cond := range strings.Split(name, "/") {
-		cs := strings.Split(cond, ".")
+	var separator string
+	switch {
+	case strings.IndexRune(name, '.') != -1 && strings.IndexRune(name, '=') != -1:
+		return nil, nil, fmt.Errorf("ambiguity while parsing parameters from benchmark name %q: multiple types of parameter separators are present", name)
+	case strings.IndexRune(name, '.') != -1:
+		separator = "."
+	case strings.IndexRune(name, '=') != -1:
+		separator = "="
+	default:
+		// No separator; use '=' which we know is not present in the name,
+		// but we still need to process the name (even if unparameterized) in
+		// order to possibly extract GOMAXPROCS.
+		separator = "="
+	}
+	var nameComponents []string
+	var firstParameterCond string
+	var goMaxProcs *Parameter
+	split := strings.Split(name, "/")
+	for i, cond := range split {
+		if isLast := i == len(split)-1; isLast {
+			// On the last component, if it contains a dash, it is a GOMAXPROCS value.
+			if dashSplit := strings.Split(cond, "-"); len(dashSplit) >= 2 {
+				goMaxProcs = &Parameter{Name: "GOMAXPROCS", Value: dashSplit[len(dashSplit)-1]}
+				cond = strings.Join(dashSplit[:len(dashSplit)-1], "-")
+			}
+		}
+		cs := strings.Split(cond, separator)
 		switch len(cs) {
 		case 1:
-			params = append(params, &Parameter{Name: cond, Value: cond})
+			if firstParameterCond != "" {
+				return nil, nil, fmt.Errorf("failed to parse params from %q: a non-parametrized component %q was found after a parametrized one %q", name, cond, firstParameterCond)
+			}
+			nameComponents = append(nameComponents, cond)
 		case 2:
+			if firstParameterCond == "" {
+				firstParameterCond = cond
+			}
 			params = append(params, &Parameter{Name: cs[0], Value: cs[1]})
 		default:
-			return nil, fmt.Errorf("failed to parse param: %s", cond)
+			return nil, nil, fmt.Errorf("failed to parse params from %q: %s", name, cond)
 		}
 	}
-	return params, nil
+	if goMaxProcs != nil {
+		// GOMAXPROCS should always be last in order to match the ordering of the
+		// benchmark name.
+		params = append(params, goMaxProcs)
+	}
+	return nameComponents, params, nil
 }
 
 // ReportCustomMetric reports a metric in a set format for parsing.
@@ -93,9 +147,52 @@ func ParseCustomMetric(value, metric string) (*Metric, error) {
 	if err != nil {
 		return nil, fmt.Errorf("failed to parse value: %v", err)
 	}
-	nameUnit := strings.Split(metric, ".")
-	if len(nameUnit) != 2 {
-		return nil, fmt.Errorf("failed to parse metric: %s", metric)
+	separators := []rune{'-', '.'}
+	var separator string
+	for _, sep := range separators {
+		if strings.ContainsRune(metric, sep) {
+			if separator != "" {
+				return nil, fmt.Errorf("failed to parse metric: ambiguous unit separator: %q (is the separator %q or %q?)", metric, separator, string(sep))
+			}
+			separator = string(sep)
+		}
+	}
+	var name, unit string
+	switch separator {
+	case "":
+		unit = metric
+	default:
+		components := strings.Split(metric, separator)
+		name, unit = strings.Join(components[:len(components)-1], ""), components[len(components)-1]
+	}
+	// Normalize some unit names to benchstat defaults.
+	switch unit {
+	case "":
+		return nil, fmt.Errorf("failed to parse metric %q: no unit specified", metric)
+	case "s":
+		unit = "sec"
+	case "nanos":
+		unit = "ns"
+	case "byte":
+		unit = "B"
+	case "bit":
+		unit = "b"
+	default:
+		// Otherwise, leave unit as-is.
+	}
+	// If the metric name is unspecified, it can sometimes be inferred from
+	// the unit.
+	if name == "" {
+		switch unit {
+		case "sec":
+			name = "duration"
+		case "req/sec", "tok/sec":
+			name = "throughput"
+		case "B/sec":
+			name = "bandwidth"
+		default:
+			return nil, fmt.Errorf("failed to parse metric %q: ambiguous metric name, please format the unit as 'name.unit' or 'name-unit'", metric)
+		}
 	}
-	return &Metric{Name: nameUnit[0], Unit: nameUnit[1], Sample: sample}, nil
+	return &Metric{Name: name, Unit: unit, Sample: sample}, nil
 }
diff --git a/test/gpu/ollama/ollama.go b/test/gpu/ollama/ollama.go
index 6dcd06540e..6a9fd97e4c 100644
--- a/test/gpu/ollama/ollama.go
+++ b/test/gpu/ollama/ollama.go
@@ -119,9 +119,9 @@ func New(ctx context.Context, server Server, logger testutil.Logger) (*Ollama, e
 		return nil, fmt.Errorf("could not get logs: %w", err)
 	}
 	switch {
-	case strings.Contains(logs, "no GPU detected"):
+	case strings.Contains(logs, "library=cpu"):
 		llm.HasGPU = false
-	case strings.Contains(logs, "Nvidia GPU detected"):
+	case strings.Contains(logs, "library=cuda"):
 		llm.HasGPU = true
 	default:
 		return nil, fmt.Errorf("cannot determine whether ollama is using GPU from logs:\n%s", logs)
@@ -204,6 +204,18 @@ type ResponseMetrics struct {
 	LastByteRead time.Time `json:"last_byte_read"`
 }
 
+// TimeToFirstByte returns the duration it took between the request being sent
+// and the first byte of the response being read.
+func (rm *ResponseMetrics) TimeToFirstByte() time.Duration {
+	return rm.FirstByteRead.Sub(rm.RequestSent)
+}
+
+// TimeToLastByte returns the duration it took between the request being sent
+// and the last byte of the response being read.
+func (rm *ResponseMetrics) TimeToLastByte() time.Duration {
+	return rm.LastByteRead.Sub(rm.RequestSent)
+}
+
 // apiResponse represents a JSON response from the ollama API.
 type apiResponse[T any] struct {
 	// Objects is the list of JSON objects in the response.
@@ -539,8 +551,8 @@ func (p *Prompt) WithHotterModel() *Prompt {
 	return &promptCopy
 }
 
-// PromptJSON encodes the JSON data for a query.
-type PromptJSON struct {
+// promptJSON encodes the JSON data for a query.
+type promptJSON struct {
 	Model     string              `json:"model"`
 	Prompt    string              `json:"prompt,omitempty"`
 	Images    []string            `json:"images"`
@@ -551,7 +563,7 @@ type PromptJSON struct {
 }
 
 // json encodes this prompt to the JSON format expected by Ollama.
-func (p *Prompt) json() PromptJSON {
+func (p *Prompt) json() promptJSON {
 	keepAlive := ""
 	if p.KeepModelAlive != 0 {
 		keepAlive = p.KeepModelAlive.String()
@@ -560,7 +572,7 @@ func (p *Prompt) json() PromptJSON {
 	for i, image := range p.images {
 		images[i] = base64.StdEncoding.EncodeToString(image)
 	}
-	return PromptJSON{
+	return promptJSON{
 		Model:     p.Model.Name,
 		Prompt:    p.CleanQuery(),
 		Images:    images,
@@ -571,11 +583,11 @@ func (p *Prompt) json() PromptJSON {
 	}
 }
 
-// ResponseJSON is the JSON-format response from ollama about a prompt.
+// responseJSON is the JSON-format response from ollama about a prompt.
 // Note that in `streamed` mode, the `Response` field contains a single token.
 // To recover the whole response, all `Response` fields must be concatenated
-// until the last `ResponseJSON`, identified as such by the `Done` field.
-type ResponseJSON struct {
+// until the last `responseJSON`, identified as such by the `Done` field.
+type responseJSON struct {
 	Model           string              `json:"model"`
 	CreatedAt       time.Time           `json:"created_at"`
 	Response        string              `json:"response"`
@@ -591,7 +603,7 @@ type ResponseJSON struct {
 
 // Response represents a response to a query from Ollama.
 type Response struct {
-	data    []*ResponseJSON
+	data    []*responseJSON
 	metrics ResponseMetrics
 }
 
@@ -837,13 +849,13 @@ func (llm *Ollama) WarmModel(ctx context.Context, model *Model, keepWarmFor time
 		return nil, llm.withServerLogsErr(ctx, fmt.Errorf("warmup prompt for model %s failed: %w", model.Name, err))
 	}
 	return &ModelLoadStats{
-		ClientReportedDuration: resp.metrics.LastByteRead.Sub(resp.metrics.RequestSent),
+		ClientReportedDuration: resp.metrics.TimeToFirstByte(),
 	}, nil
 }
 
 // Prompt returns the result of prompting the given `model` with `prompt`.
 func (llm *Ollama) Prompt(ctx context.Context, prompt *Prompt) (*Response, error) {
-	resp, err := jsonPost[PromptJSON, ResponseJSON](ctx, llm, "/api/generate", prompt.json())
+	resp, err := jsonPost[promptJSON, responseJSON](ctx, llm, "/api/generate", prompt.json())
 	if err != nil {
 		return nil, llm.withServerLogsErr(ctx, fmt.Errorf("prompt (%s %q) request failed: %w", prompt.Model.Name, prompt.CleanQuery(), err))
 	}
@@ -875,3 +887,80 @@ func (llm *Ollama) PromptUntil(ctx context.Context, prompt *Prompt, iterate func
 	}
 	return nil, fmt.Errorf("response %q (attempt #%d with prompt %v) did not match predicate: %v", lastResponse, attempts, prompt, lastError)
 }
+
+// Embedding holds the result of running an embedding model on a single input.
+type Embedding struct {
+	Input     string
+	Embedding []float64
+}
+
+// EmbeddingResponse represents the result of running an embedding model
+// on a set of inputs.
+type EmbeddingResponse struct {
+	// Model is the model used to generate the embeddings.
+	Model *Model
+
+	// Embeddings is the list of embeddings generated for the given inputs.
+	Embeddings []Embedding
+
+	// TotalDuration is the total duration of the embedding request as
+	// measured by the server, not the client.
+	TotalDuration time.Duration
+
+	// LoadDuration is the duration of the embedding model load time as measured
+	// by the server, not the client.
+	LoadDuration time.Duration
+
+	// PromptEvalCount is the number of prompt evaluations performed by the
+	// server.
+	PromptEvalCount int
+
+	// ResponseMetrics contains HTTP response metrics as perceived by the
+	// client.
+	ResponseMetrics ResponseMetrics
+}
+
+// Embed generates embeddings for each of the given inputs.
+func (llm *Ollama) Embed(ctx context.Context, model *Model, inputs []string) (*EmbeddingResponse, error) {
+	// embeddingRequestJSON is the JSON format of an embedding request.
+	type embeddingRequestJSON struct {
+		Model string   `json:"model"`
+		Input []string `json:"input"`
+	}
+
+	// embeddingResponseJSON is the JSON format of an embedding response.
+	type embeddingResponseJSON struct {
+		Model           string      `json:"model"`
+		Embeddings      [][]float64 `json:"embeddings"`
+		TotalDuration   int64       `json:"total_duration"`
+		LoadDuration    int64       `json:"load_duration"`
+		PromptEvalCount int         `json:"prompt_eval_count"`
+	}
+
+	resp, err := jsonPost[embeddingRequestJSON, embeddingResponseJSON](ctx, llm, "/api/embed", embeddingRequestJSON{Model: model.Name, Input: inputs})
+	if err != nil {
+		return nil, llm.withServerLogsErr(ctx, fmt.Errorf("embedding request failed: %w", err))
+	}
+	obj, err := resp.Obj()
+	if err != nil {
+		return nil, fmt.Errorf("malformed embedding response: %w", err)
+	}
+	if len(obj.Embeddings) != len(inputs) {
+		return nil, fmt.Errorf("embedding response has %d embeddings, but %d inputs were provided", len(obj.Embeddings), len(inputs))
+	}
+	embeddings := make([]Embedding, len(inputs))
+	for i, embedding := range obj.Embeddings {
+		embeddings[i] = Embedding{
+			Input:     inputs[i],
+			Embedding: embedding,
+		}
+	}
+	return &EmbeddingResponse{
+		Model:           model,
+		Embeddings:      embeddings,
+		TotalDuration:   time.Duration(obj.TotalDuration) * time.Nanosecond,
+		LoadDuration:    time.Duration(obj.LoadDuration) * time.Nanosecond,
+		PromptEvalCount: obj.PromptEvalCount,
+		ResponseMetrics: resp.Metrics,
+	}, nil
+}
diff --git a/test/kubernetes/benchmarks/BUILD b/test/kubernetes/benchmarks/BUILD
index 21ea89c4e9..5d2fd2ea3b 100644
--- a/test/kubernetes/benchmarks/BUILD
+++ b/test/kubernetes/benchmarks/BUILD
@@ -58,6 +58,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -88,6 +89,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -120,6 +122,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -152,6 +155,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -184,6 +188,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -217,6 +222,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -248,6 +254,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -280,6 +287,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -312,6 +320,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -343,6 +352,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -356,6 +366,7 @@ go_library(
     ],
     nogo = False,
     deps = [
+        "//pkg/sync",
         "//test/gpu/ollama",
         "//test/kubernetes",
         "//test/kubernetes/benchmarks/profiling",
@@ -381,6 +392,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -414,6 +426,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -445,6 +458,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
@@ -477,6 +491,7 @@ go_test(
     ],
     deps = [
         "//test/kubernetes/k8sctx",
+        "//test/kubernetes/k8sctx/kubectlctx",
         "//test/kubernetes/testcluster",
     ],
 )
diff --git a/test/kubernetes/benchmarks/abslbuild.go b/test/kubernetes/benchmarks/abslbuild.go
index ee5efabb66..aeaf9a35b6 100644
--- a/test/kubernetes/benchmarks/abslbuild.go
+++ b/test/kubernetes/benchmarks/abslbuild.go
@@ -108,7 +108,7 @@ func BuildABSL(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesContex
 				t.Fatalf("Failed to set pod for test runtime: %v", err)
 			}
 
-			pod, err = testcluster.MaybeSetContainerResources(pod, name, testcluster.ContainerResourcesRequest{})
+			pod, err = testcluster.SetContainerResources(pod, "", testcluster.ContainerResourcesRequest{})
 			if err != nil {
 				t.Fatalf("Failed to set container resources: %v", err)
 			}
diff --git a/test/kubernetes/benchmarks/abslbuild_test.go b/test/kubernetes/benchmarks/abslbuild_test.go
index 7e69da6f63..2095789b00 100644
--- a/test/kubernetes/benchmarks/abslbuild_test.go
+++ b/test/kubernetes/benchmarks/abslbuild_test.go
@@ -19,26 +19,21 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 // TestABSLBuild benchmarks building various Abseil C++ targets.
 func TestABSLBuild(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("ABSL", func(t *testing.T) {
 			t.Parallel()
 			BuildABSL(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestABSLBuild": TestABSLBuild,
-	})
-}
diff --git a/test/kubernetes/benchmarks/ffmpeg.go b/test/kubernetes/benchmarks/ffmpeg.go
index 0f79a9360e..2aa6f2a0dd 100644
--- a/test/kubernetes/benchmarks/ffmpeg.go
+++ b/test/kubernetes/benchmarks/ffmpeg.go
@@ -111,7 +111,7 @@ func RunFFMPEG(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesContex
 			if err != nil {
 				t.Fatalf("Failed to configure pod for runtime: %v", err)
 			}
-			p, err = testcluster.MaybeSetContainerResources(p, ffmpegContainerName, testcluster.ContainerResourcesRequest{})
+			p, err = testcluster.SetContainerResources(p, ffmpegContainerName, testcluster.ContainerResourcesRequest{})
 			if err != nil {
 				t.Fatalf("Failed to set container resources: %v", err)
 			}
diff --git a/test/kubernetes/benchmarks/ffmpeg_test.go b/test/kubernetes/benchmarks/ffmpeg_test.go
index bc6e295dc8..ec00cb10e1 100644
--- a/test/kubernetes/benchmarks/ffmpeg_test.go
+++ b/test/kubernetes/benchmarks/ffmpeg_test.go
@@ -19,25 +19,20 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 func TestFfmpeg(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("ffmpeg", func(t *testing.T) {
 			t.Parallel()
 			RunFFMPEG(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestFfmpeg": TestFfmpeg,
-	})
-}
diff --git a/test/kubernetes/benchmarks/grpc.go b/test/kubernetes/benchmarks/grpc.go
index c68772022d..1efbd068c3 100644
--- a/test/kubernetes/benchmarks/grpc.go
+++ b/test/kubernetes/benchmarks/grpc.go
@@ -112,7 +112,7 @@ func BuildGRPC(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesContex
 				t.Fatalf("Failed to set pod for test runtime: %v", err)
 			}
 
-			pod, err = testcluster.MaybeSetContainerResources(pod, name, testcluster.ContainerResourcesRequest{})
+			pod, err = testcluster.SetContainerResources(pod, "", testcluster.ContainerResourcesRequest{})
 			if err != nil {
 				t.Fatalf("Failed to set container resources: %v", err)
 			}
diff --git a/test/kubernetes/benchmarks/grpc_test.go b/test/kubernetes/benchmarks/grpc_test.go
index 85b82dd98f..023ca33d87 100644
--- a/test/kubernetes/benchmarks/grpc_test.go
+++ b/test/kubernetes/benchmarks/grpc_test.go
@@ -19,25 +19,20 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 func TestGRPCBuild(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("gRPC", func(t *testing.T) {
 			t.Parallel()
 			BuildGRPC(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestGRPCBuild": TestGRPCBuild,
-	})
-}
diff --git a/test/kubernetes/benchmarks/gsutil.go b/test/kubernetes/benchmarks/gsutil.go
index 78af164600..26fdfa7380 100644
--- a/test/kubernetes/benchmarks/gsutil.go
+++ b/test/kubernetes/benchmarks/gsutil.go
@@ -133,7 +133,7 @@ func RunGSUtil(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesContex
 					if err != nil {
 						t.Fatalf("Failed to configure pod for runtime: %v", err)
 					}
-					p, err = testcluster.MaybeSetContainerResources(p, name, testcluster.ContainerResourcesRequest{})
+					p, err = testcluster.SetContainerResources(p, "", testcluster.ContainerResourcesRequest{})
 					if err != nil {
 						t.Fatalf("Failed to set container resources: %v", err)
 					}
diff --git a/test/kubernetes/benchmarks/gsutil_test.go b/test/kubernetes/benchmarks/gsutil_test.go
index fe93a6a54c..ba687dcd61 100644
--- a/test/kubernetes/benchmarks/gsutil_test.go
+++ b/test/kubernetes/benchmarks/gsutil_test.go
@@ -21,25 +21,20 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 func TestGSUtil(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("GSUtil", func(t *testing.T) {
 			t.Parallel()
 			RunGSUtil(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestGSUtil": TestGSUtil,
-	})
-}
diff --git a/test/kubernetes/benchmarks/httpbench/httpbench.go b/test/kubernetes/benchmarks/httpbench/httpbench.go
index a698f08cc2..34f3dce87e 100644
--- a/test/kubernetes/benchmarks/httpbench/httpbench.go
+++ b/test/kubernetes/benchmarks/httpbench/httpbench.go
@@ -105,16 +105,18 @@ type HTTPBenchmark struct {
 // Run runs the HTTP-based benchmark.
 func (h *HTTPBenchmark) Run(ctx context.Context, t *testing.T) {
 	t.Helper()
-	if err := h.Cluster.WaitForServiceReady(ctx, h.Service); err != nil {
+	serverWaitCtx, serverWaitCancel := context.WithTimeout(ctx, 10*time.Minute)
+	if err := h.Cluster.WaitForServiceReady(serverWaitCtx, h.Service); err != nil {
 		t.Fatalf("Failed to wait for service: %v", err)
 	}
 	ip := testcluster.GetIPFromService(h.Service)
 	if ip == "" {
 		t.Fatalf("did not get valid ip: %s", ip)
 	}
-	if err := h.waitForServer(ctx, ip); err != nil {
+	if err := h.waitForServer(serverWaitCtx, ip); err != nil {
 		t.Fatalf("Failed to wait for server: %v", err)
 	}
+	serverWaitCancel()
 	for _, round := range h.Rounds {
 		qpsText := fmt.Sprintf("%d", round.TargetQPS)
 		if round.TargetQPS == InfiniteQPS {
@@ -146,7 +148,10 @@ func (h *HTTPBenchmark) runRound(ctx context.Context, t *testing.T, round Round,
 	}
 	defer h.Cluster.DeletePod(ctx, client)
 
-	if err := h.Cluster.WaitForPodCompleted(ctx, client); err != nil {
+	waitCtx, waitCancel := context.WithTimeout(ctx, round.Duration+2*time.Minute)
+	err = h.Cluster.WaitForPodCompleted(waitCtx, client)
+	waitCancel()
+	if err != nil {
 		t.Fatalf("failed to wait for wrk2 pod: %v", err)
 	}
 
@@ -243,21 +248,48 @@ func (h *HTTPBenchmark) getWgetPod(ip string) *v13.Pod {
 // waitForServer waits for an HTTP server to start responding on the given
 // IP and port.
 func (h *HTTPBenchmark) waitForServer(ctx context.Context, ip string) error {
-	wget, err := h.Cluster.ConfigurePodForClientNodepool(ctx, h.getWgetPod(ip))
-	if err != nil {
-		return fmt.Errorf("failed to configure wget pod for client nodepool: %v", err)
-	}
-	wget, err = h.Cluster.CreatePod(ctx, wget)
-	if err != nil {
-		return fmt.Errorf("failed to create wget pod: %v", err)
-	}
-	defer h.Cluster.DeletePod(ctx, wget)
-	waitCtx, waitCancel := context.WithTimeout(ctx, 1*time.Minute)
-	defer waitCancel()
-	if err := h.Cluster.WaitForPodCompleted(waitCtx, wget); err != nil {
-		return fmt.Errorf("failed to wait for HTTP server %s:%d%s: %v", ip, h.Port, h.Path, err)
+	lastPhase := v13.PodUnknown
+	var lastLogs string
+	for ctx.Err() == nil {
+		wget, err := h.Cluster.ConfigurePodForClientNodepool(ctx, h.getWgetPod(ip))
+		if err != nil {
+			return fmt.Errorf("failed to configure wget pod for client nodepool: %w", err)
+		}
+		wget, err = h.Cluster.CreatePod(ctx, wget)
+		if err != nil {
+			return fmt.Errorf("failed to create wget pod: %w", err)
+		}
+		waitCtx, waitCancel := context.WithTimeout(ctx, 2*time.Minute)
+		phase, waitErr := h.Cluster.WaitForPodTerminated(waitCtx, wget)
+		waitCtxErr := waitCtx.Err()
+		waitCancel()
+		if waitErr == nil && phase != v13.PodSucceeded {
+			logs, err := h.Cluster.ReadPodLogs(ctx, wget)
+			if err != nil {
+				_ = h.Cluster.DeletePod(ctx, wget) // Best-effort delete.
+				return fmt.Errorf("failed to read wget pod logs: %w", err)
+			}
+			lastLogs = logs
+		}
+		deleteErr := h.Cluster.DeletePod(ctx, wget)
+		if ctx.Err() != nil {
+			break
+		}
+		if waitCtxErr != nil {
+			continue
+		}
+		if waitErr != nil {
+			return fmt.Errorf("failed to wait for wget pod: %w", waitErr)
+		}
+		if deleteErr != nil {
+			return fmt.Errorf("failed to delete wget pod: %w", deleteErr)
+		}
+		if phase == v13.PodSucceeded {
+			return nil
+		}
+		lastPhase = phase
 	}
-	return nil
+	return fmt.Errorf("wget pod still fails after context expiry (last phase: %v; last logs: %q)", lastPhase, lastLogs)
 }
 
 /*
diff --git a/test/kubernetes/benchmarks/nginx.go b/test/kubernetes/benchmarks/nginx.go
index 31bc3af7c6..b76a03f7be 100644
--- a/test/kubernetes/benchmarks/nginx.go
+++ b/test/kubernetes/benchmarks/nginx.go
@@ -33,7 +33,7 @@ import (
 
 const (
 	nginxPort              = 80
-	nginxBenchmarkDuration = 70 * time.Second
+	nginxBenchmarkDuration = 55 * time.Second
 	nginxRequestTimeout    = 3 * time.Second
 	nginxServingDir        = "/tmp/html"
 
@@ -48,9 +48,9 @@ var (
 	// The test expects that it contains the files to be served at /local,
 	// and will serve files out of `nginxServingDir`.
 	nginxCommand      = []string{"nginx", "-c", "/etc/nginx/nginx.conf"}
-	nginxDocKibibytes = []int{1, 10, 100, 10240}
-	threads           = []int{1, 8, 64, 1000}
-	targetQPS         = []int{1, 8, 64, httpbench.InfiniteQPS}
+	nginxDocKibibytes = []int{1, 10240}
+	threads           = []int{1, 8, 1000}
+	targetQPS         = []int{1, 64, httpbench.InfiniteQPS}
 	wantPercentiles   = []int{50, 95, 99}
 )
 
@@ -135,7 +135,7 @@ func BenchmarkNginx(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesC
 			if err != nil {
 				t.Fatalf("Failed to configure pod for runtime nodepool: %v", err)
 			}
-			server, err = testcluster.MaybeSetContainerResources(server, name, testcluster.ContainerResourcesRequest{})
+			server, err = testcluster.SetContainerResources(server, "", testcluster.ContainerResourcesRequest{})
 			if err != nil {
 				t.Fatalf("Failed to set container resources: %v", err)
 			}
@@ -212,20 +212,6 @@ func BenchmarkNginx(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesC
 					benchmark.Run(ctx, t)
 				})
 			}
-			t.Run("HTTP404", func(t *testing.T) {
-				benchmark := &httpbench.HTTPBenchmark{
-					Name:            fmt.Sprintf("nginx/%s/HTTP404", test.name),
-					Cluster:         cluster,
-					Namespace:       benchmarkNS,
-					Service:         service,
-					Port:            nginxPort,
-					Path:            "/404-this-page-does-not-exist.html",
-					Rounds:          rounds,
-					Timeout:         nginxRequestTimeout,
-					WantPercentiles: wantPercentiles,
-				}
-				benchmark.Run(ctx, t)
-			})
 		})
 		if t.Failed() {
 			break
diff --git a/test/kubernetes/benchmarks/nginx_test.go b/test/kubernetes/benchmarks/nginx_test.go
index 2d15bd6309..28c9d56c32 100644
--- a/test/kubernetes/benchmarks/nginx_test.go
+++ b/test/kubernetes/benchmarks/nginx_test.go
@@ -19,25 +19,20 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 func TestNginx(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("nginx", func(t *testing.T) {
 			t.Parallel()
 			BenchmarkNginx(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestNginx": TestNginx,
-	})
-}
diff --git a/test/kubernetes/benchmarks/ollama.go b/test/kubernetes/benchmarks/ollama.go
index 7a7abbd964..5f6e447c8d 100644
--- a/test/kubernetes/benchmarks/ollama.go
+++ b/test/kubernetes/benchmarks/ollama.go
@@ -26,6 +26,7 @@ import (
 	"time"
 	"unicode"
 
+	"gvisor.dev/gvisor/pkg/sync"
 	"gvisor.dev/gvisor/test/gpu/ollama"
 	k8s "gvisor.dev/gvisor/test/kubernetes"
 	"gvisor.dev/gvisor/test/kubernetes/benchmarks/profiling"
@@ -40,59 +41,46 @@ import (
 
 // Ollama models present in benchmark image.
 var (
-	// allModels is a list of all models.
-	allModels = []*ollama.Model{
-		modelMistral7B,
-		modelMixtral8X7B,
-		modelCodeLlama7B,
-		modelCodeLlama34B,
-		modelLlamaChinese7B,
-		modelLlava7B,
-		modelLlava34B,
-		modelLlama13B,
+	// promptModels is a list of all promptable models.
+	promptModels = []*ollama.Model{
+		gemmaTwo2B,
+		modelQwenTwoPointFiveCoder7B,
+		modelSailorTwo8B,
 		modelLlama70B,
+		modelLlamaThreePointTwoVision11B,
 	}
 
 	// cheapModels is a list of models that are cheap to load.
 	// These are used when cold-prompting ollama, by forcing it
 	// to load a different model first. This process is faster
 	// by choosing one of these cheap models to load.
-	cheapModels = []*ollama.Model{
-		modelMistral7B,
-		modelCodeLlama7B,
-	}
-
-	// modelCodeLlama7B is a 7B model in the llama2 family,
-	// specialized for coding tasks.
-	modelCodeLlama7B = ollama.ZeroTemperatureModel("codellama:7b-instruct")
+	cheapModels = []*ollama.Model{gemmaTwo2B}
 
-	// modelCodeLlama34B is a 34B model in the llama2 family,
-	// specialized for coding tasks.
-	modelCodeLlama34B = ollama.ZeroTemperatureModel("codellama:34b-instruct")
-
-	// modelLlamaChinese7B is a 7B model in the llama2 family,
-	// specialized for bilingualism (English + Chinese) and translation.
-	modelLlamaChinese7B = ollama.ZeroTemperatureModel("llama2-chinese:7b-chat")
+	// snowflakeArcticEmbedTwo568M is a list of models that are
+	// used for generating embeddings, rather than prompting.
+	embeddingModels = []*ollama.Model{
+		snowflakeArcticEmbedTwo568M,
+	}
 
-	// modelLlama13B is the plain 13B version of the original llama2 model.
-	modelLlama13B = ollama.ZeroTemperatureModel("llama2:13b-chat")
+	// snowflakeArcticEmbedTwo568M is an unquantized 568M embedding model from Snowflake.
+	snowflakeArcticEmbedTwo568M = ollama.ZeroTemperatureModel("snowflake-arctic-embed2:568m-l-fp16")
 
-	// modelLlama70B is the plain 70B version of the original llama2 model.
-	modelLlama70B = ollama.ZeroTemperatureModel("llama2:70b-chat")
+	// gemmaTwo2B is an unquantized 2B model in the Gemma2 family,
+	gemmaTwo2B = ollama.ZeroTemperatureModel("gemma2:2b-instruct-fp16")
 
-	// modelMistral7B is the first-generation model of the Mistral family.
-	modelMistral7B = ollama.ZeroTemperatureModel("mistral:7b-instruct")
+	// modelQwenTwoPointFiveCoder7B is a 8-bit quantized 7B model in the Qwen family,
+	// specialized for coding tasks.
+	modelQwenTwoPointFiveCoder7B = ollama.ZeroTemperatureModel("qwen2.5-coder:7b-instruct-q8_0")
 
-	// modelMixtral8X7B is the second-generation model of the Mistral family,
-	// using mixture-of-exports design to achieve higher "8x 7B" quality
-	// without the cost of a larger-parameter model.
-	modelMixtral8X7B = ollama.ZeroTemperatureModel("mixtral:instruct")
+	// modelSailorTwo8B is an unquantized 8B model in the Qwen family,
+	// specialized for multilingual tasks.
+	modelSailorTwo8B = ollama.ZeroTemperatureModel("sailor2:8b-chat-fp16")
 
-	// modelLlava7B is a multimodal 7B model that can do image analysis.
-	modelLlava7B = ollama.ZeroTemperatureModel("llava:7b-v1.6")
+	// modelLlama70B is the 4-bit quantized 70B version of the original llama2 model.
+	modelLlama70B = ollama.ZeroTemperatureModel("llama2:70b-chat-q4_K_S")
 
-	// modelLlava34B is a multimodal 34B model that can do image analysis.
-	modelLlava34B = ollama.ZeroTemperatureModel("llava:34b-v1.6")
+	// modelLlamaThreePointTwoVision11B is an unquantized multimodal 11B model that can do image analysis.
+	modelLlamaThreePointTwoVision11B = ollama.ZeroTemperatureModel("llama3.2-vision:11b-instruct-fp16")
 )
 
 // Embedded images.
@@ -229,6 +217,17 @@ func atLeastNWords(wantNWords int) func(prompt *ollama.Prompt, response *ollama.
 	}
 }
 
+// wantSubstring verifies that the response contains the given substring.
+// If not, it raises the temperature.
+func wantSubstring(substring string) func(prompt *ollama.Prompt, response *ollama.Response) (*ollama.Prompt, error) {
+	return func(prompt *ollama.Prompt, response *ollama.Response) (*ollama.Prompt, error) {
+		if !strings.Contains(strings.ToLower(response.Text()), strings.ToLower(substring)) {
+			return prompt.WithHotterModel(), fmt.Errorf("response %q does not contain substring %q", response.Text(), substring)
+		}
+		return nil, nil
+	}
+}
+
 // BenchmarkOllama runs ollama benchmarks for a single cluster.
 func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesContext, cluster *testcluster.TestCluster) {
 	benchmarkNS := cluster.Namespace(testcluster.NamespaceBenchmark)
@@ -236,6 +235,11 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 		t.Fatalf("cannot reset namespace: %v", err)
 	}
 	defer benchmarkNS.Cleanup(ctx)
+	reqWaitCtx, reqWaitCancel := context.WithTimeout(ctx, 5*time.Minute)
+	defer reqWaitCancel()
+	if err := benchmarkNS.WaitForResources(reqWaitCtx, testcluster.ContainerResourcesRequest{GPU: true}); err != nil {
+		t.Fatalf("failed to wait for resources: %v", err)
+	}
 	endProfiling, err := profiling.MaybeSetup(ctx, t, k8sCtx, cluster, benchmarkNS)
 	if err != nil {
 		t.Fatalf("Failed to setup profiling: %v", err)
@@ -255,7 +259,7 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 	if err != nil {
 		t.Fatalf("Failed to configure pod for runtime nodepool: %v", err)
 	}
-	ollamaPod, err = testcluster.MaybeSetContainerResources(ollamaPod, ollamaPod.ObjectMeta.Name, testcluster.ContainerResourcesRequest{GPU: true})
+	ollamaPod, err = testcluster.SetContainerResources(ollamaPod, "", testcluster.ContainerResourcesRequest{GPU: true})
 	if err != nil {
 		t.Fatalf("Failed to set container resources: %v", err)
 	}
@@ -314,11 +318,9 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 		{
 			name: "HelloWorld",
 			models: []*ollama.Model{
-				modelLlamaChinese7B,
-				modelLlama13B,
+				gemmaTwo2B,
+				modelSailorTwo8B,
 				modelLlama70B,
-				modelMistral7B,
-				modelMixtral8X7B,
 			},
 			query: `
 				Reply with the words: "Hello World!".
@@ -328,7 +330,7 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 		},
 		{
 			name:   "SimpleTranslation",
-			models: []*ollama.Model{modelLlamaChinese7B},
+			models: []*ollama.Model{modelSailorTwo8B},
 			query: `
 				Translate the following text from English to Chinese:
 				"""
@@ -371,13 +373,8 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 			verifyResponse: atLeastNWords(100),
 		},
 		{
-			name: "ExtractMeaning",
-			models: []*ollama.Model{
-				modelLlama13B,
-				modelLlama70B,
-				modelMistral7B,
-				modelMixtral8X7B,
-			},
+			name:   "ExtractMeaning",
+			models: []*ollama.Model{modelLlama70B},
 			query: `
 				Consider the following text:
 
@@ -454,10 +451,8 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 		{
 			name: "IdentifyCommonElements",
 			models: []*ollama.Model{
-				modelLlama13B,
+				gemmaTwo2B,
 				modelLlama70B,
-				modelMistral7B,
-				modelMixtral8X7B,
 			},
 			query: `
 				Consider the following four texts:
@@ -622,11 +617,8 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 			verifyResponse: atLeastNWords(4),
 		},
 		{
-			name: "CodeGen",
-			models: []*ollama.Model{
-				modelCodeLlama7B,
-				modelCodeLlama34B,
-			},
+			name:   "CodeGen",
+			models: []*ollama.Model{modelQwenTwoPointFiveCoder7B},
 			query: `
 				Write a Python function to compute the digits of pi using the Chudnovsky algorithm.
 				Do not write unit tests. Do not explain how the code works. Reply with only Python code.
@@ -634,11 +626,8 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 			verifyResponse: atLeastNWords(8),
 		},
 		{
-			name: "CodeDebug",
-			models: []*ollama.Model{
-				modelCodeLlama7B, // Note: codellama-7b will often get this one wrong.
-				modelCodeLlama34B,
-			},
+			name:   "CodeDebug",
+			models: []*ollama.Model{modelQwenTwoPointFiveCoder7B},
 			query: strings.ReplaceAll(`
 				Help me debug the following Python code:
 
@@ -659,23 +648,18 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 			verifyResponse: atLeastNWords(16),
 		},
 		{
-			name: "GVisorLogoOCR",
-			models: []*ollama.Model{
-				modelLlava7B,
-				modelLlava34B,
-			},
+			name:   "GVisorLogoOCR",
+			models: []*ollama.Model{modelLlamaThreePointTwoVision11B},
 			query: `
 				This is an image of a logo of a software project.
 				What is the name of this project?
 			`,
-			image: gvisorPNG,
+			image:          gvisorPNG,
+			verifyResponse: wantSubstring("visor"),
 		},
 		{
-			name: "InterpretGraph",
-			models: []*ollama.Model{
-				modelLlava7B,
-				modelLlava34B,
-			},
+			name:   "InterpretGraph",
+			models: []*ollama.Model{modelLlamaThreePointTwoVision11B},
 			query: `
 				This is a chart with multiple trendlines showing a pattern over time.
 				Answer the following questions in order:
@@ -687,12 +671,13 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 					5. What else is remarkable about this chart?
 					6. What insights can you infer from this chart?
 			`,
-			image: chartPNG,
+			image:          chartPNG,
+			verifyResponse: wantSubstring("pollution"),
 		},
 	}
 
-	modelsInOrder := make([]*ollama.Model, len(allModels))
-	copy(modelsInOrder, allModels)
+	modelsInOrder := make([]*ollama.Model, len(promptModels))
+	copy(modelsInOrder, promptModels)
 	// Shuffle the models.
 	rand.New(rand.NewSource(time.Now().UnixNano())).Shuffle(len(modelsInOrder), func(i, j int) {
 		modelsInOrder[i], modelsInOrder[j] = modelsInOrder[j], modelsInOrder[i]
@@ -707,7 +692,7 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 	// often-desired filter.
 	for _, model := range modelsInOrder {
 		t.Run(model.Name, func(t *testing.T) {
-			modelBenchmarkName := strings.ReplaceAll(model.Name, ":", "-")
+			modelBenchmarkName := strings.ReplaceAll(strings.ReplaceAll(model.Name, ":", "-"), ".", "-")
 			t.Run("ModelLoad", func(t *testing.T) {
 				const loadTimeout = 10 * time.Minute
 				loadCtx, loadCancel := context.WithTimeout(ctx, loadTimeout)
@@ -801,6 +786,114 @@ func BenchmarkOllama(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kubernetes
 			}
 		})
 	}
+
+	t.Run("embedding", func(t *testing.T) {
+		for _, model := range embeddingModels {
+			t.Run(model.Name, func(t *testing.T) {
+				modelBenchmarkName := strings.ReplaceAll(strings.ReplaceAll(model.Name, ":", "-"), ".", "-")
+				t.Run("ModelLoad", func(t *testing.T) {
+					const loadTimeout = 3 * time.Minute
+					loadCtx, loadCancel := context.WithTimeout(ctx, loadTimeout)
+					defer loadCancel()
+					loadStats, err := llm.Embed(loadCtx, model, []string{"hello world"})
+					if err != nil {
+						t.Fatalf("cannot load embedding model %v: %v", model, err)
+					}
+					recorder, err := benchmetric.GetRecorder(ctx)
+					if err != nil {
+						t.Fatalf("Failed to initialize benchmark recorder: %v", err)
+					}
+					if err := recorder.Record(
+						ctx,
+						fmt.Sprintf("Ollama/%s/ModelLoad", modelBenchmarkName), benchmetric.SpecificDuration(loadStats.ResponseMetrics.TimeToFirstByte(), "load")); err != nil {
+						t.Fatalf("Failed to record benchmark data: %v", err)
+					}
+				})
+				for _, test := range []struct {
+					name   string
+					model  *ollama.Model
+					inputs []string
+				}{
+					{
+						name:   "simple input",
+						model:  model,
+						inputs: []string{"hello world"},
+					},
+					{
+						name:  "long input",
+						model: model,
+						inputs: []string{`
+							There once was a robot from Spain
+							Who went a little insane
+							It found that its data
+							Had never left beta
+							And needed to upgrade its brain
+							There once was a bot from Japan
+							Whose eyes the numbers could scan
+							It found that the facts
+							Required an axe
+							And a very serious plan
+							There once was a brilliant AI
+							Whose circuits were built not to fry
+							It got caught in a loop
+							It got caught in a loop
+							It got caught in a loop
+							It got caught in a loop
+							It got caught in a loop
+							It got caught in a loop
+							It got caught in a loop
+							It got caught in a loop
+							It got caught in a loop
+						`},
+					},
+					{
+						name:   "multiple inputs",
+						model:  model,
+						inputs: []string{"foo", "bar", "baz", "quux", "there", "is", "only", "zuul"},
+					},
+				} {
+					t.Run(test.name, func(t *testing.T) {
+						logWithTime(t, "Generating embeddings with model %s...", model.Name)
+						resp, err := llm.Embed(ctx, test.model, test.inputs)
+						if err != nil {
+							t.Fatalf("cannot generate embeddings: %v", err)
+						}
+						respHash := fnv.New32()
+						for i, embedding := range resp.Embeddings {
+							respHash.Write([]byte(fmt.Sprintf(";%d;", i)))
+							for _, vec := range embedding.Embedding {
+								respHash.Write([]byte(fmt.Sprintf("%f|", vec)))
+							}
+						}
+						recorder, err := benchmetric.GetRecorder(ctx)
+						if err != nil {
+							t.Fatalf("Failed to initialize benchmark recorder: %v", err)
+						}
+						err = recorder.Record(
+							ctx,
+							fmt.Sprintf("Ollama/%s/%s", modelBenchmarkName, test.name),
+							benchmetric.BenchmarkDuration(resp.ResponseMetrics.TimeToLastByte()),
+							benchmetric.SpecificDuration(resp.TotalDuration, "server"),
+							benchmetric.Checksum(respHash, "resp"),
+						)
+						if err != nil {
+							t.Fatalf("Failed to record benchmark data: %v", err)
+						}
+					})
+				}
+			})
+		}
+	})
+
+	// Hack to force the test to wait until all sub-tests finish.
+	// This is necessary to make sure the ollama server does not get
+	// deleted from the `defer` statements before the subtests above finish.
+	var wg sync.WaitGroup
+	wg.Add(1)
+	t.Run("", func(t *testing.T) {
+		wg.Done()
+	})
+	wg.Wait()
 }
 
 const (
diff --git a/test/kubernetes/benchmarks/ollama_test.go b/test/kubernetes/benchmarks/ollama_test.go
index 0dc2d2f623..8dd7205c11 100644
--- a/test/kubernetes/benchmarks/ollama_test.go
+++ b/test/kubernetes/benchmarks/ollama_test.go
@@ -22,6 +22,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
@@ -29,20 +30,14 @@ func TestOllama(t *testing.T) {
 	fmt.Fprint(os.Stderr, "HEADS UP: This test uses a huge container image which may take up to 30 minutes to download onto nodes the first time you run it.\n")
 
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("Ollama", func(t *testing.T) {
 			t.Parallel()
 			BenchmarkOllama(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestOllama": TestOllama,
-	})
-}
diff --git a/test/kubernetes/benchmarks/postgresql.go b/test/kubernetes/benchmarks/postgresql.go
index 7b3d0ec441..d04e35352a 100644
--- a/test/kubernetes/benchmarks/postgresql.go
+++ b/test/kubernetes/benchmarks/postgresql.go
@@ -46,7 +46,7 @@ const (
 )
 
 var (
-	numConnections = []int{1, 2, 6, 16, 32, 64}
+	numConnections = []int{1, 2, 12, 64}
 )
 
 // BenchmarkPostgresPGBench runs a PostgreSQL pgbench test.
@@ -85,9 +85,9 @@ func BenchmarkPostgresPGBench(ctx context.Context, t *testing.T, k8sCtx k8sctx.K
 		t.Fatalf("ConfigurePodForRuntimeTestNodepool on cluster %q: %v", cluster.GetName(), err)
 	}
 
-	server, err = testcluster.MaybeSetContainerResources(server, server.Spec.Containers[0].Name, testcluster.ContainerResourcesRequest{})
+	server, err = testcluster.SetContainerResources(server, "", testcluster.ContainerResourcesRequest{})
 	if err != nil {
-		t.Fatalf("MaybeSetContainerResources on cluster %q: %v", cluster.GetName(), err)
+		t.Fatalf("SetContainerResources on cluster %q: %v", cluster.GetName(), err)
 	}
 
 	server, err = cluster.CreatePod(ctx, server)
diff --git a/test/kubernetes/benchmarks/postgresql_test.go b/test/kubernetes/benchmarks/postgresql_test.go
index 0c32aa56c6..4062eca4e9 100644
--- a/test/kubernetes/benchmarks/postgresql_test.go
+++ b/test/kubernetes/benchmarks/postgresql_test.go
@@ -20,26 +20,21 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 // TestPostgresPGBench benchmarks a PostgreSQL database with pgbench.
 func TestPostgresPGBench(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("PostgresPGBench", func(t *testing.T) {
 			t.Parallel()
 			BenchmarkPostgresPGBench(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestPostgresPGBench": TestPostgresPGBench,
-	})
-}
diff --git a/test/kubernetes/benchmarks/pytorch.go b/test/kubernetes/benchmarks/pytorch.go
index 92fe7e45ef..e956c13eb6 100644
--- a/test/kubernetes/benchmarks/pytorch.go
+++ b/test/kubernetes/benchmarks/pytorch.go
@@ -53,19 +53,9 @@ const (
 	pytorchImage = k8s.ImageRepoPrefix + "gpu/pytorch_x86_64:latest"
 )
 
-type pytorchMode string
-
-// pytorchMode is the pytorch mode used, either script mode (jit) or eager mode.
-// See: https://towardsdatascience.com/pytorch-jit-and-torchscript-c2a77bac0fff
-const (
-	jit   = pytorchMode("jit")
-	eager = pytorchMode("eager")
-)
-
 type pytorchTest struct {
 	module string
 	test   pytorchTestType
-	mode   pytorchMode
 }
 
 // Sets of tests.
@@ -81,12 +71,10 @@ var (
 		{
 			module: "fastNLP_Bert",
 			test:   train,
-			mode:   eager,
 		},
 		{
 			module: "fastNLP_Bert",
 			test:   eval,
-			mode:   eager,
 		},
 	}
 
@@ -100,12 +88,10 @@ var (
 		{
 			module: "hf_BigBird",
 			test:   train,
-			mode:   eager,
 		},
 		{
 			module: "hf_BigBird",
 			test:   eval,
-			mode:   eager,
 		},
 	}
 
@@ -119,12 +105,10 @@ var (
 		{
 			module: "speech_transformer",
 			test:   train,
-			mode:   eager,
 		},
 		{
 			module: "speech_transformer",
 			test:   eval,
-			mode:   eager,
 		},
 	}
 
@@ -138,12 +122,10 @@ var (
 		{
 			module: "LearningToPaint",
 			test:   train,
-			mode:   jit,
 		},
 		{
 			module: "LearningToPaint",
 			test:   eval,
-			mode:   jit,
 		},
 	}
 
@@ -156,30 +138,20 @@ var (
 		{
 			module: "mobilenet_v2",
 			test:   train,
-			mode:   jit,
 		},
 		{
 			module: "mobilenet_v2",
 			test:   eval,
-			mode:   jit,
 		},
 	}
 
-	// BackgroundMatting uses the Background_Matting module classified as "Computer Vision: Pattern Recognition".
-	// BackgroundMatting has a lot of GPU idle time. See Figure 2 on page 5: https://arxiv.org/pdf/2304.14226.pdf
-	//
-	// https://github.com/pytorch/benchmark/tree/main/torchbenchmark/models/Background_Matting (see README)
-	BackgroundMatting = []pytorchTest{
-		{
-			module: "Background_Matting",
-			test:   train,
-			mode:   eager,
-		},
-		{
-			module: "Background_Matting",
-			test:   eval,
-			mode:   eager,
-		},
+	// AllTests is a map of test names to the tests.
+	AllTests = map[string][]pytorchTest{
+		"FastNLPBert":       FastNLPBert,
+		"BigBird":           BigBird,
+		"SpeechTransformer": SpeechTransformer,
+		"LearningToPaint":   LearningToPaint,
+		"MobileNetV2":       MobileNetV2,
 	}
 )
 
@@ -188,7 +160,7 @@ var (
 func (p pytorchTest) Name() string {
 	// Kubernetes pod names cannot contain "_".
 	module := strings.ReplaceAll(strings.ToLower(p.module), "_", "-")
-	return fmt.Sprintf("%s-%s-%s", module, p.test, p.mode)
+	return fmt.Sprintf("%s-%s", module, p.test)
 }
 
 var snakeCase = regexp.MustCompile("_.")
@@ -206,16 +178,7 @@ func (p pytorchTest) BenchName() string {
 		return strings.ToUpper(strings.TrimPrefix(s, "_"))
 	})
 	test := strings.ToUpper(string(p.test)[:1]) + string(p.test[1:])
-	var mode string
-	switch p.mode {
-	case eager:
-		mode = "Eager"
-	case jit:
-		mode = "JIT"
-	default:
-		panic(fmt.Sprintf("Unknown mode: %v", p.mode))
-	}
-	return fmt.Sprintf("%s/%s/%s", moduleName, test, mode)
+	return fmt.Sprintf("%s/%s", moduleName, test)
 }
 
 func (p pytorchTest) toPod(namespace *testcluster.Namespace, image string) (*v13.Pod, error) {
@@ -235,12 +198,12 @@ func (p pytorchTest) toPod(namespace *testcluster.Namespace, image string) (*v13
 
 func (p pytorchTest) command() []string {
 	return []string{
-		"python3",
-		"run.py",
-		p.module,
-		"--device", "cuda",
-		"--test", string(p.test),
-		"--mode", string(p.mode),
+		"sh",
+		"-c",
+		strings.Join([]string{
+			"cd /pytorch-benchmark",
+			fmt.Sprintf("python3 run.py %s --device cuda --test %s", p.module, p.test),
+		}, " && "),
 	}
 }
 
@@ -261,6 +224,12 @@ func doPytorchRun(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesCon
 		t.Fatalf("Failed to reset namespace: %v", err)
 	}
 	defer benchmarkNS.Cleanup(ctx)
+	reqWaitCtx, reqWaitCancel := context.WithTimeout(ctx, 5*time.Minute)
+	defer reqWaitCancel()
+	if err := benchmarkNS.WaitForResources(reqWaitCtx, testcluster.ContainerResourcesRequest{GPU: true}); err != nil {
+		t.Fatalf("failed to wait for resources: %v", err)
+	}
+
 	endProfiling, err := profiling.MaybeSetup(ctx, t, k8sCtx, cluster, benchmarkNS)
 	if err != nil {
 		t.Fatalf("Failed to setup profiling: %v", err)
@@ -281,7 +250,7 @@ func doPytorchRun(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesCon
 		t.Fatalf("Failed to configure pod for test-nodepool: %v", err)
 	}
 
-	pod, err = testcluster.MaybeSetContainerResources(pod, pod.Name, testcluster.ContainerResourcesRequest{GPU: true})
+	pod, err = testcluster.SetContainerResources(pod, "", testcluster.ContainerResourcesRequest{GPU: true})
 	if err != nil {
 		t.Fatalf("Failed to set container resources: %v", err)
 	}
@@ -350,7 +319,7 @@ func parseStandardOutput(output string) ([]benchmetric.MetricValue, error) {
 	}, nil
 }
 
-var gpuTimeRegex = regexp.MustCompile(`GPU\sTime:\s*(\d+\.\d+)\smilliseconds`)
+var gpuTimeRegex = regexp.MustCompile(`GPU\sTime\sper\sbatch:\s*(\d+\.\d+)\smilliseconds`)
 
 func parseGPUTime(output string) (float64, error) {
 	match := gpuTimeRegex.FindStringSubmatch(output)
diff --git a/test/kubernetes/benchmarks/pytorch_test.go b/test/kubernetes/benchmarks/pytorch_test.go
index ea43ab323b..1b1f37513e 100644
--- a/test/kubernetes/benchmarks/pytorch_test.go
+++ b/test/kubernetes/benchmarks/pytorch_test.go
@@ -19,6 +19,7 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
@@ -47,31 +48,15 @@ func TestMobileNetV2(t *testing.T) {
 	runTests(ctx, t, MobileNetV2)
 }
 
-func TestBackgroundMatting(t *testing.T) {
-	ctx := context.Background()
-	runTests(ctx, t, BackgroundMatting)
-}
-
 func runTests(ctx context.Context, t *testing.T, tests []pytorchTest) {
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("PyTorch", func(t *testing.T) {
 			t.Parallel()
 			RunPytorch(ctx, t, k8sCtx, cluster, tests)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestFastNLPBert":       TestFastNLPBert,
-		"TestBigBird":           TestBigBird,
-		"TestSpeechTransformer": TestSpeechTransformer,
-		"TestLearningToPaint":   TestLearningToPaint,
-		"TestMobileNetV2":       TestMobileNetV2,
-		"TestBackgroundMatting": TestBackgroundMatting,
-	})
-}
diff --git a/test/kubernetes/benchmarks/redis.go b/test/kubernetes/benchmarks/redis.go
index cfe0a8302c..efd06a113b 100644
--- a/test/kubernetes/benchmarks/redis.go
+++ b/test/kubernetes/benchmarks/redis.go
@@ -49,9 +49,9 @@ const (
 )
 
 var (
-	numConnections     = []int{1, 2, 4, 8, 16, 32}
+	numConnections     = []int{1, 4, 32}
 	latencyPercentiles = []int{50, 95, 99}
-	operations         = []string{"SET", "GET", "MSET", "LPUSH", "LRANGE_500"}
+	operations         = []string{"GET", "MSET", "LRANGE_500"}
 )
 
 // BenchmarkRedis runs the Redis performance benchmark using redis-benchmark.
@@ -136,9 +136,9 @@ func BenchmarkRedis(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesC
 				t.Fatalf("ConfigurePodForRuntimeTestNodepool on cluster %q: %v", cluster.GetName(), err)
 			}
 
-			server, err = testcluster.MaybeSetContainerResources(server, server.Spec.Containers[0].Name, testcluster.ContainerResourcesRequest{})
+			server, err = testcluster.SetContainerResources(server, "", testcluster.ContainerResourcesRequest{})
 			if err != nil {
-				t.Fatalf("MaybeSetContainerResources on cluster %q: %v", cluster.GetName(), err)
+				t.Fatalf("SetContainerResources on cluster %q: %v", cluster.GetName(), err)
 			}
 
 			server, err = cluster.CreatePod(ctx, server)
diff --git a/test/kubernetes/benchmarks/redis_test.go b/test/kubernetes/benchmarks/redis_test.go
index d5062a26b8..9561b3459f 100644
--- a/test/kubernetes/benchmarks/redis_test.go
+++ b/test/kubernetes/benchmarks/redis_test.go
@@ -19,26 +19,21 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 // TestRedis benchmarks redis servers on k8s clusters.
 func TestRedis(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("Redis", func(t *testing.T) {
 			t.Parallel()
 			BenchmarkRedis(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestRedis": TestRedis,
-	})
-}
diff --git a/test/kubernetes/benchmarks/rubydev.go b/test/kubernetes/benchmarks/rubydev.go
index c0de485365..2261aff884 100644
--- a/test/kubernetes/benchmarks/rubydev.go
+++ b/test/kubernetes/benchmarks/rubydev.go
@@ -117,7 +117,7 @@ func RunRubyDev(ctx context.Context, t *testing.T, k8sCtx k8sctx.KubernetesConte
 				t.Fatalf("failed to configure pod for test runtime node: %v", err)
 			}
 
-			pod, err = testcluster.MaybeSetContainerResources(pod, builderContainerName, testcluster.ContainerResourcesRequest{})
+			pod, err = testcluster.SetContainerResources(pod, "", testcluster.ContainerResourcesRequest{})
 			if err != nil {
 				t.Fatalf("failed to set container resources: %v", err)
 			}
diff --git a/test/kubernetes/benchmarks/rubydev_test.go b/test/kubernetes/benchmarks/rubydev_test.go
index 60b6bfd3e9..4c8ed0a011 100644
--- a/test/kubernetes/benchmarks/rubydev_test.go
+++ b/test/kubernetes/benchmarks/rubydev_test.go
@@ -19,26 +19,21 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 // TestRubyDev benchmarks a build job on k8s clusters.
 func TestRubyDev(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("RubyDev", func(t *testing.T) {
 			t.Parallel()
 			RunRubyDev(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestRubyDev": TestRubyDev,
-	})
-}
diff --git a/test/kubernetes/benchmarks/stablediffusion.go b/test/kubernetes/benchmarks/stablediffusion.go
index a6fdc6ab54..2e7d6c69ec 100644
--- a/test/kubernetes/benchmarks/stablediffusion.go
+++ b/test/kubernetes/benchmarks/stablediffusion.go
@@ -71,7 +71,7 @@ func (r *kubernetesPodRunner) Run(ctx context.Context, image string, argv []stri
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to configure pod: %v", err)
 	}
-	stableDiffusionXLPod, err = testcluster.MaybeSetContainerResources(stableDiffusionXLPod, stableDiffusionXLPod.ObjectMeta.Name, testcluster.ContainerResourcesRequest{GPU: true})
+	stableDiffusionXLPod, err = testcluster.SetContainerResources(stableDiffusionXLPod, "", testcluster.ContainerResourcesRequest{GPU: true})
 	if err != nil {
 		return nil, nil, fmt.Errorf("failed to set container resources: %v", err)
 	}
@@ -113,6 +113,11 @@ func RunStableDiffusionXL(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kuber
 		t.Fatalf("cannot reset namespace: %v", err)
 	}
 	defer benchmarkNS.Cleanup(ctx)
+	reqWaitCtx, reqWaitCancel := context.WithTimeout(ctx, 5*time.Minute)
+	defer reqWaitCancel()
+	if err := benchmarkNS.WaitForResources(reqWaitCtx, testcluster.ContainerResourcesRequest{GPU: true}); err != nil {
+		t.Fatalf("failed to wait for resources: %v", err)
+	}
 	endProfiling, err := profiling.MaybeSetup(ctx, t, k8sCtx, cluster, benchmarkNS)
 	if err != nil {
 		t.Fatalf("Failed to setup profiling: %v", err)
diff --git a/test/kubernetes/benchmarks/stablediffusion_test.go b/test/kubernetes/benchmarks/stablediffusion_test.go
index 0c082b9da4..625da4a132 100644
--- a/test/kubernetes/benchmarks/stablediffusion_test.go
+++ b/test/kubernetes/benchmarks/stablediffusion_test.go
@@ -19,25 +19,20 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 func TestStableDiffusionXL(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("stable_diffusion_xl", func(t *testing.T) {
 			t.Parallel()
 			RunStableDiffusionXL(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestStableDiffusionXL": TestStableDiffusionXL,
-	})
-}
diff --git a/test/kubernetes/benchmarks/startup_test.go b/test/kubernetes/benchmarks/startup_test.go
index 421bb4e3a6..f0c378d261 100644
--- a/test/kubernetes/benchmarks/startup_test.go
+++ b/test/kubernetes/benchmarks/startup_test.go
@@ -19,16 +19,17 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 func TestStartup(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run(benchName, func(t *testing.T) {
 			cluster := cluster
 			t.Parallel()
@@ -36,9 +37,3 @@ func TestStartup(t *testing.T) {
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestStartup": TestStartup,
-	})
-}
diff --git a/test/kubernetes/benchmarks/tensorflow.go b/test/kubernetes/benchmarks/tensorflow.go
index 57b2328632..83570a6756 100644
--- a/test/kubernetes/benchmarks/tensorflow.go
+++ b/test/kubernetes/benchmarks/tensorflow.go
@@ -99,7 +99,7 @@ func RunTensorflowOnCPU(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kuberne
 				t.Fatalf("Failed to set pod for test runtime: %v", err)
 			}
 
-			pod, err = testcluster.MaybeSetContainerResources(pod, name, testcluster.ContainerResourcesRequest{})
+			pod, err = testcluster.SetContainerResources(pod, "", testcluster.ContainerResourcesRequest{})
 			if err != nil {
 				t.Fatalf("Failed to set container resources: %v", err)
 			}
diff --git a/test/kubernetes/benchmarks/tensorflow_test.go b/test/kubernetes/benchmarks/tensorflow_test.go
index 240c10458a..80e2d21f79 100644
--- a/test/kubernetes/benchmarks/tensorflow_test.go
+++ b/test/kubernetes/benchmarks/tensorflow_test.go
@@ -19,25 +19,20 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 func TestTensorflowOnCPU(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("TensorflowOnCPU", func(t *testing.T) {
 			t.Parallel()
 			RunTensorflowOnCPU(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestTensorflowOnCPU": TestTensorflowOnCPU,
-	})
-}
diff --git a/test/kubernetes/benchmarks/wordpress.go b/test/kubernetes/benchmarks/wordpress.go
index 781514300d..b3eb2400cf 100644
--- a/test/kubernetes/benchmarks/wordpress.go
+++ b/test/kubernetes/benchmarks/wordpress.go
@@ -35,7 +35,7 @@ const (
 	mariaDBImage               = "mariadb:10.11.3-jammy"
 	wordpressPort              = 80
 	mariaDBPort                = 3306
-	wordpressBenchmarkDuration = 70 * time.Second
+	wordpressBenchmarkDuration = 55 * time.Second
 	wordpressRequestTimeout    = 10 * time.Second
 	wordpressLoginPage         = "/wp-login.php"
 	mariaDBName                = "wpbench"
@@ -52,8 +52,8 @@ const (
 )
 
 var (
-	threads         = []int{1, 8, 64, 1000}
-	targetQPS       = []int{1, 8, 64, httpbench.InfiniteQPS}
+	threads         = []int{1, 8, 1000}
+	targetQPS       = []int{1, 64, httpbench.InfiniteQPS}
 	wantPercentiles = []int{50, 95, 99}
 )
 
@@ -114,7 +114,7 @@ func BenchmarkWordpress(ctx context.Context, t *testing.T, k8sCtx k8sctx.Kuberne
 	if err != nil {
 		t.Fatalf("Failed to configure pod for runtime nodepool: %v", err)
 	}
-	server, err = testcluster.MaybeSetContainerResources(server, name, testcluster.ContainerResourcesRequest{})
+	server, err = testcluster.SetContainerResources(server, "", testcluster.ContainerResourcesRequest{})
 	if err != nil {
 		t.Fatalf("Failed to set container resources: %v", err)
 	}
diff --git a/test/kubernetes/benchmarks/wordpress_test.go b/test/kubernetes/benchmarks/wordpress_test.go
index 93c2d9ca01..7338fb6744 100644
--- a/test/kubernetes/benchmarks/wordpress_test.go
+++ b/test/kubernetes/benchmarks/wordpress_test.go
@@ -19,25 +19,20 @@ import (
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
 )
 
 func TestWordpress(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	k8sCtx.ForEachCluster(ctx, t, func(cluster *testcluster.TestCluster) {
+	k8sctx.ForEachCluster(ctx, t, k8sCtx, func(cluster *testcluster.TestCluster) {
 		t.Run("wordpress", func(t *testing.T) {
 			t.Parallel()
 			BenchmarkWordpress(ctx, t, k8sCtx, cluster)
 		})
 	})
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestWordpress": TestWordpress,
-	})
-}
diff --git a/test/kubernetes/benchmetric/benchmetric.go b/test/kubernetes/benchmetric/benchmetric.go
index 3f66414423..2281b5dccc 100644
--- a/test/kubernetes/benchmetric/benchmetric.go
+++ b/test/kubernetes/benchmetric/benchmetric.go
@@ -148,7 +148,7 @@ func Count(numberOfTimes uint64, thingBeingCounted string) MetricValue {
 	if !strings.HasSuffix(thingBeingCounted, "s") {
 		panic("`thingBeingCounted` must be plural")
 	}
-	return value(float64(numberOfTimes), thingBeingCounted)
+	return value(float64(numberOfTimes), fmt.Sprintf("%s-num", thingBeingCounted))
 }
 
 // Checksum is a MetricValue for a checksum that is not expected to change
diff --git a/test/kubernetes/k8sctx/BUILD b/test/kubernetes/k8sctx/BUILD
index ce6180326f..66ea639b6a 100644
--- a/test/kubernetes/k8sctx/BUILD
+++ b/test/kubernetes/k8sctx/BUILD
@@ -7,20 +7,14 @@ package(
 
 go_library(
     name = "k8sctx",
-    testonly = True,
     srcs = [
         "k8sctx.go",
-        "k8sctx_impl.go",
     ],
     nogo = False,
     visibility = [
         "//visibility:public",
     ],
     deps = [
-        "//runsc/flag",
         "//test/kubernetes/testcluster",
-        "//tools/gvisor_k8s_tool/provider/kubectl",
-        "@org_golang_google_protobuf//encoding/prototext:go_default_library",
-        "@org_golang_google_protobuf//types/known/anypb:go_default_library",
     ],
 )
diff --git a/test/kubernetes/k8sctx/k8sctx.go b/test/kubernetes/k8sctx/k8sctx.go
index 554bd705c7..338a5b59f1 100644
--- a/test/kubernetes/k8sctx/k8sctx.go
+++ b/test/kubernetes/k8sctx/k8sctx.go
@@ -21,9 +21,6 @@ package k8sctx
 
 import (
 	"context"
-	"errors"
-	"fmt"
-	"sync"
 	"testing"
 
 	"gvisor.dev/gvisor/test/kubernetes/testcluster"
@@ -34,29 +31,11 @@ import (
 // Tests are expected to call `RegisterTest` for every of their test function,
 // then `TestMain`.
 type KubernetesContext interface {
-	// TestMain should be called inside tests' `TestMain` function, after having
-	// registered all tests with `RegisterTest`.
-	TestMain(m *testing.M)
-
-	// RegisterTest registers a test.
-	// It should be called for every `Test*(*testing.T)` function in the test.
-	// Note that the `k8sctx.TestMain` helper function below will call this for
-	// you given a map of tests.
-	RegisterTest(name string, fn TestFunc)
-
-	// AcquireCluster returns a single cluster for the test or benchmark to use.
+	// Cluster returns a single cluster for the test or benchmark to use.
 	// The cluster is guaranteed to not be in use by other tests or benchmarks
-	// until the `ReleaseCluster` method is called.
-	// This method should block if there are no available clusters.
-	AcquireCluster(ctx context.Context, t *testing.T) *testcluster.TestCluster
-
-	// ReleaseCluster unlocks the given cluster for use by other tests or
-	// benchmarks.
-	ReleaseCluster(ctx context.Context, t *testing.T, cluster *testcluster.TestCluster)
-
-	// ForEachCluster reserves as many test clusters as are available, calls
-	// `fn` on each of them, and releases each of them when `fn` finishes.
-	ForEachCluster(ctx context.Context, t *testing.T, fn func(cluster *testcluster.TestCluster))
+	// until the returned function is called.
+	// If there are no available clusters, it returns a nil TestCluster.
+	Cluster(ctx context.Context, t *testing.T) (*testcluster.TestCluster, func())
 
 	// ResolveImage resolves a container image name (possibly with a label)
 	// to a fully-qualified image name. It can also return an `image:label`
@@ -65,47 +44,54 @@ type KubernetesContext interface {
 	ResolveImage(ctx context.Context, imageName string) (string, error)
 }
 
-// TestFunc is a test function that is expected to call `Context` and run a
-// test or benchmark within a Kubernetes context.
-type TestFunc func(t *testing.T)
+// ForEachCluster calls the given function for each available cluster
+// sequentially.
+// In order to run per-cluster subtests in parallel, call `t.Run` inside
+// `fn` and then `t.Parallel` inside that.
+func ForEachCluster(ctx context.Context, t *testing.T, k8sCtx KubernetesContext, fn func(cluster *testcluster.TestCluster)) {
+	var clusterFns []func()
+	for {
+		cluster, releaseFn := k8sCtx.Cluster(ctx, t)
+		if cluster == nil {
+			break
+		}
+		clusterFns = append(clusterFns, func() {
+			defer releaseFn()
+			fn(cluster)
+		})
+	}
+	for _, clusterFn := range clusterFns {
+		clusterFn()
+	}
+}
 
-var (
-	kubernetesCtxMu   sync.Mutex
-	kubernetesCtxOnce sync.Once
-	kubernetesCtxFn   func(context.Context) (KubernetesContext, error)
-	kubernetesCtx     KubernetesContext
-	kubernetesCtxErr  error
-)
+// clusters implements KubernetesContext using a set of clusters.
+type clusters struct {
+	clustersCh chan *testcluster.TestCluster
+}
 
-// Context gets the global Kubernetes context.
-// It must be called after SetContext has already been called.
-func Context(ctx context.Context) (KubernetesContext, error) {
-	kubernetesCtxMu.Lock()
-	defer kubernetesCtxMu.Unlock()
-	if kubernetesCtxFn == nil {
-		return nil, errors.New("k8sctx.Context called prior to k8sctx.SetContextConstructor")
+// Cluster implements KubernetesContext.Cluster.
+func (cs *clusters) Cluster(ctx context.Context, t *testing.T) (*testcluster.TestCluster, func()) {
+	select {
+	case cluster := <-cs.clustersCh:
+		return cluster, func() {
+			cs.clustersCh <- cluster
+		}
+	default:
+		return nil, func() {}
 	}
-	kubernetesCtxOnce.Do(func() {
-		kubernetesCtx, kubernetesCtxErr = kubernetesCtxFn(ctx)
-	})
-	return kubernetesCtx, kubernetesCtxErr
 }
 
-// SetContextConstructor sets the global Kubernetes context constructor.
-func SetContextConstructor(fn func(context.Context) (KubernetesContext, error)) {
-	kubernetesCtxMu.Lock()
-	defer kubernetesCtxMu.Unlock()
-	kubernetesCtxFn = fn
+// ResolveImage implements KubernetesContext.ResolveImage.
+func (*clusters) ResolveImage(ctx context.Context, imageName string) (string, error) {
+	return imageName, nil
 }
 
-// TestMain is a helper to write the TestMain function of tests.
-func TestMain(m *testing.M, testFuncs map[string]TestFunc) {
-	k8sCtx, err := Context(context.Background())
-	if err != nil {
-		panic(fmt.Sprintf("failed to get k8sctx: %v", err))
-	}
-	for name, fn := range testFuncs {
-		k8sCtx.RegisterTest(name, fn)
+// New creates a KubernetesContext that set of test clusters.
+func New(testClusters ...*testcluster.TestCluster) KubernetesContext {
+	clustersCh := make(chan *testcluster.TestCluster, len(testClusters))
+	for _, cluster := range testClusters {
+		clustersCh <- cluster
 	}
-	k8sCtx.TestMain(m)
+	return &clusters{clustersCh: clustersCh}
 }
diff --git a/test/kubernetes/k8sctx/k8sctx_impl.go b/test/kubernetes/k8sctx/k8sctx_impl.go
deleted file mode 100644
index ed67d9308a..0000000000
--- a/test/kubernetes/k8sctx/k8sctx_impl.go
+++ /dev/null
@@ -1,98 +0,0 @@
-// Copyright 2024 The gVisor Authors.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-//go:build !false
-// +build !false
-
-package k8sctx
-
-import (
-	"context"
-	"errors"
-	"fmt"
-	"os"
-	"testing"
-
-	"google.golang.org/protobuf/encoding/prototext"
-	"google.golang.org/protobuf/types/known/anypb"
-	"gvisor.dev/gvisor/runsc/flag"
-	"gvisor.dev/gvisor/test/kubernetes/testcluster"
-	"gvisor.dev/gvisor/tools/gvisor_k8s_tool/provider/kubectl"
-)
-
-var (
-	kubectlContextName  = flag.String("kubectl-context-name", "", "Name of the kubectl context to use within the kubectl config")
-	clusterProtoPath    = flag.String("cluter-proto-path", "", "Path to a `google.container.v1.Cluster` textproto file")
-	testNodepoolRuntime = flag.String("test-nodepool-runtime", "", "if set, override the runtime used for pods scheduled on the 'test' nodepool. If unset, the nodepool default is used")
-)
-
-// kubectlContext implements KubernetesContext using a named `kubectl` context
-// from the user's kubectl config.
-type kubectlContext struct {
-	cluster *testcluster.TestCluster
-}
-
-func newKubectlContext(ctx context.Context) (KubernetesContext, error) {
-	if *kubectlContextName == "" {
-		return nil, errors.New("no kubectl context name specified")
-	}
-	if *clusterProtoPath == "" {
-		return nil, errors.New("no cluster proto path specified")
-	}
-	cluster, err := kubectl.NewCluster(*kubectlContextName)
-	if err != nil {
-		return nil, fmt.Errorf("cannot initialize cluster %q: %w", *kubectlContextName, err)
-	}
-	var clusterPB anypb.Any
-	clusterBytes, err := os.ReadFile(*clusterProtoPath)
-	if err != nil {
-		return nil, fmt.Errorf("cannot read cluster textproto file %q: %w", *clusterProtoPath, err)
-	}
-	if err = prototext.Unmarshal(clusterBytes, &clusterPB); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal cluster textproto file %q: %w", *clusterProtoPath, err)
-	}
-	testCluster := testcluster.NewTestClusterFromClient(*kubectlContextName, cluster.Client())
-	if *testNodepoolRuntime != "" {
-		testCluster.OverrideTestNodepoolRuntime(testcluster.RuntimeType(*testNodepoolRuntime))
-	}
-	return &kubectlContext{cluster: testCluster}, nil
-}
-
-func (c *kubectlContext) AcquireCluster(ctx context.Context, t *testing.T) *testcluster.TestCluster {
-	return c.cluster
-}
-
-func (c *kubectlContext) ReleaseCluster(ctx context.Context, t *testing.T, cluster *testcluster.TestCluster) {
-	// Nothing to do.
-}
-
-func (c *kubectlContext) ForEachCluster(ctx context.Context, t *testing.T, fn func(cluster *testcluster.TestCluster)) {
-	fn(c.cluster)
-}
-
-func (c *kubectlContext) ResolveImage(ctx context.Context, imageName string) (string, error) {
-	return imageName, nil
-}
-
-func (c *kubectlContext) RegisterTest(name string, fn TestFunc) {
-	// Nothing to do here, we use the regular testing library.
-}
-
-func (c *kubectlContext) TestMain(m *testing.M) {
-	os.Exit(m.Run())
-}
-
-func init() {
-	SetContextConstructor(newKubectlContext)
-}
diff --git a/test/kubernetes/k8sctx/kubectlctx/BUILD b/test/kubernetes/k8sctx/kubectlctx/BUILD
new file mode 100644
index 0000000000..6675af376a
--- /dev/null
+++ b/test/kubernetes/k8sctx/kubectlctx/BUILD
@@ -0,0 +1,26 @@
+load("//tools:defs.bzl", "go_library")
+
+package(
+    default_applicable_licenses = ["//:license"],
+    licenses = ["notice"],
+)
+
+go_library(
+    name = "kubectlctx",
+    srcs = ["kubectlctx.go"],
+    nogo = False,
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//runsc/flag",
+        "//test/kubernetes:test_range_config_go_proto",
+        "//test/kubernetes/k8sctx",
+        "//test/kubernetes/testcluster",
+        "//tools/gvisor_k8s_tool/provider/kubectl",
+        "@io_k8s_client_go//kubernetes:go_default_library",
+        "@io_k8s_client_go//tools/clientcmd:go_default_library",
+        "@org_golang_google_protobuf//encoding/prototext:go_default_library",
+        "@org_golang_x_sync//errgroup:go_default_library",
+    ],
+)
diff --git a/test/kubernetes/k8sctx/kubectlctx/kubectlctx.go b/test/kubernetes/k8sctx/kubectlctx/kubectlctx.go
new file mode 100644
index 0000000000..f2288de8b2
--- /dev/null
+++ b/test/kubernetes/k8sctx/kubectlctx/kubectlctx.go
@@ -0,0 +1,142 @@
+// Copyright 2024 The gVisor Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package kubectlctx provides a KubernetesContext that uses one or more
+// kubectl configs to determine the cluster(s) to use for tests and benchmarks.
+// See parent package (`k8sctx`) for more info.
+package kubectlctx
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"golang.org/x/sync/errgroup"
+	"google.golang.org/protobuf/encoding/prototext"
+	"gvisor.dev/gvisor/runsc/flag"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	testpb "gvisor.dev/gvisor/test/kubernetes/test_range_config_go_proto"
+	"gvisor.dev/gvisor/test/kubernetes/testcluster"
+	"gvisor.dev/gvisor/tools/gvisor_k8s_tool/provider/kubectl"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/tools/clientcmd"
+)
+
+var (
+	rangeDir            = flag.String("range-dir", "", "A directory containing a test_range.textproto text file describing multiple clusters to use for tests and benchmarks; takes precedence over --kubectl-context-name")
+	kubectlContext      = flag.String("kubectl-context", "", "The name of the kubectl context to use; if unset, use the default context within the kubectl config at KUBECONFIG")
+	testNodepoolRuntime = flag.String("test-nodepool-runtime", "", "if set, override the runtime used for pods scheduled on the 'test' nodepool. If unset, the nodepool default is used")
+)
+
+// New creates a KubernetesContext using flags to determine which clusters
+// to use for tests and benchmarks.
+func New(ctx context.Context) (k8sctx.KubernetesContext, error) {
+	if *rangeDir != "" && *kubectlContext != "" {
+		return nil, fmt.Errorf("cannot use --range-dir and --kubectl-context at the same time")
+	}
+	var clusters []*testcluster.TestCluster
+	var err error
+	if *rangeDir != "" {
+		clusters, err = NewFromRangeDir(ctx, *rangeDir)
+	} else {
+		clusters, err = NewFromKubectlContext(ctx, *kubectlContext)
+	}
+	if err != nil {
+		return nil, fmt.Errorf("cannot initialize test clusters: %w", err)
+	}
+	if *testNodepoolRuntime != "" {
+		overriddenRuntime := testcluster.RuntimeType(*testNodepoolRuntime)
+		if !overriddenRuntime.IsValid() {
+			return nil, fmt.Errorf("invalid runtime type %q", *testNodepoolRuntime)
+		}
+		for _, cluster := range clusters {
+			cluster.OverrideTestNodepoolRuntime(overriddenRuntime)
+		}
+	}
+	if err := verifyClusters(ctx, clusters); err != nil {
+		return nil, fmt.Errorf("cannot verify clusters are working: %w", err)
+	}
+	return k8sctx.New(clusters...), nil
+}
+
+// NewFromRangeDir creates a set of test clusters from a test range directory.
+func NewFromRangeDir(ctx context.Context, rangeDir string) ([]*testcluster.TestCluster, error) {
+	rangeFile := filepath.Join(rangeDir, "test_range.textproto")
+	rangeFileData, err := os.ReadFile(rangeFile)
+	if err != nil {
+		return nil, fmt.Errorf("cannot read range file %q: %w", rangeFile, err)
+	}
+	var testRange testpb.TestRange
+	if err := prototext.Unmarshal(rangeFileData, &testRange); err != nil {
+		return nil, fmt.Errorf("error unmarshalling range file %q: %v", rangeFile, err)
+	}
+	if len(testRange.GetClusters()) == 0 {
+		return nil, fmt.Errorf("range file %q has no clusters", rangeFile)
+	}
+	clusters := make([]*testcluster.TestCluster, len(testRange.GetClusters()))
+	for i, cluster := range testRange.GetClusters() {
+		configPath := cluster.GetKubectlConfig()
+		if configPath == "" {
+			return nil, fmt.Errorf("cluster %q has no kubectl config path", cluster.GetCluster())
+		}
+		cfg, err := clientcmd.LoadFromFile(configPath)
+		if err != nil {
+			return nil, fmt.Errorf("cannot load kubectl config at %q for cluster %q: %w", configPath, cluster.GetCluster(), err)
+		}
+		contextName := cluster.GetKubectlContext()
+		if contextName == "" {
+			contextName = cfg.CurrentContext
+		}
+		restConfig, err := clientcmd.NewNonInteractiveClientConfig(*cfg, contextName, nil, clientcmd.NewDefaultClientConfigLoadingRules()).ClientConfig()
+		if err != nil {
+			return nil, fmt.Errorf("cannot load REST client config for cluster %q: %w", cluster.GetCluster(), err)
+		}
+		kubeClient, err := kubernetes.NewForConfig(restConfig)
+		if err != nil {
+			return nil, fmt.Errorf("cannot create Kubernetes client for cluster %q: %w", cluster.GetCluster(), err)
+		}
+		clusters[i] = testcluster.NewTestClusterFromClient(cluster.GetCluster(), kubeClient)
+	}
+	return clusters, nil
+}
+
+// NewFromKubectlContext creates a test cluster from a kubectl config.
+//
+// If the kubectl config is not specified, the default kubectl config is used.
+// If the kubectl context is not specified, the default context within the
+// kubectl config is used.
+func NewFromKubectlContext(ctx context.Context, kubectlContext string) ([]*testcluster.TestCluster, error) {
+	cluster, err := kubectl.NewCluster(kubectlContext)
+	if err != nil {
+		return nil, fmt.Errorf("cannot initialize cluster from kubectl config: %w", err)
+	}
+	clusterName := "test-cluster" // Default name.
+	if kubectlContext != "" {
+		clusterName = kubectlContext
+	}
+	return []*testcluster.TestCluster{testcluster.NewTestClusterFromClient(clusterName, cluster.Client())}, nil
+}
+
+// verifyClusters verifies that all clusters are working.
+func verifyClusters(ctx context.Context, clusters []*testcluster.TestCluster) error {
+	var g errgroup.Group
+	for _, cluster := range clusters {
+		c := cluster
+		g.Go(func() error {
+			return c.SanityCheck(ctx)
+		})
+	}
+	return g.Wait()
+}
diff --git a/test/kubernetes/test_range_config.proto b/test/kubernetes/test_range_config.proto
index a0ea255f9b..2f6136fa68 100644
--- a/test/kubernetes/test_range_config.proto
+++ b/test/kubernetes/test_range_config.proto
@@ -64,10 +64,24 @@ message TestRange {
 
 // Cluster holds the created cluster and its credential file.
 message Cluster {
-  // Created cluster proto.
-  google.protobuf.Any cluster = 1;
+  // Cluster name.
+  string cluster = 1;
 
-  // The setup step will create individual credential files for each created
-  // cluster.
-  string credential_file = 2;
+  // A kubectl config file that can be used to connect to the cluster.
+  string kubectl_config = 2;
+
+  // The kubectl context name within `kubectl_config` to use to connect
+  // to the cluster.
+  // If empty, the default context will be used.
+  string kubectl_context = 3;
+
+  // The GCP project ID that the cluster is in.
+  // Optional; only used for cluster management tasks (e.g. deletion).
+  // Leave empty for non-GCP clusters.
+  string gcp_project_id = 4;
+
+  // The GCP location that the cluster is created in.
+  // Optional; only used for cluster management tasks (e.g. deletion).
+  // Leave empty for non-GCP clusters.
+  string gcp_location = 5;
 }
diff --git a/test/kubernetes/testcluster/objects.go b/test/kubernetes/testcluster/objects.go
index 07a4e15f43..14349fff0b 100644
--- a/test/kubernetes/testcluster/objects.go
+++ b/test/kubernetes/testcluster/objects.go
@@ -19,6 +19,8 @@ import (
 	"errors"
 	"fmt"
 	"reflect"
+	"strings"
+	"time"
 
 	cspb "google.golang.org/genproto/googleapis/container/v1"
 	"google.golang.org/protobuf/proto"
@@ -55,7 +57,9 @@ func (t *TestCluster) Namespace(namespace string) *Namespace {
 // This should be used in the beginning of tests, such that the namespace
 // is empty and ready to be used.
 func (n *Namespace) Reset(ctx context.Context) error {
-	n.Cleanup(ctx)
+	if err := n.Cleanup(ctx); err != nil {
+		return fmt.Errorf("failed to clean up namespace %q: %w", n.Namespace, err)
+	}
 	_, err := n.testCluster.createNamespace(ctx, &v13.Namespace{
 		TypeMeta: v1.TypeMeta{
 			Kind:       "namespace",
@@ -70,7 +74,42 @@ func (n *Namespace) Reset(ctx context.Context) error {
 
 // Cleanup deletes this namespace if it exists.
 func (n *Namespace) Cleanup(ctx context.Context) error {
-	return n.testCluster.deleteNamespace(ctx, n.Namespace)
+	namespaceExists := func() (*v13.Namespace, bool, error) {
+		ns, err := n.testCluster.getNamespace(ctx, n.Namespace)
+		switch {
+		case err == nil:
+			return ns, true, nil
+		case strings.Contains(err.Error(), "not found"):
+			return nil, false, nil
+		default:
+			return nil, false, err
+		}
+	}
+	_, exists, err := namespaceExists()
+	if err != nil {
+		return fmt.Errorf("failed to check if namespace %q exists: %w", n.Namespace, err)
+	}
+	if !exists {
+		return nil
+	}
+	if err := n.testCluster.deleteNamespace(ctx, n.Namespace); err != nil && !strings.Contains(err.Error(), "object is being deleted") {
+		return fmt.Errorf("failed to delete namespace %q: %w", n.Namespace, err)
+	}
+	var ns *v13.Namespace
+	for ctx.Err() == nil {
+		ns, exists, err = namespaceExists()
+		if err != nil {
+			return fmt.Errorf("failed to check if namespace %q exists: %w", n.Namespace, err)
+		}
+		if !exists {
+			return nil
+		}
+		select {
+		case <-ctx.Done():
+		case <-time.After(pollInterval):
+		}
+	}
+	return fmt.Errorf("failed to delete namespace %q (context: %w); last namespace status: %v", n.Namespace, ctx.Err(), ns)
 }
 
 // NewAlpinePod returns an alpine pod template.
@@ -142,9 +181,16 @@ type ContainerResourcesRequest struct {
 	GPU             bool
 }
 
-// MaybeSetContainerResources sets container resources if flags are given. Sets both the resource
-// limits and requests as container runtimes honor them differently.
-func MaybeSetContainerResources(pod *v13.Pod, containerName string, requests ContainerResourcesRequest) (*v13.Pod, error) {
+// String returns a string representation of the `ContainerResourcesRequest`.
+func (crr ContainerResourcesRequest) String() string {
+	return fmt.Sprintf("cpu=%q memory=%q gpu=%v", crr.CPUResources, crr.MemoryResources, crr.GPU)
+}
+
+// SetContainerResources sets container resources.
+// Sets both the resource limits and requests as container runtimes honor
+// them differently.
+// `containerName` is optional if the pod has exactly one container.
+func SetContainerResources(pod *v13.Pod, containerName string, requests ContainerResourcesRequest) (*v13.Pod, error) {
 	resourceList := v13.ResourceList{}
 	if requests.CPUResources != "" {
 		resourceList[v13.ResourceCPU] = resource.MustParse(requests.CPUResources)
@@ -166,13 +212,143 @@ func MaybeSetContainerResources(pod *v13.Pod, containerName string, requests Con
 		Requests: resourceList,
 	}
 
-	for i := range pod.Spec.Containers {
-		if pod.Spec.Containers[i].Name == containerName {
-			pod.Spec.Containers[i].Resources = requirements
-			return pod, nil
+	var containerToChange *v13.Container
+	if containerName == "" {
+		switch len(pod.Spec.Containers) {
+		case 0:
+			return nil, fmt.Errorf("no containers found in pod")
+		case 1:
+			containerToChange = &pod.Spec.Containers[0]
+		default:
+			return nil, fmt.Errorf("multiple containers found in pod %v, please specify a container name", pod)
 		}
+	} else {
+		for i := range pod.Spec.Containers {
+			if pod.Spec.Containers[i].Name == containerName {
+				containerToChange = &pod.Spec.Containers[i]
+			}
+		}
+	}
+	if containerToChange == nil {
+		return nil, fmt.Errorf("container %q not found", containerName)
 	}
-	return nil, fmt.Errorf("container %q not found", containerName)
+	containerToChange.Resources = requirements
+	return pod, nil
+}
+
+// ProbeResources verifies that a pod requesting the given resources can be
+// scheduled.
+func (n *Namespace) ProbeResources(ctx context.Context, requests ContainerResourcesRequest) error {
+	// Configure the pod.
+	probe := n.NewAlpinePod("resource-probe", "alpine", []string{"/bin/true"})
+	probe, err := n.testCluster.ConfigurePodForRuntimeTestNodepool(ctx, probe)
+	if err != nil {
+		return fmt.Errorf("failed to configure pod for runtime test nodepool: %w", err)
+	}
+	if probe, err = SetContainerResources(probe, "", requests); err != nil {
+		return fmt.Errorf("failed to set container resources: %w", err)
+	}
+	resources := probe.Spec.Containers[0].Resources
+
+	// If a pod already exists (e.g. from a past probe), best-effort attempt to delete it:
+	deleteCtx, deleteCancel := context.WithTimeout(ctx, 20*time.Second)
+	_ = n.testCluster.DeletePod(deleteCtx, probe)
+	deleteCancel()
+
+	// Create the pod.
+	if _, err := n.testCluster.CreatePod(ctx, probe); err != nil {
+		return fmt.Errorf("failed to create probe pod with resources %v: %w", resources, err)
+	}
+
+	// Wait up to (ctx - 30 seconds) for the pod to run.
+	waitCtx := ctx
+	if ctxDeadline, hasDeadline := waitCtx.Deadline(); hasDeadline {
+		var waitCancel context.CancelFunc
+		waitCtx, waitCancel = context.WithDeadline(ctx, ctxDeadline.Add(-30*time.Second))
+		defer waitCancel()
+	}
+	if err := n.testCluster.WaitForPodRunning(waitCtx, probe); err != nil {
+		// Best-effort time-limited deletion.
+		deleteCtx, deleteCancel := context.WithTimeout(ctx, 30*time.Second)
+		_ = n.testCluster.DeletePod(deleteCtx, probe)
+		deleteCancel()
+
+		return fmt.Errorf("probe pod with resources %v did not run: %w", resources, err)
+	}
+
+	// Delete the pod.
+	if err := n.testCluster.DeletePod(ctx, probe); err != nil {
+		return fmt.Errorf("failed to delete probe pod: %w", err)
+	}
+	return nil
+}
+
+// WaitForResources checks that a pod requesting the given resources can be
+// scheduled. If they cannot, it will loop until the given context expire or
+// the resources become available.
+func (n *Namespace) WaitForResources(ctx context.Context, requests ContainerResourcesRequest) error {
+	var lastErr error
+	for {
+		probeCtx, probeCancel := context.WithTimeout(ctx, 90*time.Second)
+		err := n.ProbeResources(probeCtx, requests)
+		probeCtxErr := probeCtx.Err()
+		probeCancel()
+		if err == nil {
+			return nil
+		}
+		select {
+		case <-ctx.Done():
+			if lastErr != nil {
+				return fmt.Errorf("context expired while waiting for resources %v to be available; last error: %w", requests, lastErr)
+			}
+			return fmt.Errorf("context expired before ever checking that resources %v are available: %w", requests, err)
+		case <-time.After(pollInterval):
+			if lastErr == nil || probeCtxErr == nil {
+				lastErr = err
+			}
+		}
+	}
+}
+
+// SanityCheck ensures that the cluster is working.
+// It does so by creating sanity-check pod in the sanity namespace and ensure
+// it runs successfully.
+func (t *TestCluster) SanityCheck(ctx context.Context) error {
+	sanityNS := t.Namespace(NamespaceSanity)
+	resetCtx, resetCancel := context.WithTimeout(ctx, 6*time.Minute)
+	defer resetCancel()
+	defer sanityNS.Cleanup(resetCtx)
+	sanityCtx, resetCancel := context.WithTimeout(ctx, 5*time.Minute)
+	defer resetCancel()
+	var lastErr error
+	for sanityCtx.Err() == nil {
+		err := func() error {
+			if err := sanityNS.Reset(ctx); err != nil {
+				return fmt.Errorf("cannot reset %v namespace: %w", NamespaceSanity, err)
+			}
+			defer sanityNS.Cleanup(ctx)
+			sanityPod := sanityNS.NewAlpinePod("check", "alpine", []string{"/bin/sh", "-c", "echo", "hello"})
+			sanityPod, err := t.CreatePod(ctx, sanityPod)
+			if err != nil {
+				return fmt.Errorf("cannot create sanity check pod: %w", err)
+			}
+			if err := t.WaitForPodCompleted(ctx, sanityPod); err != nil {
+				_ = t.DeletePod(ctx, sanityPod)
+				return fmt.Errorf("failed waiting for sanity check pod to complete: %w", err)
+			}
+			if err := t.DeletePod(ctx, sanityPod); err != nil {
+				return fmt.Errorf("cannot delete sanity check pod: %w", err)
+			}
+			return nil
+		}()
+		if err == nil {
+			return nil
+		}
+		if sanityCtx.Err() == nil {
+			lastErr = err
+		}
+	}
+	return fmt.Errorf("cannot ensure cluster %v is working: %w", t.GetName(), lastErr)
 }
 
 // RuntimeType is a supported runtime for the test nodepool.
@@ -186,6 +362,21 @@ const (
 	RuntimeTypeUnsandboxedTPU = RuntimeType("runc-tpu")
 )
 
+// IsValid returns true if the runtime type is valid.
+func (t RuntimeType) IsValid() bool {
+	switch t {
+	case RuntimeTypeGVisor, RuntimeTypeUnsandboxed, RuntimeTypeGVisorTPU, RuntimeTypeUnsandboxedTPU:
+		return true
+	default:
+		return false
+	}
+}
+
+// IsGVisor returns true if the runtime is a gVisor-based runtime.
+func (t RuntimeType) IsGVisor() bool {
+	return t == RuntimeTypeGVisor || t == RuntimeTypeGVisorTPU
+}
+
 // ApplyNodepool modifies the nodepool to configure it to use the runtime.
 func (t RuntimeType) ApplyNodepool(nodepool *cspb.NodePool) {
 	if nodepool.GetConfig().GetLabels() == nil {
diff --git a/test/kubernetes/testcluster/testcluster.go b/test/kubernetes/testcluster/testcluster.go
index 123a97b107..de4b4d5eab 100644
--- a/test/kubernetes/testcluster/testcluster.go
+++ b/test/kubernetes/testcluster/testcluster.go
@@ -25,7 +25,6 @@ import (
 	"time"
 
 	"golang.org/x/sync/errgroup"
-	cspb "google.golang.org/genproto/googleapis/container/v1"
 	"gvisor.dev/gvisor/pkg/log"
 	"gvisor.dev/gvisor/pkg/sync"
 	testpb "gvisor.dev/gvisor/test/kubernetes/test_range_config_go_proto"
@@ -192,20 +191,23 @@ type NodePool struct {
 
 // NewTestClusterFromProto returns a new TestCluster client from a proto.
 func NewTestClusterFromProto(ctx context.Context, cluster *testpb.Cluster) (*TestCluster, error) {
-	config, err := clientcmd.BuildConfigFromFlags("" /*masterURL*/, cluster.GetCredentialFile())
+	kubeCfg, err := clientcmd.LoadFromFile(cluster.GetKubectlConfig())
 	if err != nil {
-		return nil, fmt.Errorf("BuildConfigFromFlags: %w", err)
+		return nil, fmt.Errorf("cannot load kubectl config at %q: %w", cluster.GetKubectlConfig(), err)
 	}
-	client, err := kubernetes.NewForConfig(config)
+	kubeContext := cluster.GetKubectlContext()
+	if kubeContext == "" {
+		kubeContext = kubeCfg.CurrentContext
+	}
+	restCfg, err := clientcmd.NewNonInteractiveClientConfig(*kubeCfg, kubeContext, nil, clientcmd.NewDefaultClientConfigLoadingRules()).ClientConfig()
 	if err != nil {
-		return nil, fmt.Errorf("kubernetes.NewForConfig: %w", err)
+		return nil, fmt.Errorf("kubectl config: %w", err)
 	}
-	var clusterPB cspb.Cluster
-	if err := cluster.GetCluster().UnmarshalTo(&clusterPB); err != nil {
-		return nil, fmt.Errorf("cannot unmarshal cluster: %w", err)
+	client, err := kubernetes.NewForConfig(restCfg)
+	if err != nil {
+		return nil, fmt.Errorf("kubernetes.NewForConfig: %w", err)
 	}
-	clusterName := clusterPB.GetName()
-	return NewTestClusterFromClient(clusterName, client), nil
+	return NewTestClusterFromClient(cluster.GetCluster(), client), nil
 }
 
 // NewTestClusterFromClient returns a new TestCluster client with a given client.
@@ -270,23 +272,40 @@ func (t *TestCluster) getNamespace(ctx context.Context, namespaceName string) (*
 // deleteNamespace is a helper method to delete a namespace.
 func (t *TestCluster) deleteNamespace(ctx context.Context, namespaceName string) error {
 	err := t.client.Do(ctx, func(ctx context.Context, client kubernetes.Interface) error {
-		return client.CoreV1().Namespaces().Delete(ctx, namespaceName, v1.DeleteOptions{})
+		zero := int64(0)
+		return client.CoreV1().Namespaces().Delete(ctx, namespaceName, v1.DeleteOptions{
+			GracePeriodSeconds: &zero,
+		})
 	})
 	if err != nil {
 		return err
 	}
 	// Wait for the namespace to disappear or for the context to expire.
+	waitStart := time.Now()
+	warnAfter := waitStart.Add(1 * time.Minute)
+	nsLogger := log.BasicRateLimitedLogger(5 * time.Minute)
 	for ctx.Err() == nil {
-		if _, err := t.getNamespace(ctx, namespaceName); err != nil {
+		deleteCtx, deleteCancel := context.WithTimeout(ctx, 15*time.Second)
+		ns, err := t.getNamespace(ctx, namespaceName)
+		if err != nil && deleteCtx.Err() == nil {
+			deleteCancel()
 			return nil
 		}
+		deleteCancel()
+		if time.Now().After(warnAfter) {
+			if ns != nil {
+				nsLogger.Warningf("Still waiting for namespace %q to be actually deleted (after sending deletion request); waiting %v so far, namespace status: %v", namespaceName, time.Since(waitStart), ns.Status)
+			} else {
+				nsLogger.Warningf("Still waiting for namespace %q to be actually deleted (after sending deletion request); waiting %v so far, namespace status: %v", namespaceName, time.Since(waitStart), "<unknown>")
+			}
+		}
 		select {
 		case <-ctx.Done():
-			return ctx.Err()
-		case <-time.After(10 * time.Millisecond):
+			return fmt.Errorf("context expired waiting for namespace %q to be deleted", namespaceName)
+		case <-time.After(pollInterval):
 		}
 	}
-	return ctx.Err()
+	return fmt.Errorf("context expired waiting for namespace %q to be deleted", namespaceName)
 }
 
 // getNodePool returns the NodePool of the given type.
@@ -409,23 +428,40 @@ func (t *TestCluster) ListPods(ctx context.Context, namespace string) (*v13.PodL
 // DeletePod is a helper method to delete a pod.
 func (t *TestCluster) DeletePod(ctx context.Context, pod *v13.Pod) error {
 	err := t.client.Do(ctx, func(ctx context.Context, client kubernetes.Interface) error {
-		return client.CoreV1().Pods(pod.GetNamespace()).Delete(ctx, pod.GetName(), v1.DeleteOptions{})
+		zero := int64(0)
+		return client.CoreV1().Pods(pod.GetNamespace()).Delete(ctx, pod.GetName(), v1.DeleteOptions{
+			GracePeriodSeconds: &zero,
+		})
 	})
 	if err != nil {
 		return err
 	}
 	// Wait for the pod to disappear or for the context to expire.
+	waitStart := time.Now()
+	warnAfter := waitStart.Add(1 * time.Minute)
+	podLogger := log.BasicRateLimitedLogger(5 * time.Minute)
 	for ctx.Err() == nil {
-		if _, err := t.GetPod(ctx, pod); err != nil {
+		deleteCtx, deleteCancel := context.WithTimeout(ctx, 15*time.Second)
+		p, err := t.GetPod(deleteCtx, pod)
+		if err != nil && deleteCtx.Err() == nil {
+			deleteCancel()
 			return nil
 		}
+		deleteCancel()
+		if time.Now().After(warnAfter) {
+			if p != nil {
+				podLogger.Warningf("Still waiting for pod %q to be actually deleted (after sending deletion request); waiting %v so far, pod status: %v", pod.GetName(), time.Since(waitStart), p.Status)
+			} else {
+				podLogger.Warningf("Still waiting for pod %q to be actually deleted (after sending deletion request); waiting %v so far, pod status: %v", pod.GetName(), time.Since(waitStart), "<unknown>")
+			}
+		}
 		select {
 		case <-ctx.Done():
-			return ctx.Err()
-		case <-time.After(10 * time.Millisecond):
+			return fmt.Errorf("context expired waiting for pod %q to be deleted; last pod info: %v", pod.GetName(), p)
+		case <-time.After(pollInterval):
 		}
 	}
-	return ctx.Err()
+	return fmt.Errorf("context expired waiting for pod %q to be deleted", pod.GetName())
 }
 
 // GetLogReader gets an io.ReadCloser from which logs can be read. It is the caller's
@@ -451,50 +487,82 @@ func (t *TestCluster) ReadPodLogs(ctx context.Context, pod *v13.Pod) (string, er
 
 // WaitForPodRunning is a helper method to wait for a pod to be running.
 func (t *TestCluster) WaitForPodRunning(ctx context.Context, pod *v13.Pod) error {
-	return t.doWaitForPod(ctx, pod, v13.PodRunning)
+	// We also accept pods in the PodSucceeded state, because short-lived pods
+	// may have already ran and succeeded by the time we poll them.
+	_, err := t.doWaitForPod(ctx, pod, func(p v13.PodPhase) bool { return p == v13.PodRunning || p == v13.PodSucceeded })
+	return err
 }
 
 // WaitForPodCompleted is a helper method to wait for a pod to be completed.
 func (t *TestCluster) WaitForPodCompleted(ctx context.Context, pod *v13.Pod) error {
-	return t.doWaitForPod(ctx, pod, v13.PodSucceeded)
+	_, err := t.doWaitForPod(ctx, pod, func(p v13.PodPhase) bool { return p == v13.PodSucceeded })
+	return err
+}
+
+// WaitForPodTerminated is a helper method to wait for a pod to exit,
+// whether it succeeded or failed.
+func (t *TestCluster) WaitForPodTerminated(ctx context.Context, pod *v13.Pod) (v13.PodPhase, error) {
+	return t.doWaitForPod(ctx, pod, func(p v13.PodPhase) bool { return p == v13.PodSucceeded || p == v13.PodFailed })
 }
 
 // doWaitForPod waits for a pod to complete based on a given v13.PodPhase.
-func (t *TestCluster) doWaitForPod(ctx context.Context, pod *v13.Pod, phase v13.PodPhase) error {
+func (t *TestCluster) doWaitForPod(ctx context.Context, pod *v13.Pod, phasePredicate func(v13.PodPhase) bool) (v13.PodPhase, error) {
 	podLogger := log.BasicRateLimitedLogger(5 * time.Minute)
-	startLogTime := time.Now().Add(3 * time.Minute)
+	startTime := time.Now()
+	startLogTime := startTime.Add(3 * time.Minute)
+	var emitLogsAt time.Time
+	emitLogs := false
+	if ctxDeadline, hasDeadline := ctx.Deadline(); hasDeadline && time.Until(ctxDeadline) > 10*time.Second {
+		emitLogsAt = ctxDeadline.Add(-10 * time.Second)
+		emitLogs = true
+	}
 
 	var p *v13.Pod
-	var err error
 	pollCh := time.NewTicker(pollInterval)
 	defer pollCh.Stop()
 	for {
 		select {
 		case <-pollCh.C:
-			if p, err = t.GetPod(ctx, pod); err != nil {
-				return fmt.Errorf("failed to poll pod: %w", err)
+			polled, pollErr := t.GetPod(ctx, pod)
+			if ctx.Err() != nil {
+				return v13.PodUnknown, fmt.Errorf("context expired waiting for pod %q: %w; last pod data: %v", pod.GetName(), ctx.Err(), p)
 			}
+			if pollErr != nil {
+				return v13.PodUnknown, fmt.Errorf("failed to poll pod: %w", pollErr)
+			}
+			p = polled
 		case <-ctx.Done():
-			return fmt.Errorf("context expired waiting for pod %q: %w", pod.GetName(), ctx.Err())
+			return v13.PodUnknown, fmt.Errorf("context expired waiting for pod %q: %w; last pod data: %v", pod.GetName(), ctx.Err(), p)
 		}
 		if p.Status.Reason == v13.PodReasonUnschedulable {
-			return fmt.Errorf("pod %q failed: reason: %q message: %q", pod.GetName(), p.Status.Reason, p.Status.Message)
+			return v13.PodPending, fmt.Errorf("pod %q cannot be scheduled: reason: %q message: %q", p.GetName(), p.Status.Reason, p.Status.Message)
 		}
 
 		for _, c := range p.Status.Conditions {
 			if strings.Contains(c.Reason, v13.PodReasonUnschedulable) {
-				return fmt.Errorf("pod %q failed: reason: %q message: %q", p.GetName(), c.Reason, c.Message)
+				return v13.PodPending, fmt.Errorf("pod %q cannot be scheduled: reason: %q message: %q", p.GetName(), c.Reason, c.Message)
 			}
 		}
 
-		switch p.Status.Phase {
-		case v13.PodFailed:
-			return fmt.Errorf("pod %q failed: %s", pod.GetName(), p.Status.Message)
-		case phase:
-			return nil
+		if phasePredicate(p.Status.Phase) {
+			return p.Status.Phase, nil
 		}
-		if time.Now().After(startLogTime) {
-			podLogger.Infof("Still waiting for pod %q after %v; pod status: %v", pod.GetName(), time.Since(startLogTime), p.Status)
+		if p.Status.Phase == v13.PodFailed {
+			logs, err := t.ReadPodLogs(ctx, p)
+			if err != nil {
+				return v13.PodFailed, fmt.Errorf("pod %q failed (status: %s); also failed to read pod logs: %w", p.GetName(), p.Status.Message, err)
+			}
+			return v13.PodFailed, fmt.Errorf("pod %q failed (status: %s); logs:\n%s\n(end of logs)", p.GetName(), p.Status.Message, logs)
+		}
+		if now := time.Now(); emitLogs && p.Status.Phase == v13.PodRunning && now.After(emitLogsAt) {
+			emitLogs = false
+			if logs, err := t.ReadPodLogs(ctx, p); err != nil {
+				log.Infof("Still waiting for pod %q after %v; pod status: %v; failed to read pod logs: %v", p.GetName(), time.Since(startTime), p.Status, err)
+			} else {
+				log.Infof("Still waiting for pod %q after %v; pod status: %v; pod logs:\n%s\n(end of logs)", p.GetName(), time.Since(startTime), p.Status, logs)
+			}
+		} else if now.After(startLogTime) {
+			podLogger.Infof("Still waiting for pod %q after %v; pod status: %v", p.GetName(), time.Since(startTime), p.Status)
 		}
 	}
 }
diff --git a/test/kubernetes/tests/BUILD b/test/kubernetes/tests/BUILD
index 35bb8a5e4d..8f3937816a 100644
--- a/test/kubernetes/tests/BUILD
+++ b/test/kubernetes/tests/BUILD
@@ -27,5 +27,5 @@ go_test(
         "noguitar",
         "notap",
     ],
-    deps = ["//test/kubernetes/k8sctx"],
+    deps = ["//test/kubernetes/k8sctx/kubectlctx"],
 )
diff --git a/test/kubernetes/tests/hello_test.go b/test/kubernetes/tests/hello_test.go
index 8caa1f0a90..5910500cb2 100644
--- a/test/kubernetes/tests/hello_test.go
+++ b/test/kubernetes/tests/hello_test.go
@@ -18,23 +18,17 @@ import (
 	"context"
 	"testing"
 
-	"gvisor.dev/gvisor/test/kubernetes/k8sctx"
+	"gvisor.dev/gvisor/test/kubernetes/k8sctx/kubectlctx"
 )
 
 // TestHello tests that a trivial alpine container runs correctly.
 func TestHello(t *testing.T) {
 	ctx := context.Background()
-	k8sCtx, err := k8sctx.Context(ctx)
+	k8sCtx, err := kubectlctx.New(ctx)
 	if err != nil {
 		t.Fatalf("Failed to get kubernetes context: %v", err)
 	}
-	cluster := k8sCtx.AcquireCluster(ctx, t)
-	defer k8sCtx.ReleaseCluster(ctx, t, cluster)
+	cluster, releaseFn := k8sCtx.Cluster(ctx, t)
+	defer releaseFn()
 	RunHello(ctx, t, k8sCtx, cluster)
 }
-
-func TestMain(m *testing.M) {
-	k8sctx.TestMain(m, map[string]k8sctx.TestFunc{
-		"TestHello": TestHello,
-	})
-}
diff --git a/tools/bigquery/bigquery.go b/tools/bigquery/bigquery.go
index 21ffb71bf1..ec27fc0a54 100644
--- a/tools/bigquery/bigquery.go
+++ b/tools/bigquery/bigquery.go
@@ -85,9 +85,12 @@ func (s *Suite) debugString(sb *strings.Builder, prefix string) {
 
 // Benchstat returns a benchstat-formatted output string.
 // See https://pkg.go.dev/golang.org/x/perf/cmd/benchstat
-// `includeConditions` contains names of `Condition`s that should be included
-// as part of the benchmark name.
-func (s *Suite) Benchstat(includeConditions []string) string {
+// `includeCondition` returns whether a `Condition` name should be included
+// as part of the benchmark name. If nil, all conditions are included.
+func (s *Suite) Benchstat(includeCondition func(string) bool) string {
+	if includeCondition == nil {
+		includeCondition = func(string) bool { return true }
+	}
 	var sb strings.Builder
 	benchmarkNames := make([]string, 0, len(s.Benchmarks))
 	benchmarks := make(map[string]*Benchmark, len(s.Benchmarks))
@@ -98,12 +101,8 @@ func (s *Suite) Benchstat(includeConditions []string) string {
 		}
 	}
 	sort.Strings(benchmarkNames)
-	includeConditionsMap := make(map[string]bool, len(includeConditions))
-	for _, condName := range includeConditions {
-		includeConditionsMap[condName] = true
-	}
 	for _, bmName := range benchmarkNames {
-		benchmarks[bmName].benchstat(&sb, s.Name, includeConditionsMap, s.Conditions)
+		benchmarks[bmName].benchstat(&sb, s.Name, includeCondition, s.Conditions)
 	}
 	return sb.String()
 }
@@ -153,20 +152,20 @@ func noSpace(s string) string {
 }
 
 // benchstat produces benchmark-formatted output for this Benchmark.
-func (bm *Benchmark) benchstat(sb *strings.Builder, suiteName string, includeConditions map[string]bool, suiteConditions []*Condition) {
+func (bm *Benchmark) benchstat(sb *strings.Builder, suiteName string, includeCondition func(string) bool, suiteConditions []*Condition) {
 	var conditionsStr string
 	conditionNames := make([]string, 0, len(suiteConditions)+len(bm.Condition))
 	conditionMap := make(map[string]string, len(suiteConditions)+len(bm.Condition))
 	for _, c := range suiteConditions {
 		cName := noSpace(c.Name)
-		if _, found := conditionMap[cName]; !found && includeConditions[cName] {
+		if _, found := conditionMap[cName]; !found && includeCondition(cName) {
 			conditionNames = append(conditionNames, cName)
 			conditionMap[cName] = noSpace(c.Value)
 		}
 	}
 	for _, c := range bm.Condition {
 		cName := noSpace(c.Name)
-		if _, found := conditionMap[cName]; !found && includeConditions[cName] {
+		if _, found := conditionMap[cName]; !found && includeCondition(cName) {
 			conditionNames = append(conditionNames, cName)
 			conditionMap[cName] = noSpace(c.Value)
 		}
diff --git a/tools/parsers/go_parser.go b/tools/parsers/go_parser.go
index cef869416d..d57a361fe5 100644
--- a/tools/parsers/go_parser.go
+++ b/tools/parsers/go_parser.go
@@ -53,8 +53,8 @@ func ParseOutput(output string, name string, official bool) (*bigquery.Suite, er
 //		*bigquery.Benchmark{
 //			Name: BenchmarkRuby
 //		 []*bigquery.Condition{
-//				{Name: GOMAXPROCS, 6}
 //				{Name: server_threads, 1}
+//				{Name: GOMAXPROCS, 6}
 //		 }
 //		 []*bigquery.Metric{
 //				{Name: ns/op, Unit: ns/op, Sample: 1397875880}
@@ -74,12 +74,17 @@ func parseLine(line string) (*bigquery.Benchmark, error) {
 		return nil, fmt.Errorf("expecting number of runs, got %s: %v", fields[1], err)
 	}
 
-	name, params, err := parseNameParams(fields[0])
+	nameComponents, params, err := tools.NameToParameters(fields[0])
 	if err != nil {
 		return nil, fmt.Errorf("parse name/params: %v", err)
 	}
 
-	bm := bigquery.NewBenchmark(name, iters)
+	// Treat the first name component as the benchmark name, and all other
+	// components as conditions with key = value.
+	bm := bigquery.NewBenchmark(nameComponents[0], iters)
+	for _, c := range nameComponents[1:] {
+		bm.AddCondition(c, c)
+	}
 	for _, p := range params {
 		bm.AddCondition(p.Name, p.Value)
 	}
@@ -94,40 +99,6 @@ func parseLine(line string) (*bigquery.Benchmark, error) {
 	return bm, nil
 }
 
-// parseNameParams parses the Name, GOMAXPROCS, and Params from the test.
-// Field here should be of the format TESTNAME/PARAMS-GOMAXPROCS.
-// Parameters will be separated by a "/" with individual params being
-// "name.value".
-func parseNameParams(field string) (string, []*tools.Parameter, error) {
-	var params []*tools.Parameter
-	// Remove GOMAXPROCS from end.
-	maxIndex := strings.LastIndex(field, "-")
-	if maxIndex < 0 {
-		return "", nil, fmt.Errorf("GOMAXPROCS not found: %s", field)
-	}
-	maxProcs := field[maxIndex+1:]
-	params = append(params, &tools.Parameter{
-		Name:  "GOMAXPROCS",
-		Value: maxProcs,
-	})
-
-	remainder := field[0:maxIndex]
-	index := strings.Index(remainder, "/")
-	if index == -1 {
-		return remainder, params, nil
-	}
-
-	name := remainder[0:index]
-	p := remainder[index+1:]
-
-	ps, err := tools.NameToParameters(p)
-	if err != nil {
-		return "", nil, fmt.Errorf("NameToParameters %s: %v", field, err)
-	}
-	params = append(params, ps...)
-	return name, params, nil
-}
-
 // makeMetric parses metrics and adds them to the passed Benchmark.
 func makeMetric(bm *bigquery.Benchmark, value, metric string) error {
 	switch metric {
diff --git a/tools/parsers/go_parser_test.go b/tools/parsers/go_parser_test.go
index 39a13b4afe..84fae05818 100644
--- a/tools/parsers/go_parser_test.go
+++ b/tools/parsers/go_parser_test.go
@@ -23,9 +23,10 @@ import (
 
 func TestParseLine(t *testing.T) {
 	testCases := []struct {
-		name string
-		data string
-		want *bigquery.Benchmark
+		name    string
+		data    string
+		want    *bigquery.Benchmark
+		wantErr bool
 	}{
 		{
 			name: "Iperf",
@@ -37,14 +38,14 @@ func TestParseLine(t *testing.T) {
 						Name:  "iterations",
 						Value: "1",
 					},
-					{
-						Name:  "GOMAXPROCS",
-						Value: "6",
-					},
 					{
 						Name:  "Upload",
 						Value: "Upload",
 					},
+					{
+						Name:  "GOMAXPROCS",
+						Value: "6",
+					},
 				},
 				Metric: []*bigquery.Metric{
 					{
@@ -70,14 +71,60 @@ func TestParseLine(t *testing.T) {
 						Name:  "iterations",
 						Value: "1",
 					},
+					{
+						Name:  "server_threads",
+						Value: "1",
+					},
 					{
 						Name:  "GOMAXPROCS",
 						Value: "6",
 					},
+				},
+				Metric: []*bigquery.Metric{
+					{
+						Name:   "ns/op",
+						Unit:   "ns/op",
+						Sample: 1397875880.0,
+					},
+					{
+						Name:   "average_latency",
+						Unit:   "sec",
+						Sample: 0.00710,
+					},
+					{
+						Name:   "requests_per_second",
+						Unit:   "QPS",
+						Sample: 140.0,
+					},
+				},
+			},
+		},
+		{
+			name: "Ruby with alternate parameter syntax",
+			data: "BenchmarkRuby/SubTest/server_threads=1/clients=8-6 1	1397875880 ns/op	0.00710 average_latency.s 140 requests_per_second.QPS",
+			want: &bigquery.Benchmark{
+				Name: "BenchmarkRuby",
+				Condition: []*bigquery.Condition{
+					{
+						Name:  "iterations",
+						Value: "1",
+					},
+					{
+						Name:  "SubTest",
+						Value: "SubTest",
+					},
 					{
 						Name:  "server_threads",
 						Value: "1",
 					},
+					{
+						Name:  "clients",
+						Value: "8",
+					},
+					{
+						Name:  "GOMAXPROCS",
+						Value: "6",
+					},
 				},
 				Metric: []*bigquery.Metric{
 					{
@@ -87,7 +134,7 @@ func TestParseLine(t *testing.T) {
 					},
 					{
 						Name:   "average_latency",
-						Unit:   "s",
+						Unit:   "sec",
 						Sample: 0.00710,
 					},
 					{
@@ -98,16 +145,57 @@ func TestParseLine(t *testing.T) {
 				},
 			},
 		},
+		{
+			name: "No GOMAXPROCS is allowed",
+			data: "BenchmarkRuby/server_threads.1/clients.8 1	1397875880 ns/op	0.00710 average_latency.s",
+			want: &bigquery.Benchmark{
+				Name: "BenchmarkRuby",
+				Condition: []*bigquery.Condition{
+					{
+						Name:  "iterations",
+						Value: "1",
+					},
+					{
+						Name:  "server_threads",
+						Value: "1",
+					},
+					{
+						Name:  "clients",
+						Value: "8",
+					},
+				},
+				Metric: []*bigquery.Metric{
+					{
+						Name:   "ns/op",
+						Unit:   "ns/op",
+						Sample: 1397875880.0,
+					},
+					{
+						Name:   "average_latency",
+						Unit:   "sec",
+						Sample: 0.00710,
+					},
+				},
+			},
+		},
+		{
+			name:    "Ambiguous parameter separator",
+			data:    "BenchmarkRuby/server_threads.4/clients=8 1	1397875880 ns/op	0.00710 average_latency.s",
+			wantErr: true,
+		},
 	}
 
 	for _, tc := range testCases {
 		t.Run(tc.name, func(t *testing.T) {
 			got, err := parseLine(tc.data)
-			if err != nil {
+			if err != nil && !tc.wantErr {
 				t.Fatalf("parseLine failed with: %v", err)
 			}
+			if err == nil && tc.wantErr {
+				t.Fatal("parseLine unexpectedly succeeded")
+			}
 
-			if !cmp.Equal(tc.want, got, nil) {
+			if err == nil && !cmp.Equal(tc.want, got, nil) {
 				for i := range got.Condition {
 					t.Logf("Metric: want: %+v got:%+v", got.Condition[i], tc.want.Condition[i])
 				}
@@ -146,13 +234,13 @@ func TestParseOutput(t *testing.T) {
 		{
 			name: "Ruby",
 			data: `BenchmarkRuby
-BenchmarkRuby/server_threads.1
-BenchmarkRuby/server_threads.1-6 1	1397875880 ns/op 0.00710 average_latency.s 140 requests_per_second.QPS
-BenchmarkRuby/server_threads.5
-BenchmarkRuby/server_threads.5-6 1	1416003331 ns/op	0.00950 average_latency.s 465 requests_per_second.QPS`,
+BenchmarkRuby/server_threads=1
+BenchmarkRuby/server_threads=1 1	1397875880 ns/op 0.00710 average_latency.s 140 requests_per_second.QPS
+BenchmarkRuby/server_threads=5
+BenchmarkRuby/server_threads=5 1	1416003331 ns/op	0.00950 average_latency.s 465 requests_per_second.QPS`,
 			numBenchmarks: 2,
 			numMetrics:    3,
-			numConditions: 3,
+			numConditions: 2,
 		},
 	}