aws · cartermckinnon · Jul 18, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jul 11, 2024
diff --git a/e2e2/test/cases/inference/bert_inference_test.go b/e2e2/test/cases/inference/bert_inference_test.go
@@ -0,0 +1,75 @@
+package inference
+
+import (
+	"context"
+	_ "embed"
+	"fmt"
+	"testing"
+	"time"
+
+	fwext "github.com/aws/aws-k8s-tester/e2e2/internal/framework_extensions"
+	"sigs.k8s.io/e2e-framework/klient/wait"
+	"sigs.k8s.io/e2e-framework/pkg/envconf"
+	"sigs.k8s.io/e2e-framework/pkg/features"
+
+	batchv1 "k8s.io/api/batch/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+var (
+	//go:embed manifests/bert-inference.yaml
+	bertInferenceManifest         []byte
+	renderedBertInferenceManifest []byte
+)
+
+type bertInferenceManifestTplVars struct {
+	BertInferenceImage string
+	InferenceMode      string
+}
+
+func TestBertInference(t *testing.T) {
+	bertInference := features.New("bert-inference").
+		WithLabel("suite", "nvidia").
+		WithLabel("hardware", "gpu").
+		Setup(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
+			if *bertInferenceImage == "" {
+				t.Fatal(fmt.Errorf("bertInferenceImage must be set to run the test"))
+			}
+
+			var err error
+			renderedBertInferenceManifest, err = fwext.RenderManifests(bertInferenceManifest, bertInferenceManifestTplVars{
+				BertInferenceImage: *bertInferenceImage,
+				InferenceMode:      *inferenceMode,
+			})
+			if err != nil {
+				t.Fatal(err)
+			}
+			err = fwext.ApplyManifests(cfg.Client().RESTConfig(), renderedBertInferenceManifest)
+			if err != nil {
+				t.Fatal(err)
+			}
+			return ctx
+		}).
+		Assess("BERT inference Job succeeds", func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
+			job := &batchv1.Job{
+				ObjectMeta: metav1.ObjectMeta{Name: "bert-inference", Namespace: "default"},
+			}
+			err := wait.For(fwext.NewConditionExtension(cfg.Client().Resources()).JobSucceeded(job),
+				wait.WithTimeout(time.Minute*20))
+			if err != nil {
+				t.Fatal(err)
+			}
+			return ctx
+		}).
+		Teardown(func(ctx context.Context, t *testing.T, cfg *envconf.Config) context.Context {
+			// Delete the manifest
+			err := fwext.DeleteManifests(cfg.Client().RESTConfig(), renderedBertInferenceManifest)
+			if err != nil {
+				t.Fatal(err)
+			}
+			return ctx
+		}).
+		Feature()
+
+	testenv.Test(t, bertInference)
+}
diff --git a/e2e2/test/cases/inference/main_test.go b/e2e2/test/cases/inference/main_test.go
@@ -0,0 +1,79 @@
+package inference
+
+import (
+	"context"
+	_ "embed"
+	"flag"
+	"log"
+	"os"
+	"slices"
+	"testing"
+	"time"
+
+	fwext "github.com/aws/aws-k8s-tester/e2e2/internal/framework_extensions"
+	appsv1 "k8s.io/api/apps/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"sigs.k8s.io/e2e-framework/klient/wait"
+	"sigs.k8s.io/e2e-framework/pkg/env"
+	"sigs.k8s.io/e2e-framework/pkg/envconf"
+)
+
+var (
+	testenv            env.Environment
+	bertInferenceImage *string
+	inferenceMode      *string
+)
+
+var (
+	//go:embed manifests/nvidia-device-plugin.yaml
+	nvidiaDevicePluginManifest []byte
+)
+
+func TestMain(m *testing.M) {
+	bertInferenceImage = flag.String("bertInferenceImage", "", "test image for BERT inference")
+	inferenceMode = flag.String("inferenceMode", "throughput", "inference mode to set for BERT inference test")
+	cfg, err := envconf.NewFromFlags()
+	if err != nil {
+		log.Fatalf("failed to initialize test environment: %v", err)
+	}
+	testenv = env.NewWithConfig(cfg)
+
+	// all NVIDIA tests require the device plugin and MPI operator
+	manifests := [][]byte{
+		nvidiaDevicePluginManifest,
+	}
+
+	testenv.Setup(
+		func(ctx context.Context, config *envconf.Config) (context.Context, error) {
+			err := fwext.ApplyManifests(config.Client().RESTConfig(), manifests...)
+			if err != nil {
+				return ctx, err
+			}
+			return ctx, nil
+		},
+		func(ctx context.Context, config *envconf.Config) (context.Context, error) {
+			ds := appsv1.DaemonSet{
+				ObjectMeta: metav1.ObjectMeta{Name: "nvidia-device-plugin-daemonset", Namespace: "kube-system"},
+			}
+			err := wait.For(fwext.NewConditionExtension(config.Client().Resources()).DaemonSetReady(&ds),
+				wait.WithTimeout(time.Minute*5))
+			if err != nil {
+				return ctx, err
+			}
+			return ctx, nil
+		},
+	)
+
+	testenv.Finish(
+		func(ctx context.Context, config *envconf.Config) (context.Context, error) {
+			slices.Reverse(manifests)
+			err = fwext.DeleteManifests(config.Client().RESTConfig(), manifests...)
+			if err != nil {
+				return ctx, err
+			}
+			return ctx, nil
+		},
+	)
+
+	os.Exit(testenv.Run(m))
+}
diff --git a/e2e2/test/cases/inference/manifests/bert-inference.yaml b/e2e2/test/cases/inference/manifests/bert-inference.yaml
@@ -0,0 +1,18 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: bert-inference
+spec:
+  template:
+    spec:
+      containers:
+        - name: bert-inference
+          image: {{.BertInferenceImage}}
+          command: ["python", "infer.py"]
+          env:
+            - name: INFERENCE_MODE
+              value: {{.InferenceMode}}
+          resources:
+            limits:
+              nvidia.com/gpu: 1
+      restartPolicy: OnFailure
diff --git a/e2e2/test/cases/inference/manifests/nvidia-device-plugin.yaml b/e2e2/test/cases/inference/manifests/nvidia-device-plugin.yaml
@@ -0,0 +1,56 @@
+# Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nvidia-device-plugin-daemonset
+  namespace: kube-system
+spec:
+  selector:
+    matchLabels:
+      name: nvidia-device-plugin-ds
+  updateStrategy:
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        name: nvidia-device-plugin-ds
+    spec:
+      tolerations:
+      - key: nvidia.com/gpu
+        operator: Exists
+        effect: NoSchedule
+      # Mark this pod as a critical add-on; when enabled, the critical add-on
+      # scheduler reserves resources for critical add-on pods so that they can
+      # be rescheduled after a failure.
+      # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/
+      priorityClassName: "system-node-critical"
+      containers:
+      - image: nvcr.io/nvidia/k8s-device-plugin:v0.14.2
+        name: nvidia-device-plugin-ctr
+        env:
+          - name: FAIL_ON_INIT_ERROR
+            value: "false"
+        securityContext:
+          allowPrivilegeEscalation: false
+          capabilities:
+            drop: ["ALL"]
+        volumeMounts:
+        - name: device-plugin
+          mountPath: /var/lib/kubelet/device-plugins
+      volumes:
+      - name: device-plugin
+        hostPath:
+          path: /var/lib/kubelet/device-plugins