Skip to content

Commit

Permalink
Support the DevicePluginCDIDevices feature gate
Browse files Browse the repository at this point in the history
This patch adds support for the `DevicePluginCDIDevices` feature gate by
adding `spec.operator.useDevicePluginCDIDevicesFeature` to
`ClusterPolicy`.  When this field is set, the operator sets the
`DEVICE_LIST_STRATEGY` device plug-in environment variable to `cdi-cri`.

Signed-off-by: Jean-Francois Roy <[email protected]>
  • Loading branch information
jfroy committed Nov 6, 2024
1 parent 8fa9ed5 commit ef79ad3
Show file tree
Hide file tree
Showing 8 changed files with 39 additions and 1 deletion.
12 changes: 12 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,9 @@ type OperatorSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="On OpenShift, enable DriverToolkit image to build and install driver modules"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UseOpenShiftDriverToolkit *bool `json:"use_ocp_driver_toolkit,omitempty"`

// UseDevicePluginCDIDevicesFeature indicates if the device plug-in should be configured to use the CDI devices feature
UseDevicePluginCDIDevicesFeature *bool `json:"useDevicePluginCDIDevicesFeature,omitempty"`
}

// HostPathsSpec defines various paths on the host needed by GPU Operator components
Expand Down Expand Up @@ -1827,6 +1830,15 @@ func ImagePullPolicy(pullPolicy string) corev1.PullPolicy {
return imagePullPolicy
}

// DevicePluginCDIDevicesFeatureEnabled returns true if use DevicePluginCDIDevices feature is enabled
func (s *OperatorSpec) DevicePluginCDIDevicesFeatureEnabled() bool {
if s.UseDevicePluginCDIDevicesFeature == nil {
// default is false if not specified by user
return false
}
return *s.UseDevicePluginCDIDevicesFeature
}

// IsEnabled returns true if driver install is enabled(default) through gpu-operator
func (d *DriverSpec) IsEnabled() bool {
if d.Enabled == nil {
Expand Down
5 changes: 5 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,10 @@ spec:
image should be used on OpenShift to build and install driver
modules
type: boolean
useDevicePluginCDIDevicesFeature:
description: UseDevicePluginCDIDevicesFeature indicates if the device plug-in
should be configured to use the CDI devices feature
type: boolean
required:
- defaultRuntime
type: object
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,10 @@ spec:
image should be used on OpenShift to build and install driver
modules
type: boolean
useDevicePluginCDIDevicesFeature:
description: UseDevicePluginCDIDevicesFeature indicates if the
device plug-in should be configured to use the CDI devices feature
type: boolean
required:
- defaultRuntime
type: object
Expand Down
6 changes: 5 additions & 1 deletion controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -1398,7 +1398,11 @@ func TransformDevicePlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpe
// update env required for CDI support
if config.CDI.IsEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIEnabledEnvName, "true")
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations")
if config.Operator.DevicePluginCDIDevicesFeatureEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "cdi-cri")
} else {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), DeviceListStrategyEnvName, "envvar,cdi-annotations")
}
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), CDIAnnotationPrefixEnvName, "nvidia.cdi.k8s.io/")
if config.Toolkit.IsEnabled() {
setContainerEnv(&(obj.Spec.Template.Spec.Containers[0]), NvidiaCDIHookPathEnvName, filepath.Join(config.Toolkit.InstallDir, "toolkit/nvidia-cdi-hook"))
Expand Down
4 changes: 4 additions & 0 deletions deployments/gpu-operator/crds/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1558,6 +1558,10 @@ spec:
image should be used on OpenShift to build and install driver
modules
type: boolean
useDevicePluginCDIDevicesFeature:
description: UseDevicePluginCDIDevicesFeature indicates if the device plug-in
should be configured to use the CDI devices feature
type: boolean
required:
- defaultRuntime
type: object
Expand Down
3 changes: 3 additions & 0 deletions deployments/gpu-operator/templates/clusterpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ spec:
{{- if .Values.operator.use_ocp_driver_toolkit }}
use_ocp_driver_toolkit: {{ .Values.operator.use_ocp_driver_toolkit }}
{{- end }}
{{- if .Values.operator.useDevicePluginCDIDevicesFeature }}
useDevicePluginCDIDevicesFeature: {{ .Values.operator.useDevicePluginCDIDevicesFeature }}
{{- end }}
daemonsets:
labels:
{{- include "gpu-operator.operand-labels" . | nindent 6 }}
Expand Down
2 changes: 2 additions & 0 deletions deployments/gpu-operator/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ operator:
# upgrade CRD on chart upgrade, requires --disable-openapi-validation flag
# to be passed during helm upgrade.
upgradeCRD: true
# use DevicePluginCDIDevices feature
useDevicePluginCDIDevicesFeature: false
initContainer:
image: cuda
repository: nvcr.io/nvidia
Expand Down

0 comments on commit ef79ad3

Please sign in to comment.