Skip to content

Commit

Permalink
Merge branch 'CNT-4706/open-kernel-modules' into 'master'
Browse files Browse the repository at this point in the history
Add useOpenKernelModules option to both ClusterPolicy and NVIDIADriver API

See merge request nvidia/kubernetes/gpu-operator!924
  • Loading branch information
cdesiniotis committed Nov 2, 2023
2 parents 65d2850 + 7b0b609 commit 911c9e4
Show file tree
Hide file tree
Showing 12 changed files with 66 additions and 1 deletion.
15 changes: 15 additions & 0 deletions api/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,12 @@ type DriverSpec struct {
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UsePrecompiled *bool `json:"usePrecompiled,omitempty"`

// UseOpenKernelModules indicates if the open GPU kernel modules should be used
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable use of open GPU kernel modules"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"`

// Enabled indicates if deployment of NVIDIA Driver through operator is enabled
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA Driver deployment through GPU Operator"
Expand Down Expand Up @@ -1755,6 +1761,15 @@ func (d *DriverSpec) UsePrecompiledDrivers() bool {
return *d.UsePrecompiled
}

// OpenKernelModulesEnabled returns true if driver install is enabled using open GPU kernel modules
func (d *DriverSpec) OpenKernelModulesEnabled() bool {
if d.UseOpenKernelModules == nil {
// default is false if not specified by user
return false
}
return *d.UseOpenKernelModules
}

// IsEnabled returns true if device-plugin is enabled(default) through gpu-operator
func (p *DevicePluginSpec) IsEnabled() bool {
if p.Enabled == nil {
Expand Down
5 changes: 5 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions api/v1alpha1/nvidiadriver_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ type NVIDIADriverSpec struct {
// +kubebuilder:validation:XValidation:rule="self == oldSelf",message="usePrecompiled is an immutable field. Please create a new NvidiaDriver resource instead when you want to change this setting."
UsePrecompiled *bool `json:"usePrecompiled,omitempty"`

// UseOpenKernelModules indicates if the open GPU kernel modules should be used
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable use of open GPU kernel modules"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
UseOpenKernelModules *bool `json:"useOpenKernelModules,omitempty"`

// NVIDIA Driver container startup probe settings
StartupProbe *ContainerProbeSpec `json:"startupProbe,omitempty"`

Expand Down
5 changes: 5 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions bundle/manifests/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,10 @@ spec:
description: UseNvidiaDriverCRD indicates if the deployment of
NVIDIA Driver is managed by the NVIDIADriver CRD type
type: boolean
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA
Driver using pre-compiled modules is enabled
Expand Down
4 changes: 4 additions & 0 deletions bundle/manifests/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,10 @@ spec:
minimum: 1
type: integer
type: object
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA Driver
using pre-compiled modules is enabled
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/nvidia.com_clusterpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,10 @@ spec:
description: UseNvidiaDriverCRD indicates if the deployment of
NVIDIA Driver is managed by the NVIDIADriver CRD type
type: boolean
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA
Driver using pre-compiled modules is enabled
Expand Down
4 changes: 4 additions & 0 deletions config/crd/bases/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,10 @@ spec:
minimum: 1
type: integer
type: object
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA Driver
using pre-compiled modules is enabled
Expand Down
8 changes: 7 additions & 1 deletion controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ const (
PodControllerRevisionHashLabelKey = "controller-revision-hash"
// DefaultCCModeEnvName is the name of the envvar for configuring default CC mode on all compatible GPUs on the node
DefaultCCModeEnvName = "DEFAULT_CC_MODE"
// OpenKernelModulesEnabledEnvName is the name of the driver-container envvar for enabling open GPU kernel module support
OpenKernelModulesEnabledEnvName = "OPEN_KERNEL_MODULES_ENABLED"
)

// ContainerProbe defines container probe types
Expand Down Expand Up @@ -2859,12 +2861,16 @@ func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy
if len(config.Driver.Args) > 0 {
driverContainer.Args = config.Driver.Args
}
// set/append environment variables for exporter container
// set/append environment variables for driver container
if len(config.Driver.Env) > 0 {
for _, env := range config.Driver.Env {
setContainerEnv(driverContainer, env.Name, env.Value)
}
}
if config.Driver.OpenKernelModulesEnabled() {
setContainerEnv(driverContainer, OpenKernelModulesEnabledEnvName, "true")
}

// set container probe timeouts
if config.Driver.StartupProbe != nil {
setContainerProbe(driverContainer, config.Driver.StartupProbe, Startup)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -939,6 +939,10 @@ spec:
description: UseNvidiaDriverCRD indicates if the deployment of
NVIDIA Driver is managed by the NVIDIADriver CRD type
type: boolean
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA
Driver using pre-compiled modules is enabled
Expand Down
4 changes: 4 additions & 0 deletions deployments/gpu-operator/crds/nvidia.com_nvidiadrivers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,10 @@ spec:
minimum: 1
type: integer
type: object
useOpenKernelModules:
description: UseOpenKernelModules indicates if the open GPU kernel
modules should be used
type: boolean
usePrecompiled:
description: UsePrecompiled indicates if deployment of NVIDIA Driver
using pre-compiled modules is enabled
Expand Down
4 changes: 4 additions & 0 deletions manifests/state-driver/0500_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,10 @@ spec:
# always use runc for driver containers
- name: NVIDIA_VISIBLE_DEVICES
value: void
{{- if .Driver.Spec.UseOpenKernelModules }}
- name: OPEN_KERNEL_MODULES_ENABLED
value: "true"
{{- end }}
{{- if and (.Openshift) (.Runtime.OpenshiftVersion) }}
- name: OPENSHIFT_VERSION
value: {{ .Runtime.OpenshiftVersion | quote }}
Expand Down

0 comments on commit 911c9e4

Please sign in to comment.