Skip to content

Commit

Permalink
Merge branch 'nvdriver-fix-conditions' into 'master'
Browse files Browse the repository at this point in the history
Ensure NVIDIADriver CR status.state is non-empty when setting conditions

See merge request nvidia/kubernetes/gpu-operator!927
  • Loading branch information
cdesiniotis committed Oct 31, 2023
2 parents 0555bc5 + d553c48 commit 65d2850
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 2 deletions.
13 changes: 11 additions & 2 deletions controllers/nvidiadriver_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,12 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
}
err = fmt.Errorf("Error getting NVIDIADriver object: %w", err)
logger.V(consts.LogLevelError).Error(nil, err.Error())
// Error reading the object - requeue the request.
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
}
// Error reading the object - requeue the request.
return reconcile.Result{}, err
}

Expand All @@ -103,6 +104,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
if err != nil {
err = fmt.Errorf("Error getting ClusterPolicy list: %v", err)
logger.V(consts.LogLevelError).Error(nil, err.Error())
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
Expand All @@ -113,6 +115,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
if len(clusterPolicyList.Items) == 0 {
err = fmt.Errorf("no ClusterPolicy object found in the cluster")
logger.V(consts.LogLevelError).Error(nil, err.Error())
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
Expand All @@ -135,13 +138,18 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
// is deployed per GPU node.
err = r.nodeSelectorValidator.Validate(ctx, instance)
if err != nil {
_ = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ConflictingNodeSelector, err.Error())
logger.V(consts.LogLevelError).Error(nil, err.Error())
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ConflictingNodeSelector, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
}
return reconcile.Result{}, nil
}

if instance.Spec.UsePrecompiledDrivers() && instance.Spec.IsGDSEnabled() {
err = fmt.Errorf("GPUDirect Storage driver (nvidia-fs) is not supported along with pre-compiled NVIDIA drivers")
logger.V(consts.LogLevelError).Error(nil, err.Error())
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
Expand All @@ -152,6 +160,7 @@ func (r *NVIDIADriverReconciler) Reconcile(ctx context.Context, req ctrl.Request
if instance.Spec.DriverType == nvidiav1alpha1.VGPUHostManager {
err = fmt.Errorf("vgpu-host-manager driver type is not supported through NVIDIADriver CR")
logger.V(consts.LogLevelError).Error(nil, err.Error())
instance.Status.State = nvidiav1alpha1.NotReady
condErr = r.conditionUpdater.SetConditionsError(ctx, instance, conditions.ReconcileFailed, err.Error())
if condErr != nil {
logger.V(consts.LogLevelDebug).Error(nil, condErr.Error())
Expand Down
8 changes: 8 additions & 0 deletions internal/conditions/nvidiadriver.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,5 +102,13 @@ func (u *nvDriverUpdater) setConditionsError(ctx context.Context, cr *nvidiav1al
Message: message,
})

// Ensure status.state is not empty when updating the CR status.
// The caller should set the state appropriately in the CR
// depending on the error condition.
instance.Status.State = cr.Status.State
if instance.Status.State == "" {
instance.Status.State = nvidiav1alpha1.NotReady
}

return u.client.Status().Update(ctx, instance)
}

0 comments on commit 65d2850

Please sign in to comment.