diff --git a/assets/state-node-status-exporter/0200_role.yaml b/assets/state-node-status-exporter/0200_role.yaml index da164c69d..d74b46a94 100644 --- a/assets/state-node-status-exporter/0200_role.yaml +++ b/assets/state-node-status-exporter/0200_role.yaml @@ -19,3 +19,11 @@ rules: verbs: - get - list +- apiGroups: + - apps + resources: + - daemonsets + verbs: + - get + - list + - watch diff --git a/assets/state-node-status-exporter/0700_daemonset.yaml b/assets/state-node-status-exporter/0700_daemonset.yaml index 6ec4036bc..2d6d830ad 100644 --- a/assets/state-node-status-exporter/0700_daemonset.yaml +++ b/assets/state-node-status-exporter/0700_daemonset.yaml @@ -40,6 +40,10 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: OPERATOR_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace ports: - name: node-status containerPort: 8000 diff --git a/cmd/gpu-operator/main.go b/cmd/gpu-operator/main.go index bbbe14e6d..076362c12 100644 --- a/cmd/gpu-operator/main.go +++ b/cmd/gpu-operator/main.go @@ -27,6 +27,7 @@ import ( // to ensure that exec-entrypoint and run can make use of them. "go.uber.org/zap/zapcore" _ "k8s.io/client-go/plugin/pkg/client/auth" + "sigs.k8s.io/controller-runtime/pkg/cache" apiconfigv1 "github.com/openshift/api/config/v1" apiimagev1 "github.com/openshift/api/image/v1" @@ -49,6 +50,7 @@ import ( nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1" "github.com/NVIDIA/gpu-operator/controllers" "github.com/NVIDIA/gpu-operator/controllers/clusterinfo" + "github.com/NVIDIA/gpu-operator/internal/consts" "github.com/NVIDIA/gpu-operator/internal/info" // +kubebuilder:scaffold:imports ) @@ -104,6 +106,16 @@ func main() { Port: 9443, }) + operatorNamespace := os.Getenv("OPERATOR_NAMESPACE") + openshiftNamespace := consts.OpenshiftNamespace + cacheOptions := cache.Options{ + DefaultNamespaces: map[string]cache.Config{ + operatorNamespace: {}, + // Also cache resources in the openshift namespace to retrieve ImageStreams when on an openshift cluster + openshiftNamespace: {}, + }, + } + options := ctrl.Options{ Scheme: scheme, Metrics: metricsOptions, @@ -111,6 +123,7 @@ func main() { LeaderElection: enableLeaderElection, LeaderElectionID: "53822513.nvidia.com", WebhookServer: webhookServer, + Cache: cacheOptions, } if enableLeaderElection && int(renewDeadline) != 0 { diff --git a/controllers/clusterinfo/clusterinfo.go b/controllers/clusterinfo/clusterinfo.go index 8f0cec78c..b1de7fe43 100644 --- a/controllers/clusterinfo/clusterinfo.go +++ b/controllers/clusterinfo/clusterinfo.go @@ -341,7 +341,7 @@ func getOpenshiftDTKImages(ctx context.Context, c *rest.Config) map[string]strin logger := log.FromContext(ctx) name := "driver-toolkit" - namespace := "openshift" + namespace := consts.OpenshiftNamespace ocpImageClient, err := imagesv1.NewForConfig(c) if err != nil { diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 376302fb7..7073b8378 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -52,6 +52,7 @@ import ( "sigs.k8s.io/yaml" gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1" + "github.com/NVIDIA/gpu-operator/internal/consts" "github.com/NVIDIA/gpu-operator/internal/utils" ) @@ -3705,7 +3706,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) { ctx := n.ctx found := &apiimagev1.ImageStream{} name := "driver-toolkit" - namespace := "openshift" + namespace := consts.OpenshiftNamespace err := n.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, found) if err != nil { if apierrors.IsNotFound(err) { diff --git a/internal/consts/consts.go b/internal/consts/consts.go index a65c3027d..c2850f419 100644 --- a/internal/consts/consts.go +++ b/internal/consts/consts.go @@ -39,6 +39,9 @@ const ( // Containerd runtime Containerd = "containerd" + // OpenshiftNamespace indicates the main namespace of an Openshift cluster + OpenshiftNamespace = "openshift" + OcpDriverToolkitVersionLabel = "openshift.driver-toolkit.rhcos" OcpDriverToolkitIdentificationLabel = "openshift.driver-toolkit" NfdOSTreeVersionLabelKey = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION" diff --git a/validator/metrics.go b/validator/metrics.go index 0d62119bd..4105dd166 100644 --- a/validator/metrics.go +++ b/validator/metrics.go @@ -238,6 +238,7 @@ func (nm *NodeMetrics) watchDriverValidation() { nm.driverValidation.Set(1) nm.driverValidationLastSuccess.Set(float64(time.Now().Unix())) } else { + log.Errorf("failed to validate driver: %v", err) nm.driverValidation.Set(0) } time.Sleep(driverValidationCheckDelaySeconds * time.Second)