Skip to content

Commit

Permalink
Merge pull request #894 from NVIDIA/cherry-pick_fixes-for-openshift
Browse files Browse the repository at this point in the history
Cherry-pick fixes for OpenShift
  • Loading branch information
tariq1890 authored Aug 2, 2024
2 parents 1d0a78c + 2032e2d commit 2ad39e6
Show file tree
Hide file tree
Showing 7 changed files with 32 additions and 2 deletions.
8 changes: 8 additions & 0 deletions assets/state-node-status-exporter/0200_role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,11 @@ rules:
verbs:
- get
- list
- apiGroups:
- apps
resources:
- daemonsets
verbs:
- get
- list
- watch
4 changes: 4 additions & 0 deletions assets/state-node-status-exporter/0700_daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: OPERATOR_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
ports:
- name: node-status
containerPort: 8000
Expand Down
13 changes: 13 additions & 0 deletions cmd/gpu-operator/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
// to ensure that exec-entrypoint and run can make use of them.
"go.uber.org/zap/zapcore"
_ "k8s.io/client-go/plugin/pkg/client/auth"
"sigs.k8s.io/controller-runtime/pkg/cache"

apiconfigv1 "github.com/openshift/api/config/v1"
apiimagev1 "github.com/openshift/api/image/v1"
Expand All @@ -49,6 +50,7 @@ import (
nvidiav1alpha1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1alpha1"
"github.com/NVIDIA/gpu-operator/controllers"
"github.com/NVIDIA/gpu-operator/controllers/clusterinfo"
"github.com/NVIDIA/gpu-operator/internal/consts"
"github.com/NVIDIA/gpu-operator/internal/info"
// +kubebuilder:scaffold:imports
)
Expand Down Expand Up @@ -104,13 +106,24 @@ func main() {
Port: 9443,
})

operatorNamespace := os.Getenv("OPERATOR_NAMESPACE")
openshiftNamespace := consts.OpenshiftNamespace
cacheOptions := cache.Options{
DefaultNamespaces: map[string]cache.Config{
operatorNamespace: {},
// Also cache resources in the openshift namespace to retrieve ImageStreams when on an openshift cluster
openshiftNamespace: {},
},
}

options := ctrl.Options{
Scheme: scheme,
Metrics: metricsOptions,
HealthProbeBindAddress: probeAddr,
LeaderElection: enableLeaderElection,
LeaderElectionID: "53822513.nvidia.com",
WebhookServer: webhookServer,
Cache: cacheOptions,
}

if enableLeaderElection && int(renewDeadline) != 0 {
Expand Down
2 changes: 1 addition & 1 deletion controllers/clusterinfo/clusterinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ func getOpenshiftDTKImages(ctx context.Context, c *rest.Config) map[string]strin
logger := log.FromContext(ctx)

name := "driver-toolkit"
namespace := "openshift"
namespace := consts.OpenshiftNamespace

ocpImageClient, err := imagesv1.NewForConfig(c)
if err != nil {
Expand Down
3 changes: 2 additions & 1 deletion controllers/object_controls.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ import (
"sigs.k8s.io/yaml"

gpuv1 "github.com/NVIDIA/gpu-operator/api/nvidia/v1"
"github.com/NVIDIA/gpu-operator/internal/consts"
"github.com/NVIDIA/gpu-operator/internal/utils"
)

Expand Down Expand Up @@ -3705,7 +3706,7 @@ func ocpHasDriverToolkitImageStream(n *ClusterPolicyController) (bool, error) {
ctx := n.ctx
found := &apiimagev1.ImageStream{}
name := "driver-toolkit"
namespace := "openshift"
namespace := consts.OpenshiftNamespace
err := n.client.Get(ctx, types.NamespacedName{Namespace: namespace, Name: name}, found)
if err != nil {
if apierrors.IsNotFound(err) {
Expand Down
3 changes: 3 additions & 0 deletions internal/consts/consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ const (
// Containerd runtime
Containerd = "containerd"

// OpenshiftNamespace indicates the main namespace of an Openshift cluster
OpenshiftNamespace = "openshift"

OcpDriverToolkitVersionLabel = "openshift.driver-toolkit.rhcos"
OcpDriverToolkitIdentificationLabel = "openshift.driver-toolkit"
NfdOSTreeVersionLabelKey = "feature.node.kubernetes.io/system-os_release.OSTREE_VERSION"
Expand Down
1 change: 1 addition & 0 deletions validator/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,7 @@ func (nm *NodeMetrics) watchDriverValidation() {
nm.driverValidation.Set(1)
nm.driverValidationLastSuccess.Set(float64(time.Now().Unix()))
} else {
log.Errorf("failed to validate driver: %v", err)
nm.driverValidation.Set(0)
}
time.Sleep(driverValidationCheckDelaySeconds * time.Second)
Expand Down

0 comments on commit 2ad39e6

Please sign in to comment.