diff --git a/operator/api/config/v1alpha1/defaults.go b/operator/api/config/v1alpha1/defaults.go index e2752790..5f6aad18 100644 --- a/operator/api/config/v1alpha1/defaults.go +++ b/operator/api/config/v1alpha1/defaults.go @@ -79,6 +79,19 @@ func SetDefaults_ServerConfiguration(serverConfig *ServerConfiguration) { serverConfig.Webhooks.ServerCertDir = defaultWebhookServerTLSServerCertDir } + if serverConfig.Webhooks.CertManagement == nil { + serverConfig.Webhooks.CertManagement = &WebhookCertManagement{} + } + if serverConfig.Webhooks.CertManagement.AutoProvision == nil { + serverConfig.Webhooks.CertManagement.AutoProvision = ptr.To(true) + } + if serverConfig.Webhooks.CertManagement.SecretName == "" { + serverConfig.Webhooks.CertManagement.SecretName = "grove-webhook-server-cert" + } + if serverConfig.Webhooks.CertManagement.CertManagerEnabled == nil { + serverConfig.Webhooks.CertManagement.CertManagerEnabled = ptr.To(false) + } + if serverConfig.HealthProbes == nil { serverConfig.HealthProbes = &Server{} } diff --git a/operator/api/config/v1alpha1/types.go b/operator/api/config/v1alpha1/types.go index 2c2c8df0..870859e0 100644 --- a/operator/api/config/v1alpha1/types.go +++ b/operator/api/config/v1alpha1/types.go @@ -135,6 +135,27 @@ type WebhookServer struct { Server `json:",inline"` // ServerCertDir is the directory containing the server certificate and key. ServerCertDir string `json:"serverCertDir"` + // CertManagement defines the certificate management configuration. + // +optional + CertManagement *WebhookCertManagement `json:"certManagement,omitempty"` +} + +// WebhookCertManagement defines how webhook certificates are managed. +type WebhookCertManagement struct { + // AutoProvision enables automatic certificate generation and rotation. + // If set to false, you must provide your own certificates via Secret. + // Default: true + // +optional + AutoProvision *bool `json:"autoProvision,omitempty"` + // SecretName is the name of the secret containing the webhook server certificate. + // Default: grove-webhook-server-cert + // +optional + SecretName string `json:"secretName,omitempty"` + // CertManagerEnabled indicates whether to annotate webhook configurations for cert-manager CA injection. + // This requires cert-manager to be installed in the cluster. + // Default: false + // +optional + CertManagerEnabled *bool `json:"certManagerEnabled,omitempty"` } // Server contains information for HTTP(S) server configuration. diff --git a/operator/charts/templates/_helpers.tpl b/operator/charts/templates/_helpers.tpl index 8d7dc59f..1e0912d0 100644 --- a/operator/charts/templates/_helpers.tpl +++ b/operator/charts/templates/_helpers.tpl @@ -16,6 +16,11 @@ config.yaml: | server: webhooks: port: {{ .Values.config.server.webhooks.port }} + serverCertDir: {{ .Values.config.server.webhooks.certDir }} + certManagement: + autoProvision: {{ .Values.config.server.webhooks.certManagement.autoProvision }} + secretName: {{ .Values.config.server.webhooks.certManagement.secretName }} + certManagerEnabled: {{ .Values.config.server.webhooks.certManagement.certManagerEnabled }} healthProbes: port: {{ .Values.config.server.healthProbes.port }} metrics: diff --git a/operator/charts/templates/authorizer-webhook-config.yaml b/operator/charts/templates/authorizer-webhook-config.yaml index db325a6e..e8f4c221 100644 --- a/operator/charts/templates/authorizer-webhook-config.yaml +++ b/operator/charts/templates/authorizer-webhook-config.yaml @@ -5,6 +5,10 @@ kind: ValidatingWebhookConfiguration metadata: name: authorizer-webhook namespace: {{ .Release.Namespace }} + {{- if .Values.config.server.webhooks.certManagement.certManagerEnabled }} + annotations: + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ .Values.config.server.webhooks.certManagement.secretName }} + {{- end }} labels: {{- include "operator.authorizer.webhook.labels" . | nindent 4 }} webhooks: diff --git a/operator/charts/templates/deployment.yaml b/operator/charts/templates/deployment.yaml index 9508b970..1ea9c3af 100644 --- a/operator/charts/templates/deployment.yaml +++ b/operator/charts/templates/deployment.yaml @@ -60,7 +60,7 @@ spec: mountPath: /var/run/secrets/kubernetes.io/serviceaccount readOnly: true - name: grove-webhook-server-cert - mountPath: /etc/grove-operator/webhook-certs + mountPath: {{ .Values.webhookServerCert.mountPath }} readOnly: true env: - name: GROVE_OPERATOR_SERVICE_ACCOUNT_NAME @@ -91,6 +91,10 @@ spec: configMap: name: {{ include "operator.config.name" . }} - name: grove-webhook-server-cert + {{- if .Values.webhookServerCert.volume }} + {{- toYaml .Values.webhookServerCert.volume | nindent 10 }} + {{- else }} secret: - secretName: grove-webhook-server-cert + secretName: {{ .Values.config.server.webhooks.certManagement.secretName }} defaultMode: 420 + {{- end }} diff --git a/operator/charts/templates/pcs-defaulting-webhook-config.yaml b/operator/charts/templates/pcs-defaulting-webhook-config.yaml index 993320e3..bfd99e2f 100644 --- a/operator/charts/templates/pcs-defaulting-webhook-config.yaml +++ b/operator/charts/templates/pcs-defaulting-webhook-config.yaml @@ -4,6 +4,10 @@ kind: MutatingWebhookConfiguration metadata: name: podcliqueset-defaulting-webhook namespace: {{ .Release.Namespace }} + {{- if .Values.config.server.webhooks.certManagement.certManagerEnabled }} + annotations: + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ .Values.config.server.webhooks.certManagement.secretName }} + {{- end }} labels: {{- include "operator.pcs.defaulting.webhook.labels" . | nindent 4 }} webhooks: diff --git a/operator/charts/templates/pcs-validating-webhook-config.yaml b/operator/charts/templates/pcs-validating-webhook-config.yaml index 244976e7..e1c95e84 100644 --- a/operator/charts/templates/pcs-validating-webhook-config.yaml +++ b/operator/charts/templates/pcs-validating-webhook-config.yaml @@ -4,6 +4,10 @@ kind: ValidatingWebhookConfiguration metadata: name: podcliqueset-validating-webhook namespace: {{ .Release.Namespace }} + {{- if .Values.config.server.webhooks.certManagement.certManagerEnabled }} + annotations: + cert-manager.io/inject-ca-from: {{ .Release.Namespace }}/{{ .Values.config.server.webhooks.certManagement.secretName }} + {{- end }} labels: {{- include "operator.pcs.validating.webhook.labels" . | nindent 4 }} webhooks: diff --git a/operator/charts/templates/webhook-server-cert-secret.yaml b/operator/charts/templates/webhook-server-cert-secret.yaml index 7c906471..b9a7ad71 100644 --- a/operator/charts/templates/webhook-server-cert-secret.yaml +++ b/operator/charts/templates/webhook-server-cert-secret.yaml @@ -1,7 +1,8 @@ +{{- if .Values.config.server.webhooks.certManagement.autoProvision }} apiVersion: v1 kind: Secret metadata: - name: grove-webhook-server-cert + name: {{ .Values.config.server.webhooks.certManagement.secretName }} namespace: {{ .Release.Namespace }} labels: {{- include "operator.server.secret.labels" . | nindent 4 }} @@ -9,3 +10,4 @@ type: kubernetes.io/tls data: tls.crt: "" tls.key: "" +{{- end }} diff --git a/operator/charts/values.yaml b/operator/charts/values.yaml index deed7719..37d04a25 100644 --- a/operator/charts/values.yaml +++ b/operator/charts/values.yaml @@ -46,6 +46,19 @@ config: server: webhooks: port: 9443 + # certDir is the directory path where certificate files are located. + # The operator will look for tls.crt and tls.key in this directory. + # Default: /etc/grove-operator/webhook-certs + certDir: /etc/grove-operator/webhook-certs + certManagement: + # autoProvision enables automatic certificate generation and rotation. + # Set to false to use external certificates (e.g., cert-manager). + autoProvision: true + # secretName is the name of the secret containing the webhook certificate. + secretName: grove-webhook-server-cert + # certManagerEnabled adds cert-manager.io/inject-ca-from annotation to webhook configs. + # Requires cert-manager to be installed in the cluster. + certManagerEnabled: false healthProbes: enable: false port: 9444 @@ -134,6 +147,40 @@ webhookServerSecret: app.kubernetes.io/name: grove-webhook-server-secret app.kubernetes.io/part-of: grove +# Webhook server certificate volume configuration +webhookServerCert: + # mountPath is where the certificate volume will be mounted in the container. + # This should typically match config.server.webhooks.certDir + mountPath: /etc/grove-operator/webhook-certs + + # volume defines the volume source for certificates. + # If not specified, defaults to using the Secret defined in config.server.webhooks.certManagement.secretName + # + # Example 1 - Using hostPath (for local certificate files): + # volume: + # hostPath: + # path: /host/path/to/certs + # type: Directory + # + # Example 2 - Using CSI driver (Vault): + # volume: + # csi: + # driver: secrets-store.csi.k8s.io + # readOnly: true + # volumeAttributes: + # secretProviderClass: "grove-webhook-certs" + # + # Example 3 - Using ConfigMap (for testing only): + # volume: + # configMap: + # name: grove-webhook-certs + # + # Example 4 - Using emptyDir (for testing): + # volume: + # emptyDir: {} + # + volume: null + priorityClass: enabled: false name: grove-operator-priority-critical diff --git a/operator/cmd/main.go b/operator/cmd/main.go index d9ec4b87..aae207d8 100644 --- a/operator/cmd/main.go +++ b/operator/cmd/main.go @@ -62,8 +62,16 @@ func main() { } webhookCertsReadyCh := make(chan struct{}) - if err = cert.ManageWebhookCerts(mgr, operatorCfg.Server.Webhooks.ServerCertDir, operatorCfg.Authorizer.Enabled, webhookCertsReadyCh); err != nil { - logger.Error(err, "failed to setup cert rotation") + certMgmt := operatorCfg.Server.Webhooks.CertManagement + if err = cert.ManageWebhookCerts( + mgr, + operatorCfg.Server.Webhooks.ServerCertDir, + certMgmt.SecretName, + operatorCfg.Authorizer.Enabled, + *certMgmt.AutoProvision, + webhookCertsReadyCh, + ); err != nil { + logger.Error(err, "failed to setup certificate management") os.Exit(1) } diff --git a/operator/internal/controller/cert/cert.go b/operator/internal/controller/cert/cert.go index 2cd60b4d..ce3f1007 100644 --- a/operator/internal/controller/cert/cert.go +++ b/operator/internal/controller/cert/cert.go @@ -20,6 +20,7 @@ import ( "fmt" "os" "strings" + "time" "github.com/ai-dynamo/grove/operator/internal/constants" authorizationwebhook "github.com/ai-dynamo/grove/operator/internal/webhook/admission/pcs/authorization" @@ -39,16 +40,30 @@ const ( ) // ManageWebhookCerts registers the cert-controller with the manager which will be used to manage -// webhook certificates. -func ManageWebhookCerts(mgr ctrl.Manager, certDir string, authorizerEnabled bool, certsReadyCh chan struct{}) error { +// webhook certificates. If autoProvision is false, it skips automatic certificate management +// and only ensures that certificates are mounted from the provided Secret. +func ManageWebhookCerts(mgr ctrl.Manager, certDir string, secretName string, authorizerEnabled bool, autoProvision bool, certsReadyCh chan struct{}) error { namespace, err := getOperatorNamespace() if err != nil { return err } + + // If autoProvision is disabled, just verify certs exist and notify readiness + if !autoProvision { + logger := ctrl.Log.WithName("cert-external") + logger.Info("Automatic certificate provisioning is disabled, using external certificates", + "secretName", secretName, "certDir", certDir) + + // Start a goroutine to wait for externally managed certificates + go waitForExternalCerts(logger, certDir, certsReadyCh) + return nil + } + + // Auto-provision mode: use cert-controller rotator := &cert.CertRotator{ SecretKey: types.NamespacedName{ Namespace: namespace, - Name: "grove-webhook-server-cert", + Name: secretName, }, CertDir: certDir, CAName: certificateAuthorityName, @@ -109,3 +124,51 @@ func getOperatorNamespace() (string, error) { } return namespace, nil } + +// waitForExternalCerts waits for externally managed certificates to be available +// in the specified directory. This is used when autoProvision is disabled. +func waitForExternalCerts(logger logr.Logger, certDir string, certsReadyCh chan struct{}) { + const ( + maxRetries = 30 + retryInterval = 2 * time.Second + certFileName = "tls.crt" + keyFileName = "tls.key" + ) + + certPath := fmt.Sprintf("%s/%s", certDir, certFileName) + keyPath := fmt.Sprintf("%s/%s", certDir, keyFileName) + + for i := 0; i < maxRetries; i++ { + // Check if both certificate and key files exist + certExists := fileExists(certPath) + keyExists := fileExists(keyPath) + + if certExists && keyExists { + logger.Info("External certificates found and ready", + "certPath", certPath, "keyPath", keyPath) + close(certsReadyCh) + return + } + + if i < maxRetries-1 { + logger.Info("Waiting for external certificates to be mounted", + "attempt", i+1, "maxRetries", maxRetries, + "certExists", certExists, "keyExists", keyExists) + time.Sleep(retryInterval) + } + } + + logger.Error(fmt.Errorf("timeout waiting for external certificates"), + "Failed to find certificates after maximum retries", + "certPath", certPath, "keyPath", keyPath) + // Don't close the channel - this will cause the readiness check to fail +} + +// fileExists checks if a file exists and is not a directory +func fileExists(path string) bool { + info, err := os.Stat(path) + if err != nil { + return false + } + return !info.IsDir() +}