@@ -36,6 +36,7 @@ import (
3636 "github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
3737 "github.com/openshift/cluster-etcd-operator/pkg/operator/externaletcdsupportcontroller"
3838 "github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
39+ "github.com/openshift/cluster-etcd-operator/pkg/tlshelpers"
3940 "github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/jobs"
4041 "github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/tools"
4142)
4546 fencingUpdateTriggered bool
4647 // fencingUpdateMutex is used to make usage of fencingUpdateTriggered thread safe
4748 fencingUpdateMutex sync.Mutex
49+ // etcdCertUpdateTriggered is set to true when a etcd cert update is already triggered
50+ etcdCertUpdateTriggered bool
51+ // etcdCertUpdateMutex is used to make usage of etcdCertUpdateTriggered thread safe
52+ etcdCertUpdateMutex sync.Mutex
4853)
4954
5055// HandleDualReplicaClusters checks feature gate and control plane topology,
@@ -102,12 +107,15 @@ func HandleDualReplicaClusters(
102107 _ , err = kubeInformersForNamespaces .InformersFor (operatorclient .TargetNamespace ).Core ().V1 ().Secrets ().Informer ().AddEventHandler (cache.ResourceEventHandlerFuncs {
103108 AddFunc : func (obj interface {}) {
104109 handleFencingSecretChange (ctx , kubeClient , nil , obj )
110+ handleEtcdCertChange (ctx , controllerContext , operatorClient , kubeInformersForNamespaces , controlPlaneNodeLister , kubeClient , nil , obj )
105111 },
106112 UpdateFunc : func (oldObj , newObj interface {}) {
107113 handleFencingSecretChange (ctx , kubeClient , oldObj , newObj )
114+ handleEtcdCertChange (ctx , controllerContext , operatorClient , kubeInformersForNamespaces , controlPlaneNodeLister , kubeClient , oldObj , newObj )
108115 },
109116 DeleteFunc : func (obj interface {}) {
110117 handleFencingSecretChange (ctx , kubeClient , nil , obj )
118+ handleEtcdCertChange (ctx , controllerContext , operatorClient , kubeInformersForNamespaces , controlPlaneNodeLister , kubeClient , nil , obj )
111119 },
112120 })
113121 if err != nil {
@@ -275,6 +283,128 @@ func runJobController(ctx context.Context, jobType tools.JobType, nodeName *stri
275283 go tnfJobController .Run (ctx , 1 )
276284}
277285
286+ func handleEtcdCertChange (ctx context.Context , controllerContext * controllercmd.ControllerContext , operatorClient v1helpers.StaticPodOperatorClient , kubeInformersForNamespaces v1helpers.KubeInformersForNamespaces , controlPlaneNodeLister corev1listers.NodeLister , client kubernetes.Interface , oldObj , obj interface {}) {
287+ secret , ok := obj .(* corev1.Secret )
288+ if ! ok {
289+ klog .Warningf ("failed to convert added / modified / deleted object to Secret %+v" , obj )
290+ return
291+ }
292+
293+ if secret .GetName () != tlshelpers .EtcdAllCertsSecretName {
294+ return
295+ }
296+
297+ var oldSecret * corev1.Secret
298+ if oldObj != nil {
299+ oldSecret , ok = oldObj .(* corev1.Secret )
300+ if ! ok {
301+ klog .Warningf ("failed to convert old object to Secret %+v" , oldObj )
302+ return
303+ }
304+ } else {
305+ // Nothing to do, no old cert was found, things should have progressed as normal
306+ // Might need to revisit this in the future if some edge case is found
307+ return
308+ }
309+
310+ certsChanged := false
311+ for key , oldValue := range oldSecret .Data {
312+ newValue , exists := secret .Data [key ]
313+ if ! exists || ! bytes .Equal (oldValue , newValue ) {
314+ klog .Infof ("etcd certs changed, restarting etcd" )
315+ certsChanged = true
316+ break
317+ }
318+ }
319+ if ! certsChanged {
320+ klog .Infof ("etcd certs did not change, skipping restart of podman-etcd" )
321+ return
322+ }
323+ klog .Infof ("etcd certs changed, restarting etcd" )
324+
325+ etcdCertUpdateMutex .Lock ()
326+ if etcdCertUpdateTriggered {
327+ klog .Infof ("etcd cert update triggered already, skipping recreation of etcd job for secret %s" , secret .GetName ())
328+ etcdCertUpdateMutex .Unlock ()
329+ return
330+ }
331+ etcdCertUpdateTriggered = true
332+ etcdCertUpdateMutex .Unlock ()
333+
334+ defer func () {
335+ etcdCertUpdateMutex .Lock ()
336+ etcdCertUpdateTriggered = false
337+ etcdCertUpdateMutex .Unlock ()
338+ }()
339+
340+ nodeList , err := controlPlaneNodeLister .List (labels .Everything ())
341+ if err != nil {
342+ klog .Errorf ("failed to list control plane nodes while waiting to create TNF jobs: %v" , err )
343+ return
344+ }
345+ if len (nodeList ) != 2 {
346+ klog .Info ("not starting TNF jobs yet, waiting for 2 control plane nodes to exist" )
347+ return
348+ }
349+
350+ for _ , node := range nodeList {
351+ runJobController (ctx , tools .JobTypeRestartEtcd , & node .Name , controllerContext , operatorClient , client , kubeInformersForNamespaces )
352+ }
353+
354+ jobsFound := false
355+ jobsDone := map [string ]bool {}
356+
357+ // helper func for waiting for a running job
358+ // finished = Complete, Failed, or not found
359+ isResrtartJobFinished := func (context.Context ) (finished bool , returnErr error ) {
360+ var err error
361+ jobsFound = false
362+ jobs , err := client .BatchV1 ().Jobs (operatorclient .TargetNamespace ).List (ctx , metav1.ListOptions {
363+ LabelSelector : fmt .Sprintf ("app.kubernetes.io/name=%s" , tools .JobTypeRestartEtcd .GetNameLabelValue ()),
364+ })
365+ if err != nil {
366+ if apierrors .IsNotFound (err ) {
367+ return true , nil
368+ }
369+ klog .Errorf ("failed to get fencing job, will retry: %v" , err )
370+ return false , nil
371+ }
372+ jobsFound = true
373+ for _ , job := range jobs .Items {
374+ if tools .IsConditionTrue (job .Status .Conditions , batchv1 .JobComplete ) || tools .IsConditionTrue (job .Status .Conditions , batchv1 .JobFailed ) {
375+ jobsDone [job .Name ] = true
376+ }
377+ }
378+ if len (jobsDone ) == len (nodeList ) {
379+ return true , nil
380+ }
381+ klog .Infof ("fencing job still running, skipping recreation for now, will retry" )
382+ return false , nil
383+ }
384+
385+ // wait as long as the fencing job waits as well, plus some execution time
386+ err = wait .PollUntilContextTimeout (ctx , tools .JobPollIntervall , tools .RestartEtcdJobCompletedTimeout , true , isResrtartJobFinished )
387+ if err != nil {
388+ // if we set timeouts right, this should not happen...
389+ klog .Errorf ("timed out waiting for fencing job to complete: %v" , err )
390+ return
391+ }
392+
393+ if ! jobsFound {
394+ klog .Errorf ("fencing job not found, nothing to do" )
395+ return
396+ }
397+
398+ klog .Info ("deleting fencing job for recreation" )
399+ for jobName , _ := range jobsDone {
400+ err = client .BatchV1 ().Jobs (operatorclient .TargetNamespace ).Delete (ctx , jobName , metav1.DeleteOptions {})
401+ if err != nil && ! apierrors .IsNotFound (err ) {
402+ // TODO how to trigger a retry here...
403+ klog .Errorf ("failed to delete fencing job: %v" , err )
404+ }
405+ }
406+ }
407+
278408func handleFencingSecretChange (ctx context.Context , client kubernetes.Interface , oldObj , obj interface {}) {
279409 secret , ok := obj .(* corev1.Secret )
280410 if ! ok {
0 commit comments