Skip to content

Commit e4b1946

Browse files
committed
feat: added logic to handle cert tnf cert rotation
added a handler func during event watch to trigger a job on both nodes to restart the podman-etcd service Signed-off-by: ehila <[email protected]>
1 parent e06c84b commit e4b1946

File tree

5 files changed

+198
-4
lines changed

5 files changed

+198
-4
lines changed

cmd/tnf-setup-runner/main.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
tnfauth "github.com/openshift/cluster-etcd-operator/pkg/tnf/auth"
2020
tnffencing "github.com/openshift/cluster-etcd-operator/pkg/tnf/fencing"
2121
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/tools"
22+
tnfrestartetcd "github.com/openshift/cluster-etcd-operator/pkg/tnf/restart-etcd"
2223
tnfsetup "github.com/openshift/cluster-etcd-operator/pkg/tnf/setup"
2324
)
2425

@@ -58,6 +59,7 @@ func NewTnfSetupRunnerCommand() *cobra.Command {
5859
cmd.AddCommand(NewSetupCommand())
5960
cmd.AddCommand(NewAfterSetupCommand())
6061
cmd.AddCommand(NewFencingCommand())
62+
cmd.AddCommand(NewRestartEtcdCommand())
6163

6264
return cmd
6365
}
@@ -113,3 +115,16 @@ func NewFencingCommand() *cobra.Command {
113115
},
114116
}
115117
}
118+
119+
func NewRestartEtcdCommand() *cobra.Command {
120+
return &cobra.Command{
121+
Use: tools.JobTypeRestartEtcd.GetSubCommand(),
122+
Short: "Run restart etcd steps for cert change",
123+
Run: func(cmd *cobra.Command, args []string) {
124+
err := tnfrestartetcd.RunEtcdRestart()
125+
if err != nil {
126+
klog.Fatal(err)
127+
}
128+
},
129+
}
130+
}

pkg/tnf/operator/starter.go

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
"github.com/openshift/cluster-etcd-operator/pkg/operator/ceohelpers"
3737
"github.com/openshift/cluster-etcd-operator/pkg/operator/externaletcdsupportcontroller"
3838
"github.com/openshift/cluster-etcd-operator/pkg/operator/operatorclient"
39+
"github.com/openshift/cluster-etcd-operator/pkg/tlshelpers"
3940
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/jobs"
4041
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/tools"
4142
)
@@ -45,6 +46,10 @@ var (
4546
fencingUpdateTriggered bool
4647
// fencingUpdateMutex is used to make usage of fencingUpdateTriggered thread safe
4748
fencingUpdateMutex sync.Mutex
49+
// etcdCertUpdateTriggered is set to true when a etcd cert update is already triggered
50+
etcdCertUpdateTriggered bool
51+
// etcdCertUpdateMutex is used to make usage of etcdCertUpdateTriggered thread safe
52+
etcdCertUpdateMutex sync.Mutex
4853
)
4954

5055
// HandleDualReplicaClusters checks feature gate and control plane topology,
@@ -102,12 +107,15 @@ func HandleDualReplicaClusters(
102107
_, err = kubeInformersForNamespaces.InformersFor(operatorclient.TargetNamespace).Core().V1().Secrets().Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
103108
AddFunc: func(obj interface{}) {
104109
handleFencingSecretChange(ctx, kubeClient, nil, obj)
110+
handleEtcdCertChange(ctx, controllerContext, operatorClient, kubeInformersForNamespaces, controlPlaneNodeLister, kubeClient, nil, obj)
105111
},
106112
UpdateFunc: func(oldObj, newObj interface{}) {
107113
handleFencingSecretChange(ctx, kubeClient, oldObj, newObj)
114+
handleEtcdCertChange(ctx, controllerContext, operatorClient, kubeInformersForNamespaces, controlPlaneNodeLister, kubeClient, oldObj, newObj)
108115
},
109116
DeleteFunc: func(obj interface{}) {
110117
handleFencingSecretChange(ctx, kubeClient, nil, obj)
118+
handleEtcdCertChange(ctx, controllerContext, operatorClient, kubeInformersForNamespaces, controlPlaneNodeLister, kubeClient, nil, obj)
111119
},
112120
})
113121
if err != nil {
@@ -275,6 +283,128 @@ func runJobController(ctx context.Context, jobType tools.JobType, nodeName *stri
275283
go tnfJobController.Run(ctx, 1)
276284
}
277285

286+
func handleEtcdCertChange(ctx context.Context, controllerContext *controllercmd.ControllerContext, operatorClient v1helpers.StaticPodOperatorClient, kubeInformersForNamespaces v1helpers.KubeInformersForNamespaces, controlPlaneNodeLister corev1listers.NodeLister, client kubernetes.Interface, oldObj, obj interface{}) {
287+
secret, ok := obj.(*corev1.Secret)
288+
if !ok {
289+
klog.Warningf("failed to convert added / modified / deleted object to Secret %+v", obj)
290+
return
291+
}
292+
293+
if secret.GetName() != tlshelpers.EtcdAllCertsSecretName {
294+
return
295+
}
296+
297+
var oldSecret *corev1.Secret
298+
if oldObj != nil {
299+
oldSecret, ok = oldObj.(*corev1.Secret)
300+
if !ok {
301+
klog.Warningf("failed to convert old object to Secret %+v", oldObj)
302+
return
303+
}
304+
} else {
305+
// Nothing to do, no old cert was found, things should have progressed as normal
306+
// Might need to revisit this in the future if some edge case is found
307+
return
308+
}
309+
310+
certsChanged := false
311+
for key, oldValue := range oldSecret.Data {
312+
newValue, exists := secret.Data[key]
313+
if !exists || !bytes.Equal(oldValue, newValue) {
314+
klog.Infof("etcd certs changed, restarting etcd")
315+
certsChanged = true
316+
break
317+
}
318+
}
319+
if !certsChanged {
320+
klog.Infof("etcd certs did not change, skipping restart of podman-etcd")
321+
return
322+
}
323+
klog.Infof("etcd certs changed, restarting etcd")
324+
325+
etcdCertUpdateMutex.Lock()
326+
if etcdCertUpdateTriggered {
327+
klog.Infof("etcd cert update triggered already, skipping recreation of etcd job for secret %s", secret.GetName())
328+
etcdCertUpdateMutex.Unlock()
329+
return
330+
}
331+
etcdCertUpdateTriggered = true
332+
etcdCertUpdateMutex.Unlock()
333+
334+
defer func() {
335+
etcdCertUpdateMutex.Lock()
336+
etcdCertUpdateTriggered = false
337+
etcdCertUpdateMutex.Unlock()
338+
}()
339+
340+
nodeList, err := controlPlaneNodeLister.List(labels.Everything())
341+
if err != nil {
342+
klog.Errorf("failed to list control plane nodes while waiting to create TNF jobs: %v", err)
343+
return
344+
}
345+
if len(nodeList) != 2 {
346+
klog.Info("not starting TNF jobs yet, waiting for 2 control plane nodes to exist")
347+
return
348+
}
349+
350+
for _, node := range nodeList {
351+
runJobController(ctx, tools.JobTypeRestartEtcd, &node.Name, controllerContext, operatorClient, client, kubeInformersForNamespaces)
352+
}
353+
354+
jobsFound := false
355+
jobsDone := map[string]bool{}
356+
357+
// helper func for waiting for a running job
358+
// finished = Complete, Failed, or not found
359+
isResrtartJobFinished := func(context.Context) (finished bool, returnErr error) {
360+
var err error
361+
jobsFound = false
362+
jobs, err := client.BatchV1().Jobs(operatorclient.TargetNamespace).List(ctx, metav1.ListOptions{
363+
LabelSelector: fmt.Sprintf("app.kubernetes.io/name=%s", tools.JobTypeRestartEtcd.GetNameLabelValue()),
364+
})
365+
if err != nil {
366+
if apierrors.IsNotFound(err) {
367+
return true, nil
368+
}
369+
klog.Errorf("failed to get fencing job, will retry: %v", err)
370+
return false, nil
371+
}
372+
jobsFound = true
373+
for _, job := range jobs.Items {
374+
if tools.IsConditionTrue(job.Status.Conditions, batchv1.JobComplete) || tools.IsConditionTrue(job.Status.Conditions, batchv1.JobFailed) {
375+
jobsDone[job.Name] = true
376+
}
377+
}
378+
if len(jobsDone) == len(nodeList) {
379+
return true, nil
380+
}
381+
klog.Infof("fencing job still running, skipping recreation for now, will retry")
382+
return false, nil
383+
}
384+
385+
// wait as long as the fencing job waits as well, plus some execution time
386+
err = wait.PollUntilContextTimeout(ctx, tools.JobPollIntervall, tools.RestartEtcdJobCompletedTimeout, true, isResrtartJobFinished)
387+
if err != nil {
388+
// if we set timeouts right, this should not happen...
389+
klog.Errorf("timed out waiting for fencing job to complete: %v", err)
390+
return
391+
}
392+
393+
if !jobsFound {
394+
klog.Errorf("fencing job not found, nothing to do")
395+
return
396+
}
397+
398+
klog.Info("deleting fencing job for recreation")
399+
for jobName, _ := range jobsDone {
400+
err = client.BatchV1().Jobs(operatorclient.TargetNamespace).Delete(ctx, jobName, metav1.DeleteOptions{})
401+
if err != nil && !apierrors.IsNotFound(err) {
402+
// TODO how to trigger a retry here...
403+
klog.Errorf("failed to delete fencing job: %v", err)
404+
}
405+
}
406+
}
407+
278408
func handleFencingSecretChange(ctx context.Context, client kubernetes.Interface, oldObj, obj interface{}) {
279409
secret, ok := obj.(*corev1.Secret)
280410
if !ok {

pkg/tnf/pkg/pcs/etcd.go

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,28 @@ func ConfigureEtcd(ctx context.Context, cfg config.ClusterConfig) error {
3333
return nil
3434
}
3535

36+
func RestartEtcd(ctx context.Context) error {
37+
klog.Info("Checking pcs resources")
38+
39+
stdOut, stdErr, err := exec.Execute(ctx, "/usr/sbin/pcs resource status")
40+
if err != nil || len(stdErr) > 0 {
41+
klog.Error(err, "Failed to get pcs resource status", "stdout", stdOut, "stderr", stdErr, "err", err)
42+
return err
43+
}
44+
if !strings.Contains(stdOut, "etcd") {
45+
klog.Info("etcd resource not found, nothing to do")
46+
return nil
47+
}
48+
49+
klog.Info("Restarting etcd")
50+
stdOut, stdErr, err = exec.Execute(ctx, "/usr/sbin/pcs resource restart etcd")
51+
if err != nil || len(stdErr) > 0 {
52+
klog.Error(err, "Failed to restart etcd", "stdout", stdOut, "stderr", stdErr, "err", err)
53+
return err
54+
}
55+
return nil
56+
}
57+
3658
// ConfigureConstraints configures the etcd constraints
3759
func ConfigureConstraints(ctx context.Context) (bool, error) {
3860
klog.Info("Checking pcs constraints")

pkg/tnf/pkg/tools/jobs.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,11 @@ import (
1111
// setup job: waits for auth jobs to complete
1212
// after setup jobs: waits for setup job to complete
1313
const (
14-
JobPollIntervall = 15 * time.Second
15-
AuthJobCompletedTimeout = 10 * time.Minute
16-
SetupJobCompletedTimeout = 20 * time.Minute
17-
FencingJobCompletedTimeout = 25 * time.Minute
14+
JobPollIntervall = 15 * time.Second
15+
AuthJobCompletedTimeout = 10 * time.Minute
16+
SetupJobCompletedTimeout = 20 * time.Minute
17+
FencingJobCompletedTimeout = 25 * time.Minute
18+
RestartEtcdJobCompletedTimeout = 10 * time.Minute
1819
)
1920

2021
// JobType represent the different jobs we run, with some methods needed
@@ -26,6 +27,7 @@ const (
2627
JobTypeSetup
2728
JobTypeAfterSetup
2829
JobTypeFencing
30+
JobTypeRestartEtcd
2931
)
3032

3133
func (t JobType) GetSubCommand() string {
@@ -38,6 +40,8 @@ func (t JobType) GetSubCommand() string {
3840
return "after-setup"
3941
case JobTypeFencing:
4042
return "fencing"
43+
case JobTypeRestartEtcd:
44+
return "restart-etcd"
4145
default:
4246
return ""
4347
}

pkg/tnf/restart-etcd/runner.go

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
package restartetcd
2+
3+
import (
4+
"context"
5+
6+
"k8s.io/klog/v2"
7+
8+
"github.com/openshift/cluster-etcd-operator/pkg/tnf/pkg/pcs"
9+
)
10+
11+
func RunEtcdRestart() error {
12+
klog.Info("Running TNF etcd restart")
13+
ctx := context.Background()
14+
15+
err := pcs.RestartEtcd(ctx)
16+
if err != nil {
17+
return err
18+
}
19+
20+
klog.Infof("Etcd restart done!")
21+
22+
return nil
23+
}

0 commit comments

Comments
 (0)