Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add a webhook to prevent eviction of pods on kosmos NotReady nodes #342

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
feat: add a webhook to prevent eviction of pods on kosmos NotReady nodes
Signed-off-by: wangyizhi1 <[email protected]>
wangyizhi1 committed Dec 22, 2023

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit db94de3323cd7ae903acef108b803d54061d47b5
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ TARGETS := clusterlink-controller-manager \
clusterlink-proxy \
clustertree-cluster-manager \
scheduler \
webhook

CTL_TARGETS := kosmosctl

63 changes: 63 additions & 0 deletions cmd/webhook/app/options/options.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package options

import (
"github.com/spf13/pflag"
"k8s.io/client-go/tools/leaderelection/resourcelock"
componentbaseconfig "k8s.io/component-base/config"

"github.com/kosmos.io/kosmos/pkg/utils"
"github.com/kosmos.io/kosmos/pkg/webhook"
)

type Options struct {
LeaderElection componentbaseconfig.LeaderElectionConfiguration
KubernetesOptions KubernetesOptions
WebhookServerOptions WebhookServerOptions
PodValidatorOptions webhook.PodValidatorOptions
}

type KubernetesOptions struct {
KubeConfig string `json:"kubeconfig" yaml:"kubeconfig"`
Master string `json:"master,omitempty" yaml:"master,omitempty"`
QPS float32 `json:"qps,omitempty" yaml:"qps,omitempty"`
Burst int `json:"burst,omitempty" yaml:"burst,omitempty"`
}

type WebhookServerOptions struct {
Host string
Port int
CertDir string
CertName string
KeyName string
}

func NewOptions() *Options {
return &Options{
LeaderElection: componentbaseconfig.LeaderElectionConfiguration{
LeaderElect: true,
ResourceLock: resourcelock.LeasesResourceLock,
ResourceNamespace: utils.DefaultNamespace,
ResourceName: "network-manager",
},
}
}

func (o *Options) AddFlags(flags *pflag.FlagSet) {
if o == nil {
return
}

flags.BoolVar(&o.LeaderElection.LeaderElect, "leader-elect", true, "Start a leader election client and gain leadership before executing the main loop. Enable this when running replicated components for high availability.")
flags.StringVar(&o.LeaderElection.ResourceName, "leader-elect-resource-name", "kosmos-webhook", "The name of resource object that is used for locking during leader election.")
flags.StringVar(&o.LeaderElection.ResourceNamespace, "leader-elect-resource-namespace", utils.DefaultNamespace, "The namespace of resource object that is used for locking during leader election.")
flags.Float32Var(&o.KubernetesOptions.QPS, "kube-qps", 40.0, "QPS to use while talking with kube-apiserver.")
flags.IntVar(&o.KubernetesOptions.Burst, "kube-burst", 60, "Burst to use while talking with kube-apiserver.")
flags.StringVar(&o.KubernetesOptions.KubeConfig, "kubeconfig", "", "Path for kubernetes kubeconfig file, if left blank, will use in cluster way.")
flags.StringVar(&o.KubernetesOptions.Master, "master", "", "Used to generate kubeconfig for downloading, if not specified, will use host in kubeconfig.")
flags.StringVar(&o.WebhookServerOptions.Host, "bind-address", "0.0.0.0", "The IP address on which to listen for the --secure-port port.")
flags.IntVar(&o.WebhookServerOptions.Port, "secure-port", 9443, "The secure port on which to serve HTTPS.")
flags.StringVar(&o.WebhookServerOptions.CertDir, "cert-dir", "/etc/certs", "The directory that contains the server key and certificate.")
flags.StringVar(&o.WebhookServerOptions.CertName, "tls-cert-file-name", "tls.crt", "The name of server certificate.")
flags.StringVar(&o.WebhookServerOptions.KeyName, "tls-private-key-file-name", "tls.key", "The name of server key.")
flags.StringArrayVar(&o.PodValidatorOptions.UsernamesNeedToPrevent, "usernames-need-to-prevent", []string{"system:serviceaccount:kube-system:node-controller"}, "Usernames that need to prevent deleting pods on NotReady kosmos nodes.")
}
10 changes: 10 additions & 0 deletions cmd/webhook/app/options/validation.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package options

import "k8s.io/apimachinery/pkg/util/validation/field"

// Validate checks Options and return a slice of found errs.
func (o *Options) Validate() field.ErrorList {
errs := field.ErrorList{}

return errs
}
102 changes: 102 additions & 0 deletions cmd/webhook/app/webhook.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package app

import (
"context"
"fmt"
"net/http"

"github.com/spf13/cobra"
"k8s.io/client-go/tools/clientcmd"
cliflag "k8s.io/component-base/cli/flag"
"k8s.io/klog/v2"
controllerruntime "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/webhook"

"github.com/kosmos.io/kosmos/cmd/webhook/app/options"
"github.com/kosmos.io/kosmos/pkg/scheme"
"github.com/kosmos.io/kosmos/pkg/sharedcli/klogflag"
kosmoswebhook "github.com/kosmos.io/kosmos/pkg/webhook"
)

func NewWebhookCommand(ctx context.Context) *cobra.Command {
opts := options.NewOptions()

cmd := &cobra.Command{
Use: "kosmos-webhook",
Long: `TODO`,
RunE: func(cmd *cobra.Command, args []string) error {
if errs := opts.Validate(); len(errs) != 0 {
return errs.ToAggregate()
}
if err := Run(ctx, opts); err != nil {
return err
}
return nil
},
Args: func(cmd *cobra.Command, args []string) error {
for _, arg := range args {
if len(arg) > 0 {
return fmt.Errorf("%q does not take any arguments, got %q", cmd.CommandPath(), args)
}
}
return nil
},
}

fss := cliflag.NamedFlagSets{}

genericFlagSet := fss.FlagSet("generic")
opts.AddFlags(genericFlagSet)

logsFlagSet := fss.FlagSet("logs")
klogflag.Add(logsFlagSet)

cmd.Flags().AddFlagSet(genericFlagSet)
cmd.Flags().AddFlagSet(logsFlagSet)

return cmd
}

func Run(ctx context.Context, opts *options.Options) error {
config, err := clientcmd.BuildConfigFromFlags(opts.KubernetesOptions.Master, opts.KubernetesOptions.KubeConfig)
if err != nil {
panic(err)
}
config.QPS, config.Burst = opts.KubernetesOptions.QPS, opts.KubernetesOptions.Burst

mgr, err := controllerruntime.NewManager(config, controllerruntime.Options{
Logger: klog.Background(),
Scheme: scheme.NewSchema(),
WebhookServer: &webhook.Server{
Host: opts.WebhookServerOptions.Host,
Port: opts.WebhookServerOptions.Port,
CertDir: opts.WebhookServerOptions.CertDir,
CertName: opts.WebhookServerOptions.CertName,
KeyName: opts.WebhookServerOptions.KeyName,
},
MetricsBindAddress: "0",
HealthProbeBindAddress: "0",
LeaderElection: opts.LeaderElection.LeaderElect,
LeaderElectionID: opts.LeaderElection.ResourceName,
LeaderElectionNamespace: opts.LeaderElection.ResourceNamespace,
})
if err != nil {
klog.Errorf("failed to build webhook server: %v", err)
return err
}

hookServer := mgr.GetWebhookServer()
hookServer.Register("/validate-delete-pod", &webhook.Admission{Handler: &kosmoswebhook.PodValidator{
Client: mgr.GetClient(),
Options: opts.PodValidatorOptions,
}})
hookServer.WebhookMux.Handle("/health", http.StripPrefix("/health", &healthz.Handler{}))

if err := mgr.Start(ctx); err != nil {
klog.Errorf("failed to start webhook manager: %v", err)
return err
}

return nil
}
17 changes: 17 additions & 0 deletions cmd/webhook/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package main

import (
"os"

apiserver "k8s.io/apiserver/pkg/server"
"k8s.io/component-base/cli"

"github.com/kosmos.io/kosmos/cmd/webhook/app"
)

func main() {
ctx := apiserver.SetupSignalContext()
cmd := app.NewWebhookCommand(ctx)
code := cli.Run(cmd)
os.Exit(code)
}
53 changes: 53 additions & 0 deletions deploy/webhook/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: kosmos-webhook
namespace: kosmos-system
spec:
selector:
matchLabels:
app: kosmos-webhook
template:
metadata:
labels:
app: kosmos-webhook
spec:
serviceAccountName: kosmos-webhook
affinity:
nodeAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
nodeSelectorTerms:
- matchExpressions:
- key: kosmos.io/node
operator: DoesNotExist
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- kosmos-webhook
namespaces:
- kosmos-system
topologyKey: kubernetes.io/hostname
containers:
- image: ghcr.io/kosmos-io/webhook:__VERSION__
name: kosmos-webhook
volumeMounts:
- name: tls
mountPath: "/etc/certs"
command:
- webhook
- --v=4
resources:
limits:
memory: 500Mi
cpu: 500m
requests:
cpu: 500m
memory: 500Mi
volumes:
- name: tls
secret:
secretName: kosmos-webhook-tls
27 changes: 27 additions & 0 deletions deploy/webhook/rbac.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kosmos-webhook
namespace: kosmos-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kosmos-webhook
rules:
- apiGroups: ['*']
resources: ["nodes", "pods"]
verbs: ["*"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kosmos-webhook
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kosmos-webhook
subjects:
- kind: ServiceAccount
name: kosmos-webhook
namespace: kosmos-system
12 changes: 12 additions & 0 deletions deploy/webhook/service.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: kosmos-webhook
namespace: kosmos-system
spec:
ports:
- port: 9443
protocol: TCP
targetPort: 9443
selector:
app: kosmos-webhook
9 changes: 9 additions & 0 deletions deploy/webhook/tls-secret.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
apiVersion: v1
data:
tls.crt: __BASE64_SERVER_CRT__
tls.key: __BASE64_SERVER_KEY__
kind: Secret
metadata:
name: kosmos-webhook-tls
namespace: kosmos-system
type: kubernetes.io/tls
26 changes: 26 additions & 0 deletions deploy/webhook/validate-delete-pod-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
apiVersion: admissionregistration.k8s.io/v1
kind: ValidatingWebhookConfiguration
metadata:
name: "validate-delete-pod.kosmos.io"
webhooks:
- name: "validate-delete-pod.kosmos.io"
rules:
- apiGroups: [""]
apiVersions: ["v1"]
operations: ["DELETE"]
resources: ["pods"]
scope: "*"
admissionReviewVersions: ["v1"]
# FailurePolicy defines how unrecognized errors from the admission endpoint are handled - allowed values are
# Ignore or Fail. Defaults to Fail.
failurePolicy: Ignore
sideEffects: None
timeoutSeconds: 3
clientConfig:
service:
namespace: kosmos-system
name: kosmos-webhook
path: /validate-delete-pod
port: 9443
caBundle: |
__BASE64_CA_CRT__
36 changes: 36 additions & 0 deletions hack/gen-webhook-certs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
#!/bin/bash

WEBHOOK_NAME="kosmos-webhook"
NAMESPACE="kosmos-system"
DAYS="36500"

openssl genrsa -out ca.key 2048

openssl req -new -x509 -days ${DAYS} -key ca.key \
-subj "/C=CN/CN=${WEBHOOK_NAME}"\
-out ca.crt

openssl req -newkey rsa:2048 -nodes -keyout server.key \
-subj "/C=CN/CN=${WEBHOOK_NAME}" \
-out server.csr

openssl x509 -req \
-extfile <(printf "subjectAltName=DNS:${WEBHOOK_NAME}.${NAMESPACE}.svc") \
-days ${DAYS} \
-in server.csr \
-CA ca.crt -CAkey ca.key -CAcreateserial \
-out server.crt

echo
echo ">> Generating kube secrets..."
kubectl create secret tls ${WEBHOOK_NAME}-tls \
--cert=server.crt \
--key=server.key \
--dry-run=client -o yaml \
> tls-secret.yaml

echo
echo ">> MutatingWebhookConfiguration caBundle:"
cat ca.crt | base64 | fold

rm ca.crt ca.key ca.srl server.crt server.csr server.key
1 change: 1 addition & 0 deletions hack/util.sh
Original file line number Diff line number Diff line change
@@ -23,6 +23,7 @@ CLUSTERLINK_TARGET_SOURCE=(
clusterlink-controller-manager=cmd/clusterlink/controller-manager
clustertree-cluster-manager=cmd/clustertree/cluster-manager
kosmosctl=cmd/kosmosctl
webhook=cmd/webhook
)

#https://textkool.com/en/ascii-art-generator?hl=default&vl=default&font=DOS%20Rebel&text=KOSMOS
9 changes: 9 additions & 0 deletions pkg/utils/k8s.go
Original file line number Diff line number Diff line change
@@ -342,3 +342,12 @@ func ListResourceClusters(anno map[string]string) []string {
owners := strings.Split(anno[KosmosResourceOwnersAnnotations], ",")
return owners
}

func IsNotReady(node *corev1.Node) bool {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status == corev1.ConditionTrue {
return false
}
}
return true
}
70 changes: 70 additions & 0 deletions pkg/webhook/validate_delete_pods.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package webhook

import (
"context"

admissionv1 "k8s.io/api/admission/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/klog/v2"
"k8s.io/utils/strings/slices"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/webhook/admission"

"github.com/kosmos.io/kosmos/pkg/utils"
)

type PodValidator struct {
client.Client
decoder *admission.Decoder
Options PodValidatorOptions
}

type PodValidatorOptions struct {
UsernamesNeedToPrevent []string
}

var _ admission.Handler = &PodValidator{}
var _ admission.DecoderInjector = &PodValidator{}

func (v *PodValidator) Handle(ctx context.Context, req admission.Request) admission.Response {
if req.Operation != admissionv1.Delete {
return admission.Allowed("")
}

pod := &corev1.Pod{}
err := v.decoder.DecodeRaw(req.OldObject, pod)
if err != nil {
klog.Warningf("Decode oldObject error: %v, skip", err)
return admission.Allowed("")
}

if pod.Spec.NodeName == "" {
klog.V(4).Infof("Pod %s's nodeName is empty, skip", err)
return admission.Allowed("")
}

node := &corev1.Node{}
if err = v.Client.Get(ctx, types.NamespacedName{
Name: pod.Spec.NodeName,
}, node); err != nil {
klog.V(4).Infof("Failed to get pod %s/%s's node, nodeName: %s, error: %v", pod.Namespace, pod.Name, pod.Spec.NodeName, err)
return admission.Allowed("")
}

if utils.IsKosmosNode(node) && utils.IsNotReady(node) && v.needToPrevent(req.UserInfo.Username) {
klog.Infof("Kosmos prevents pod deletion, name: %s, ns: %s", pod.Name, pod.Namespace)
return admission.Denied("Deleting pods of notReady kosmos nodes is not allowed.")
}

return admission.Allowed("")
}

func (v *PodValidator) InjectDecoder(d *admission.Decoder) error {
v.decoder = d
return nil
}

func (v *PodValidator) needToPrevent(username string) bool {
return slices.Contains(v.Options.UsernamesNeedToPrevent, username)
}