Add pod anti affinity rule by default

Right now, the Kubernetes scheduler can deploy replicas of the same service on the same node. This is not desirable for HA. Also, it is impossible for the user to specify any kind of affinity rules for the pods. This PR adds the ability to configure an affinity spec in the config. Also, it adds by default an anti-affinity rule to make sure that the Kubernetes scheduler is doing its best to schedule different replicas for a service to different nodes.
ServiceWeaver · Jun 17, 2024 · 55c239b · 55c239b
1 parent b7b4fd5
commit 55c239b
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 3 deletions.
diff --git a/internal/impl/config.go b/internal/impl/config.go
@@ -83,6 +83,12 @@ type kubeConfig struct {
 	// [1] https://pkg.go.dev/k8s.io/kubernetes/pkg/apis/autoscaling#HorizontalPodAutoscalerSpec.
 	ScalingSpec *autoscalingv2.HorizontalPodAutoscalerSpec
 
+	// Specs for pod affinity. Note that the affinity specs should satisfy
+	// the format specified in [1].
+	//
+	// [1] https://pkg.go.dev/k8s.io/api/core/v1#Affinity
+	AffinitySpec *corev1.Affinity
+
 	// Volumes that should be provided to all the running components.
 	StorageSpec volumeSpecs
 

diff --git a/internal/impl/kube.go b/internal/impl/kube.go
@@ -96,6 +96,10 @@ func buildDeployment(d deployment, g group) (*appsv1.Deployment, error) {
 		dnsPolicy = corev1.DNSClusterFirstWithHostNet
 	}
 
+	matchLabels := map[string]string{
+		"serviceweaver/name": name,
+	}
+
 	// Create container.
 	container, err := buildContainer(d, g)
 	if err != nil {
@@ -121,9 +125,7 @@ func buildDeployment(d deployment, g group) (*appsv1.Deployment, error) {
 		},
 		Spec: appsv1.DeploymentSpec{
 			Selector: &metav1.LabelSelector{
-				MatchLabels: map[string]string{
-					"serviceweaver/name": name,
-				},
+				MatchLabels: matchLabels,
 			},
 			Template: corev1.PodTemplateSpec{
 				ObjectMeta: metav1.ObjectMeta{
@@ -138,6 +140,7 @@ func buildDeployment(d deployment, g group) (*appsv1.Deployment, error) {
 					Containers:         []corev1.Container{container},
 					DNSPolicy:          dnsPolicy,
 					HostNetwork:        d.config.UseHostNetwork,
+					Affinity:           updateAffinitySpec(d.config.AffinitySpec, matchLabels),
 					Volumes: []corev1.Volume{
 						{
 							Name: "config",
@@ -711,7 +714,45 @@ func newDeployment(app *protos.AppConfig, cfg *kubeConfig, depId, image string)
 		app:          app,
 		groups:       sorted,
 	}, nil
+}
 
+// updateAffinitySpec updates an affinity spec with a rule that instructs the
+// kubernetes scheduler to try its best to assign different replicas for the same
+// deployment to different nodes.
+//
+// Note that this rule isn't necessarily enforced, and the scheduler can ignore
+// it if there is no way this can be done (e.g., number of replicas is greater or
+// equal to the number of nodes).
+func updateAffinitySpec(spec *corev1.Affinity, labels map[string]string) *corev1.Affinity {
+	updated := &corev1.Affinity{PodAntiAffinity: &corev1.PodAntiAffinity{
+		PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{
+			{
+				Weight: 100,
+				PodAffinityTerm: corev1.PodAffinityTerm{
+					LabelSelector: &metav1.LabelSelector{
+						MatchLabels: labels,
+					},
+					TopologyKey: corev1.LabelHostname,
+				},
+			},
+		},
+	},
+	}
+	if spec == nil {
+		return updated
+	}
+	if spec.NodeAffinity != nil {
+		updated.NodeAffinity = spec.NodeAffinity
+	}
+	if spec.PodAffinity != nil {
+		updated.PodAffinity = spec.PodAffinity
+	}
+	if spec.PodAntiAffinity != nil {
+		updated.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution = spec.PodAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution
+		updated.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution = append(
+			updated.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution, spec.PodAntiAffinity.PreferredDuringSchedulingIgnoredDuringExecution...)
+	}
+	return updated
 }
 
 // newListener returns a new listener.