Skip to content

Commit 169b2c0

Browse files
authored
Merge pull request #2658 from 08volt/empty-zones
Instance Group Manager does not remove nodes with empty string in the zone field
2 parents 2d7ee88 + 35c187b commit 169b2c0

File tree

6 files changed

+140
-24
lines changed

6 files changed

+140
-24
lines changed

pkg/instancegroups/manager.go

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,17 @@ func (m *manager) Sync(nodes []string, logger klog.Logger) (err error) {
310310
// https://github.com/kubernetes/cloud-provider-gcp/blob/fca628cb3bf9267def0abb509eaae87d2d4040f3/providers/gce/gce_loadbalancer_internal.go#L606C1-L675C1
311311
// the m.maxIGSize should be set to 1000 as is in the cloud-provider-gcp.
312312
zonedNodes := m.splitNodesByZone(nodes, iglogger)
313+
iglogger.Info(fmt.Sprintf("Syncing nodes: %d nodes over %d zones", len(nodes), len(zonedNodes)))
314+
315+
emptyZoneNodesNames := sets.NewString(zonedNodes[zonegetter.EmptyZone]...)
316+
if len(emptyZoneNodesNames) > 0 {
317+
iglogger.Info(fmt.Sprintf("%d nodes have empty zone: %v. They will not be removed from instance group as long as zone is missing", len(emptyZoneNodesNames), emptyZoneNodesNames))
318+
}
319+
313320
for zone, kubeNodesFromZone := range zonedNodes {
321+
if zone == zonegetter.EmptyZone {
322+
continue // skip ensuring instance group for empty zone
323+
}
314324
igName := m.namer.InstanceGroup()
315325
if len(kubeNodesFromZone) > m.maxIGSize {
316326
sortedKubeNodesFromZone := sets.NewString(kubeNodesFromZone...).List()
@@ -335,7 +345,12 @@ func (m *manager) Sync(nodes []string, logger klog.Logger) (err error) {
335345
gceNodes.Insert(instance)
336346
}
337347

338-
removeNodes := gceNodes.Difference(kubeNodes).List()
348+
removalCandidates := gceNodes.Difference(kubeNodes)
349+
iglogger.V(2).Info("Nodes that are removal candidates", "removalCandidates", events.TruncatedStringList(removalCandidates.List()))
350+
351+
removeNodes := removalCandidates.Difference(emptyZoneNodesNames).List() // Do not remove nodes which zone label still need to be assigned
352+
iglogger.V(2).Info("Removing nodes (after ignoring nodes without zone assigned)", "removeNodes", events.TruncatedStringList(removeNodes)) // Do not remove nodes which zone label still need to be assigned
353+
339354
addNodes := kubeNodes.Difference(gceNodes).List()
340355

341356
iglogger.V(2).Info("Removing nodes", "removeNodes", events.TruncatedStringList(removeNodes))

pkg/instancegroups/manager_test.go

Lines changed: 101 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,14 @@ package instancegroups
1818

1919
import (
2020
"fmt"
21-
apiv1 "k8s.io/api/core/v1"
22-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
23-
"k8s.io/ingress-gce/pkg/utils"
2421
"net/http"
2522
"strings"
2623
"testing"
2724

25+
apiv1 "k8s.io/api/core/v1"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/ingress-gce/pkg/utils"
28+
2829
"google.golang.org/api/googleapi"
2930
"k8s.io/klog/v2"
3031

@@ -62,6 +63,103 @@ func newNodePool(f Provider, maxIGSize int) Manager {
6263
return pool
6364
}
6465

66+
func getNodeNames(nodes map[string]string) []string {
67+
names := make([]string, 0)
68+
for name, _ := range nodes {
69+
names = append(names, name)
70+
}
71+
return names
72+
}
73+
74+
func TestNodePoolSyncWithEmptyZone(t *testing.T) {
75+
76+
testCases := []struct {
77+
desc string
78+
instanceGroupVMs []string // VMs present before the sync, all in defaultTestZone
79+
kubeNodes map[string]string // map of node:zone during the sync "ensure instance group"
80+
wantedInstanceGroupVMs []string // VMs that should be there at the end of the updates
81+
}{
82+
{
83+
desc: "Both nodes have zone during update do not get deleted",
84+
instanceGroupVMs: []string{"n1", "n2"},
85+
kubeNodes: map[string]string{
86+
"n1": defaultTestZone,
87+
"n2": defaultTestZone,
88+
},
89+
wantedInstanceGroupVMs: []string{"n1", "n2"},
90+
},
91+
{
92+
desc: "Create node when zone ready and do not delete node when zone empty",
93+
instanceGroupVMs: []string{"n1"},
94+
kubeNodes: map[string]string{
95+
"n1": "",
96+
"n2": defaultTestZone,
97+
},
98+
wantedInstanceGroupVMs: []string{"n1", "n2"},
99+
},
100+
{
101+
desc: "Do not delete nodes if zone is empty but delete if node not there",
102+
instanceGroupVMs: []string{"n1", "n2", "n3"},
103+
kubeNodes: map[string]string{
104+
"n2": "",
105+
"n3": defaultTestZone,
106+
},
107+
wantedInstanceGroupVMs: []string{"n2", "n3"},
108+
},
109+
{
110+
desc: "Do not create one Node without zone assigned",
111+
instanceGroupVMs: []string{"n1"},
112+
kubeNodes: map[string]string{
113+
"n1": defaultTestZone,
114+
"n2": "",
115+
},
116+
wantedInstanceGroupVMs: []string{"n1"},
117+
},
118+
}
119+
120+
for _, tc := range testCases {
121+
t.Run(tc.desc, func(t *testing.T) {
122+
// create fake gce node pool with nodes in instanceGroupVMs
123+
ig := &compute.InstanceGroup{Name: defaultNamer.InstanceGroup()}
124+
zonesToIGs := map[string]IGsToInstances{
125+
defaultTestZone: {
126+
ig: sets.NewString(tc.instanceGroupVMs...),
127+
},
128+
}
129+
fakeGCEInstanceGroups := NewFakeInstanceGroups(zonesToIGs, 10)
130+
131+
// assigne zones to nodes in kubeNodes
132+
pool := newNodePool(fakeGCEInstanceGroups, 10)
133+
for name, zone := range tc.kubeNodes {
134+
manager := pool.(*manager)
135+
zonegetter.AddFakeNodes(manager.ZoneGetter, zone, name)
136+
}
137+
138+
// run sync step
139+
nodeNames := getNodeNames(tc.kubeNodes)
140+
err := pool.Sync(nodeNames, klog.TODO())
141+
if err != nil {
142+
t.Fatalf("pool.Sync(%v) returned error %v, want nil", nodeNames, err)
143+
}
144+
145+
instancesList, err := fakeGCEInstanceGroups.ListInstancesInInstanceGroup(ig.Name, defaultTestZone, allInstances)
146+
if err != nil {
147+
t.Fatalf("fakeGCEInstanceGroups.ListInstancesInInstanceGroup(%s, %s, %s) returned error %v, want nil", ig.Name, defaultTestZone, allInstances, err)
148+
}
149+
instances, err := test.InstancesListToNameSet(instancesList)
150+
if err != nil {
151+
t.Fatalf("test.InstancesListToNameSet(%v) returned error %v, want nil", ig, err)
152+
}
153+
154+
// check nodes are exactly the ones we expect to have in the instance group after the sync
155+
wantedIGVMsSet := sets.NewString(tc.wantedInstanceGroupVMs...)
156+
if !wantedIGVMsSet.Equal(instances) {
157+
t.Errorf("Expected kubeNodeSet = %v is not equal to instance set = %v", wantedIGVMsSet, instances)
158+
}
159+
})
160+
}
161+
}
162+
65163
func TestNodePoolSync(t *testing.T) {
66164
maxIGSize := 1000
67165

pkg/neg/syncers/endpoints_calculator.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ func (l *LocalL4EndpointsCalculator) CalculateEndpoints(eds []types.EndpointsDat
107107
continue
108108
}
109109
zone, subnet, err := l.zoneGetter.ZoneAndSubnetForNode(node.Name, l.logger)
110-
if err != nil {
110+
if err != nil || zone == zonegetter.EmptyZone {
111111
l.logger.Error(err, "Unable to find zone for node, skipping", "nodeName", node.Name)
112112
metrics.PublishNegControllerErrorCountMetrics(err, true)
113113
continue
@@ -184,7 +184,7 @@ func (l *ClusterL4EndpointsCalculator) CalculateEndpoints(_ []types.EndpointsDat
184184
continue
185185
}
186186
zone, subnet, err := l.zoneGetter.ZoneAndSubnetForNode(node.Name, l.logger)
187-
if err != nil {
187+
if err != nil || zone == zonegetter.EmptyZone {
188188
l.logger.Error(err, "Unable to find zone for node skipping", "nodeName", node.Name)
189189
metrics.PublishNegControllerErrorCountMetrics(err, true)
190190
continue

pkg/neg/syncers/utils.go

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -470,15 +470,17 @@ func toZoneNetworkEndpointMapDegradedMode(eds []negtypes.EndpointsData, zoneGett
470470
localEPCount[negtypes.NodeMissing]++
471471
continue
472472
}
473-
zone, subnet, err := zoneGetter.ZoneAndSubnetForNode(nodeName, logger)
474-
if err != nil {
475-
metrics.PublishNegControllerErrorCountMetrics(err, true)
476-
if enableMultiSubnetCluster && errors.Is(err, zonegetter.ErrNodeNotInDefaultSubnet) {
477-
epLogger.Error(err, "Detected endpoint not from default subnet. Skipping", "nodeName", nodeName)
478-
localEPCount[negtypes.NodeInNonDefaultSubnet]++
479-
continue
473+
zone, subnet, getZoneErr := zoneGetter.ZoneAndSubnetForNode(nodeName, logger)
474+
if getZoneErr != nil || zone == zonegetter.EmptyZone {
475+
if getZoneErr != nil {
476+
metrics.PublishNegControllerErrorCountMetrics(getZoneErr, true)
477+
if enableMultiSubnetCluster && errors.Is(getZoneErr, zonegetter.ErrNodeNotInDefaultSubnet) {
478+
epLogger.Error(getZoneErr, "Detected endpoint not from default subnet. Skipping", "nodeName", nodeName)
479+
localEPCount[negtypes.NodeInNonDefaultSubnet]++
480+
continue
481+
}
480482
}
481-
epLogger.Error(err, "Endpoint's corresponding node does not have valid zone information, skipping", "nodeName", nodeName)
483+
epLogger.Error(getZoneErr, "Endpoint's corresponding node does not have valid zone information, skipping", "nodeName", nodeName)
482484
localEPCount[negtypes.NodeNotFound]++
483485
continue
484486
}

pkg/utils/zonegetter/zone_getter.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ const (
4848
AllNodesFilter = Filter("AllNodesFilter")
4949
CandidateNodesFilter = Filter("CandidateNodesFilter")
5050
CandidateAndUnreadyNodesFilter = Filter("CandidateAndUnreadyNodesFilter")
51+
EmptyZone = ""
5152
)
5253

5354
var ErrProviderIDNotFound = errors.New("providerID not found")
@@ -58,7 +59,7 @@ var ErrNodePodCIDRNotSet = errors.New("Node does not have PodCIDR set")
5859

5960
// providerIDRE is the regex to process providerID.
6061
// A providerID is build out of '${ProviderName}://${project-id}/${zone}/${instance-name}'
61-
var providerIDRE = regexp.MustCompile(`^` + "gce" + `://([^/]+)/([^/]+)/([^/]+)$`)
62+
var providerIDRE = regexp.MustCompile(`^` + "gce" + `://([^/]+)/([^/]*)/([^/]+)$`)
6263

6364
// ZoneGetter manages lookups for GCE instances to zones.
6465
type ZoneGetter struct {
@@ -103,6 +104,9 @@ func (z *ZoneGetter) ZoneAndSubnetForNode(name string, logger klog.Logger) (stri
103104
nodeLogger.Error(err, "Failed to get zone from the providerID")
104105
return "", "", err
105106
}
107+
if zone == EmptyZone {
108+
return EmptyZone, "", nil
109+
}
106110

107111
subnet, err := getSubnet(node, z.defaultSubnetURL)
108112
if err != nil {
@@ -168,7 +172,7 @@ func (z *ZoneGetter) ListZones(filter Filter, logger klog.Logger) ([]string, err
168172
zones := sets.String{}
169173
for _, n := range nodes {
170174
zone, err := getZone(n)
171-
if err != nil {
175+
if err != nil || zone == EmptyZone {
172176
filterLogger.Error(err, "Failed to get zone from providerID", "nodeName", n.Name)
173177
continue
174178
}
@@ -321,9 +325,6 @@ func getZone(node *api_v1.Node) (string, error) {
321325
if len(matches) != 4 {
322326
return "", fmt.Errorf("%w: providerID %q of node %s is not valid", ErrSplitProviderID, node.Spec.ProviderID, node.Name)
323327
}
324-
if matches[2] == "" {
325-
return "", fmt.Errorf("%w: node %s has an empty zone", ErrSplitProviderID, node.Name)
326-
}
327328
return matches[2], nil
328329
}
329330

pkg/utils/zonegetter/zone_getter_test.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ func TestListZones(t *testing.T) {
6868
t.Errorf("For test case %q with onlyIncludeDefaultSubnetNodes = %v, got %d zones, want %d zones", tc.desc, enableMultiSubnetCluster, len(zones), tc.expectLen)
6969
}
7070
for _, zone := range zones {
71-
if zone == "" {
71+
if zone == EmptyZone {
7272
t.Errorf("For test case %q with onlyIncludeDefaultSubnetNodes = %v, got an empty zone,", tc.desc, enableMultiSubnetCluster)
7373
}
7474
}
@@ -113,7 +113,7 @@ func TestListZonesMultipleSubnets(t *testing.T) {
113113
t.Errorf("For test case %q with multi subnet cluster enabled, got %d zones, want %d zones", tc.desc, len(zones), tc.expectLen)
114114
}
115115
for _, zone := range zones {
116-
if zone == "" {
116+
if zone == EmptyZone {
117117
t.Errorf("For test case %q with multi subnet cluster enabled, got an empty zone,", tc.desc)
118118
}
119119
}
@@ -245,7 +245,7 @@ func TestZoneAndSubnetForNode(t *testing.T) {
245245
nodeName: "instance-empty-zone-providerID",
246246
expectZone: "",
247247
expectedSubnet: "",
248-
expectErr: ErrSplitProviderID,
248+
expectErr: nil,
249249
},
250250
}
251251
for _, tc := range testCases {
@@ -374,8 +374,8 @@ func TestGetZone(t *testing.T) {
374374
ProviderID: "gce://foo-project//bar-node",
375375
},
376376
},
377-
expectZone: "",
378-
expectErr: ErrSplitProviderID,
377+
expectZone: EmptyZone,
378+
expectErr: nil,
379379
},
380380
} {
381381
zone, err := getZone(&tc.node)

0 commit comments

Comments
 (0)