Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1046,7 +1046,7 @@ func runSimpleScaleUpTest(t *testing.T, config *ScaleUpTestConfig) *ScaleUpTestR
err = autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, kube_util.ScheduledPods(pods), nil)
assert.NoError(t, err)
nodeInfos, err := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).
Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, autoscalingCtx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(options.NodeGroupDefaults), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
Expand Down Expand Up @@ -1155,7 +1155,7 @@ func TestScaleUpUnhealthy(t *testing.T) {
assert.NoError(t, err)
err = autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, pods, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, autoscalingCtx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
p3 := BuildTestPod("p-new", 550, 0)
Expand Down Expand Up @@ -1199,7 +1199,7 @@ func TestBinpackingLimiter(t *testing.T) {
err = autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, nil, nil)
assert.NoError(t, err)
nodeInfos, err := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).
Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)

clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, autoscalingCtx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
Expand Down Expand Up @@ -1258,7 +1258,7 @@ func TestScaleUpNoHelp(t *testing.T) {
assert.NoError(t, err)
err = autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, pods, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, autoscalingCtx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
p3 := BuildTestPod("p-new", 500, 0)
Expand Down Expand Up @@ -1413,7 +1413,7 @@ func TestComputeSimilarNodeGroups(t *testing.T) {
assert.NoError(t, err)
err = autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, nil, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, autoscalingCtx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
assert.NoError(t, clusterState.UpdateNodes(nodes, nodeInfos, time.Now()))

Expand Down Expand Up @@ -1497,7 +1497,7 @@ func TestScaleUpBalanceGroups(t *testing.T) {
assert.NoError(t, err)
err = autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, podList, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, autoscalingCtx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())

Expand Down Expand Up @@ -1568,7 +1568,7 @@ func TestScaleUpAutoprovisionedNodeGroup(t *testing.T) {
processors.NodeGroupManager = &MockAutoprovisioningNodeGroupManager{T: t, ExtraGroups: 0}

nodes := []*apiv1.Node{}
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())

suOrchestrator := New()
suOrchestrator.Initialize(&autoscalingCtx, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{})
Expand Down Expand Up @@ -1619,7 +1619,7 @@ func TestScaleUpBalanceAutoprovisionedNodeGroups(t *testing.T) {
processors.NodeGroupManager = &MockAutoprovisioningNodeGroupManager{T: t, ExtraGroups: 2}

nodes := []*apiv1.Node{}
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())

suOrchestrator := New()
suOrchestrator.Initialize(&autoscalingCtx, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{})
Expand Down Expand Up @@ -1673,7 +1673,7 @@ func TestScaleUpToMeetNodeGroupMinSize(t *testing.T) {
nodes := []*apiv1.Node{n1, n2}
err = autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, nil, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
processors := processorstest.NewTestProcessors(&autoscalingCtx)
clusterState := clusterstate.NewClusterStateRegistry(provider, clusterstate.ClusterStateRegistryConfig{}, autoscalingCtx.LogRecorder, NewBackoff(), nodegroupconfig.NewDefaultNodeGroupConfigProcessor(config.NodeGroupAutoscalingOptions{MaxNodeProvisionTime: 15 * time.Minute}), asyncnodegroups.NewDefaultAsyncNodeGroupStateChecker())
clusterState.UpdateNodes(nodes, nodeInfos, time.Now())
Expand Down Expand Up @@ -1768,7 +1768,7 @@ func TestScaleupAsyncNodeGroupsEnabled(t *testing.T) {
processors.AsyncNodeGroupStateChecker = &asyncnodegroups.MockAsyncNodeGroupStateChecker{IsUpcomingNodeGroup: tc.isUpcomingMockMap}

nodes := []*apiv1.Node{}
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())

suOrchestrator := New()
suOrchestrator.Initialize(&autoscalingCtx, processors, clusterState, newEstimatorBuilder(), taints.TaintConfig{})
Expand Down
8 changes: 4 additions & 4 deletions cluster-autoscaler/core/scaleup/resource/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ func TestDeltaForNode(t *testing.T) {
group, nodes := newNodeGroup(t, cp, ng.Name, ng.Min, ng.Max, ng.Size, ng.CPU, ng.Mem)
err := autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, nil, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*corev1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())

rm := NewManager(processors.CustomResourcesProcessor)
delta, err := rm.DeltaForNode(&autoscalingCtx, nodeInfos[ng.Name], group)
Expand Down Expand Up @@ -116,7 +116,7 @@ func TestResourcesLeft(t *testing.T) {
_, nodes := newNodeGroup(t, cp, ng.Name, ng.Min, ng.Max, ng.Size, ng.CPU, ng.Mem)
err := autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, nil, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*corev1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())

rm := NewManager(processors.CustomResourcesProcessor)
left, err := rm.ResourcesLeft(&autoscalingCtx, nodeInfos, nodes)
Expand Down Expand Up @@ -169,7 +169,7 @@ func TestApplyLimits(t *testing.T) {
group, nodes := newNodeGroup(t, cp, ng.Name, ng.Min, ng.Max, ng.Size, ng.CPU, ng.Mem)
err := autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, nil, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*corev1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())

rm := NewManager(processors.CustomResourcesProcessor)
newCount, err := rm.ApplyLimits(&autoscalingCtx, testCase.newNodeCount, testCase.resourcesLeft, nodeInfos[testCase.nodeGroupConfig.Name], group)
Expand Down Expand Up @@ -236,7 +236,7 @@ func TestResourceManagerWithGpuResource(t *testing.T) {
nodes := []*corev1.Node{n1}
err = autoscalingCtx.ClusterSnapshot.SetClusterState(nodes, nil, nil)
assert.NoError(t, err)
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())
nodeInfos, _ := nodeinfosprovider.NewDefaultTemplateNodeInfoProvider(nil, false).Process(&autoscalingCtx, nodes, []*corev1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, time.Now())

rm := NewManager(processors.CustomResourcesProcessor)

Expand Down
2 changes: 1 addition & 1 deletion cluster-autoscaler/core/static_autoscaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,7 +348,7 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
return typedErr.AddPrefix("failed to initialize RemainingPdbTracker: ")
}

nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, readyNodes, daemonsets, a.taintConfig, currentTime)
nodeInfosForGroups, autoscalerError := a.processors.TemplateNodeInfoProvider.Process(autoscalingCtx, readyNodes, allNodes, daemonsets, a.taintConfig, currentTime)
if autoscalerError != nil {
klog.Errorf("Failed to get node infos for groups: %v", autoscalerError)
return autoscalerError.AddPrefix("failed to build node infos for node groups: ")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ func NewCustomAnnotationNodeInfoProvider(templateNodeInfoProvider TemplateNodeIn
}

// Process returns the nodeInfos set for this cluster.
func (p *AnnotationNodeInfoProvider) Process(autoscalingCtx *ca_context.AutoscalingContext, nodes []*apiv1.Node, daemonsets []*appsv1.DaemonSet, taintConfig taints.TaintConfig, currentTime time.Time) (map[string]*framework.NodeInfo, errors.AutoscalerError) {
nodeInfos, err := p.templateNodeInfoProvider.Process(autoscalingCtx, nodes, daemonsets, taintConfig, currentTime)
func (p *AnnotationNodeInfoProvider) Process(autoscalingCtx *ca_context.AutoscalingContext, readyNodes []*apiv1.Node, allNodes []*apiv1.Node, daemonsets []*appsv1.DaemonSet, taintConfig taints.TaintConfig, currentTime time.Time) (map[string]*framework.NodeInfo, errors.AutoscalerError) {
nodeInfos, err := p.templateNodeInfoProvider.Process(autoscalingCtx, readyNodes, allNodes, daemonsets, taintConfig, currentTime)
if err != nil {
return nil, err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ func NewAsgTagResourceNodeInfoProvider(t *time.Duration, forceDaemonSets bool) *
}

// Process returns the nodeInfos set for this cluster.
func (p *AsgTagResourceNodeInfoProvider) Process(autoscalingCtx *ca_context.AutoscalingContext, nodes []*apiv1.Node, daemonsets []*appsv1.DaemonSet, taintConfig taints.TaintConfig, currentTime time.Time) (map[string]*framework.NodeInfo, errors.AutoscalerError) {
nodeInfos, err := p.mixedTemplateNodeInfoProvider.Process(autoscalingCtx, nodes, daemonsets, taintConfig, currentTime)
func (p *AsgTagResourceNodeInfoProvider) Process(autoscalingCtx *ca_context.AutoscalingContext, readyNodes []*apiv1.Node, allNodes []*apiv1.Node, daemonsets []*appsv1.DaemonSet, taintConfig taints.TaintConfig, currentTime time.Time) (map[string]*framework.NodeInfo, errors.AutoscalerError) {
nodeInfos, err := p.mixedTemplateNodeInfoProvider.Process(autoscalingCtx, readyNodes, allNodes, daemonsets, taintConfig, currentTime)
if err != nil {
return nil, err
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func (p *MixedTemplateNodeInfoProvider) CleanUp() {
}

// Process returns the nodeInfos set for this cluster
func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.AutoscalingContext, nodes []*apiv1.Node, daemonsets []*appsv1.DaemonSet, taintConfig taints.TaintConfig, now time.Time) (map[string]*framework.NodeInfo, caerror.AutoscalerError) {
func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.AutoscalingContext, readyNodes []*apiv1.Node, allNodes []*apiv1.Node, daemonsets []*appsv1.DaemonSet, taintConfig taints.TaintConfig, now time.Time) (map[string]*framework.NodeInfo, caerror.AutoscalerError) {
// TODO(mwielgus): This returns map keyed by url, while most code (including scheduler) uses node.Name for a key.
// TODO(mwielgus): Review error policy - sometimes we may continue with partial errors.
result := make(map[string]*framework.NodeInfo)
Expand Down Expand Up @@ -103,7 +103,7 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos
return false, "", nil
}

for _, node := range nodes {
for _, node := range readyNodes {
// Broken nodes might have some stuff missing. Skipping.
if !isNodeGoodTemplateCandidate(node, now) {
continue
Expand Down Expand Up @@ -156,7 +156,10 @@ func (p *MixedTemplateNodeInfoProvider) Process(autoscalingCtx *ca_context.Autos
}

// Last resort - unready/unschedulable nodes.
for _, node := range nodes {
// we want to check not only the ready nodes, but also ready unschedulable nodes.
// this needs to combine readyNodes and allNodes due to filtering that occurs at
// a higher level.
for _, node := range append(readyNodes, allNodes...) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Two things:

Isn't readyNodes a subset of allNodes? In which case this will range over the nodes in readyNodes twice.

Also, how do we know the diff of allNodes - readyNodes are nodes of type Ready + Unschedulable? Aren't there going to be other types of nodes not classified as readyNodes in that set (for example, various flavors of NotReady nodes)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't readyNodes a subset of allNodes? In which case this will range over the nodes in readyNodes twice.

readyNodes is not a pure subset of allNodes, there is some filtering that occurs to remove some of the nodes in readyNodes from allNodes.

if you change this line to only use allNodes you will see some unit tests fail.

Also, how do we know the diff of allNodes - readyNodes are nodes of type Ready + Unschedulable? Aren't there going to be other types of nodes not classified as readyNodes in that set (for example, various flavors of NotReady nodes)?

i did look at how readyNodes and allNodes are created, there is filtering that happens in this function https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/core/static_autoscaler.go#L993

i have run a series of tests using allNodes and also using a version of this patch that specifically creates a readyUnschedulableNodes list. after running both tests, and putting them through CI, i am more convinced that using allNodes here is the appropriate thing to do. adding a new lister for "ready unschedulable" nodes did not change the results of my testing, and it makes the code more complicated. this is why i went to using the allNodes approach.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aren't there going to be other types of nodes not classified as readyNodes in that set (for example, various flavors of NotReady nodes)?

one more point about this, looking at the function in the mixed node info processor, we can see that there are other conditions than just "ready unschedulable" that are checked for. i think the original intent of this function was to look at all nodes.

https://github.com/kubernetes/autoscaler/blob/master/cluster-autoscaler/processors/nodeinfosprovider/mixed_nodeinfos_processor.go#L181

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@towca does this seem like the right path forward to you as well?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is some filtering that occurs to remove some of the nodes in readyNodes from allNodes.

Should we be more precise about how we append allNodes to readyNodes to avoid duplicates? Or are we confident that the effort to de-dup is equivalent or more costly than duplicate processing?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I attempted a bit of archaeology and found the PR that added the filtering by readiness to this logic: #72. This is pretty bizarre, because even way back then it seems that this logic would only see Ready Nodes:

In any case, IMO the most readable change would be to:

  • Start passing allNodes instead of readyNodes to TemplateNodeInfoProvider.Process() without changing the signature. This is what the interface definition suggests anyway.
  • At the beginning of MixedTemplateNodeInfoProvider.Process(), group the passed allNodes into good and bad candidates utilizing isNodeGoodTemplateCandidate(). Then iterate over the good ones in the first loop, and over the bad ones in the last loop.

This should work because:

  • readyNodes should be a subset of allNodes. So the logic should see all the same Nodes as before + additional ones. This is a bit murky because the Node lists are modified by CustomResourcesProcessor after being listed. CustomResourcesProcessor implementations should only remove Nodes from readyNodes and hack their Ready condition in allNodes. This is what the in-tree implementations do, if an out-of-tree implementations breaks the assumption they might not be subsets but IMO this isn't a supported case and such implementations should be ready for things breaking.
  • The set of conditions checked in isNodeGoodTemplateCandidate() (ready && stable && schedulable && !toBeDeleted) is a superset of conditions by which readyNodes are filtered from allNodes (ready && schedulable). Both places use the same kube_util.GetReadinessState() function for determining the ready part, the schedulable part is just checking the same field on the Node.
  • Based on the two above if a Node is in allNodes, but not readyNodes, isNodeGoodTemplateCandidate() should always return false for it. So all the Nodes from allNodes - readyNodes should be categorized as bad candidates like we want.
  • And if a Node is in readyNodes, it should also be in allNodes and the result of isNodeGoodTemplateCandidate() should be identical for both versions. So the good candidates determined from allNodes via isNodeGoodTemplateCandidate() should be exactly the same as determined from readyNodes like we do now.

Does that make sense? @x13n could you double-check my logic here?

@elmiko IIUC you attempted something like this and got unit test failures? Could you describe what kind? I could definitely see the tests just being too coupled to the current implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we be more precise about how we append allNodes to readyNodes to avoid duplicates? Or are we confident that the effort to de-dup is equivalent or more costly than duplicate processing?

at this point in the processing, i don't think the duplicates is an issue.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@elmiko IIUC you attempted something like this and got unit test failures? Could you describe what kind? I could definitely see the tests just being too coupled to the current implementation.

i will need to run the unit tests again, but essentially, if only allNodes is used in the final clause of the mixed node infos processor's Process function, then a few tests fail. my impression is that the filtering occurring with the custom node processor or the processor that filters out nodes with startup taints is causing the issues.

i can certainly take another look at passing only allNodes to Process. i didn't want to break anything else though XD

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is one of the main failures i see when changing to use only allNodes:

generated by running go test ./... in the cluster-autoscaler/core directory.

--- FAIL: TestScaleUpToMeetNodeGroupMinSize (0.00s)
    orchestrator_test.go:1684:
                Error Trace:    /home/mike/dev/kubernetes-autoscaler/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go:1684
                Error:          Received unexpected error:
                                could not compute total resources: No node info for: ng1
                Test:           TestScaleUpToMeetNodeGroupMinSize
    orchestrator_test.go:1685:
                Error Trace:    /home/mike/dev/kubernetes-autoscaler/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go:1685
                Error:          Should be true
                Test:           TestScaleUpToMeetNodeGroupMinSize
    orchestrator_test.go:1686:
                Error Trace:    /home/mike/dev/kubernetes-autoscaler/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go:1686
                Error:          Not equal:
                                expected: 1
                                actual  : 0
                Test:           TestScaleUpToMeetNodeGroupMinSize
panic: runtime error: index out of range [0] with length 0 [recovered]
        panic: runtime error: index out of range [0] with length 0

goroutine 81 [running]:
testing.tRunner.func1.2({0x2acda20, 0xc0003fd5a8})
        /home/mike/sdk/go1.24.0/src/testing/testing.go:1734 +0x21c
testing.tRunner.func1()
        /home/mike/sdk/go1.24.0/src/testing/testing.go:1737 +0x35e
panic({0x2acda20?, 0xc0003fd5a8?})
        /home/mike/sdk/go1.24.0/src/runtime/panic.go:787 +0x132
k8s.io/autoscaler/cluster-autoscaler/core/scaleup/orchestrator.TestScaleUpToMeetNodeGroupMinSize(0xc000d16e00)
        /home/mike/dev/kubernetes-autoscaler/cluster-autoscaler/core/scaleup/orchestrator/orchestrator_test.go:1687 +0x11fa
testing.tRunner(0xc000d16e00, 0x2e31b28)
        /home/mike/sdk/go1.24.0/src/testing/testing.go:1792 +0xf4
created by testing.(*T).Run in goroutine 1
        /home/mike/sdk/go1.24.0/src/testing/testing.go:1851 +0x413
FAIL    k8s.io/autoscaler/cluster-autoscaler/core/scaleup/orchestrator  0.044s
--- FAIL: TestDeltaForNode (0.00s)
panic: runtime error: invalid memory address or nil pointer dereference [recovered]
        panic: runtime error: invalid memory address or nil pointer dereference
[signal SIGSEGV: segmentation violation code=0x1 addr=0x0 pc=0x246c737]

goroutine 98 [running]:
testing.tRunner.func1.2({0x27e4140, 0x4c18f50})
        /home/mike/sdk/go1.24.0/src/testing/testing.go:1734 +0x21c
testing.tRunner.func1()
        /home/mike/sdk/go1.24.0/src/testing/testing.go:1737 +0x35e
panic({0x27e4140?, 0x4c18f50?})
        /home/mike/sdk/go1.24.0/src/runtime/panic.go:787 +0x132
k8s.io/autoscaler/cluster-autoscaler/simulator/framework.(*NodeInfo).Node(...)
        /home/mike/dev/kubernetes-autoscaler/cluster-autoscaler/simulator/framework/infos.go:66
k8s.io/autoscaler/cluster-autoscaler/core/scaleup/resource.(*Manager).DeltaForNode(0xc000ab1ce0, 0xc000aa6008, 0x0, {0x30df800, 0xc000703700})
        /home/mike/dev/kubernetes-autoscaler/cluster-autoscaler/core/scaleup/resource/manager.go:64 +0x57
k8s.io/autoscaler/cluster-autoscaler/core/scaleup/resource.TestDeltaForNode(0xc000103500)
        /home/mike/dev/kubernetes-autoscaler/cluster-autoscaler/core/scaleup/resource/manager_test.go:79 +0x5e5
testing.tRunner(0xc000103500, 0x2ddcaa0)
        /home/mike/sdk/go1.24.0/src/testing/testing.go:1792 +0xf4
created by testing.(*T).Run in goroutine 1
        /home/mike/sdk/go1.24.0/src/testing/testing.go:1851 +0x413
FAIL    k8s.io/autoscaler/cluster-autoscaler/core/scaleup/resource      0.029s
?       k8s.io/autoscaler/cluster-autoscaler/core/test  [no test files]
ok      k8s.io/autoscaler/cluster-autoscaler/core/utils 0.025s
FAIL

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, looking at this failure again. i think it's due to my change in the test.

// Allowing broken nodes
if isNodeGoodTemplateCandidate(node, now) {
continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func TestGetNodeInfosForGroups(t *testing.T) {
ListerRegistry: registry,
},
}
res, err := NewMixedTemplateNodeInfoProvider(&cacheTtl, false).Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
res, err := NewMixedTemplateNodeInfoProvider(&cacheTtl, false).Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
assert.Equal(t, 6, len(res))
info, found := res["ng1"]
Expand Down Expand Up @@ -125,7 +125,7 @@ func TestGetNodeInfosForGroups(t *testing.T) {
ListerRegistry: registry,
},
}
res, err = NewMixedTemplateNodeInfoProvider(&cacheTtl, false).Process(&autoscalingCtx, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
res, err = NewMixedTemplateNodeInfoProvider(&cacheTtl, false).Process(&autoscalingCtx, []*apiv1.Node{}, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
assert.Equal(t, 0, len(res))
}
Expand Down Expand Up @@ -184,7 +184,7 @@ func TestGetNodeInfosForGroupsCache(t *testing.T) {
},
}
niProcessor := NewMixedTemplateNodeInfoProvider(&cacheTtl, false)
res, err := niProcessor.Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
res, err := niProcessor.Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
// Check results
assert.Equal(t, 4, len(res))
Expand Down Expand Up @@ -218,7 +218,7 @@ func TestGetNodeInfosForGroupsCache(t *testing.T) {
assert.Equal(t, "ng3", lastDeletedGroup)

// Check cache with all nodes removed
res, err = niProcessor.Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
res, err = niProcessor.Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
// Check results
assert.Equal(t, 2, len(res))
Expand All @@ -239,7 +239,7 @@ func TestGetNodeInfosForGroupsCache(t *testing.T) {
// Fill cache manually
infoNg4Node6 := framework.NewTestNodeInfo(ready6.DeepCopy())
niProcessor.nodeInfoCache = map[string]cacheItem{"ng4": {NodeInfo: infoNg4Node6, added: now}}
res, err = niProcessor.Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
res, err = niProcessor.Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
// Check if cache was used
assert.NoError(t, err)
assert.Equal(t, 2, len(res))
Expand Down Expand Up @@ -285,7 +285,7 @@ func TestGetNodeInfosCacheExpired(t *testing.T) {
provider.AddNode("ng1", ready1)

assert.Equal(t, 2, len(niProcessor1.nodeInfoCache))
_, err = niProcessor1.Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
_, err = niProcessor1.Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
assert.Equal(t, 1, len(niProcessor1.nodeInfoCache))

Expand All @@ -296,7 +296,7 @@ func TestGetNodeInfosCacheExpired(t *testing.T) {
"ng2": {NodeInfo: tni, added: now.Add(-2 * time.Second)},
}
assert.Equal(t, 2, len(niProcessor2.nodeInfoCache))
_, err = niProcessor1.Process(&autoscalingCtx, nodes, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
_, err = niProcessor1.Process(&autoscalingCtx, nodes, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
assert.NoError(t, err)
assert.Equal(t, 2, len(niProcessor2.nodeInfoCache))

Expand All @@ -319,7 +319,7 @@ func TestProcessHandlesTemplateNodeInfoErrors(t *testing.T) {
ClusterSnapshot: testsnapshot.NewTestSnapshotOrDie(t),
}

res, err := NewMixedTemplateNodeInfoProvider(&cacheTtl, false).Process(&autoscalingCtx, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)
res, err := NewMixedTemplateNodeInfoProvider(&cacheTtl, false).Process(&autoscalingCtx, []*apiv1.Node{}, []*apiv1.Node{}, []*appsv1.DaemonSet{}, taints.TaintConfig{}, now)

// Should not fail despite ng1 error - continues processing
assert.NoError(t, err)
Expand Down
Loading
Loading