@@ -9,6 +9,7 @@ package clusterchecks
9
9
10
10
import (
11
11
"fmt"
12
+ "math/rand"
12
13
"time"
13
14
14
15
"github.com/DataDog/datadog-agent/pkg/autodiscovery/integration"
@@ -75,38 +76,57 @@ func (d *dispatcher) processNodeStatus(nodeName, clientIP string, status types.N
75
76
return false
76
77
}
77
78
78
- // getLeastBusyNode returns the name of the node that is assigned
79
- // the lowest number of checks. In case of equality, one is chosen
80
- // randomly, based on map iterations being randomized.
81
- func (d * dispatcher ) getLeastBusyNode () string {
82
- var leastBusyNode string
83
- minCheckCount := int (- 1 )
84
- minBusyness := int (- 1 )
79
+ // getNodeToScheduleCheck returns the node where a new check should be scheduled
80
+
81
+ // Advanced dispatching relies on the check stats fetched from the cluster check
82
+ // runners API to distribute the checks. The stats are only updated when the
83
+ // checks are rebalanced, they are not updated every time a check is scheduled.
84
+ // That's why it's not a good idea to pick the least busy node. Rebalance
85
+ // happens every few minutes, so all the checks added during that time would get
86
+ // scheduled to the same node. It's a better solution to pick a random node and
87
+ // rely on rebalancing to distribute when needed.
88
+ //
89
+ // On the other hand, when advanced dispatching is not used, we can pick the
90
+ // node with fewer checks. It's because the number of checks is kept up to date.
91
+ func (d * dispatcher ) getNodeToScheduleCheck () string {
92
+ if d .advancedDispatching {
93
+ return d .getRandomNode ()
94
+ }
95
+
96
+ return d .getNodeWithLessChecks ()
97
+ }
98
+
99
+ func (d * dispatcher ) getRandomNode () string {
100
+ d .store .RLock ()
101
+ defer d .store .RUnlock ()
102
+
103
+ var nodes []string
104
+ for name := range d .store .nodes {
105
+ nodes = append (nodes , name )
106
+ }
107
+
108
+ if len (nodes ) == 0 {
109
+ return ""
110
+ }
111
+
112
+ return nodes [rand .Intn (len (nodes ))]
113
+ }
85
114
115
+ func (d * dispatcher ) getNodeWithLessChecks () string {
86
116
d .store .RLock ()
87
117
defer d .store .RUnlock ()
88
118
119
+ var selectedNode string
120
+ minNumChecks := 0
121
+
89
122
for name , store := range d .store .nodes {
90
- if name == "" {
91
- continue
92
- }
93
- if d .advancedDispatching && store .busyness > defaultBusynessValue {
94
- // dispatching based on clc runners stats
95
- // only when advancedDispatching is true and
96
- // started collecting busyness values
97
- if minBusyness == - 1 || store .busyness < minBusyness {
98
- leastBusyNode = name
99
- minBusyness = store .busyness
100
- }
101
- } else {
102
- // count-based round robin dispatching
103
- if minCheckCount == - 1 || len (store .digestToConfig ) < minCheckCount {
104
- leastBusyNode = name
105
- minCheckCount = len (store .digestToConfig )
106
- }
123
+ if selectedNode == "" || len (store .digestToConfig ) < minNumChecks {
124
+ selectedNode = name
125
+ minNumChecks = len (store .digestToConfig )
107
126
}
108
127
}
109
- return leastBusyNode
128
+
129
+ return selectedNode
110
130
}
111
131
112
132
// expireNodes iterates over nodes and removes the ones that have not
0 commit comments