@@ -68,11 +68,18 @@ type healthStreamer struct {
68
68
degradedThreshold time.Duration
69
69
unhealthyThreshold atomic.Int64
70
70
71
- mu sync.Mutex
72
- ctx context.Context
73
- cancel context.CancelFunc
74
- clients map [chan * querypb.StreamHealthResponse ]struct {}
75
- state * querypb.StreamHealthResponse
71
+ // cancelMu is a mutex used to protect the cancel variable
72
+ // and for ensuring we don't call setup functions in parallel.
73
+ cancelMu sync.Mutex
74
+ ctx context.Context
75
+ cancel context.CancelFunc
76
+
77
+ // fieldsMu is used to protect access to the fields below.
78
+ // We require two separate mutexes, so that we don't have to acquire the same mutex
79
+ // in Close and reload that can lead to a deadlock described in https://github.com/vitessio/vitess/issues/17229#issuecomment-2476136610.
80
+ fieldsMu sync.Mutex
81
+ clients map [chan * querypb.StreamHealthResponse ]struct {}
82
+ state * querypb.StreamHealthResponse
76
83
// isServingPrimary stores if this tablet is currently the serving primary or not.
77
84
isServingPrimary bool
78
85
@@ -126,8 +133,8 @@ func (hs *healthStreamer) InitDBConfig(target *querypb.Target, cp dbconfigs.Conn
126
133
}
127
134
128
135
func (hs * healthStreamer ) Open () {
129
- hs .mu .Lock ()
130
- defer hs .mu .Unlock ()
136
+ hs .cancelMu .Lock ()
137
+ defer hs .cancelMu .Unlock ()
131
138
132
139
if hs .cancel != nil {
133
140
return
@@ -140,8 +147,8 @@ func (hs *healthStreamer) Open() {
140
147
}
141
148
142
149
func (hs * healthStreamer ) Close () {
143
- hs .mu .Lock ()
144
- defer hs .mu .Unlock ()
150
+ hs .cancelMu .Lock ()
151
+ defer hs .cancelMu .Unlock ()
145
152
146
153
if hs .cancel != nil {
147
154
hs .se .UnregisterNotifier ("healthStreamer" )
@@ -182,13 +189,16 @@ func (hs *healthStreamer) Stream(ctx context.Context, callback func(*querypb.Str
182
189
}
183
190
184
191
func (hs * healthStreamer ) register () (chan * querypb.StreamHealthResponse , context.Context ) {
185
- hs .mu .Lock ()
186
- defer hs .mu .Unlock ()
192
+ hs .cancelMu .Lock ()
193
+ defer hs .cancelMu .Unlock ()
187
194
188
195
if hs .cancel == nil {
189
196
return nil , nil
190
197
}
191
198
199
+ hs .fieldsMu .Lock ()
200
+ defer hs .fieldsMu .Unlock ()
201
+
192
202
ch := make (chan * querypb.StreamHealthResponse , streamHealthBufferSize )
193
203
hs .clients [ch ] = struct {}{}
194
204
@@ -198,15 +208,15 @@ func (hs *healthStreamer) register() (chan *querypb.StreamHealthResponse, contex
198
208
}
199
209
200
210
func (hs * healthStreamer ) unregister (ch chan * querypb.StreamHealthResponse ) {
201
- hs .mu .Lock ()
202
- defer hs .mu .Unlock ()
211
+ hs .fieldsMu .Lock ()
212
+ defer hs .fieldsMu .Unlock ()
203
213
204
214
delete (hs .clients , ch )
205
215
}
206
216
207
217
func (hs * healthStreamer ) ChangeState (tabletType topodatapb.TabletType , ptsTimestamp time.Time , lag time.Duration , err error , serving bool ) {
208
- hs .mu .Lock ()
209
- defer hs .mu .Unlock ()
218
+ hs .fieldsMu .Lock ()
219
+ defer hs .fieldsMu .Unlock ()
210
220
211
221
hs .state .Target .TabletType = tabletType
212
222
if tabletType == topodatapb .TabletType_PRIMARY {
@@ -260,8 +270,8 @@ func (hs *healthStreamer) broadCastToClients(shr *querypb.StreamHealthResponse)
260
270
}
261
271
262
272
func (hs * healthStreamer ) AppendDetails (details []* kv ) []* kv {
263
- hs .mu .Lock ()
264
- defer hs .mu .Unlock ()
273
+ hs .fieldsMu .Lock ()
274
+ defer hs .fieldsMu .Unlock ()
265
275
if hs .state .Target .TabletType == topodatapb .TabletType_PRIMARY {
266
276
return details
267
277
}
@@ -306,8 +316,8 @@ func (hs *healthStreamer) SetUnhealthyThreshold(v time.Duration) {
306
316
// MakePrimary tells the healthstreamer that the current tablet is now the primary,
307
317
// so it can read and write to the MySQL instance for schema-tracking.
308
318
func (hs * healthStreamer ) MakePrimary (serving bool ) {
309
- hs .mu .Lock ()
310
- defer hs .mu .Unlock ()
319
+ hs .fieldsMu .Lock ()
320
+ defer hs .fieldsMu .Unlock ()
311
321
hs .isServingPrimary = serving
312
322
// We register for notifications from the schema Engine only when schema tracking is enabled,
313
323
// and we are going to a serving primary state.
@@ -322,15 +332,15 @@ func (hs *healthStreamer) MakePrimary(serving bool) {
322
332
323
333
// MakeNonPrimary tells the healthstreamer that the current tablet is now not a primary.
324
334
func (hs * healthStreamer ) MakeNonPrimary () {
325
- hs .mu .Lock ()
326
- defer hs .mu .Unlock ()
335
+ hs .fieldsMu .Lock ()
336
+ defer hs .fieldsMu .Unlock ()
327
337
hs .isServingPrimary = false
328
338
}
329
339
330
340
// reload reloads the schema from the underlying mysql for the tables that we get the alert on.
331
341
func (hs * healthStreamer ) reload (full map [string ]* schema.Table , created , altered , dropped []* schema.Table ) error {
332
- hs .mu .Lock ()
333
- defer hs .mu .Unlock ()
342
+ hs .fieldsMu .Lock ()
343
+ defer hs .fieldsMu .Unlock ()
334
344
// Schema Reload to happen only on primary when it is serving.
335
345
// We can be in a state when the primary is not serving after we have run DemotePrimary. In that case,
336
346
// we don't want to run any queries in MySQL, so we shouldn't reload anything in the healthStreamer.
0 commit comments