Skip to content

Commit

Permalink
enhance: Log error instead of panicking if load lock wait timeout (#3…
Browse files Browse the repository at this point in the history
…9308)

Related to #39205
Previous PR #39206

This PR change wait timeout behavior to log error and return to avoid
making other collection read failure in only some collections have
deadlock

Signed-off-by: Congqi Xia <[email protected]>
  • Loading branch information
congqixia authored Jan 15, 2025
1 parent a5a83a0 commit 57e5652
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 10 deletions.
8 changes: 4 additions & 4 deletions internal/querynodev2/segments/segment.go
Original file line number Diff line number Diff line change
Expand Up @@ -1176,10 +1176,10 @@ func (s *LocalSegment) WarmupChunkCache(ctx context.Context, fieldID int64, mmap
}).Await()
case "async":
task := func() (any, error) {
// bad implemtation, warmup is async at another goroutine and hold the rlock.
// the state transition of segment in segment loader will blocked.
// add a waiter to avoid it.
s.ptrLock.BlockUntilDataLoadedOrReleased()
// failed to wait for state update, return directly
if !s.ptrLock.BlockUntilDataLoadedOrReleased() {
return nil, nil
}
if s.PinIfNotReleased() != nil {
return nil, nil
}
Expand Down
10 changes: 7 additions & 3 deletions internal/querynodev2/segments/state/load_state_lock.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import (

"github.com/cockroachdb/errors"
"go.uber.org/atomic"
"go.uber.org/zap"

"github.com/milvus-io/milvus/pkg/log"
"github.com/milvus-io/milvus/pkg/util/paramtable"
)

Expand Down Expand Up @@ -168,10 +170,12 @@ func (ls *LoadStateLock) StartReleaseAll() (g LoadStateLockGuard) {
}

// blockUntilDataLoadedOrReleased blocks until the segment is loaded or released.
func (ls *LoadStateLock) BlockUntilDataLoadedOrReleased() {
func (ls *LoadStateLock) BlockUntilDataLoadedOrReleased() bool {
var ok bool
ls.waitOrPanic(func(state loadStateEnum) bool {
return state == LoadStateDataLoaded || state == LoadStateReleased
}, noop)
}, func() { ok = true })
return ok
}

// waitUntilCanReleaseData waits until segment is release data able.
Expand Down Expand Up @@ -199,7 +203,7 @@ func (ls *LoadStateLock) waitOrPanic(ready func(state loadStateEnum) bool, then

select {
case <-time.After(maxWaitTime):
panic(fmt.Sprintf("max WLock wait time(%v) excceeded", maxWaitTime))
log.Error("load state lock wait timeout", zap.Duration("maxWaitTime", maxWaitTime))
case <-ch:
}
}
Expand Down
9 changes: 7 additions & 2 deletions internal/querynodev2/segments/state/load_state_lock_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -197,25 +197,30 @@ func TestWaitOrPanic(t *testing.T) {
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.MaxWLockConditionalWaitTime.Key)

l := NewLoadStateLock(LoadStateDataLoaded)
executed := false

assert.NotPanics(t, func() {
l.waitOrPanic(func(state loadStateEnum) bool {
return state == LoadStateDataLoaded
}, noop)
}, func() { executed = true })
})

assert.True(t, executed)
})

t.Run("timeout_panic", func(t *testing.T) {
paramtable.Get().Save(paramtable.Get().CommonCfg.MaxWLockConditionalWaitTime.Key, "1")
defer paramtable.Get().Reset(paramtable.Get().CommonCfg.MaxWLockConditionalWaitTime.Key)

l := NewLoadStateLock(LoadStateOnlyMeta)
executed := false

assert.Panics(t, func() {
assert.NotPanics(t, func() {
l.waitOrPanic(func(state loadStateEnum) bool {
return state == LoadStateDataLoaded
}, noop)
})
assert.False(t, executed)
})
}

Expand Down
4 changes: 3 additions & 1 deletion pkg/util/hardware/gpu_mem_info_cuda.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,11 @@ int getAllGPUMemoryInfo(GPUMemoryInfo** infos) {
}
*/
import "C"

import (
"github.com/cockroachdb/errors"
"unsafe"

"github.com/cockroachdb/errors"
)

// GPUMemoryInfo represents a single GPU's memory information.
Expand Down

0 comments on commit 57e5652

Please sign in to comment.