Skip to content

Commit 8f45611

Browse files
committed
runtime/pprof: deflake TestGoroutineLeakProfileConcurrency
Issue #76540 reports failures in this test from the leaked goroutine count being too small. The test makes an effort to wait for the goroutines to leak, but there's no guarantee. This change instead changes TestGoroutineLeakProfileConcurrency to wait for the number of leaked goroutines to reach at least the minimum expected before proceeding. This deflakes this particular issue. The downside of this change is that a failure to detect leaked goroutines due to a bug will lead to a timeout instead of an instant failure, but this change makes an effort to log and report that it was waiting for the goroutines to leak and timed out for that reason, so at least the failure is more obvious. Overall, this is still better than random flakes. While we're here, I also make some minor stylistic changes and document the helper functions a little more. I also noticed that checkFrames was using the wrong *testing.T, so this change fixes that too. Fixes #76540. Change-Id: I0508855dc39b91f8c6b72d059ce88dbfc68fe748 Reviewed-on: https://go-review.googlesource.com/c/go/+/729280 Reviewed-by: Cherry Mui <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]>
1 parent 34af879 commit 8f45611

File tree

1 file changed

+74
-58
lines changed

1 file changed

+74
-58
lines changed

src/runtime/pprof/pprof_test.go

Lines changed: 74 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1590,14 +1590,30 @@ func TestGoroutineLeakProfileConcurrency(t *testing.T) {
15901590
goroutineProf := Lookup("goroutine")
15911591
goroutineLeakProf := goroutineLeakProfile
15921592

1593-
// Check that a profile with debug information contains
1594-
includesLeak := func(t *testing.T, name, s string) {
1595-
if !strings.Contains(s, "runtime/pprof.goroutineLeakExample") {
1596-
t.Errorf("%s profile does not contain expected leaked goroutine (runtime/pprof.goroutineLeakExample): %s", name, s)
1593+
// We use this helper to count the total number of leaked goroutines in a text profile.
1594+
countLeaks := func(t *testing.T, profText string) int64 {
1595+
t.Helper()
1596+
1597+
// Strip the profile header
1598+
parts := regexLeakCount.Split(profText, -1)
1599+
if len(parts) < 2 {
1600+
t.Fatalf("goroutineleak profile does not contain 'goroutineleak profile: total ': %s\nparts: %v", profText, parts)
15971601
}
1602+
1603+
parts = whiteSpace.Split(parts[1], -1)
1604+
1605+
count, err := strconv.ParseInt(parts[0], 10, 64)
1606+
if err != nil {
1607+
t.Fatalf("goroutineleak profile count is not a number: %s\nerror: %v", profText, err)
1608+
}
1609+
return count
15981610
}
15991611

1600-
checkFrame := func(i int, j int, locations []*profile.Location, expectedFunctionName string) {
1612+
// checkFrame looks for a specific frame in the stack.
1613+
//
1614+
// i is the location index in the profile and j is the location line index for the location.
1615+
// (Inlining may cause aliasing to the same location.)
1616+
checkFrame := func(t *testing.T, i int, j int, locations []*profile.Location, funcName string) {
16011617
if len(locations) <= i {
16021618
t.Errorf("leaked goroutine stack locations: out of range index %d, length %d", i, len(locations))
16031619
return
@@ -1607,47 +1623,13 @@ func TestGoroutineLeakProfileConcurrency(t *testing.T) {
16071623
t.Errorf("leaked goroutine stack location lines: out of range index %d, length %d", j, len(location.Line))
16081624
return
16091625
}
1610-
if location.Line[j].Function.Name != expectedFunctionName {
1611-
t.Errorf("leaked goroutine stack expected %s as the location[%d].Line[%d] but found %s (%s:%d)", expectedFunctionName, i, j, location.Line[j].Function.Name, location.Line[j].Function.Filename, location.Line[j].Line)
1612-
}
1613-
}
1614-
1615-
// We use this helper to count the total number of leaked goroutines in the profile.
1616-
//
1617-
// NOTE(vsaioc): This value should match for the number of leaks produced in this test,
1618-
// but other tests could also leak goroutines, in which case we would have a mismatch
1619-
// when bulk-running tests.
1620-
//
1621-
// The two mismatching outcomes are therefore:
1622-
// - More leaks than expected, which is a correctness issue with other tests.
1623-
// In this case, this test effectively checks other tests wrt
1624-
// goroutine leaks during bulk executions (e.g., running all.bash).
1625-
//
1626-
// - Fewer leaks than expected; this is an unfortunate symptom of scheduling
1627-
// non-determinism, which may occur once in a blue moon. We make
1628-
// a best-effort attempt to allow the expected leaks to occur, by yielding
1629-
// the main thread, but it is never a guarantee.
1630-
countLeaks := func(t *testing.T, number int, s string) {
1631-
// Strip the profile header
1632-
parts := regexLeakCount.Split(s, -1)
1633-
if len(parts) < 2 {
1634-
t.Fatalf("goroutineleak profile does not contain 'goroutineleak profile: total ': %s\nparts: %v", s, parts)
1635-
return
1636-
}
1637-
1638-
parts = whiteSpace.Split(parts[1], -1)
1639-
1640-
count, err := strconv.ParseInt(parts[0], 10, 64)
1641-
if err != nil {
1642-
t.Fatalf("goroutineleak profile count is not a number: %s\nerror: %v", s, err)
1643-
}
1644-
1645-
// Check that the total number of leaked goroutines is exactly the expected number.
1646-
if count != int64(number) {
1647-
t.Errorf("goroutineleak profile does not contain exactly %d leaked goroutines: %d", number, count)
1626+
if location.Line[j].Function.Name != funcName {
1627+
t.Errorf("leaked goroutine stack expected %s as location[%d].Line[%d] but found %s (%s:%d)", funcName, i, j, location.Line[j].Function.Name, location.Line[j].Function.Filename, location.Line[j].Line)
16481628
}
16491629
}
16501630

1631+
// checkLeakStack hooks into profile parsing and performs validation, looking for specific stacks for
1632+
// the goroutines we'll leak in this test.
16511633
checkLeakStack := func(t *testing.T) func(pc uintptr, locations []*profile.Location, _ map[string][]string) {
16521634
return func(pc uintptr, locations []*profile.Location, _ map[string][]string) {
16531635
if pc != leakCount {
@@ -1665,16 +1647,18 @@ func TestGoroutineLeakProfileConcurrency(t *testing.T) {
16651647
return
16661648
}
16671649
// We expect a receive operation. This is the typical stack.
1668-
checkFrame(0, 0, locations, "runtime.gopark")
1669-
checkFrame(1, 0, locations, "runtime.chanrecv")
1670-
checkFrame(2, 0, locations, "runtime.chanrecv1")
1671-
checkFrame(3, 0, locations, "runtime/pprof.goroutineLeakExample")
1650+
checkFrame(t, 0, 0, locations, "runtime.gopark")
1651+
checkFrame(t, 1, 0, locations, "runtime.chanrecv")
1652+
checkFrame(t, 2, 0, locations, "runtime.chanrecv1")
1653+
checkFrame(t, 3, 0, locations, "runtime/pprof.goroutineLeakExample")
16721654
if len(locations) == 5 {
1673-
checkFrame(4, 0, locations, "runtime/pprof.TestGoroutineLeakProfileConcurrency.func5")
1655+
checkFrame(t, 4, 0, locations, "runtime/pprof.TestGoroutineLeakProfileConcurrency.func4")
16741656
}
16751657
}
16761658
}
1659+
16771660
// Leak some goroutines that will feature in the goroutine leak profile
1661+
const totalLeaked = leakCount * 2
16781662
for i := 0; i < leakCount; i++ {
16791663
go goroutineLeakExample()
16801664
go func() {
@@ -1683,13 +1667,36 @@ func TestGoroutineLeakProfileConcurrency(t *testing.T) {
16831667
goroutineLeakExample()
16841668
panic("unreachable")
16851669
}()
1686-
// Yield several times to allow the goroutines to leak.
1687-
runtime.Gosched()
1688-
runtime.Gosched()
16891670
}
16901671

1691-
// Give all goroutines a chance to leak.
1692-
time.Sleep(time.Second)
1672+
// Wait for the goroutines to leak. We might wait here until the timeout,
1673+
// but this is better than intermittent flakes because we didn't wait long
1674+
// enough. If we actually time out, then there's likely a bug.
1675+
attempts := 0
1676+
startTime := time.Now()
1677+
waitFor := 10 * time.Millisecond
1678+
for {
1679+
//
1680+
// If they never get detected, we'll get a timeout.
1681+
time.Sleep(waitFor)
1682+
1683+
var w strings.Builder
1684+
goroutineLeakProf.WriteTo(&w, 1)
1685+
n := countLeaks(t, w.String())
1686+
if n >= totalLeaked {
1687+
break
1688+
}
1689+
1690+
// Log some messages so if a timeout is seen
1691+
attempts++
1692+
t.Logf("waiting for leak: attempt %d (t=%s): found %d leaked goroutines", attempts, time.Since(startTime), n)
1693+
1694+
// Wait a little longer to avoid spamming the log.
1695+
waitFor *= 2
1696+
if waitFor > time.Second {
1697+
waitFor = time.Second
1698+
}
1699+
}
16931700

16941701
t.Run("profile contains leak", func(t *testing.T) {
16951702
var w strings.Builder
@@ -1707,6 +1714,11 @@ func TestGoroutineLeakProfileConcurrency(t *testing.T) {
17071714

17081715
// Concurrent calls to the goroutine leak profiler should not trigger data races
17091716
// or corruption.
1717+
quickCheckForGoroutine := func(t *testing.T, profType, leak, profText string) {
1718+
if !strings.Contains(profText, leak) {
1719+
t.Errorf("%s profile does not contain expected leaked goroutine %s: %s", profType, leak, profText)
1720+
}
1721+
}
17101722
t.Run("overlapping profile requests", func(t *testing.T) {
17111723
ctx := context.Background()
17121724
ctx, cancel := context.WithTimeout(ctx, time.Second)
@@ -1721,8 +1733,10 @@ func TestGoroutineLeakProfileConcurrency(t *testing.T) {
17211733
for ctx.Err() == nil {
17221734
var w strings.Builder
17231735
goroutineLeakProf.WriteTo(&w, 1)
1724-
countLeaks(t, 2*leakCount, w.String())
1725-
includesLeak(t, "goroutineleak", w.String())
1736+
if n := countLeaks(t, w.String()); n != totalLeaked {
1737+
t.Errorf("expected %d goroutines leaked, got %d: %s", totalLeaked, n, w.String())
1738+
}
1739+
quickCheckForGoroutine(t, "goroutineleak", "runtime/pprof.goroutineLeakExample", w.String())
17261740
}
17271741
}()
17281742
})
@@ -1746,8 +1760,10 @@ func TestGoroutineLeakProfileConcurrency(t *testing.T) {
17461760
for ctx.Err() == nil {
17471761
var w strings.Builder
17481762
goroutineLeakProf.WriteTo(&w, 1)
1749-
countLeaks(t, 2*leakCount, w.String())
1750-
includesLeak(t, "goroutineleak", w.String())
1763+
if n := countLeaks(t, w.String()); n != totalLeaked {
1764+
t.Errorf("expected %d goroutines leaked, got %d: %s", totalLeaked, n, w.String())
1765+
}
1766+
quickCheckForGoroutine(t, "goroutineleak", "runtime/pprof.goroutineLeakExample", w.String())
17511767
}
17521768
}()
17531769
go func() {
@@ -1758,7 +1774,7 @@ func TestGoroutineLeakProfileConcurrency(t *testing.T) {
17581774
// The regular goroutine profile should see the leaked
17591775
// goroutines. We simply check that the goroutine leak
17601776
// profile does not corrupt the goroutine profile state.
1761-
includesLeak(t, "goroutine", w.String())
1777+
quickCheckForGoroutine(t, "goroutine", "runtime/pprof.goroutineLeakExample", w.String())
17621778
}
17631779
}()
17641780
})

0 commit comments

Comments
 (0)