Skip to content

Commit ecfce58

Browse files
committed
runtime: skip work recheck for non-spinning Ms
When an M transitions from spinning to non-spinning state, it must recheck most sources of work to avoid missing work submitted between its initial check and decrementing sched.nmspinning (see "delicate dance" comment). Ever since the scheduler rewrite in Go 1.1 (golang.org/cl/7314062), we have performed this recheck on all Ms before stopping, regardless of whether or not they were spinning. Unfortunately, there is a problem with this approach: non-spinning Ms are not eligible to steal work (note the skip over the stealWork block), but can detect work during the recheck. If there is work available, this non-spinning M will jump to top, skip stealing, land in recheck again, and repeat. i.e., it will spin uselessly. The spin is bounded. This can only occur if there is another spinning M, which will either take the work, allowing this M to stop, or take some other work, allowing this M to upgrade to spinning. But the spinning is ultimately just a fancy spin-wait. golang.org/issue/43997 discusses several ways to address this. This CL takes the simplest approach: skipping the recheck on non-spinning Ms and allowing them to go to stop. Results for scheduler-relevant runtime and time benchmarks can be found at https://perf.golang.org/search?q=upload:20210420.5. The new BenchmarkCreateGoroutinesSingle is a characteristic example workload that hits this issue hard. A single M readies lots of work without itself parking. Other Ms must spin to steal work, which is very short-lived, forcing those Ms to spin again. Some of the Ms will be non-spinning and hit the above bug. With this fixed, that benchmark drops in CPU usage by a massive 68%, and wall time 24%. BenchmarkNetpollBreak shows similar drops because it is unintentionally almost the same benchmark (create short-living Gs in a loop). Typical well-behaved programs show little change. We also measure scheduling latency (time from goready to execute). Note that many of these benchmarks are very noisy because they don't involve much scheduling. Those that do, like CreateGoroutinesSingle, are expected to increase as we are replacing unintentional spin waiting with a real park. Fixes #43997 Change-Id: Ie1d1e1800f393cee1792455412caaa5865d13562 Reviewed-on: https://go-review.googlesource.com/c/go/+/310850 Trust: Michael Pratt <[email protected]> Run-TryBot: Michael Pratt <[email protected]> TryBot-Result: Go Bot <[email protected]> Reviewed-by: Michael Knyszek <[email protected]>
1 parent b6ff3c6 commit ecfce58

File tree

2 files changed

+79
-28
lines changed

2 files changed

+79
-28
lines changed

src/runtime/proc.go

+30-28
Original file line numberDiff line numberDiff line change
@@ -2841,44 +2841,46 @@ top:
28412841
if int32(atomic.Xadd(&sched.nmspinning, -1)) < 0 {
28422842
throw("findrunnable: negative nmspinning")
28432843
}
2844-
}
28452844

2846-
// Check all runqueues once again.
2847-
_p_ = checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
2848-
if _p_ != nil {
2849-
acquirep(_p_)
2850-
if wasSpinning {
2845+
// Note the for correctness, only the last M transitioning from
2846+
// spinning to non-spinning must perform these rechecks to
2847+
// ensure no missed work. We are performing it on every M that
2848+
// transitions as a conservative change to monitor effects on
2849+
// latency. See golang.org/issue/43997.
2850+
2851+
// Check all runqueues once again.
2852+
_p_ = checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
2853+
if _p_ != nil {
2854+
acquirep(_p_)
28512855
_g_.m.spinning = true
28522856
atomic.Xadd(&sched.nmspinning, 1)
2857+
goto top
28532858
}
2854-
goto top
2855-
}
28562859

2857-
// Check for idle-priority GC work again.
2858-
_p_, gp = checkIdleGCNoP()
2859-
if _p_ != nil {
2860-
acquirep(_p_)
2861-
if wasSpinning {
2860+
// Check for idle-priority GC work again.
2861+
_p_, gp = checkIdleGCNoP()
2862+
if _p_ != nil {
2863+
acquirep(_p_)
28622864
_g_.m.spinning = true
28632865
atomic.Xadd(&sched.nmspinning, 1)
2864-
}
28652866

2866-
// Run the idle worker.
2867-
_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
2868-
casgstatus(gp, _Gwaiting, _Grunnable)
2869-
if trace.enabled {
2870-
traceGoUnpark(gp, 0)
2867+
// Run the idle worker.
2868+
_p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
2869+
casgstatus(gp, _Gwaiting, _Grunnable)
2870+
if trace.enabled {
2871+
traceGoUnpark(gp, 0)
2872+
}
2873+
return gp, false
28712874
}
2872-
return gp, false
2873-
}
28742875

2875-
// Finally, check for timer creation or expiry concurrently with
2876-
// transitioning from spinning to non-spinning.
2877-
//
2878-
// Note that we cannot use checkTimers here because it calls
2879-
// adjusttimers which may need to allocate memory, and that isn't
2880-
// allowed when we don't have an active P.
2881-
pollUntil = checkTimersNoP(allpSnapshot, timerpMaskSnapshot, pollUntil)
2876+
// Finally, check for timer creation or expiry concurrently with
2877+
// transitioning from spinning to non-spinning.
2878+
//
2879+
// Note that we cannot use checkTimers here because it calls
2880+
// adjusttimers which may need to allocate memory, and that isn't
2881+
// allowed when we don't have an active P.
2882+
pollUntil = checkTimersNoP(allpSnapshot, timerpMaskSnapshot, pollUntil)
2883+
}
28822884

28832885
// Poll network until next timer.
28842886
if netpollinited() && (atomic.Load(&netpollWaiters) > 0 || pollUntil != 0) && atomic.Xchg64(&sched.lastpoll, 0) != 0 {

src/runtime/proc_test.go

+49
Original file line numberDiff line numberDiff line change
@@ -692,6 +692,55 @@ func BenchmarkCreateGoroutinesCapture(b *testing.B) {
692692
}
693693
}
694694

695+
// warmupScheduler ensures the scheduler has at least targetThreadCount threads
696+
// in its thread pool.
697+
func warmupScheduler(targetThreadCount int) {
698+
var wg sync.WaitGroup
699+
var count int32
700+
for i := 0; i < targetThreadCount; i++ {
701+
wg.Add(1)
702+
go func() {
703+
atomic.AddInt32(&count, 1)
704+
for atomic.LoadInt32(&count) < int32(targetThreadCount) {
705+
// spin until all threads started
706+
}
707+
708+
// spin a bit more to ensure they are all running on separate CPUs.
709+
doWork(time.Millisecond)
710+
wg.Done()
711+
}()
712+
}
713+
wg.Wait()
714+
}
715+
716+
func doWork(dur time.Duration) {
717+
start := time.Now()
718+
for time.Since(start) < dur {
719+
}
720+
}
721+
722+
// BenchmarkCreateGoroutinesSingle creates many goroutines, all from a single
723+
// producer (the main benchmark goroutine).
724+
//
725+
// Compared to BenchmarkCreateGoroutines, this causes different behavior in the
726+
// scheduler because Ms are much more likely to need to steal work from the
727+
// main P rather than having work in the local run queue.
728+
func BenchmarkCreateGoroutinesSingle(b *testing.B) {
729+
// Since we are interested in stealing behavior, warm the scheduler to
730+
// get all the Ps running first.
731+
warmupScheduler(runtime.GOMAXPROCS(0))
732+
b.ResetTimer()
733+
734+
var wg sync.WaitGroup
735+
wg.Add(b.N)
736+
for i := 0; i < b.N; i++ {
737+
go func(){
738+
wg.Done()
739+
}()
740+
}
741+
wg.Wait()
742+
}
743+
695744
func BenchmarkClosureCall(b *testing.B) {
696745
sum := 0
697746
off1 := 1

0 commit comments

Comments
 (0)