Skip to content

Commit 8cb350d

Browse files
prattmicgopherbot
authored andcommitted
runtime: synchronize P wakeup and dropping Ps
CL 310850 dropped work re-checks on non-spinning Ms to fix #43997. This introduced a new race condition: a non-spinning M may drop its P and then park at the same time a spinning M attempts to wake a P to handle some new work. The spinning M fails to find an idle P (because the non-spinning M hasn't quite made its P idle yet), and does nothing assuming that the system is fully loaded. This results in loss of work conservation. In the worst case we could have a complete deadlock if injectglist fails to wake anything just as all Ps are going idle. sched.needspinning adds new synchronization to cover this case. If work submission fails to find a P, it sets needspinning to indicate that a spinning M is required. When non-spinning Ms prepare to drop their P, they check needspinning and abort going idle to become a spinning M instead. This addresses the race without extra spurious wakeups. In the normal (non-racing case), an M will become spinning via the normal path and clear the flag. injectglist must change in addition to wakep because it is a similar form of work submission, notably used following netpoll at a point when we might not have a P that would guarantee the work runs. Fixes #45867 Change-Id: Ieb623a6d4162fb8c2be7b4ff8acdebcc3a0d69a8 Reviewed-on: https://go-review.googlesource.com/c/go/+/389014 TryBot-Result: Gopher Robot <[email protected]> Reviewed-by: Michael Knyszek <[email protected]> Run-TryBot: Michael Pratt <[email protected]> Auto-Submit: Michael Pratt <[email protected]>
1 parent 54cf1b1 commit 8cb350d

File tree

2 files changed

+139
-45
lines changed

2 files changed

+139
-45
lines changed

src/runtime/proc.go

+133-42
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ var modinfo string
7373
// If there is at least one spinning thread (sched.nmspinning>1), we don't
7474
// unpark new threads when submitting work. To compensate for that, if the last
7575
// spinning thread finds work and stops spinning, it must unpark a new spinning
76-
// thread. This approach smooths out unjustified spikes of thread unparking,
76+
// thread. This approach smooths out unjustified spikes of thread unparking,
7777
// but at the same time guarantees eventual maximal CPU parallelism
7878
// utilization.
7979
//
@@ -827,6 +827,12 @@ func mcommoninit(mp *m, id int64) {
827827
}
828828
}
829829

830+
func (mp *m) becomeSpinning() {
831+
mp.spinning = true
832+
sched.nmspinning.Add(1)
833+
sched.needspinning.Store(0)
834+
}
835+
830836
var fastrandseed uintptr
831837

832838
func fastrandinit() {
@@ -2242,8 +2248,8 @@ func mspinning() {
22422248
// Schedules some M to run the p (creates an M if necessary).
22432249
// If p==nil, tries to get an idle P, if no idle P's does nothing.
22442250
// May run with m.p==nil, so write barriers are not allowed.
2245-
// If spinning is set, the caller has incremented nmspinning and startm will
2246-
// either decrement nmspinning or set m.spinning in the newly started M.
2251+
// If spinning is set, the caller has incremented nmspinning and must provide a
2252+
// P. startm will set m.spinning in the newly started M.
22472253
//
22482254
// Callers passing a non-nil P must call from a non-preemptible context. See
22492255
// comment on acquirem below.
@@ -2271,16 +2277,15 @@ func startm(pp *p, spinning bool) {
22712277
mp := acquirem()
22722278
lock(&sched.lock)
22732279
if pp == nil {
2280+
if spinning {
2281+
// TODO(prattmic): All remaining calls to this function
2282+
// with _p_ == nil could be cleaned up to find a P
2283+
// before calling startm.
2284+
throw("startm: P required for spinning=true")
2285+
}
22742286
pp, _ = pidleget(0)
22752287
if pp == nil {
22762288
unlock(&sched.lock)
2277-
if spinning {
2278-
// The caller incremented nmspinning, but there are no idle Ps,
2279-
// so it's okay to just undo the increment and give up.
2280-
if sched.nmspinning.Add(-1) < 0 {
2281-
throw("startm: negative nmspinning")
2282-
}
2283-
}
22842289
releasem(mp)
22852290
return
22862291
}
@@ -2358,6 +2363,7 @@ func handoffp(pp *p) {
23582363
// no local work, check that there are no spinning/idle M's,
23592364
// otherwise our help is not required
23602365
if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic
2366+
sched.needspinning.Store(0)
23612367
startm(pp, true)
23622368
return
23632369
}
@@ -2404,15 +2410,41 @@ func handoffp(pp *p) {
24042410

24052411
// Tries to add one more P to execute G's.
24062412
// Called when a G is made runnable (newproc, ready).
2413+
// Must be called with a P.
24072414
func wakep() {
2408-
if sched.npidle.Load() == 0 {
2415+
// Be conservative about spinning threads, only start one if none exist
2416+
// already.
2417+
if sched.nmspinning.Load() != 0 || !sched.nmspinning.CompareAndSwap(0, 1) {
24092418
return
24102419
}
2411-
// be conservative about spinning threads
2412-
if sched.nmspinning.Load() != 0 || !sched.nmspinning.CompareAndSwap(0, 1) {
2420+
2421+
// Disable preemption until ownership of pp transfers to the next M in
2422+
// startm. Otherwise preemption here would leave pp stuck waiting to
2423+
// enter _Pgcstop.
2424+
//
2425+
// See preemption comment on acquirem in startm for more details.
2426+
mp := acquirem()
2427+
2428+
var pp *p
2429+
lock(&sched.lock)
2430+
pp, _ = pidlegetSpinning(0)
2431+
if pp == nil {
2432+
if sched.nmspinning.Add(-1) < 0 {
2433+
throw("wakep: negative nmspinning")
2434+
}
2435+
unlock(&sched.lock)
2436+
releasem(mp)
24132437
return
24142438
}
2415-
startm(nil, true)
2439+
// Since we always have a P, the race in the "No M is available"
2440+
// comment in startm doesn't apply during the small window between the
2441+
// unlock here and lock in startm. A checkdead in between will always
2442+
// see at least one running M (ours).
2443+
unlock(&sched.lock)
2444+
2445+
startm(pp, true)
2446+
2447+
releasem(mp)
24162448
}
24172449

24182450
// Stops execution of the current m that is locked to a g until the g is runnable again.
@@ -2646,8 +2678,7 @@ top:
26462678
// GOMAXPROCS>>1 but the program parallelism is low.
26472679
if mp.spinning || 2*sched.nmspinning.Load() < gomaxprocs-sched.npidle.Load() {
26482680
if !mp.spinning {
2649-
mp.spinning = true
2650-
sched.nmspinning.Add(1)
2681+
mp.becomeSpinning()
26512682
}
26522683

26532684
gp, inheritTime, tnow, w, newWork := stealWork(now)
@@ -2723,6 +2754,12 @@ top:
27232754
unlock(&sched.lock)
27242755
return gp, false, false
27252756
}
2757+
if !mp.spinning && sched.needspinning.Load() == 1 {
2758+
// See "Delicate dance" comment below.
2759+
mp.becomeSpinning()
2760+
unlock(&sched.lock)
2761+
goto top
2762+
}
27262763
if releasep() != pp {
27272764
throw("findrunnable: wrong p")
27282765
}
@@ -2743,12 +2780,28 @@ top:
27432780
// * New/modified-earlier timers on a per-P timer heap.
27442781
// * Idle-priority GC work (barring golang.org/issue/19112).
27452782
//
2746-
// If we discover new work below, we need to restore m.spinning as a signal
2747-
// for resetspinning to unpark a new worker thread (because there can be more
2748-
// than one starving goroutine). However, if after discovering new work
2749-
// we also observe no idle Ps it is OK to skip unparking a new worker
2750-
// thread: the system is fully loaded so no spinning threads are required.
2751-
// Also see "Worker thread parking/unparking" comment at the top of the file.
2783+
// If we discover new work below, we need to restore m.spinning as a
2784+
// signal for resetspinning to unpark a new worker thread (because
2785+
// there can be more than one starving goroutine).
2786+
//
2787+
// However, if after discovering new work we also observe no idle Ps
2788+
// (either here or in resetspinning), we have a problem. We may be
2789+
// racing with a non-spinning M in the block above, having found no
2790+
// work and preparing to release its P and park. Allowing that P to go
2791+
// idle will result in loss of work conservation (idle P while there is
2792+
// runnable work). This could result in complete deadlock in the
2793+
// unlikely event that we discover new work (from netpoll) right as we
2794+
// are racing with _all_ other Ps going idle.
2795+
//
2796+
// We use sched.needspinning to synchronize with non-spinning Ms going
2797+
// idle. If needspinning is set when they are about to drop their P,
2798+
// they abort the drop and instead become a new spinning M on our
2799+
// behalf. If we are not racing and the system is truly fully loaded
2800+
// then no spinning threads are required, and the next thread to
2801+
// naturally become spinning will clear the flag.
2802+
//
2803+
// Also see "Worker thread parking/unparking" comment at the top of the
2804+
// file.
27522805
wasSpinning := mp.spinning
27532806
if mp.spinning {
27542807
mp.spinning = false
@@ -2758,25 +2811,26 @@ top:
27582811

27592812
// Note the for correctness, only the last M transitioning from
27602813
// spinning to non-spinning must perform these rechecks to
2761-
// ensure no missed work. We are performing it on every M that
2762-
// transitions as a conservative change to monitor effects on
2763-
// latency. See golang.org/issue/43997.
2814+
// ensure no missed work. However, the runtime has some cases
2815+
// of transient increments of nmspinning that are decremented
2816+
// without going through this path, so we must be conservative
2817+
// and perform the check on all spinning Ms.
2818+
//
2819+
// See https://go.dev/issue/43997.
27642820

27652821
// Check all runqueues once again.
27662822
pp := checkRunqsNoP(allpSnapshot, idlepMaskSnapshot)
27672823
if pp != nil {
27682824
acquirep(pp)
2769-
mp.spinning = true
2770-
sched.nmspinning.Add(1)
2825+
mp.becomeSpinning()
27712826
goto top
27722827
}
27732828

27742829
// Check for idle-priority GC work again.
27752830
pp, gp := checkIdleGCNoP()
27762831
if pp != nil {
27772832
acquirep(pp)
2778-
mp.spinning = true
2779-
sched.nmspinning.Add(1)
2833+
mp.becomeSpinning()
27802834

27812835
// Run the idle worker.
27822836
pp.gcMarkWorkerMode = gcMarkWorkerIdleMode
@@ -2844,8 +2898,7 @@ top:
28442898
return gp, false, false
28452899
}
28462900
if wasSpinning {
2847-
mp.spinning = true
2848-
sched.nmspinning.Add(1)
2901+
mp.becomeSpinning()
28492902
}
28502903
goto top
28512904
}
@@ -2964,17 +3017,18 @@ func checkRunqsNoP(allpSnapshot []*p, idlepMaskSnapshot pMask) *p {
29643017
for id, p2 := range allpSnapshot {
29653018
if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(p2) {
29663019
lock(&sched.lock)
2967-
pp, _ := pidleget(0)
2968-
unlock(&sched.lock)
2969-
if pp != nil {
2970-
return pp
3020+
pp, _ := pidlegetSpinning(0)
3021+
if pp == nil {
3022+
// Can't get a P, don't bother checking remaining Ps.
3023+
unlock(&sched.lock)
3024+
return nil
29713025
}
2972-
2973-
// Can't get a P, don't bother checking remaining Ps.
2974-
break
3026+
unlock(&sched.lock)
3027+
return pp
29753028
}
29763029
}
29773030

3031+
// No work available.
29783032
return nil
29793033
}
29803034

@@ -3030,7 +3084,7 @@ func checkIdleGCNoP() (*p, *g) {
30303084
// the assumption in gcControllerState.findRunnableGCWorker that an
30313085
// empty gcBgMarkWorkerPool is only possible if gcMarkDone is running.
30323086
lock(&sched.lock)
3033-
pp, now := pidleget(0)
3087+
pp, now := pidlegetSpinning(0)
30343088
if pp == nil {
30353089
unlock(&sched.lock)
30363090
return nil, nil
@@ -3130,8 +3184,20 @@ func injectglist(glist *gList) {
31303184
*glist = gList{}
31313185

31323186
startIdle := func(n int) {
3133-
for ; n != 0 && sched.npidle.Load() != 0; n-- {
3134-
startm(nil, false)
3187+
for i := 0; i < n; i++ {
3188+
mp := acquirem() // See comment in startm.
3189+
lock(&sched.lock)
3190+
3191+
pp, _ := pidlegetSpinning(0)
3192+
if pp == nil {
3193+
unlock(&sched.lock)
3194+
releasem(mp)
3195+
break
3196+
}
3197+
3198+
unlock(&sched.lock)
3199+
startm(pp, false)
3200+
releasem(mp)
31353201
}
31363202
}
31373203

@@ -5406,7 +5472,7 @@ func schedtrace(detailed bool) {
54065472
}
54075473

54085474
lock(&sched.lock)
5409-
print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle.Load(), " threads=", mcount(), " spinningthreads=", sched.nmspinning.Load(), " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
5475+
print("SCHED ", (now-starttime)/1e6, "ms: gomaxprocs=", gomaxprocs, " idleprocs=", sched.npidle.Load(), " threads=", mcount(), " spinningthreads=", sched.nmspinning.Load(), " needspinning=", sched.needspinning.Load(), " idlethreads=", sched.nmidle, " runqueue=", sched.runqsize)
54105476
if detailed {
54115477
print(" gcwaiting=", sched.gcwaiting.Load(), " nmidlelocked=", sched.nmidlelocked, " stopwait=", sched.stopwait, " sysmonwait=", sched.sysmonwait.Load(), "\n")
54125478
}
@@ -5742,6 +5808,31 @@ func pidleget(now int64) (*p, int64) {
57425808
return pp, now
57435809
}
57445810

5811+
// pidlegetSpinning tries to get a p from the _Pidle list, acquiring ownership.
5812+
// This is called by spinning Ms (or callers than need a spinning M) that have
5813+
// found work. If no P is available, this must synchronized with non-spinning
5814+
// Ms that may be preparing to drop their P without discovering this work.
5815+
//
5816+
// sched.lock must be held.
5817+
//
5818+
// May run during STW, so write barriers are not allowed.
5819+
//
5820+
//go:nowritebarrierrec
5821+
func pidlegetSpinning(now int64) (*p, int64) {
5822+
assertLockHeld(&sched.lock)
5823+
5824+
pp, now := pidleget(now)
5825+
if pp == nil {
5826+
// See "Delicate dance" comment in findrunnable. We found work
5827+
// that we cannot take, we must synchronize with non-spinning
5828+
// Ms that may be preparing to drop their P.
5829+
sched.needspinning.Store(1)
5830+
return nil, now
5831+
}
5832+
5833+
return pp, now
5834+
}
5835+
57455836
// runqempty reports whether pp has no Gs on its local run queue.
57465837
// It never returns true spuriously.
57475838
func runqempty(pp *p) bool {

src/runtime/runtime2.go

+6-3
Original file line numberDiff line numberDiff line change
@@ -777,9 +777,10 @@ type schedt struct {
777777

778778
ngsys atomic.Int32 // number of system goroutines
779779

780-
pidle puintptr // idle p's
781-
npidle atomic.Int32
782-
nmspinning atomic.Int32 // See "Worker thread parking/unparking" comment in proc.go.
780+
pidle puintptr // idle p's
781+
npidle atomic.Int32
782+
nmspinning atomic.Int32 // See "Worker thread parking/unparking" comment in proc.go.
783+
needspinning atomic.Uint32 // See "Delicate dance" comment in proc.go. Boolean. Must hold sched.lock to set to 1.
783784

784785
// Global runnable queue.
785786
runq gQueue
@@ -840,6 +841,8 @@ type schedt struct {
840841
// with the rest of the runtime.
841842
sysmonlock mutex
842843

844+
_ uint32 // ensure timeToRun has 8-byte alignment
845+
843846
// timeToRun is a distribution of scheduling latencies, defined
844847
// as the sum of time a G spends in the _Grunnable state before
845848
// it transitions to _Grunning.

0 commit comments

Comments
 (0)