Skip to content

Commit 8fdc79e

Browse files
ChrisHinesALTree
authored andcommitted
runtime: reduce timer latency
Change the scheduler to treat expired timers with the same approach it uses to steal runnable G's. Previously the scheduler ignored timers on P's not marked for preemption. That had the downside that any G's waiting on those expired timers starved until the G running on their P completed or was preempted. That could take as long as 20ms if sysmon was in a 10ms wake up cycle. In addition, a spinning P that ignored an expired timer and found no other work would stop despite there being available work, missing the opportunity for greater parallelism. With this change the scheduler no longer ignores timers on non-preemptable P's or relies on sysmon as a backstop to start threads when timers expire. Instead it wakes an idle P, if needed, when creating a new timer because it cannot predict if the current P will have a scheduling opportunity before the new timer expires. The P it wakes will determine how long to sleep and block on the netpoller for the required time, potentially stealing the new timer when it wakes. This change also eliminates a race between a spinning P transitioning to idle concurrently with timer creation using the same pattern used for submission of new goroutines in the same window. Benchmark analysis: CL 232199, which was included in Go 1.15 improved timer latency over Go 1.14 by allowing P's to steal timers from P's not marked for preemption. The benchmarks added in this CL measure that improvement in the ParallelTimerLatency benchmark seen below. However, Go 1.15 still relies on sysmon to notice expired timers in some situations and sysmon can sleep for up to 10ms before waking to check timers. This CL fixes that shortcoming with modest regression on other benchmarks. name \ avg-late-ns go14.time.bench go15.time.bench fix.time.bench ParallelTimerLatency-8 17.3M ± 3% 7.9M ± 0% 0.2M ± 3% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=1-8 53.4k ±23% 50.7k ±31% 252.4k ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=2-8 204k ±14% 90k ±58% 188k ±12% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=3-8 1.17M ± 0% 0.11M ± 5% 0.11M ± 2% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=4-8 1.81M ±44% 0.10M ± 4% 0.10M ± 2% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=5-8 2.28M ±66% 0.09M ±13% 0.08M ±21% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=6-8 2.84M ±85% 0.07M ±15% 0.07M ±18% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=7-8 2.13M ±27% 0.06M ± 4% 0.06M ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=8-8 2.63M ± 6% 0.06M ±11% 0.06M ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=9-8 3.32M ±17% 0.06M ±16% 0.07M ±14% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=10-8 8.46M ±20% 4.37M ±21% 5.03M ±23% StaggeredTickerLatency/work-dur=2ms/tickers-per-P=1-8 1.02M ± 1% 0.20M ± 2% 0.20M ± 2% name \ max-late-ns go14.time.bench go15.time.bench fix.time.bench ParallelTimerLatency-8 18.3M ± 1% 8.2M ± 0% 0.5M ±12% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=1-8 141k ±19% 127k ±19% 1129k ± 3% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=2-8 2.78M ± 4% 1.23M ±15% 1.26M ± 5% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=3-8 6.05M ± 5% 0.67M ±56% 0.81M ±33% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=4-8 7.93M ±20% 0.71M ±46% 0.76M ±41% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=5-8 9.41M ±30% 0.92M ±23% 0.81M ±44% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=6-8 10.8M ±42% 0.8M ±41% 0.8M ±30% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=7-8 9.62M ±24% 0.77M ±38% 0.88M ±27% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=8-8 10.6M ±10% 0.8M ±32% 0.7M ±27% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=9-8 11.9M ±36% 0.6M ±46% 0.8M ±38% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=10-8 36.8M ±21% 24.7M ±21% 27.5M ±16% StaggeredTickerLatency/work-dur=2ms/tickers-per-P=1-8 2.12M ± 2% 1.02M ±11% 1.03M ± 7% Other time benchmarks: name \ time/op go14.time.bench go15.time.bench fix.time.bench AfterFunc-8 137µs ± 4% 123µs ± 4% 131µs ± 2% After-8 212µs ± 3% 195µs ± 4% 204µs ± 7% Stop-8 165µs ± 6% 156µs ± 2% 151µs ±12% SimultaneousAfterFunc-8 260µs ± 3% 248µs ± 3% 284µs ± 2% StartStop-8 65.8µs ± 9% 64.4µs ± 7% 67.3µs ±15% Reset-8 13.6µs ± 2% 9.6µs ± 2% 9.1µs ± 4% Sleep-8 307µs ± 4% 306µs ± 3% 320µs ± 2% Ticker-8 53.0µs ± 5% 54.5µs ± 5% 57.0µs ±11% TickerReset-8 9.24µs ± 2% 9.51µs ± 3% TickerResetNaive-8 149µs ± 5% 145µs ± 5% Fixes #38860 Updates #25471 Updates #27707 Change-Id: If52680509b0f3b66dbd1d0c13fa574bd2d0bbd57 Reviewed-on: https://go-review.googlesource.com/c/go/+/232298 Run-TryBot: Alberto Donizetti <[email protected]> TryBot-Result: Go Bot <[email protected]> Reviewed-by: Austin Clements <[email protected]> Trust: Ian Lance Taylor <[email protected]>
1 parent b3f7f60 commit 8fdc79e

File tree

3 files changed

+294
-83
lines changed

3 files changed

+294
-83
lines changed

src/runtime/lockrank.go

+11-11
Original file line numberDiff line numberDiff line change
@@ -41,12 +41,12 @@ const (
4141
lockRankCpuprof
4242
lockRankSweep
4343

44+
lockRankPollDesc
4445
lockRankSched
4546
lockRankDeadlock
4647
lockRankPanic
4748
lockRankAllg
4849
lockRankAllp
49-
lockRankPollDesc
5050

5151
lockRankTimers // Multiple timers locked simultaneously in destroy()
5252
lockRankItab
@@ -120,12 +120,12 @@ var lockNames = []string{
120120
lockRankCpuprof: "cpuprof",
121121
lockRankSweep: "sweep",
122122

123+
lockRankPollDesc: "pollDesc",
123124
lockRankSched: "sched",
124125
lockRankDeadlock: "deadlock",
125126
lockRankPanic: "panic",
126127
lockRankAllg: "allg",
127128
lockRankAllp: "allp",
128-
lockRankPollDesc: "pollDesc",
129129

130130
lockRankTimers: "timers",
131131
lockRankItab: "itab",
@@ -182,14 +182,14 @@ func (rank lockRank) String() string {
182182
return lockNames[rank]
183183
}
184184

185-
// lockPartialOrder is a partial order among the various lock types, listing the immediate
186-
// ordering that has actually been observed in the runtime. Each entry (which
187-
// corresponds to a particular lock rank) specifies the list of locks that can be
188-
// already be held immediately "above" it.
185+
// lockPartialOrder is a partial order among the various lock types, listing the
186+
// immediate ordering that has actually been observed in the runtime. Each entry
187+
// (which corresponds to a particular lock rank) specifies the list of locks
188+
// that can already be held immediately "above" it.
189189
//
190-
// So, for example, the lockRankSched entry shows that all the locks preceding it in
191-
// rank can actually be held. The fin lock shows that only the sched, timers, or
192-
// hchan lock can be held immediately above it when it is acquired.
190+
// So, for example, the lockRankSched entry shows that all the locks preceding
191+
// it in rank can actually be held. The allp lock shows that only the sysmon or
192+
// sched lock can be held immediately above it when it is acquired.
193193
var lockPartialOrder [][]lockRank = [][]lockRank{
194194
lockRankDummy: {},
195195
lockRankSysmon: {},
@@ -199,12 +199,12 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
199199
lockRankAssistQueue: {},
200200
lockRankCpuprof: {},
201201
lockRankSweep: {},
202-
lockRankSched: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
202+
lockRankPollDesc: {},
203+
lockRankSched: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
203204
lockRankDeadlock: {lockRankDeadlock},
204205
lockRankPanic: {lockRankDeadlock},
205206
lockRankAllg: {lockRankSysmon, lockRankSched, lockRankPanic},
206207
lockRankAllp: {lockRankSysmon, lockRankSched},
207-
lockRankPollDesc: {},
208208
lockRankTimers: {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
209209
lockRankItab: {},
210210
lockRankReflectOffs: {lockRankItab},

src/runtime/proc.go

+96-72
Original file line numberDiff line numberDiff line change
@@ -2264,11 +2264,16 @@ func handoffp(_p_ *p) {
22642264
startm(_p_, false)
22652265
return
22662266
}
2267-
if when := nobarrierWakeTime(_p_); when != 0 {
2268-
wakeNetPoller(when)
2269-
}
2267+
2268+
// The scheduler lock cannot be held when calling wakeNetPoller below
2269+
// because wakeNetPoller may call wakep which may call startm.
2270+
when := nobarrierWakeTime(_p_)
22702271
pidleput(_p_)
22712272
unlock(&sched.lock)
2273+
2274+
if when != 0 {
2275+
wakeNetPoller(when)
2276+
}
22722277
}
22732278

22742279
// Tries to add one more P to execute G's.
@@ -2477,40 +2482,33 @@ top:
24772482
_g_.m.spinning = true
24782483
atomic.Xadd(&sched.nmspinning, 1)
24792484
}
2480-
for i := 0; i < 4; i++ {
2485+
const stealTries = 4
2486+
for i := 0; i < stealTries; i++ {
2487+
stealTimersOrRunNextG := i == stealTries-1
2488+
24812489
for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
24822490
if sched.gcwaiting != 0 {
24832491
goto top
24842492
}
2485-
stealRunNextG := i > 2 // first look for ready queues with more than 1 g
24862493
p2 := allp[enum.position()]
24872494
if _p_ == p2 {
24882495
continue
24892496
}
24902497

2491-
// Don't bother to attempt to steal if p2 is idle.
2492-
if !idlepMask.read(enum.position()) {
2493-
if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
2494-
return gp, false
2495-
}
2496-
}
2497-
2498-
// Consider stealing timers from p2.
2499-
// This call to checkTimers is the only place where
2500-
// we hold a lock on a different P's timers.
2501-
// Lock contention can be a problem here, so
2502-
// initially avoid grabbing the lock if p2 is running
2503-
// and is not marked for preemption. If p2 is running
2504-
// and not being preempted we assume it will handle its
2505-
// own timers.
2498+
// Steal timers from p2. This call to checkTimers is the only place
2499+
// where we might hold a lock on a different P's timers. We do this
2500+
// once on the last pass before checking runnext because stealing
2501+
// from the other P's runnext should be the last resort, so if there
2502+
// are timers to steal do that first.
25062503
//
2507-
// If we're still looking for work after checking all
2508-
// the P's, then go ahead and steal from an active P.
2504+
// We only check timers on one of the stealing iterations because
2505+
// the time stored in now doesn't change in this loop and checking
2506+
// the timers for each P more than once with the same value of now
2507+
// is probably a waste of time.
25092508
//
2510-
// TODO(prattmic): Maintain a global look-aside similar
2511-
// to idlepMask to avoid looking at p2 if it can't
2512-
// possibly have timers.
2513-
if i > 2 || (i > 1 && shouldStealTimers(p2)) {
2509+
// TODO(prattmic): Maintain a global look-aside similar to idlepMask
2510+
// to avoid looking at p2 if it can't possibly have timers.
2511+
if stealTimersOrRunNextG {
25142512
tnow, w, ran := checkTimers(p2, now)
25152513
now = tnow
25162514
if w != 0 && (pollUntil == 0 || w < pollUntil) {
@@ -2531,6 +2529,13 @@ top:
25312529
ranTimer = true
25322530
}
25332531
}
2532+
2533+
// Don't bother to attempt to steal if p2 is idle.
2534+
if !idlepMask.read(enum.position()) {
2535+
if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
2536+
return gp, false
2537+
}
2538+
}
25342539
}
25352540
}
25362541
if ranTimer {
@@ -2606,7 +2611,7 @@ stop:
26062611
// drop nmspinning first and then check all per-P queues again (with
26072612
// #StoreLoad memory barrier in between). If we do it the other way around,
26082613
// another thread can submit a goroutine after we've checked all run queues
2609-
// but before we drop nmspinning; as the result nobody will unpark a thread
2614+
// but before we drop nmspinning; as a result nobody will unpark a thread
26102615
// to run the goroutine.
26112616
// If we discover new work below, we need to restore m.spinning as a signal
26122617
// for resetspinning to unpark a new worker thread (because there can be more
@@ -2640,6 +2645,35 @@ stop:
26402645
}
26412646
}
26422647

2648+
// Similar to above, check for timer creation or expiry concurrently with
2649+
// transitioning from spinning to non-spinning. Note that we cannot use
2650+
// checkTimers here because it calls adjusttimers which may need to allocate
2651+
// memory, and that isn't allowed when we don't have an active P.
2652+
for _, _p_ := range allpSnapshot {
2653+
// This is similar to nobarrierWakeTime, but minimizes calls to
2654+
// nanotime.
2655+
if atomic.Load(&_p_.adjustTimers) > 0 {
2656+
if now == 0 {
2657+
now = nanotime()
2658+
}
2659+
pollUntil = now
2660+
} else {
2661+
w := int64(atomic.Load64(&_p_.timer0When))
2662+
if w != 0 && (pollUntil == 0 || w < pollUntil) {
2663+
pollUntil = w
2664+
}
2665+
}
2666+
}
2667+
if pollUntil != 0 {
2668+
if now == 0 {
2669+
now = nanotime()
2670+
}
2671+
delta = pollUntil - now
2672+
if delta < 0 {
2673+
delta = 0
2674+
}
2675+
}
2676+
26432677
// Check for idle-priority GC work again.
26442678
if gcBlackenEnabled != 0 && gcMarkWorkAvailable(nil) {
26452679
lock(&sched.lock)
@@ -2735,9 +2769,9 @@ func pollWork() bool {
27352769
return false
27362770
}
27372771

2738-
// wakeNetPoller wakes up the thread sleeping in the network poller,
2739-
// if there is one, and if it isn't going to wake up anyhow before
2740-
// the when argument.
2772+
// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
2773+
// going to wake up before the when argument; or it wakes an idle P to service
2774+
// timers and the network poller if there isn't one already.
27412775
func wakeNetPoller(when int64) {
27422776
if atomic.Load64(&sched.lastpoll) == 0 {
27432777
// In findrunnable we ensure that when polling the pollUntil
@@ -2748,6 +2782,10 @@ func wakeNetPoller(when int64) {
27482782
if pollerPollUntil == 0 || pollerPollUntil > when {
27492783
netpollBreak()
27502784
}
2785+
} else {
2786+
// There are no threads in the network poller, try to get
2787+
// one there so it can handle new timers.
2788+
wakep()
27512789
}
27522790
}
27532791

@@ -3034,25 +3072,6 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
30343072
return rnow, pollUntil, ran
30353073
}
30363074

3037-
// shouldStealTimers reports whether we should try stealing the timers from p2.
3038-
// We don't steal timers from a running P that is not marked for preemption,
3039-
// on the assumption that it will run its own timers. This reduces
3040-
// contention on the timers lock.
3041-
func shouldStealTimers(p2 *p) bool {
3042-
if p2.status != _Prunning {
3043-
return true
3044-
}
3045-
mp := p2.m.ptr()
3046-
if mp == nil || mp.locks > 0 {
3047-
return false
3048-
}
3049-
gp := mp.curg
3050-
if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
3051-
return false
3052-
}
3053-
return true
3054-
}
3055-
30563075
func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
30573076
unlock((*mutex)(lock))
30583077
return true
@@ -4603,7 +4622,7 @@ func procresize(nprocs int32) *p {
46034622
}
46044623
sched.procresizetime = now
46054624

4606-
maskWords := (nprocs+31) / 32
4625+
maskWords := (nprocs + 31) / 32
46074626

46084627
// Grow allp if necessary.
46094628
if nprocs > int32(len(allp)) {
@@ -4927,11 +4946,28 @@ func sysmon() {
49274946
}
49284947
usleep(delay)
49294948
mDoFixup()
4949+
4950+
// sysmon should not enter deep sleep if schedtrace is enabled so that
4951+
// it can print that information at the right time.
4952+
//
4953+
// It should also not enter deep sleep if there are any active P's so
4954+
// that it can retake P's from syscalls, preempt long running G's, and
4955+
// poll the network if all P's are busy for long stretches.
4956+
//
4957+
// It should wakeup from deep sleep if any P's become active either due
4958+
// to exiting a syscall or waking up due to a timer expiring so that it
4959+
// can resume performing those duties. If it wakes from a syscall it
4960+
// resets idle and delay as a bet that since it had retaken a P from a
4961+
// syscall before, it may need to do it again shortly after the
4962+
// application starts work again. It does not reset idle when waking
4963+
// from a timer to avoid adding system load to applications that spend
4964+
// most of their time sleeping.
49304965
now := nanotime()
4931-
next, _ := timeSleepUntil()
49324966
if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
49334967
lock(&sched.lock)
49344968
if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
4969+
syscallWake := false
4970+
next, _ := timeSleepUntil()
49354971
if next > now {
49364972
atomic.Store(&sched.sysmonwait, 1)
49374973
unlock(&sched.lock)
@@ -4945,33 +4981,27 @@ func sysmon() {
49454981
if shouldRelax {
49464982
osRelax(true)
49474983
}
4948-
notetsleep(&sched.sysmonnote, sleep)
4984+
syscallWake = notetsleep(&sched.sysmonnote, sleep)
49494985
mDoFixup()
49504986
if shouldRelax {
49514987
osRelax(false)
49524988
}
4953-
now = nanotime()
4954-
next, _ = timeSleepUntil()
49554989
lock(&sched.lock)
49564990
atomic.Store(&sched.sysmonwait, 0)
49574991
noteclear(&sched.sysmonnote)
49584992
}
4959-
idle = 0
4960-
delay = 20
4993+
if syscallWake {
4994+
idle = 0
4995+
delay = 20
4996+
}
49614997
}
49624998
unlock(&sched.lock)
49634999
}
5000+
49645001
lock(&sched.sysmonlock)
4965-
{
4966-
// If we spent a long time blocked on sysmonlock
4967-
// then we want to update now and next since it's
4968-
// likely stale.
4969-
now1 := nanotime()
4970-
if now1-now > 50*1000 /* 50µs */ {
4971-
next, _ = timeSleepUntil()
4972-
}
4973-
now = now1
4974-
}
5002+
// Update now in case we blocked on sysmonnote or spent a long time
5003+
// blocked on schedlock or sysmonlock above.
5004+
now = nanotime()
49755005

49765006
// trigger libc interceptors if needed
49775007
if *cgo_yield != nil {
@@ -4996,12 +5026,6 @@ func sysmon() {
49965026
}
49975027
}
49985028
mDoFixup()
4999-
if next < now {
5000-
// There are timers that should have already run,
5001-
// perhaps because there is an unpreemptible P.
5002-
// Try to start an M to run them.
5003-
startm(nil, false)
5004-
}
50055029
if atomic.Load(&scavenge.sysmonWake) != 0 {
50065030
// Kick the scavenger awake if someone requested it.
50075031
wakeScavenger()

0 commit comments

Comments
 (0)