runtime: reduce timer latency

ChrisHines · ALTree · commit 8fdc79e18a97 · 2020-10-27T17:24:42.000Z
Change the scheduler to treat expired timers with the same approach it uses to steal runnable G's. Previously the scheduler ignored timers on P's not marked for preemption. That had the downside that any G's waiting on those expired timers starved until the G running on their P completed or was preempted. That could take as long as 20ms if sysmon was in a 10ms wake up cycle. In addition, a spinning P that ignored an expired timer and found no other work would stop despite there being available work, missing the opportunity for greater parallelism. With this change the scheduler no longer ignores timers on non-preemptable P's or relies on sysmon as a backstop to start threads when timers expire. Instead it wakes an idle P, if needed, when creating a new timer because it cannot predict if the current P will have a scheduling opportunity before the new timer expires. The P it wakes will determine how long to sleep and block on the netpoller for the required time, potentially stealing the new timer when it wakes. This change also eliminates a race between a spinning P transitioning to idle concurrently with timer creation using the same pattern used for submission of new goroutines in the same window. Benchmark analysis: CL 232199, which was included in Go 1.15 improved timer latency over Go 1.14 by allowing P's to steal timers from P's not marked for preemption. The benchmarks added in this CL measure that improvement in the ParallelTimerLatency benchmark seen below. However, Go 1.15 still relies on sysmon to notice expired timers in some situations and sysmon can sleep for up to 10ms before waking to check timers. This CL fixes that shortcoming with modest regression on other benchmarks. name \ avg-late-ns go14.time.bench go15.time.bench fix.time.bench ParallelTimerLatency-8 17.3M ± 3% 7.9M ± 0% 0.2M ± 3% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=1-8 53.4k ±23% 50.7k ±31% 252.4k ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=2-8 204k ±14% 90k ±58% 188k ±12% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=3-8 1.17M ± 0% 0.11M ± 5% 0.11M ± 2% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=4-8 1.81M ±44% 0.10M ± 4% 0.10M ± 2% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=5-8 2.28M ±66% 0.09M ±13% 0.08M ±21% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=6-8 2.84M ±85% 0.07M ±15% 0.07M ±18% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=7-8 2.13M ±27% 0.06M ± 4% 0.06M ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=8-8 2.63M ± 6% 0.06M ±11% 0.06M ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=9-8 3.32M ±17% 0.06M ±16% 0.07M ±14% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=10-8 8.46M ±20% 4.37M ±21% 5.03M ±23% StaggeredTickerLatency/work-dur=2ms/tickers-per-P=1-8 1.02M ± 1% 0.20M ± 2% 0.20M ± 2% name \ max-late-ns go14.time.bench go15.time.bench fix.time.bench ParallelTimerLatency-8 18.3M ± 1% 8.2M ± 0% 0.5M ±12% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=1-8 141k ±19% 127k ±19% 1129k ± 3% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=2-8 2.78M ± 4% 1.23M ±15% 1.26M ± 5% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=3-8 6.05M ± 5% 0.67M ±56% 0.81M ±33% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=4-8 7.93M ±20% 0.71M ±46% 0.76M ±41% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=5-8 9.41M ±30% 0.92M ±23% 0.81M ±44% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=6-8 10.8M ±42% 0.8M ±41% 0.8M ±30% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=7-8 9.62M ±24% 0.77M ±38% 0.88M ±27% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=8-8 10.6M ±10% 0.8M ±32% 0.7M ±27% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=9-8 11.9M ±36% 0.6M ±46% 0.8M ±38% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=10-8 36.8M ±21% 24.7M ±21% 27.5M ±16% StaggeredTickerLatency/work-dur=2ms/tickers-per-P=1-8 2.12M ± 2% 1.02M ±11% 1.03M ± 7% Other time benchmarks: name \ time/op go14.time.bench go15.time.bench fix.time.bench AfterFunc-8 137µs ± 4% 123µs ± 4% 131µs ± 2% After-8 212µs ± 3% 195µs ± 4% 204µs ± 7% Stop-8 165µs ± 6% 156µs ± 2% 151µs ±12% SimultaneousAfterFunc-8 260µs ± 3% 248µs ± 3% 284µs ± 2% StartStop-8 65.8µs ± 9% 64.4µs ± 7% 67.3µs ±15% Reset-8 13.6µs ± 2% 9.6µs ± 2% 9.1µs ± 4% Sleep-8 307µs ± 4% 306µs ± 3% 320µs ± 2% Ticker-8 53.0µs ± 5% 54.5µs ± 5% 57.0µs ±11% TickerReset-8 9.24µs ± 2% 9.51µs ± 3% TickerResetNaive-8 149µs ± 5% 145µs ± 5% Fixes #38860 Updates #25471 Updates #27707 Change-Id: If52680509b0f3b66dbd1d0c13fa574bd2d0bbd57 Reviewed-on: https://go-review.googlesource.com/c/go/+/232298 Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Trust: Ian Lance Taylor <iant@golang.org>
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
@@ -41,12 +41,12 @@ const (
 	lockRankCpuprof
 	lockRankSweep
 
+	lockRankPollDesc
 	lockRankSched
 	lockRankDeadlock
 	lockRankPanic
 	lockRankAllg
 	lockRankAllp
-	lockRankPollDesc
 
 	lockRankTimers // Multiple timers locked simultaneously in destroy()
 	lockRankItab
@@ -120,12 +120,12 @@ var lockNames = []string{
 	lockRankCpuprof:      "cpuprof",
 	lockRankSweep:        "sweep",
 
+	lockRankPollDesc: "pollDesc",
 	lockRankSched:    "sched",
 	lockRankDeadlock: "deadlock",
 	lockRankPanic:    "panic",
 	lockRankAllg:     "allg",
 	lockRankAllp:     "allp",
-	lockRankPollDesc: "pollDesc",
 
 	lockRankTimers:      "timers",
 	lockRankItab:        "itab",
@@ -182,14 +182,14 @@ func (rank lockRank) String() string {
 	return lockNames[rank]
 }
 
-// lockPartialOrder is a partial order among the various lock types, listing the immediate
-// ordering that has actually been observed in the runtime. Each entry (which
-// corresponds to a particular lock rank) specifies the list of locks that can be
-// already be held immediately "above" it.
+// lockPartialOrder is a partial order among the various lock types, listing the
+// immediate ordering that has actually been observed in the runtime. Each entry
+// (which corresponds to a particular lock rank) specifies the list of locks
+// that can already be held immediately "above" it.
 //
-// So, for example, the lockRankSched entry shows that all the locks preceding it in
-// rank can actually be held. The fin lock shows that only the sched, timers, or
-// hchan lock can be held immediately above it when it is acquired.
+// So, for example, the lockRankSched entry shows that all the locks preceding
+// it in rank can actually be held. The allp lock shows that only the sysmon or
+// sched lock can be held immediately above it when it is acquired.
 var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankDummy:         {},
 	lockRankSysmon:        {},
@@ -199,12 +199,12 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankAssistQueue:   {},
 	lockRankCpuprof:       {},
 	lockRankSweep:         {},
-	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankPollDesc:      {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
 	lockRankDeadlock:      {lockRankDeadlock},
 	lockRankPanic:         {lockRankDeadlock},
 	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
 	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankPollDesc:      {},
 	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
@@ -2264,11 +2264,16 @@ func handoffp(_p_ *p) {
 		startm(_p_, false)
 		return
 	}
-	if when := nobarrierWakeTime(_p_); when != 0 {
-		wakeNetPoller(when)
-	}
+
+	// The scheduler lock cannot be held when calling wakeNetPoller below
+	// because wakeNetPoller may call wakep which may call startm.
+	when := nobarrierWakeTime(_p_)
 	pidleput(_p_)
 	unlock(&sched.lock)
+
+	if when != 0 {
+		wakeNetPoller(when)
+	}
 }
 
 // Tries to add one more P to execute G's.
@@ -2477,40 +2482,33 @@ top:
 		_g_.m.spinning = true
 		atomic.Xadd(&sched.nmspinning, 1)
 	}
-	for i := 0; i < 4; i++ {
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
 		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
 			if sched.gcwaiting != 0 {
 				goto top
 			}
-			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
 			p2 := allp[enum.position()]
 			if _p_ == p2 {
 				continue
 			}
 
-			// Don't bother to attempt to steal if p2 is idle.
-			if !idlepMask.read(enum.position()) {
-				if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
-					return gp, false
-				}
-			}
-
-			// Consider stealing timers from p2.
-			// This call to checkTimers is the only place where
-			// we hold a lock on a different P's timers.
-			// Lock contention can be a problem here, so
-			// initially avoid grabbing the lock if p2 is running
-			// and is not marked for preemption. If p2 is running
-			// and not being preempted we assume it will handle its
-			// own timers.
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
 			//
-			// If we're still looking for work after checking all
-			// the P's, then go ahead and steal from an active P.
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
 			//
-			// TODO(prattmic): Maintain a global look-aside similar
-			// to idlepMask to avoid looking at p2 if it can't
-			// possibly have timers.
-			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
+			// TODO(prattmic): Maintain a global look-aside similar to idlepMask
+			// to avoid looking at p2 if it can't possibly have timers.
+			if stealTimersOrRunNextG {
 				tnow, w, ran := checkTimers(p2, now)
 				now = tnow
 				if w != 0 && (pollUntil == 0 || w < pollUntil) {
@@ -2531,6 +2529,13 @@ top:
 					ranTimer = true
 				}
 			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false
+				}
+			}
 		}
 	}
 	if ranTimer {
@@ -2606,7 +2611,7 @@ stop:
 	// drop nmspinning first and then check all per-P queues again (with
 	// #StoreLoad memory barrier in between). If we do it the other way around,
 	// another thread can submit a goroutine after we've checked all run queues
-	// but before we drop nmspinning; as the result nobody will unpark a thread
+	// but before we drop nmspinning; as a result nobody will unpark a thread
 	// to run the goroutine.
 	// If we discover new work below, we need to restore m.spinning as a signal
 	// for resetspinning to unpark a new worker thread (because there can be more
@@ -2640,6 +2645,35 @@ stop:
 		}
 	}
 
+	// Similar to above, check for timer creation or expiry concurrently with
+	// transitioning from spinning to non-spinning. Note that we cannot use
+	// checkTimers here because it calls adjusttimers which may need to allocate
+	// memory, and that isn't allowed when we don't have an active P.
+	for _, _p_ := range allpSnapshot {
+		// This is similar to nobarrierWakeTime, but minimizes calls to
+		// nanotime.
+		if atomic.Load(&_p_.adjustTimers) > 0 {
+			if now == 0 {
+				now = nanotime()
+			}
+			pollUntil = now
+		} else {
+			w := int64(atomic.Load64(&_p_.timer0When))
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+	if pollUntil != 0 {
+		if now == 0 {
+			now = nanotime()
+		}
+		delta = pollUntil - now
+		if delta < 0 {
+			delta = 0
+		}
+	}
+
 	// Check for idle-priority GC work again.
 	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(nil) {
 		lock(&sched.lock)
@@ -2735,9 +2769,9 @@ func pollWork() bool {
 	return false
 }
 
-// wakeNetPoller wakes up the thread sleeping in the network poller,
-// if there is one, and if it isn't going to wake up anyhow before
-// the when argument.
+// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
+// going to wake up before the when argument; or it wakes an idle P to service
+// timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
 	if atomic.Load64(&sched.lastpoll) == 0 {
 		// In findrunnable we ensure that when polling the pollUntil
@@ -2748,6 +2782,10 @@ func wakeNetPoller(when int64) {
 		if pollerPollUntil == 0 || pollerPollUntil > when {
 			netpollBreak()
 		}
+	} else {
+		// There are no threads in the network poller, try to get
+		// one there so it can handle new timers.
+		wakep()
 	}
 }
 
@@ -3034,25 +3072,6 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
 	return rnow, pollUntil, ran
 }
 
-// shouldStealTimers reports whether we should try stealing the timers from p2.
-// We don't steal timers from a running P that is not marked for preemption,
-// on the assumption that it will run its own timers. This reduces
-// contention on the timers lock.
-func shouldStealTimers(p2 *p) bool {
-	if p2.status != _Prunning {
-		return true
-	}
-	mp := p2.m.ptr()
-	if mp == nil || mp.locks > 0 {
-		return false
-	}
-	gp := mp.curg
-	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
-		return false
-	}
-	return true
-}
-
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
 	unlock((*mutex)(lock))
 	return true
@@ -4603,7 +4622,7 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
-	maskWords := (nprocs+31) / 32
+	maskWords := (nprocs + 31) / 32
 
 	// Grow allp if necessary.
 	if nprocs > int32(len(allp)) {
@@ -4927,11 +4946,28 @@ func sysmon() {
 		}
 		usleep(delay)
 		mDoFixup()
+
+		// sysmon should not enter deep sleep if schedtrace is enabled so that
+		// it can print that information at the right time.
+		//
+		// It should also not enter deep sleep if there are any active P's so
+		// that it can retake P's from syscalls, preempt long running G's, and
+		// poll the network if all P's are busy for long stretches.
+		//
+		// It should wakeup from deep sleep if any P's become active either due
+		// to exiting a syscall or waking up due to a timer expiring so that it
+		// can resume performing those duties. If it wakes from a syscall it
+		// resets idle and delay as a bet that since it had retaken a P from a
+		// syscall before, it may need to do it again shortly after the
+		// application starts work again. It does not reset idle when waking
+		// from a timer to avoid adding system load to applications that spend
+		// most of their time sleeping.
 		now := nanotime()
-		next, _ := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				syscallWake := false
+				next, _ := timeSleepUntil()
 				if next > now {
 					atomic.Store(&sched.sysmonwait, 1)
 					unlock(&sched.lock)
@@ -4945,33 +4981,27 @@ func sysmon() {
 					if shouldRelax {
 						osRelax(true)
 					}
-					notetsleep(&sched.sysmonnote, sleep)
+					syscallWake = notetsleep(&sched.sysmonnote, sleep)
 					mDoFixup()
 					if shouldRelax {
 						osRelax(false)
 					}
-					now = nanotime()
-					next, _ = timeSleepUntil()
 					lock(&sched.lock)
 					atomic.Store(&sched.sysmonwait, 0)
 					noteclear(&sched.sysmonnote)
 				}
-				idle = 0
-				delay = 20
+				if syscallWake {
+					idle = 0
+					delay = 20
+				}
 			}
 			unlock(&sched.lock)
 		}
+
 		lock(&sched.sysmonlock)
-		{
-			// If we spent a long time blocked on sysmonlock
-			// then we want to update now and next since it's
-			// likely stale.
-			now1 := nanotime()
-			if now1-now > 50*1000 /* 50µs */ {
-				next, _ = timeSleepUntil()
-			}
-			now = now1
-		}
+		// Update now in case we blocked on sysmonnote or spent a long time
+		// blocked on schedlock or sysmonlock above.
+		now = nanotime()
 
 		// trigger libc interceptors if needed
 		if *cgo_yield != nil {
@@ -4996,12 +5026,6 @@ func sysmon() {
 			}
 		}
 		mDoFixup()
-		if next < now {
-			// There are timers that should have already run,
-			// perhaps because there is an unpreemptible P.
-			// Try to start an M to run them.
-			startm(nil, false)
-		}
 		if atomic.Load(&scavenge.sysmonWake) != 0 {
 			// Kick the scavenger awake if someone requested it.
 			wakeScavenger()
diff --git a/src/time/sleep_test.go b/src/time/sleep_test.go