Skip to content

Commit 130baf3

Browse files
mknyszekgopherbot
authored andcommitted
runtime: improve tickspersecond
Currently tickspersecond forces a 100 millisecond sleep the first time it's called. This isn't great for profiling short-lived programs, since both CPU profiling and block profiling might call into it. 100 milliseconds is a long time, but it's chosen to try and capture a decent estimate of the conversion on platform with course-granularity clocks. If the granularity is 15 ms, it'll only be 15% off at worst. Let's try a different strategy. First, let's require 5 milliseconds of time to have elapsed at a minimum. This should be plenty on platforms with nanosecond time granularity from the system clock, provided the caller of tickspersecond intends to use it for calculating durations, not timestamps. Next, grab a timestamp as close to process start as possible, so that we can cover some of that 5 millisecond just during runtime start. Finally, this function is only ever called from normal goroutine contexts. Let's do a regular goroutine sleep instead of a thread-level sleep under a runtime lock, which has all sorts of nasty effects on preemption. While we're here, let's also rename tickspersecond to ticksPerSecond. Also, let's write down some explicit rules of thumb on when to use this function. Clocks are hard, and using this for timestamp conversion is likely to make lining up those timestamps with other clocks on the system difficult if not impossible. Note that while this improves ticksPerSecond on platforms with good clocks, we still end up with a pretty coarse sleep on platforms with coarse clocks, and a pretty coarse result. On these platforms, keep the minimum required elapsed time at 100 ms. There's not much we can do about these platforms except spin and try to catch the clock boundary, but at 10+ ms of granularity, that might be a lot of spinning. Fixes #63103. Fixes #63078. Change-Id: Ic32a4ba70a03bdf5c13cb80c2669c4064aa4cca2 Reviewed-on: https://go-review.googlesource.com/c/go/+/538898 Auto-Submit: Michael Knyszek <[email protected]> LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Mauri de Souza Meneguzzo <[email protected]> Reviewed-by: Michael Pratt <[email protected]>
1 parent 25895d1 commit 130baf3

File tree

4 files changed

+104
-22
lines changed

4 files changed

+104
-22
lines changed

src/runtime/cpuprof.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ func CPUProfile() []byte {
211211

212212
//go:linkname runtime_pprof_runtime_cyclesPerSecond runtime/pprof.runtime_cyclesPerSecond
213213
func runtime_pprof_runtime_cyclesPerSecond() int64 {
214-
return tickspersecond()
214+
return ticksPerSecond()
215215
}
216216

217217
// readProfile, provided to runtime/pprof, returns the next chunk of

src/runtime/mprof.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ func SetBlockProfileRate(rate int) {
471471
r = 1 // profile everything
472472
} else {
473473
// convert ns to cycles, use float64 to prevent overflow during multiplication
474-
r = int64(float64(rate) * float64(tickspersecond()) / (1000 * 1000 * 1000))
474+
r = int64(float64(rate) * float64(ticksPerSecond()) / (1000 * 1000 * 1000))
475475
if r == 0 {
476476
r = 1
477477
}

src/runtime/proc.go

+17-1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import (
88
"internal/abi"
99
"internal/cpu"
1010
"internal/goarch"
11+
"internal/goos"
1112
"runtime/internal/atomic"
1213
"runtime/internal/sys"
1314
"unsafe"
@@ -772,6 +773,7 @@ func schedinit() {
772773
// The world starts stopped.
773774
worldStopped()
774775

776+
ticks.init() // run as early as possible
775777
moduledataverify()
776778
stackinit()
777779
mallocinit()
@@ -937,6 +939,20 @@ func (mp *m) hasCgoOnStack() bool {
937939
return mp.ncgo > 0 || mp.isextra
938940
}
939941

942+
const (
943+
// osHasLowResTimer indicates that the platform's internal timer system has a low resolution,
944+
// typically on the order of 1 ms or more.
945+
osHasLowResTimer = GOOS == "windows" || GOOS == "openbsd" || GOOS == "netbsd"
946+
947+
// osHasLowResClockInt is osHasLowResClock but in integer form, so it can be used to create
948+
// constants conditionally.
949+
osHasLowResClockInt = goos.IsWindows
950+
951+
// osHasLowResClock indicates that timestamps produced by nanotime on the platform have a
952+
// low resolution, typically on the order of 1 ms or more.
953+
osHasLowResClock = osHasLowResClockInt > 0
954+
)
955+
940956
var fastrandseed uintptr
941957

942958
func fastrandinit() {
@@ -6599,7 +6615,7 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool)
65996615
// between different Ps.
66006616
// A sync chan send/recv takes ~50ns as of time of
66016617
// writing, so 3us gives ~50x overshoot.
6602-
if GOOS != "windows" && GOOS != "openbsd" && GOOS != "netbsd" {
6618+
if !osHasLowResTimer {
66036619
usleep(3)
66046620
} else {
66056621
// On some platforms system timer granularity is

src/runtime/runtime.go

+85-19
Original file line numberDiff line numberDiff line change
@@ -17,34 +17,100 @@ import (
1717
var ticks ticksType
1818

1919
type ticksType struct {
20-
lock mutex
21-
val atomic.Int64
20+
// lock protects access to start* and val.
21+
lock mutex
22+
startTicks int64
23+
startTime int64
24+
val atomic.Int64
2225
}
2326

24-
// Note: Called by runtime/pprof in addition to runtime code.
25-
func tickspersecond() int64 {
27+
// init initializes ticks to maximize the chance that we have a good ticksPerSecond reference.
28+
//
29+
// Must not run concurrently with ticksPerSecond.
30+
func (t *ticksType) init() {
31+
lock(&ticks.lock)
32+
t.startTime = nanotime()
33+
t.startTicks = cputicks()
34+
unlock(&ticks.lock)
35+
}
36+
37+
// minTimeForTicksPerSecond is the minimum elapsed time we require to consider our ticksPerSecond
38+
// measurement to be of decent enough quality for profiling.
39+
//
40+
// There's a linear relationship here between minimum time and error from the true value.
41+
// The error from the true ticks-per-second in a linux/amd64 VM seems to be:
42+
// - 1 ms -> ~0.02% error
43+
// - 5 ms -> ~0.004% error
44+
// - 10 ms -> ~0.002% error
45+
// - 50 ms -> ~0.0003% error
46+
// - 100 ms -> ~0.0001% error
47+
//
48+
// We're willing to take 0.004% error here, because ticksPerSecond is intended to be used for
49+
// converting durations, not timestamps. Durations are usually going to be much larger, and so
50+
// the tiny error doesn't matter. The error is definitely going to be a problem when trying to
51+
// use this for timestamps, as it'll make those timestamps much less likely to line up.
52+
const minTimeForTicksPerSecond = 5_000_000*(1-osHasLowResClockInt) + 100_000_000*osHasLowResClockInt
53+
54+
// ticksPerSecond returns a conversion rate between the cputicks clock and the nanotime clock.
55+
//
56+
// Note: Clocks are hard. Using this as an actual conversion rate for timestamps is ill-advised
57+
// and should be avoided when possible. Use only for durations, where a tiny error term isn't going
58+
// to make a meaningful difference in even a 1ms duration. If an accurate timestamp is needed,
59+
// use nanotime instead. (The entire Windows platform is a broad exception to this rule, where nanotime
60+
// produces timestamps on such a coarse granularity that the error from this conversion is actually
61+
// preferable.)
62+
//
63+
// The strategy for computing the conversion rate is to write down nanotime and cputicks as
64+
// early in process startup as possible. From then, we just need to wait until we get values
65+
// from nanotime that we can use (some platforms have a really coarse system time granularity).
66+
// We require some amount of time to pass to ensure that the conversion rate is fairly accurate
67+
// in aggregate. But because we compute this rate lazily, there's a pretty good chance a decent
68+
// amount of time has passed by the time we get here.
69+
//
70+
// Must be called from a normal goroutine context (running regular goroutine with a P).
71+
//
72+
// Called by runtime/pprof in addition to runtime code.
73+
//
74+
// TODO(mknyszek): This doesn't account for things like CPU frequency scaling. Consider
75+
// a more sophisticated and general approach in the future.
76+
func ticksPerSecond() int64 {
77+
// Get the conversion rate if we've already computed it.
2678
r := ticks.val.Load()
2779
if r != 0 {
2880
return r
2981
}
30-
lock(&ticks.lock)
31-
r = ticks.val.Load()
32-
if r == 0 {
33-
t0 := nanotime()
34-
c0 := cputicks()
35-
usleep(100 * 1000)
36-
t1 := nanotime()
37-
c1 := cputicks()
38-
if t1 == t0 {
39-
t1++
82+
83+
// Compute the conversion rate.
84+
for {
85+
lock(&ticks.lock)
86+
r = ticks.val.Load()
87+
if r != 0 {
88+
unlock(&ticks.lock)
89+
return r
4090
}
41-
r = (c1 - c0) * 1000 * 1000 * 1000 / (t1 - t0)
42-
if r == 0 {
43-
r++
91+
92+
// Grab the current time in both clocks.
93+
nowTime := nanotime()
94+
nowTicks := cputicks()
95+
96+
// See if we can use these times.
97+
if nowTicks > ticks.startTicks && nowTime-ticks.startTime > minTimeForTicksPerSecond {
98+
// Perform the calculation with floats. We don't want to risk overflow.
99+
r = int64(float64(nowTicks-ticks.startTicks) * 1e9 / float64(nowTime-ticks.startTime))
100+
if r == 0 {
101+
// Zero is both a sentinel value and it would be bad if callers used this as
102+
// a divisor. We tried out best, so just make it 1.
103+
r++
104+
}
105+
ticks.val.Store(r)
106+
unlock(&ticks.lock)
107+
break
44108
}
45-
ticks.val.Store(r)
109+
unlock(&ticks.lock)
110+
111+
// Sleep in one millisecond increments until we have a reliable time.
112+
timeSleep(1_000_000)
46113
}
47-
unlock(&ticks.lock)
48114
return r
49115
}
50116

0 commit comments

Comments
 (0)