Skip to content
This repository was archived by the owner on Nov 9, 2017. It is now read-only.
This repository was archived by the owner on Nov 9, 2017. It is now read-only.

NewHistogram vs Snapshot deadlock #15

Closed
@ksedgwic

Description

@ksedgwic

We observed deadlock in a program which uses codahale/metrics. Here are the relevant backtraces:

goroutine 1549 [semacquire, 828 minutes]:
sync.runtime_Semacquire(0xc878bc)
        /usr/local/go/src/runtime/sema.go:47 +0x30
sync.(*Mutex).Lock(0xc878b8)
        /usr/local/go/src/sync/mutex.go:85 +0xd0
github.com/codahale/metrics.Gauge.SetBatchFunc(0xc421b78940, 0x17, 0x91b5c0, 0xc4263433f0, 0xc4263433e0, 0xc421b78900)
        /home/meng/prod/ghostback/src/github.com/codahale/metrics/metrics.go:122 +0x31
github.com/codahale/metrics.NewHistogram(0xc420c18080, 0x13, 0x0, 0xf4240, 0x3, 0x0)
        /home/meng/prod/ghostback/src/github.com/codahale/metrics/metrics.go:212 +0x6d2
common.(*TelemetryInstance).RecordTime(0xc4203f0dc0, 0xc420c18080, 0x13, 0x11f03447)
        /home/meng/prod/ghostback/src/common/telemetry.go:158 +0xfa
common.TelemetryLag(0xc420c18080, 0x13, 0x11f03447)
        /home/meng/prod/ghostback/src/common/telemetry.go:182 +0x10a
common.TelemetryProcessResponse(0xc4219c2410, 0xc422b41770, 0x1478abbbaeef0ea3)
        /home/meng/prod/ghostback/src/common/telemetry.go:76 +0x81
httpworker.(*Instance).incomingHandler(0xc420077680, 0xc4239587e0, 0x0, 0x0)
        /home/meng/prod/ghostback/src/httpworker/worker.go:172 +0x22b
httpworker.(*Instance).(httpworker.incomingHandler)-fm(0xc4239587e0, 0xc42514f201, 0x101)
        /home/meng/prod/ghostback/src/httpworker/worker.go:91 +0x34
github.com/nsqio/go-nsq.HandlerFunc.HandleMessage(0xc420888040, 0xc4239587e0, 0x94c200, 0xc420888040)
        /home/meng/prod/ghostback/src/github.com/nsqio/go-nsq/consumer.go:42 +0x30
github.com/nsqio/go-nsq.(*Consumer).handlerLoop(0xc420904000, 0xc40160, 0xc420888040)
        /home/meng/prod/ghostback/src/github.com/nsqio/go-nsq/consumer.go:1105 +0x14a
created by github.com/nsqio/go-nsq.(*Consumer).AddConcurrentHandlers
        /home/meng/prod/ghostback/src/github.com/nsqio/go-nsq/consumer.go:1087 +0xaa

goroutine 1606087 [semacquire, 828 minutes]:
sync.runtime_Semacquire(0xc878c4)
        /usr/local/go/src/runtime/sema.go:47 +0x30
sync.(*Mutex).Lock(0xc878c0)
        /usr/local/go/src/sync/mutex.go:85 +0xd0
github.com/codahale/metrics.Snapshot(0x0, 0x0)
        /home/meng/prod/ghostback/src/github.com/codahale/metrics/metrics.go:166 +0xd2
github.com/codahale/metrics.init.1.func1(0xc423aa1800, 0x455d50)
        /home/meng/prod/ghostback/src/github.com/codahale/metrics/metrics.go:313 +0x26
expvar.Func.String(0xa36258, 0xa36210, 0xc426c8aa80)
        /usr/local/go/src/expvar/expvar.go:241 +0x27
fmt.(*pp).handleMethods(0xc426c8aa80, 0x73, 0x1)
        /usr/local/go/src/fmt/print.go:590 +0x2f6
fmt.(*pp).printArg(0xc426c8aa80, 0x946560, 0xa36258, 0x73)
        /usr/local/go/src/fmt/print.go:665 +0x17b
fmt.(*pp).doPrintf(0xc426c8aa80, 0x9f4e7a, 0x6, 0xc42c6b5b60, 0x2, 0x2)
        /usr/local/go/src/fmt/print.go:985 +0x123d
fmt.Fprintf(0xc3f560, 0xc4215b5e10, 0x9f4e7a, 0x6, 0xc42c6b5b60, 0x2, 0x2, 0x2, 0x0, 0x0)
        /usr/local/go/src/fmt/print.go:181 +0x76
expvar.expvarHandler.func1(0x9f6a34, 0x7, 0xc40060, 0xa36258)
        /usr/local/go/src/expvar/expvar.go:320 +0x15b
expvar.Do(0xc42c6b5c48)
        /usr/local/go/src/expvar/expvar.go:307 +0xef
expvar.expvarHandler(0xc43980, 0xc4215b5e10, 0xc420a2be00)
        /usr/local/go/src/expvar/expvar.go:321 +0x12b
net/http.HandlerFunc.ServeHTTP(0xa361f8, 0xc43980, 0xc4215b5e10, 0xc420a2be00)
        /usr/local/go/src/net/http/server.go:1726 +0x44
net/http.(*ServeMux).ServeHTTP(0xc683e0, 0xc43980, 0xc4215b5e10, 0xc420a2be00)
        /usr/local/go/src/net/http/server.go:2022 +0x7f
net/http.serverHandler.ServeHTTP(0xc42085e000, 0xc43980, 0xc4215b5e10, 0xc420a2be00)
        /usr/local/go/src/net/http/server.go:2202 +0x7d
net/http.(*conn).serve(0xc421063380, 0xc44580, 0xc423ebb600)
        /usr/local/go/src/net/http/server.go:1579 +0x4b7
created by net/http.(*Server).Serve
        /usr/local/go/src/net/http/server.go:2293 +0x44d

One thread is creating a NewHistogram, acquires hm and tries for gm:

func NewHistogram(name string, minValue, maxValue int64, sigfigs int) *Histogram {
    hm.Lock()
    defer hm.Unlock()

    if _, ok := histograms[name]; ok {
        panic(name + " already exists")
    }

    hist := &Histogram{
        name: name,
        hist: hdrhistogram.NewWindowed(5, minValue, maxValue, sigfigs),
    }
    histograms[name] = hist

    Gauge(name+".P50").SetBatchFunc(hname(name), hist.merge, hist.valueAt(50))
    Gauge(name+".P75").SetBatchFunc(hname(name), hist.merge, hist.valueAt(75))
    Gauge(name+".P90").SetBatchFunc(hname(name), hist.merge, hist.valueAt(90))
    Gauge(name+".P95").SetBatchFunc(hname(name), hist.merge, hist.valueAt(95))
    Gauge(name+".P99").SetBatchFunc(hname(name), hist.merge, hist.valueAt(99))
    Gauge(name+".P999").SetBatchFunc(hname(name), hist.merge, hist.valueAt(99.9))

    return hist
}

func (g Gauge) SetBatchFunc(key interface{}, init func(), f func() int64) {
    gm.Lock()
    defer gm.Unlock()

    gauges[string(g)] = f
    if _, ok := inits[key]; !ok {
        inits[key] = init
    }
}

The other thread is sampling the data, acquires gm and tries for hm:

func Snapshot() (c map[string]uint64, g map[string]int64) {
    cm.Lock()
    defer cm.Unlock()

    gm.Lock()
    defer gm.Unlock()

    hm.Lock()
    defer hm.Unlock()

    ....

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions