Skip to content
This repository was archived by the owner on Jul 11, 2023. It is now read-only.

Commit 0ef51f2

Browse files
committed
remove head of line blocking from workerpool
Signed-off-by: Sean Teeling <[email protected]>
1 parent 30e5362 commit 0ef51f2

File tree

3 files changed

+34
-172
lines changed

3 files changed

+34
-172
lines changed

pkg/messaging/broker.go

+10-80
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package messaging
22

33
import (
44
"fmt"
5+
"sync"
56
"sync/atomic"
67
"time"
78

@@ -92,100 +93,29 @@ func (b *Broker) runWorkqueueProcessor(stopCh <-chan struct{}) {
9293

9394
// runProxyUpdateDispatcher runs the dispatcher responsible for batching
9495
// proxy update events received in close proximity.
95-
// It batches proxy update events with the use of 2 timers:
96-
// 1. Sliding window timer that resets when a proxy update event is received
97-
// 2. Max window timer that caps the max duration a sliding window can be reset to
98-
// When either of the above timers expire, the proxy update event is published
99-
// on the dedicated pub-sub instance.
96+
// It batches proxy update events with the use of a mutex, and TryLock.
10097
func (b *Broker) runProxyUpdateDispatcher(stopCh <-chan struct{}) {
101-
// batchTimer and maxTimer are updated by the dispatcher routine
102-
// when events are processed and timeouts expire. They are initialized
103-
// with a large timeout (a decade) so they don't time out till an event
104-
// is received.
105-
noTimeout := 87600 * time.Hour // A decade
106-
slidingTimer := time.NewTimer(noTimeout)
107-
maxTimer := time.NewTimer(noTimeout)
108-
109-
// dispatchPending indicates whether a proxy update event is pending
110-
// from being published on the pub-sub. A proxy update event will
111-
// be held for 'proxyUpdateSlidingWindow' duration to be able to
112-
// coalesce multiple proxy update events within that duration, before
113-
// it is dispatched on the pub-sub. The 'proxyUpdateSlidingWindow' duration
114-
// is a sliding window, which means each event received within a window
115-
// slides the window further ahead in time, up to a max of 'proxyUpdateMaxWindow'.
116-
//
117-
// This mechanism is necessary to avoid triggering proxy update pub-sub events in
118-
// a hot loop, which would otherwise result in CPU spikes on the controller.
119-
// We want to coalesce as many proxy update events within the 'proxyUpdateMaxWindow'
120-
// duration.
121-
dispatchPending := false
98+
var coalesce sync.Mutex
12299
batchCount := 0 // number of proxy update events batched per dispatch
123-
124-
var event proxyUpdateEvent
125100
for {
126101
select {
127-
case e, ok := <-b.proxyUpdateCh:
102+
case event, ok := <-b.proxyUpdateCh:
128103
if !ok {
129104
log.Warn().Msgf("Proxy update event chan closed, exiting dispatcher")
130105
return
131106
}
132-
event = e
133-
134-
if !dispatchPending {
135-
// No proxy update events are pending send on the pub-sub.
136-
// Reset the dispatch timers. The events will be dispatched
137-
// when either of the timers expire.
138-
if !slidingTimer.Stop() {
139-
<-slidingTimer.C
140-
}
141-
slidingTimer.Reset(proxyUpdateSlidingWindow)
142-
if !maxTimer.Stop() {
143-
<-maxTimer.C
144-
}
145-
maxTimer.Reset(proxyUpdateMaxWindow)
146-
dispatchPending = true
147-
batchCount++
148-
log.Trace().Msgf("Pending dispatch of msg kind %s", event.msg.Kind)
149-
} else {
150-
// A proxy update event is pending dispatch. Update the sliding window.
151-
if !slidingTimer.Stop() {
152-
<-slidingTimer.C
153-
}
154-
slidingTimer.Reset(proxyUpdateSlidingWindow)
155-
batchCount++
156-
log.Trace().Msgf("Reset sliding window for msg kind %s", event.msg.Kind)
107+
batchCount++
108+
if coalesce.TryLock() {
109+
log.Trace().Msgf("Coalescing proxy update event %s", event.msg.Kind)
110+
continue
157111
}
112+
log.Trace().Msgf("Batching, msg kind %s, batch size %d", event.msg.Kind, batchCount)
158113

159-
case <-slidingTimer.C:
160-
slidingTimer.Reset(noTimeout) // 'slidingTimer' drained in this case statement
161-
// Stop and drain 'maxTimer' before Reset()
162-
if !maxTimer.Stop() {
163-
// Drain channel. Refer to Reset() doc for more info.
164-
<-maxTimer.C
165-
}
166-
maxTimer.Reset(noTimeout)
167-
b.proxyUpdatePubSub.Pub(event.msg, event.topic)
168114
atomic.AddUint64(&b.totalDispatchedProxyEventCount, 1)
169115
metricsstore.DefaultMetricsStore.ProxyBroadcastEventCount.Inc()
170-
log.Trace().Msgf("Sliding window expired, msg kind %s, batch size %d", event.msg.Kind, batchCount)
171-
dispatchPending = false
172-
batchCount = 0
173-
174-
case <-maxTimer.C:
175-
maxTimer.Reset(noTimeout) // 'maxTimer' drained in this case statement
176-
// Stop and drain 'slidingTimer' before Reset()
177-
if !slidingTimer.Stop() {
178-
// Drain channel. Refer to Reset() doc for more info.
179-
<-slidingTimer.C
180-
}
181-
slidingTimer.Reset(noTimeout)
182116
b.proxyUpdatePubSub.Pub(event.msg, event.topic)
183-
atomic.AddUint64(&b.totalDispatchedProxyEventCount, 1)
184-
metricsstore.DefaultMetricsStore.ProxyBroadcastEventCount.Inc()
185-
log.Trace().Msgf("Max window expired, msg kind %s, batch size %d", event.msg.Kind, batchCount)
186-
dispatchPending = false
187117
batchCount = 0
188-
118+
coalesce.Unlock()
189119
case <-stopCh:
190120
log.Info().Msg("Proxy update dispatcher received stop signal, exiting")
191121
return

pkg/workerpool/workerpool.go

+24-53
Original file line numberDiff line numberDiff line change
@@ -9,30 +9,17 @@ import (
99
"github.com/openservicemesh/osm/pkg/logger"
1010
)
1111

12-
const (
13-
// Size of the job queue per worker
14-
maxJobPerWorker = 4096
15-
)
16-
1712
var (
1813
log = logger.New("workerpool")
1914
)
2015

21-
// worker context for a worker routine
22-
type worker struct {
23-
id int
24-
jobs chan Job // Job queue
25-
stop chan struct{} // Stop channel
26-
wg *sync.WaitGroup // Pointer to WorkerPool wg
27-
jobsProcessed uint64 // Jobs processed by this worker
28-
}
29-
3016
// WorkerPool object representation
3117
type WorkerPool struct {
3218
wg sync.WaitGroup // Sync group, to stop workers if needed
33-
workerContext []*worker // Worker contexts
3419
nWorkers uint64 // Number of workers. Uint64 for easier mod hash later
35-
rRobinCounter uint64 // Used only by the round robin api. Modified atomically on API.
20+
jobsProcessed uint64 // Jobs processed by this worker
21+
jobs chan Job
22+
stop chan struct{} // Stop channel
3623
}
3724

3825
// Job is a runnable interface to queue jobs on a WorkerPool
@@ -62,72 +49,56 @@ func NewWorkerPool(nWorkers int) *WorkerPool {
6249

6350
log.Info().Msgf("New worker pool setting up %d workers", nWorkers)
6451

65-
var workPool WorkerPool
52+
workPool := &WorkerPool{
53+
nWorkers: uint64(nWorkers),
54+
jobs: make(chan Job, nWorkers),
55+
stop: make(chan struct{}),
56+
}
6657
for i := 0; i < nWorkers; i++ {
67-
workPool.workerContext = append(workPool.workerContext,
68-
&worker{
69-
id: i,
70-
jobs: make(chan Job, maxJobPerWorker),
71-
stop: make(chan struct{}, 1),
72-
wg: &workPool.wg,
73-
jobsProcessed: 0,
74-
},
75-
)
76-
workPool.wg.Add(1)
77-
workPool.nWorkers++
78-
79-
go (workPool.workerContext[i]).work()
58+
i := i
59+
go workPool.work(i)
8060
}
8161

82-
return &workPool
62+
return workPool
8363
}
8464

8565
// AddJob posts the job on a worker queue
8666
// Uses Hash underneath to choose worker to post the job to
8767
func (wp *WorkerPool) AddJob(job Job) <-chan struct{} {
88-
wp.workerContext[job.Hash()%wp.nWorkers].jobs <- job
68+
wp.jobs <- job
8969
return job.GetDoneCh()
9070
}
9171

92-
// AddJobRoundRobin adds a job in round robin to the queues
93-
// Concurrent calls to AddJobRoundRobin are thread safe and fair
94-
// between each other
95-
func (wp *WorkerPool) AddJobRoundRobin(jobs Job) {
96-
added := atomic.AddUint64(&wp.rRobinCounter, 1)
97-
wp.workerContext[added%wp.nWorkers].jobs <- jobs
98-
}
99-
10072
// GetWorkerNumber get number of queues/workers
10173
func (wp *WorkerPool) GetWorkerNumber() int {
10274
return int(wp.nWorkers)
10375
}
10476

10577
// Stop stops the workerpool
10678
func (wp *WorkerPool) Stop() {
107-
for _, worker := range wp.workerContext {
108-
worker.stop <- struct{}{}
109-
}
79+
close(wp.stop)
11080
wp.wg.Wait()
11181
}
11282

113-
func (workContext *worker) work() {
114-
defer workContext.wg.Done()
83+
func (wp *WorkerPool) work(id int) {
84+
wp.wg.Add(1)
85+
defer wp.wg.Done()
86+
87+
log.Info().Msgf("Worker %d running", id)
11588

116-
log.Info().Msgf("Worker %d running", workContext.id)
11789
for {
11890
select {
119-
case j := <-workContext.jobs:
91+
case j := <-wp.jobs:
12092
t := time.Now()
121-
log.Debug().Msgf("work[%d]: Starting %v", workContext.id, j.JobName())
93+
log.Debug().Msgf("work[%d]: Starting %v", id, j.JobName())
12294

12395
// Run current job
12496
j.Run()
12597

126-
log.Debug().Msgf("work[%d][%s] : took %v", workContext.id, j.JobName(), time.Since(t))
127-
workContext.jobsProcessed++
128-
129-
case <-workContext.stop:
130-
log.Debug().Msgf("work[%d]: Stopped", workContext.id)
98+
log.Debug().Msgf("work[%d][%s] : took %v", id, j.JobName(), time.Since(t))
99+
atomic.AddUint64(&wp.jobsProcessed, 1)
100+
case <-wp.stop:
101+
log.Debug().Msgf("work[%d]: Stopped", id)
131102
return
132103
}
133104
}

pkg/workerpool/workerpool_test.go

-39
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@ func (tj *testJob) Hash() uint64 {
4444

4545
// Uses AddJob, which relies on job hash for queue assignment
4646
func TestAddJob(t *testing.T) {
47-
assert := tassert.New(t)
48-
4947
njobs := 10 // also worker routines
5048
wp := NewWorkerPool(njobs)
5149
joblist := make([]testJob, njobs)
@@ -66,41 +64,4 @@ func TestAddJob(t *testing.T) {
6664
}
6765

6866
wp.Stop()
69-
70-
// Verify all the workers processed 1 job (as expected by the static hash)
71-
for i := 0; i < njobs; i++ {
72-
assert.Equal(uint64(1), wp.workerContext[i].jobsProcessed)
73-
}
74-
}
75-
76-
// Uses AddJobRoundRobin, which relies on round robin for queue assignment
77-
func TestAddJobRoundRobin(t *testing.T) {
78-
assert := tassert.New(t)
79-
80-
njobs := 10 // also worker routines
81-
wp := NewWorkerPool(njobs)
82-
joblist := make([]testJob, njobs)
83-
84-
// Create and add jobs
85-
for i := 0; i < njobs; i++ {
86-
joblist[i] = testJob{
87-
jobDone: make(chan struct{}, 1),
88-
hash: uint64(i),
89-
}
90-
91-
wp.AddJobRoundRobin(&joblist[i])
92-
}
93-
94-
// Verify all jobs ran through the workers
95-
for i := 0; i < njobs; i++ {
96-
<-joblist[i].jobDone
97-
}
98-
99-
wp.Stop()
100-
101-
// Verify all the workers processed 1 job (round-robbined)
102-
assert.Equal(uint64(njobs), wp.rRobinCounter)
103-
for i := 0; i < njobs; i++ {
104-
assert.Equal(uint64(1), wp.workerContext[i].jobsProcessed)
105-
}
10667
}

0 commit comments

Comments
 (0)