Skip to content

Commit 0620121

Browse files
committed
fix: prevent race condition when failure is before the end of the mq
Signed-off-by: Matthias Riegler <[email protected]>
1 parent 418bfdd commit 0620121

File tree

3 files changed

+32
-55
lines changed

3 files changed

+32
-55
lines changed

Readme.md

+22-48
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ end
8989
9090
```
9191

92-
<details><summary>Sequence diagram of a failure with a new build coming in right away</summary>
92+
**Sequence diagram of a failure with a new build coming in right away**
9393

9494
> :warning: I see a potential conflict here. It could be that GHA1 or GHA2 causes the failure of GHA3, we might not want to accept new LeaseRequests but handle priority across remaining ones
9595
@@ -126,69 +126,43 @@ sequenceDiagram
126126
GHA2->>+LeaseProvider: Aquire: priority: 2
127127
LeaseProvider-->>-GHA2: priority: 2, status: PENDING
128128
129-
rect rgb(200, 255,200)
129+
rect rgb(255, 200, 200)
130130
note over GHA_NEXT: New GHA run started by GH merge queue after GHA3 failed
131+
loop until lease successful and all request marked COMPLETED
131132
GHA_NEXT->>+LeaseProvider: Aquire: priority:3
132-
note right of LeaseProvider: Full state awareness
133-
LeaseProvider-->>GHA_NEXT: priority: 3, status: AQUIRED
134-
note left of GHA_NEXT: holds lease to access shared resource
135-
136-
GHA_NEXT->>LeaseProvider: Release: priority: 3, status: SUCCESS
137-
note right of LeaseProvider: the lease is marked as completed -> status is available on the next requests
138-
LeaseProvider-->>-GHA_NEXT: priority: 3, status: COMPLETED
133+
note right of LeaseProvider: previous lease failed
134+
LeaseProvider-->>-GHA_NEXT: error, previous lease failed (409 CONFLICT)
135+
end
139136
end
140137
141-
GHA1->>+LeaseProvider: Aquire: priority: 1
142-
LeaseProvider-->>-GHA1: priority: 1, status: COMPLETED
143-
GHA2->>+LeaseProvider: Aquire: priority: 2
144-
LeaseProvider-->>-GHA2: priority: 2, status: COMPLETED
145-
```
146-
</detail>
147-
148-
<details><summary>Sequence Diagram of a failure and passing the lease to the next LeaseReqeust without a new contendor</summary>
149-
150-
```mermaid
151-
sequenceDiagram
152-
participant LeaseProvider
153-
participant GHA1
154-
participant GHA2
155-
participant GHA3
156138
157-
139+
par
140+
rect rgb(200, 255,200)
141+
GHA2->>+LeaseProvider: Aquire: priority: 2
142+
note right of LeaseProvider: GHA2 has the highest priority of remaining badges
143+
LeaseProvider-->>-GHA2: priority: 2, status: AQUIRED
144+
end
145+
loop until lease successful
158146
GHA1->>+LeaseProvider: Aquire: priority: 1
159147
LeaseProvider-->>-GHA1: priority: 1, status: PENDING
160-
GHA2->>+LeaseProvider: Aquire: priority: 2
161-
LeaseProvider-->>-GHA2: priority: 2, status: PENDING
162-
163-
rect rgb(255, 200, 200)
164-
GHA3->>+LeaseProvider: Aquire: priority:3
165-
note right of LeaseProvider: Full state awareness
166-
LeaseProvider-->>GHA3: priority: 3, status: AQUIRED
167-
note left of GHA3: holds lease to access shared resource
168-
169-
GHA3->>LeaseProvider: Release: priority: 3, status: FAILURE
170-
note right of LeaseProvider: the lease is removed since it failed
171-
LeaseProvider-->>-GHA3: priority: 3, status: FAILURE
172148
end
173149
174-
note right of GHA1: Assuming sufficient time has passed for stabilize window
175-
176150
rect rgb(200, 255,200)
177-
178-
GHA2->>+LeaseProvider: Aquire: priority: 2
179-
note right of LeaseProvider: Full state awareness
180-
LeaseProvider-->>GHA2: priority: 2, status: AQUIRED
181-
note left of GHA2: holds lease to access shared resource
182-
183-
GHA2->>LeaseProvider: Release: priority: 2, status: SUCCESS
184-
note right of LeaseProvider: the lease is marked as completed -> status is available on the next requests
151+
GHA2->>+LeaseProvider: Release: priority: 2, status: SUCCESS
152+
note right of LeaseProvider: the lease is marked as completed
185153
LeaseProvider-->>-GHA2: priority: 2, status: COMPLETED
186154
end
155+
end
187156
188157
GHA1->>+LeaseProvider: Aquire: priority: 1
189158
LeaseProvider-->>-GHA1: priority: 1, status: COMPLETED
159+
160+
GHA_NEXT->>+LeaseProvider: Aquire: priority: <>
161+
note left of GHA_NEXT: Priority is recalculated as previous branches were merged
162+
LeaseProvider-->>-GHA_NEXT: priority: <>, status: PENDING
190163
```
191-
</detail>
164+
165+
192166

193167
### GithubAction
194168
> :warning: WIP

internal/lease/leaseprovider.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -148,15 +148,16 @@ func (lp *leaseProviderImpl) insert(leaseRequest *LeaseRequest) (*LeaseRequest,
148148
func (lp *leaseProviderImpl) isWinningLeaseRequest(req *LeaseRequest) *LeaseRequest {
149149
// Prereq: we can expect the arg to be already part of the map!
150150

151-
if lp.aquired != nil {
151+
if lp.aquired != nil && !(pointer.StringDeref(lp.aquired.Status, STATUS_AQUIRED) == STATUS_FAILURE) {
152152
// Lock already aquired
153153
return req
154154
}
155155
// 1st: we reached the time limit -> lastUpdatedAt + StabilizeDuration > now
156156
passedStabilizeDuration := time.Since(lp.lastUpdatedAt) >= lp.opts.StabilizeDuration
157157
// 2nd: we received all requests and can take a decision
158+
// 3rd: there has been no previous failure
158159
reachedExpectedRequestCount := len(lp.known) >= lp.opts.ExpectedRequestCount
159-
if !passedStabilizeDuration && !reachedExpectedRequestCount {
160+
if lp.aquired == nil && (!passedStabilizeDuration && !reachedExpectedRequestCount) {
160161
return req
161162
}
162163

@@ -229,7 +230,6 @@ func (lp *leaseProviderImpl) Release(leaseRequest *LeaseRequest) (*LeaseRequest,
229230
if status == STATUS_FAILURE {
230231
// On failure, drop it. This way the next one can aquire the lease
231232
delete(lp.known, req.HeadSHA)
232-
lp.aquired = nil
233233
return req, nil
234234
}
235235

internal/lease/leaseprovider_test.go

+7-4
Original file line numberDiff line numberDiff line change
@@ -493,11 +493,14 @@ func Test_leaseProviderImpl__FullLoop_ReleaseFailedNewRequest(t *testing.T) {
493493
assert.NoError(t, err)
494494
assert.Equal(t, STATUS_PENDING, *req1.Status)
495495

496-
// A new request is coming in. Since there's no decision on who aquired the lease yet, it can still enter.
497-
// With it, we have full knowledge again and it gets the lease.
498-
reqNext, err = lp.Aquire(reqNext)
496+
// A new request is coming in. Since there has been a previous failure, it should be rejected
497+
_, err = lp.Aquire(reqNext)
498+
assert.Error(t, err)
499+
500+
// Request 2 is the highest one in the batch now
501+
req2, err = lp.Aquire(req2)
499502
assert.NoError(t, err)
500-
assert.Equal(t, STATUS_AQUIRED, *reqNext.Status)
503+
assert.Equal(t, STATUS_AQUIRED, *req2.Status)
501504
}
502505

503506
func Test_leaseProviderImpl__FullLoop_ReleaseWithNoAquiredLease(t *testing.T) {

0 commit comments

Comments
 (0)