Skip to content

Commit 12d02ca

Browse files
wilfred-scraigcondit
authored andcommitted
[YUNIKORN-2953] Placeholder release count incorrect (#991)
While releasing placeholders the allocated placeholders are counted twice in the tracking information. With YUNIKORN-2926 in place this happens if some but not all placeholders are allocated only. Mark unallocated placeholders that timeout as released to prevent scheduling issues. Closes: #991 Signed-off-by: Craig Condit <[email protected]>
1 parent 5003167 commit 12d02ca

File tree

2 files changed

+60
-34
lines changed

2 files changed

+60
-34
lines changed

pkg/scheduler/objects/application.go

+29-19
Original file line numberDiff line numberDiff line change
@@ -416,9 +416,8 @@ func (sa *Application) clearPlaceholderTimer() {
416416
func (sa *Application) timeoutPlaceholderProcessing() {
417417
sa.Lock()
418418
defer sa.Unlock()
419-
switch {
420-
// Case 1: if all app's placeholders are allocated, only part of them gets replaced, just delete the remaining placeholders
421-
case (sa.IsRunning() || sa.IsCompleting()) && !resources.IsZero(sa.allocatedPlaceholder):
419+
if (sa.IsRunning() || sa.IsCompleting()) && !resources.IsZero(sa.allocatedPlaceholder) {
420+
// Case 1: if all app's placeholders are allocated, only part of them gets replaced, just delete the remaining placeholders
422421
var toRelease []*Allocation
423422
replacing := 0
424423
for _, alloc := range sa.getPlaceholderAllocations() {
@@ -430,19 +429,14 @@ func (sa *Application) timeoutPlaceholderProcessing() {
430429
alloc.SetReleased(true)
431430
toRelease = append(toRelease, alloc)
432431
}
433-
log.Log(log.SchedApplication).Info("Placeholder timeout, releasing placeholders",
432+
log.Log(log.SchedApplication).Info("Placeholder timeout, releasing allocated placeholders",
434433
zap.String("AppID", sa.ApplicationID),
435434
zap.Int("placeholders being replaced", replacing),
436435
zap.Int("releasing placeholders", len(toRelease)))
437436
// trigger the release of the placeholders: accounting updates when the release is done
438437
sa.notifyRMAllocationReleased(toRelease, si.TerminationType_TIMEOUT, "releasing allocated placeholders on placeholder timeout")
439-
// Case 2: in every other case progress the application, and notify the context about the expired placeholder asks
440-
default:
441-
log.Log(log.SchedApplication).Info("Placeholder timeout, releasing asks and placeholders",
442-
zap.String("AppID", sa.ApplicationID),
443-
zap.Int("releasing placeholders", len(sa.allocations)),
444-
zap.Int("releasing asks", len(sa.requests)),
445-
zap.String("gang scheduling style", sa.gangSchedulingStyle))
438+
} else {
439+
// Case 2: in every other case progress the application, and notify the context about the expired placeholders
446440
// change the status of the app based on gang style: soft resume normal allocations, hard fail the app
447441
event := ResumeApplication
448442
if sa.gangSchedulingStyle == Hard {
@@ -454,14 +448,30 @@ func (sa *Application) timeoutPlaceholderProcessing() {
454448
zap.String("currentState", sa.CurrentState()),
455449
zap.Error(err))
456450
}
457-
sa.notifyRMAllocationReleased(sa.getAllRequestsInternal(), si.TerminationType_TIMEOUT, "releasing placeholders asks on placeholder timeout")
458-
sa.removeAsksInternal("", si.EventRecord_REQUEST_TIMEOUT)
459-
// all allocations are placeholders but GetAllAllocations is locked and cannot be used
460-
sa.notifyRMAllocationReleased(sa.getPlaceholderAllocations(), si.TerminationType_TIMEOUT, "releasing allocated placeholders on placeholder timeout")
461-
// we are in an accepted or new state so nothing can be replaced yet: mark everything as timedout
462-
for _, phData := range sa.placeholderData {
463-
phData.TimedOut = phData.Count
451+
// all allocations are placeholders release them all
452+
var toRelease, pendingRelease []*Allocation
453+
for _, alloc := range sa.allocations {
454+
alloc.SetReleased(true)
455+
toRelease = append(toRelease, alloc)
456+
}
457+
// get all open requests and remove them all filter out already allocated as they are already released
458+
for _, alloc := range sa.requests {
459+
if !alloc.IsAllocated() {
460+
alloc.SetReleased(true)
461+
pendingRelease = append(pendingRelease, alloc)
462+
sa.placeholderData[alloc.taskGroupName].TimedOut++
463+
}
464464
}
465+
log.Log(log.SchedApplication).Info("Placeholder timeout, releasing allocated and pending placeholders",
466+
zap.String("AppID", sa.ApplicationID),
467+
zap.Int("releasing placeholders", len(toRelease)),
468+
zap.Int("pending placeholders", len(pendingRelease)),
469+
zap.String("gang scheduling style", sa.gangSchedulingStyle))
470+
sa.removeAsksInternal("", si.EventRecord_REQUEST_TIMEOUT)
471+
// trigger the release of the allocated placeholders: accounting updates when the release is done
472+
sa.notifyRMAllocationReleased(toRelease, si.TerminationType_TIMEOUT, "releasing allocated placeholders on placeholder timeout")
473+
// trigger the release of the pending placeholders: accounting has been done
474+
sa.notifyRMAllocationReleased(pendingRelease, si.TerminationType_TIMEOUT, "releasing pending placeholders on placeholder timeout")
465475
}
466476
sa.clearPlaceholderTimer()
467477
}
@@ -1890,7 +1900,7 @@ func (sa *Application) removeAllocationInternal(allocationKey string, releaseTyp
18901900
if sa.hasZeroAllocations() {
18911901
removeApp = true
18921902
event = CompleteApplication
1893-
eventWarning = "Application state not changed to Waiting while removing an allocation"
1903+
eventWarning = "Application state not changed to Completing while removing an allocation"
18941904
}
18951905
sa.decUserResourceUsage(alloc.GetAllocatedResource(), removeApp)
18961906
}

pkg/scheduler/objects/application_test.go

+31-15
Original file line numberDiff line numberDiff line change
@@ -1491,14 +1491,14 @@ func TestReplaceAllocationTracking(t *testing.T) {
14911491
}
14921492

14931493
func TestTimeoutPlaceholderSoft(t *testing.T) {
1494-
runTimeoutPlaceholderTest(t, Resuming.String(), Soft, []int{1, 2})
1494+
runTimeoutPlaceholderTest(t, Resuming.String(), Soft)
14951495
}
14961496

14971497
func TestTimeoutPlaceholderHard(t *testing.T) {
1498-
runTimeoutPlaceholderTest(t, Failing.String(), Hard, []int{1, 2})
1498+
runTimeoutPlaceholderTest(t, Failing.String(), Hard)
14991499
}
15001500

1501-
func runTimeoutPlaceholderTest(t *testing.T, expectedState string, gangSchedulingStyle string, expectedReleases []int) {
1501+
func runTimeoutPlaceholderTest(t *testing.T, expectedState string, gangSchedulingStyle string) {
15021502
const (
15031503
tg1 = "tg-1"
15041504
tg2 = "tg-2"
@@ -1530,16 +1530,16 @@ func runTimeoutPlaceholderTest(t *testing.T, expectedState string, gangSchedulin
15301530
assert.Assert(t, app.IsAccepted(), "Application should be in accepted state")
15311531

15321532
// add the placeholder to the app
1533-
ph := newPlaceholderAlloc(appID1, nodeID1, res, tg2)
1534-
app.AddAllocation(ph)
1535-
app.addPlaceholderDataWithLocking(ph)
1533+
ph1 := newPlaceholderAlloc(appID1, nodeID1, res, tg2)
1534+
app.AddAllocation(ph1)
1535+
app.addPlaceholderDataWithLocking(ph1)
15361536
assertPlaceholderData(t, app, tg2, 1, 0, 0, res)
15371537
assertUserGroupResource(t, getTestUserGroup(), res)
15381538
assert.Assert(t, app.getPlaceholderTimer() != nil, "Placeholder timer should be initiated after the first placeholder allocation")
15391539
// add a second one to check the filter
1540-
ph = newPlaceholderAlloc(appID1, nodeID1, res, tg2)
1541-
app.AddAllocation(ph)
1542-
app.addPlaceholderDataWithLocking(ph)
1540+
ph2 := newPlaceholderAlloc(appID1, nodeID1, res, tg2)
1541+
app.AddAllocation(ph2)
1542+
app.addPlaceholderDataWithLocking(ph2)
15431543
assertPlaceholderData(t, app, tg2, 2, 0, 0, res)
15441544
assertUserGroupResource(t, getTestUserGroup(), resources.Multiply(res, 2))
15451545
assert.Assert(t, app.IsAccepted(), "Application should be in accepted state")
@@ -1549,11 +1549,22 @@ func runTimeoutPlaceholderTest(t *testing.T, expectedState string, gangSchedulin
15491549
return app.placeholderTimer == nil
15501550
})
15511551
assert.NilError(t, err, "Placeholder timeout cleanup did not trigger unexpectedly")
1552-
// When the app was in the accepted state, timeout should equal to count
1552+
// pending updates immediately
15531553
assertPlaceholderData(t, app, tg1, 1, 1, 0, res)
1554-
assertPlaceholderData(t, app, tg2, 2, 2, 0, res)
1554+
// No changes until the removals are confirmed
1555+
assertPlaceholderData(t, app, tg2, 2, 0, 0, res)
1556+
15551557
assert.Equal(t, app.stateMachine.Current(), expectedState, "Application did not progress into expected state")
1558+
log := app.GetStateLog()
1559+
assert.Equal(t, len(log), 2, "wrong number of app events")
1560+
assert.Equal(t, log[0].ApplicationState, Accepted.String())
1561+
assert.Equal(t, log[1].ApplicationState, expectedState)
1562+
15561563
assertUserGroupResource(t, getTestUserGroup(), resources.Multiply(res, 2))
1564+
// ordering of events is based on the order in which we call the release in the application
1565+
// first are the allocated placeholders then the pending placeholders, always same values
1566+
// See the timeoutPlaceholderProcessing() function
1567+
expectedReleases := []int{2, 1}
15571568
events := testHandler.GetEvents()
15581569
var found int
15591570
idx := 0
@@ -1572,10 +1583,15 @@ func runTimeoutPlaceholderTest(t *testing.T, expectedState string, gangSchedulin
15721583
assert.Assert(t, resources.Equals(app.GetPlaceholderResource(), resources.Multiply(res, 2)), "Unexpected placeholder resources for the app")
15731584
assertUserGroupResource(t, getTestUserGroup(), resources.Multiply(res, 2))
15741585

1575-
log := app.GetStateLog()
1576-
assert.Equal(t, len(log), 2, "wrong number of app events")
1577-
assert.Equal(t, log[0].ApplicationState, Accepted.String())
1578-
assert.Equal(t, log[1].ApplicationState, expectedState)
1586+
removed := app.RemoveAllocation(ph1.allocationKey, si.TerminationType_TIMEOUT)
1587+
assert.Assert(t, removed != nil, "expected allocation got nil")
1588+
assert.Equal(t, ph1.allocationKey, removed.allocationKey, "expected placeholder to be returned")
1589+
removed = app.RemoveAllocation(ph2.allocationKey, si.TerminationType_TIMEOUT)
1590+
assert.Assert(t, removed != nil, "expected allocation got nil")
1591+
assert.Equal(t, ph2.allocationKey, removed.allocationKey, "expected placeholder to be returned")
1592+
1593+
// Removals are confirmed: timeout should equal to count
1594+
assertPlaceholderData(t, app, tg2, 2, 2, 0, res)
15791595
}
15801596

15811597
func TestTimeoutPlaceholderAllocReleased(t *testing.T) {

0 commit comments

Comments
 (0)