Skip to content

Quota check preceding resource check #614

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 28 commits into from
Aug 30, 2023
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
69163df
Quota release when AppWrapper is completed
metalcycling May 15, 2023
79df8d1
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling May 27, 2023
5ef78b6
Quota release testing. Fix in the Fits function to allocate consumer.
metalcycling May 29, 2023
839f23a
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling May 29, 2023
8277a66
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Jun 19, 2023
02364f7
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Jul 14, 2023
0a2b83f
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Jul 18, 2023
cd058bb
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 3, 2023
c4b0eea
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 8, 2023
b2dd8e1
Aggregate all the resources names accross all children in the tree to…
metalcycling Aug 8, 2023
6e822e8
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 15, 2023
3c3487d
Reverted changes from previous commits to current version of main
metalcycling Aug 15, 2023
d397d1f
Added GPU as one of the default resources
metalcycling Aug 15, 2023
c95bbae
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 17, 2023
fbad3a4
Formatting fixes
metalcycling Aug 18, 2023
975e731
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 18, 2023
935ca78
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 21, 2023
d155445
Fixed bug in quota management library that didn't preempt jobs when b…
metalcycling Aug 23, 2023
fac47ac
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 23, 2023
185ebf8
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 25, 2023
5dfeb97
Quota check preceeding resource check update. This version allows to …
metalcycling Aug 28, 2023
2dcca14
Interface fix
metalcycling Aug 28, 2023
014a5e7
Added some comments and TODOs
metalcycling Aug 29, 2023
a6d35ea
Merge remote-tracking branch 'upstream/main' into quota-management
metalcycling Aug 29, 2023
615757b
Kuttl quota borrowing test
metalcycling Aug 29, 2023
8a4aa10
Separated borrowing tests so that quotas are resetted
metalcycling Aug 29, 2023
ed5c924
Added missing files for the borrowing test
metalcycling Aug 29, 2023
34ffda2
Added commands to build quota tree when the test is run
metalcycling Aug 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pkg/controller/clusterstate/api/node_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,10 @@ type NodeInfo struct {

// The releasing resource on that node
Releasing *Resource

// The idle resource on that node
Idle *Resource

// The used resource on that node, including running and terminating
// pods
Used *Resource
Expand Down Expand Up @@ -104,7 +106,6 @@ func (ni *NodeInfo) Clone() *NodeInfo {
func (ni *NodeInfo) SetNode(node *v1.Node) {
if ni.Node == nil {
ni.Idle = NewResource(node.Status.Allocatable)

}

ni.Name = node.Name
Expand All @@ -121,5 +122,4 @@ func (ni NodeInfo) String() string {

return fmt.Sprintf("Node (%s): idle <%v>, used <%v>, releasing <%v>%s",
ni.Name, ni.Idle, ni.Used, ni.Releasing, res)

}
273 changes: 135 additions & 138 deletions pkg/controller/queuejob/queuejob_controller_ex.go

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pkg/controller/quota/quota_manager_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
)

type QuotaManagerInterface interface {
Fits(aw *arbv1.AppWrapper, resources *clusterstateapi.Resource, proposedPremptions []*arbv1.AppWrapper) (bool, []*arbv1.AppWrapper, string)
Fits(aw *arbv1.AppWrapper, requestedResources *clusterstateapi.Resource, clusterResources *clusterstateapi.Resource, proposedPremptions []*arbv1.AppWrapper) (bool, []*arbv1.AppWrapper, string)
Release(aw *arbv1.AppWrapper) bool
GetValidQuotaLabels() []string
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
qmbackend "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota"
qmbackendutils "github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/quota-forest/quota-manager/quota/utils"
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/quotaplugins/util"
"github.com/project-codeflare/multi-cluster-app-dispatcher/pkg/controller/queuejobresources/genericresource"
"k8s.io/client-go/rest"

"math"
Expand Down Expand Up @@ -205,7 +206,7 @@ func (qm *QuotaManager) loadDispatchedAWs(dispatchedAWDemands map[string]*cluste
}
aw.SetLabels(newLabels)

doesFit, preemptionIds, errorMessage := qm.Fits(aw, v, nil)
doesFit, preemptionIds, errorMessage := qm.Fits(aw, v, nil, nil)
if !doesFit {
klog.Errorf("[loadDispatchedAWs] Loading of AppWrapper %s/%s failed.",
aw.Namespace, aw.Name)
Expand Down Expand Up @@ -509,7 +510,7 @@ func (qm *QuotaManager) buildRequest(aw *arbv1.AppWrapper,

return consumerInfo, err
}
func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi.Resource,
func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi.Resource, clusterResources *clusterstateapi.Resource,
proposedPreemptions []*arbv1.AppWrapper) (bool, []*arbv1.AppWrapper, string) {

// If a Quota Manager Backend instance does not exists then assume quota failed
Expand Down Expand Up @@ -555,7 +556,7 @@ func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi

consumerID := consumerInfo.GetID()
klog.V(4).Infof("[Fits] Sending quota allocation request: %#v ", consumerInfo)
allocResponse, err := qm.quotaManagerBackend.AllocateForest(QuotaManagerForestName, consumerID)
allocResponse, err := qm.quotaManagerBackend.TryAllocateForest(QuotaManagerForestName, consumerID)
if err != nil {
qm.removeConsumer(consumerID)
klog.Errorf("[Fits] Error allocating consumer: %s/%s, err=%#v.", aw.Namespace, aw.Name, err)
Expand All @@ -569,9 +570,47 @@ func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi
return doesFit, preemptIds, strings.TrimSpace(allocResponse.GetMessage())
}
preemptIds = qm.getAppWrappers(allocResponse.GetPreemptedIds())

// Update cluster resources in the even that preemption happens
// TODO: Potentially move this resource updated out to the calling function (would need to comeback again to undo the allocation
// if the resources are not enough after preemption)
if clusterResources != nil {
updatedResources := clusterResources

for _, appWrapper := range preemptIds {
updatedResources.Add(qm.getAggregatedResources(appWrapper))
}

// Check if job fits with the update resources after preempted AppWrappers are removed
if clusterResources != nil && !awResDemands.LessEqual(updatedResources) {
qm.quotaManagerBackend.UndoAllocateForest(QuotaManagerForestName, consumerID)
qm.removeConsumer(consumerID)
return false, preemptIds, fmt.Sprintf("[Fits] AppWrapper '%s/%s' does not fit in the cluster, even after borrowed quota is freed", aw.Namespace, aw.Name)
}
}

return doesFit, preemptIds, strings.TrimSpace(allocResponse.GetMessage())
}

func (qm *QuotaManager) getAggregatedResources(appWrapper *arbv1.AppWrapper) *clusterstateapi.Resource {
// After quota evaluation, a set of AppWrappers is returned for preemption. Before deciding to delete them,
// we need to make sure enough resources are free for the new AppWrapper after the preemptable list is deleted.
// For this we need to add back the requests consumed by the preemptable AppWrappers to the available resources
// in order to perform a correct resource check with updated values.
allocated := clusterstateapi.EmptyResource()

for _, genericItem := range appWrapper.Spec.AggrResources.GenericItems {
resources, err := genericresource.GetResources(&genericItem)
if err != nil {
klog.V(8).Infof("[GetAggregatedResources] Failure aggregating resources for %s/%s, err=%#v, genericItem=%#v",
appWrapper.Namespace, appWrapper.Name, err, genericItem)
}
allocated = allocated.Add(resources)
}

return allocated
}

func (qm *QuotaManager) getAppWrappers(preemptIds []string) []*arbv1.AppWrapper {
var aws []*arbv1.AppWrapper
if len(preemptIds) <= 0 {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ func (qn *QuotaNode) SlideDown() {
func (qn *QuotaNode) SlideUp(c *Consumer, applyPriority bool, allocationRecovery *AllocationRecovery,
preemptedConsumers *[]string) bool {

if qn.isHard {
if qn.isHard && !qn.IsRoot() {
return false
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -494,12 +494,14 @@ func (m *Manager) AllocateForest(forestName string, consumerID string) (response

// TryAllocateForest : allocate a consumer on a forest
func (m *Manager) TryAllocateForest(forestName string, consumerID string) (response *core.AllocationResponse, err error) {
if m.mode != Normal {
response, err = m.AllocateForest(forestName, consumerID)
return response, err
}

m.mutex.Lock()
defer m.mutex.Unlock()

if m.mode != Normal {
return nil, fmt.Errorf("manager is not in normal mode")
}
forestController, forestConsumer, err := m.preAllocateForest(forestName, consumerID)
if err == nil && forestController.IsConsumerAllocated(consumerID) {
err = fmt.Errorf("consumer %s already allocated on forest %s", consumerID, forestName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ var (
DefaultTreeName string = "default"

// DefaultResourceNames : the default resource names
DefaultResourceNames []string = []string{"cpu", "memory"}
DefaultResourceNames []string = []string{"cpu", "memory", "nvidia.com/gpu"}

// DefaultTreeKind : the default kind attribute of the tree
DefaultTreeKind string = "QuotaTree"
Expand Down
2 changes: 1 addition & 1 deletion pkg/quotaplugins/quota-simple-rest/quota_rest_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ func (qm *QuotaManager) getQuotaDesignation(aw *arbv1.AppWrapper) []QuotaGroup {
return groups
}

func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi.Resource,
func (qm *QuotaManager) Fits(aw *arbv1.AppWrapper, awResDemands *clusterstateapi.Resource, clusterResources *clusterstateapi.Resource,
proposedPreemptions []*arbv1.AppWrapper) (bool, []*arbv1.AppWrapper, string) {

// Handle uninitialized quota manager
Expand Down
2 changes: 1 addition & 1 deletion test/e2e-kuttl/quota-forest/09-assert.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#Verify AppWrappers finished successfully
# Verify AppWrappers finished successfully
---
apiVersion: mcad.ibm.com/v1beta1
kind: AppWrapper
Expand Down