@@ -26,6 +26,8 @@ import (
26
26
"github.com/kubernetes-sigs/kube-batch/cmd/kube-batch/app/options"
27
27
arbcorev1 "github.com/kubernetes-sigs/kube-batch/pkg/apis/scheduling/v1alpha1"
28
28
"github.com/kubernetes-sigs/kube-batch/pkg/apis/utils"
29
+ "sort"
30
+ "strings"
29
31
)
30
32
31
33
type TaskID types.UID
@@ -73,9 +75,8 @@ func NewTaskInfo(pod *v1.Pod) *TaskInfo {
73
75
NodeName : pod .Spec .NodeName ,
74
76
Status : getTaskStatus (pod ),
75
77
Priority : 1 ,
76
-
77
- Pod : pod ,
78
- Resreq : req ,
78
+ Pod : pod ,
79
+ Resreq : req ,
79
80
}
80
81
81
82
if pod .Spec .Priority != nil {
@@ -109,6 +110,8 @@ type JobID types.UID
109
110
110
111
type tasksMap map [TaskID ]* TaskInfo
111
112
113
+ type NodeResourceMap map [string ]* Resource
114
+
112
115
type JobInfo struct {
113
116
UID JobID
114
117
@@ -122,6 +125,8 @@ type JobInfo struct {
122
125
NodeSelector map [string ]string
123
126
MinAvailable int32
124
127
128
+ NodesFitDelta NodeResourceMap
129
+
125
130
// All tasks of the Job.
126
131
TaskStatusIndex map [TaskStatus ]tasksMap
127
132
Tasks tasksMap
@@ -140,11 +145,11 @@ func NewJobInfo(uid JobID) *JobInfo {
140
145
return & JobInfo {
141
146
UID : uid ,
142
147
143
- MinAvailable : 0 ,
144
- NodeSelector : make (map [string ]string ),
145
-
146
- Allocated : EmptyResource (),
147
- TotalRequest : EmptyResource (),
148
+ MinAvailable : 0 ,
149
+ NodeSelector : make (map [string ]string ),
150
+ NodesFitDelta : make ( NodeResourceMap ),
151
+ Allocated : EmptyResource (),
152
+ TotalRequest : EmptyResource (),
148
153
149
154
TaskStatusIndex : map [TaskStatus ]tasksMap {},
150
155
Tasks : tasksMap {},
@@ -278,10 +283,11 @@ func (ji *JobInfo) Clone() *JobInfo {
278
283
Namespace : ji .Namespace ,
279
284
Queue : ji .Queue ,
280
285
281
- MinAvailable : ji .MinAvailable ,
282
- NodeSelector : map [string ]string {},
283
- Allocated : ji .Allocated .Clone (),
284
- TotalRequest : ji .TotalRequest .Clone (),
286
+ MinAvailable : ji .MinAvailable ,
287
+ NodeSelector : map [string ]string {},
288
+ Allocated : ji .Allocated .Clone (),
289
+ TotalRequest : ji .TotalRequest .Clone (),
290
+ NodesFitDelta : make (NodeResourceMap ),
285
291
286
292
PDB : ji .PDB ,
287
293
PodGroup : ji .PodGroup ,
@@ -314,3 +320,36 @@ func (ji JobInfo) String() string {
314
320
315
321
return fmt .Sprintf ("Job (%v): name %v, minAvailable %d" , ji .UID , ji .Name , ji .MinAvailable ) + res
316
322
}
323
+
324
+ // Error returns detailed information on why a job's task failed to fit on
325
+ // each available node
326
+ func (f * JobInfo ) FitError () string {
327
+ if len (f .NodesFitDelta ) == 0 {
328
+ reasonMsg := fmt .Sprintf ("0 nodes are available" )
329
+ return reasonMsg
330
+ }
331
+
332
+ reasons := make (map [string ]int )
333
+ for _ , v := range f .NodesFitDelta {
334
+ if v .Get (v1 .ResourceCPU ) < 0 {
335
+ reasons ["cpu" ]++
336
+ }
337
+ if v .Get (v1 .ResourceMemory ) < 0 {
338
+ reasons ["memory" ]++
339
+ }
340
+ if v .Get (GPUResourceName ) < 0 {
341
+ reasons ["GPU" ]++
342
+ }
343
+ }
344
+
345
+ sortReasonsHistogram := func () []string {
346
+ reasonStrings := []string {}
347
+ for k , v := range reasons {
348
+ reasonStrings = append (reasonStrings , fmt .Sprintf ("%v insufficient %v" , v , k ))
349
+ }
350
+ sort .Strings (reasonStrings )
351
+ return reasonStrings
352
+ }
353
+ reasonMsg := fmt .Sprintf ("0/%v nodes are available, %v." , len (f .NodesFitDelta ), strings .Join (sortReasonsHistogram (), ", " ))
354
+ return reasonMsg
355
+ }
0 commit comments