Skip to content
This repository was archived by the owner on Sep 12, 2023. It is now read-only.

Commit fdb9739

Browse files
authored
fix job active count; count terminating pod as failed (#214)
Signed-off-by: yowenter <[email protected]>
1 parent 83259a0 commit fdb9739

File tree

2 files changed

+20
-4
lines changed

2 files changed

+20
-4
lines changed

pkg/controller.v1/common/status_test.go

+12-3
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ package common
22

33
import (
44
"testing"
5+
"time"
56

67
apiv1 "github.com/kubeflow/common/pkg/apis/common/v1"
78
"github.com/stretchr/testify/assert"
89
corev1 "k8s.io/api/core/v1"
10+
metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1"
911
)
1012

1113
func TestUpdateJobReplicaStatuses(t *testing.T) {
@@ -14,13 +16,14 @@ func TestUpdateJobReplicaStatuses(t *testing.T) {
1416
_, ok := jobStatus.ReplicaStatuses["worker"]
1517
// assert ReplicaStatus for "worker" exists
1618
assert.True(t, ok)
17-
setStatusForTest(&jobStatus, "worker", 2, 3, 1)
18-
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(2))
19+
setStatusForTest(&jobStatus, "worker", 2, 3, 1, 1)
20+
// terminating pod should count as failed.
21+
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(3))
1922
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Succeeded, int32(3))
2023
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Active, int32(1))
2124
}
2225

23-
func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active int32) {
26+
func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active, terminating int32) {
2427
pod := corev1.Pod{
2528
Status: corev1.PodStatus{},
2629
}
@@ -37,4 +40,10 @@ func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, faile
3740
pod.Status.Phase = corev1.PodRunning
3841
updateJobReplicaStatuses(jobStatus, rtype, &pod)
3942
}
43+
for i = 0; i < terminating; i++ {
44+
pod.Status.Phase = corev1.PodRunning
45+
deletionTimestamp := metaV1.NewTime(time.Now())
46+
pod.DeletionTimestamp = &deletionTimestamp
47+
updateJobReplicaStatuses(jobStatus, rtype, &pod)
48+
}
4049
}

pkg/core/status.go

+8-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,14 @@ func InitializeReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaTy
3434
func UpdateJobReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, pod *corev1.Pod) {
3535
switch pod.Status.Phase {
3636
case corev1.PodRunning:
37-
jobStatus.ReplicaStatuses[rtype].Active++
37+
if pod.DeletionTimestamp != nil {
38+
// when node is not ready, the pod will be in terminating state.
39+
// Count deleted Pods as failures to account for orphan Pods that
40+
// never have a chance to reach the Failed phase.
41+
jobStatus.ReplicaStatuses[rtype].Failed++
42+
} else {
43+
jobStatus.ReplicaStatuses[rtype].Active++
44+
}
3845
case corev1.PodSucceeded:
3946
jobStatus.ReplicaStatuses[rtype].Succeeded++
4047
case corev1.PodFailed:

0 commit comments

Comments
 (0)