Skip to content

Commit 83ef59c

Browse files
committed
fix tfjob status when enableDynamicWorker set true
1 parent 4ac55d2 commit 83ef59c

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

pkg/controller.v1/tensorflow/tfjob_controller.go

+7
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,13 @@ func (r *TFJobReconciler) UpdateJobStatus(job interface{}, replicas map[commonv1
513513
// we know it because we update the status condition when reconciling the replicas
514514
trainingoperatorcommon.RestartedJobsCounterInc(tfJob.Namespace, tensorflowv1.FrameworkName)
515515
} else {
516+
if rtype == tensorflowv1.TFReplicaTypeWorker {
517+
if tfJob.Spec.EnableDynamicWorker {
518+
commonutil.LoggerForJob(tfJob).Infof("TFJob %s/%s continues regardless %d Worker replica(s) failed as enableDynamicWorker is set true.",
519+
tfJob.Namespace, tfJob.Name, failed)
520+
continue
521+
}
522+
}
516523
msg := fmt.Sprintf("TFJob %s/%s has failed because %d %s replica(s) failed.",
517524
tfJob.Namespace, tfJob.Name, failed, rtype)
518525
r.recorder.Event(tfJob, corev1.EventTypeNormal, tfJobFailedReason, msg)

0 commit comments

Comments
 (0)