Skip to content

Commit 2669bbf

Browse files
committed
fix: Implement review comments
1 parent 94e1dfd commit 2669bbf

File tree

1 file changed

+15
-6
lines changed

1 file changed

+15
-6
lines changed

controllers/lmes/lmevaljob_controller.go

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,15 @@ import (
2020
"bytes"
2121
"context"
2222
"fmt"
23-
"github.com/trustyai-explainability/trustyai-service-operator/controllers/utils"
2423
"maps"
2524
"slices"
2625
"strconv"
2726
"strings"
2827
"sync"
2928
"time"
3029

30+
"github.com/trustyai-explainability/trustyai-service-operator/controllers/utils"
31+
3132
corev1 "k8s.io/api/core/v1"
3233
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3334
"k8s.io/apimachinery/pkg/runtime"
@@ -522,7 +523,13 @@ func (r *LMEvalJobReconciler) checkScheduledPod(ctx context.Context, log logr.Lo
522523
// pull status from the driver
523524
if err = r.updateStatus(ctx, log, job); err == nil && job.Status.State == lmesv1alpha1.CompleteJobState {
524525
// Job completed successfully, handle cleanup
525-
return r.handleComplete(ctx, log, job)
526+
result, handleErr := r.handleComplete(ctx, log, job)
527+
if handleErr != nil {
528+
log.Error(handleErr, "failed to handle job completion, will retry")
529+
// If handleComplete fails, we should retry after the polling interval
530+
return r.pullingJobs.addOrUpdate(string(job.GetUID()), Options.PodCheckingInterval), handleErr
531+
}
532+
return result, nil
526533
}
527534
if err != nil {
528535
log.Error(err, "unable to retrieve the status from the job's pod. retry after the pulling interval")
@@ -1405,10 +1412,12 @@ func isContainerFailed(status *corev1.ContainerStatus) (bool, string) {
14051412
status.State.Waiting.Reason != "PodInitializing" {
14061413
return true, status.State.Waiting.Reason
14071414
}
1408-
if status.State.Terminated != nil &&
1409-
status.State.Terminated.Reason != "Completed" &&
1410-
status.State.Terminated.ExitCode != 0 {
1411-
return true, status.State.Terminated.Reason
1415+
if status.State.Terminated != nil {
1416+
// Container is considered failed if it has a non-zero exit code OR an unexpected termination reason
1417+
if status.State.Terminated.ExitCode != 0 ||
1418+
(status.State.Terminated.Reason != "Completed" && status.State.Terminated.Reason != "") {
1419+
return true, status.State.Terminated.Reason
1420+
}
14121421
}
14131422
return false, ""
14141423
}

0 commit comments

Comments
 (0)