@@ -20,14 +20,15 @@ import (
20
20
"bytes"
21
21
"context"
22
22
"fmt"
23
- "github.com/trustyai-explainability/trustyai-service-operator/controllers/utils"
24
23
"maps"
25
24
"slices"
26
25
"strconv"
27
26
"strings"
28
27
"sync"
29
28
"time"
30
29
30
+ "github.com/trustyai-explainability/trustyai-service-operator/controllers/utils"
31
+
31
32
corev1 "k8s.io/api/core/v1"
32
33
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
33
34
"k8s.io/apimachinery/pkg/runtime"
@@ -522,7 +523,13 @@ func (r *LMEvalJobReconciler) checkScheduledPod(ctx context.Context, log logr.Lo
522
523
// pull status from the driver
523
524
if err = r .updateStatus (ctx , log , job ); err == nil && job .Status .State == lmesv1alpha1 .CompleteJobState {
524
525
// Job completed successfully, handle cleanup
525
- return r .handleComplete (ctx , log , job )
526
+ result , handleErr := r .handleComplete (ctx , log , job )
527
+ if handleErr != nil {
528
+ log .Error (handleErr , "failed to handle job completion, will retry" )
529
+ // If handleComplete fails, we should retry after the polling interval
530
+ return r .pullingJobs .addOrUpdate (string (job .GetUID ()), Options .PodCheckingInterval ), handleErr
531
+ }
532
+ return result , nil
526
533
}
527
534
if err != nil {
528
535
log .Error (err , "unable to retrieve the status from the job's pod. retry after the pulling interval" )
@@ -1405,10 +1412,12 @@ func isContainerFailed(status *corev1.ContainerStatus) (bool, string) {
1405
1412
status .State .Waiting .Reason != "PodInitializing" {
1406
1413
return true , status .State .Waiting .Reason
1407
1414
}
1408
- if status .State .Terminated != nil &&
1409
- status .State .Terminated .Reason != "Completed" &&
1410
- status .State .Terminated .ExitCode != 0 {
1411
- return true , status .State .Terminated .Reason
1415
+ if status .State .Terminated != nil {
1416
+ // Container is considered failed if it has a non-zero exit code OR an unexpected termination reason
1417
+ if status .State .Terminated .ExitCode != 0 ||
1418
+ (status .State .Terminated .Reason != "Completed" && status .State .Terminated .Reason != "" ) {
1419
+ return true , status .State .Terminated .Reason
1420
+ }
1412
1421
}
1413
1422
return false , ""
1414
1423
}
0 commit comments