Skip to content

Commit f1b8566

Browse files
authored
fix: improve healing executions when they are not found in the cluster (#6292)
1 parent b01fbcf commit f1b8566

File tree

1 file changed

+49
-1
lines changed

1 file changed

+49
-1
lines changed

pkg/runner/service.go

+49-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ package runner
22

33
import (
44
"context"
5+
"time"
56

7+
"github.com/pkg/errors"
68
"go.uber.org/zap"
79
"golang.org/x/sync/errgroup"
810

@@ -12,8 +14,11 @@ import (
1214
"github.com/kubeshop/testkube/pkg/event"
1315
"github.com/kubeshop/testkube/pkg/log"
1416
configrepo "github.com/kubeshop/testkube/pkg/repository/config"
17+
"github.com/kubeshop/testkube/pkg/testworkflows/executionworker/controller"
1518
"github.com/kubeshop/testkube/pkg/testworkflows/executionworker/executionworkertypes"
19+
"github.com/kubeshop/testkube/pkg/testworkflows/executionworker/registry"
1620
"github.com/kubeshop/testkube/pkg/testworkflows/testworkflowconfig"
21+
"github.com/kubeshop/testkube/pkg/testworkflows/testworkflowprocessor/stage"
1722
)
1823

1924
type Options struct {
@@ -84,8 +89,51 @@ func (s *service) recover(ctx context.Context) (err error) {
8489
for _, exec := range executions {
8590
go func(environmentId string, executionId string) {
8691
err := s.runner.Monitor(ctx, s.proContext.OrgID, environmentId, executionId)
87-
if err != nil {
92+
if err == nil {
93+
return
94+
}
95+
if !errors.Is(err, registry.ErrResourceNotFound) {
8896
s.logger.Errorw("failed to monitor execution", "id", executionId, "error", err)
97+
return
98+
}
99+
100+
s.logger.Warnw("execution to monitor not found. recovering.", "id", executionId, "error", err)
101+
102+
// Get the existing execution
103+
execution, err := s.client.GetExecution(ctx, environmentId, executionId)
104+
if err != nil {
105+
s.logger.Errorw("failed to recover execution: getting execution", "id", executionId, "error", err)
106+
return
107+
}
108+
109+
// Ignore if it's still queued - orchestrator will recover it later
110+
if execution.Result.IsQueued() {
111+
s.logger.Warnw("execution to monitor is still queued: leaving it for orchestrator", "id", executionId)
112+
return
113+
}
114+
115+
// Check if there is error message acknowledged
116+
sigSequence := stage.MapSignatureListToInternal(stage.MapSignatureToSequence(stage.MapSignatureList(execution.Signature)))
117+
errorMessage := execution.Result.Initialization.ErrorMessage
118+
if errorMessage == "" {
119+
for _, sig := range sigSequence {
120+
if execution.Result.Steps[sig.Ref].ErrorMessage != "" {
121+
errorMessage = execution.Result.Steps[sig.Ref].ErrorMessage
122+
break
123+
}
124+
}
125+
}
126+
127+
// Finalize and save the result
128+
execution.Result.HealAborted(sigSequence, errorMessage, controller.DefaultErrorMessage)
129+
execution.Result.HealTimestamps(sigSequence, execution.ScheduledAt, time.Time{}, time.Time{}, true)
130+
execution.Result.HealDuration(execution.ScheduledAt)
131+
execution.Result.HealMissingPauseStatuses()
132+
execution.Result.HealStatus(sigSequence)
133+
if err = s.client.FinishExecutionResult(ctx, environmentId, executionId, execution.Result); err != nil {
134+
s.logger.Errorw("failed to recover execution: saving execution", "id", executionId, "error", err)
135+
} else {
136+
s.logger.Infow("recovered execution", "id", executionId, "error", err)
89137
}
90138
}(exec.EnvironmentId, exec.Id)
91139
}

0 commit comments

Comments
 (0)