Skip to content

Commit b52d309

Browse files
committed
Retry StopContainer for a limited number of times
1 parent d30da28 commit b52d309

File tree

5 files changed

+68
-27
lines changed

5 files changed

+68
-27
lines changed

agent/dockerclient/dockerapi/errors.go

+6
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,12 @@ func (err *DockerTimeoutError) Error() string {
5050
// ErrorName returns the name of the error
5151
func (err *DockerTimeoutError) ErrorName() string { return DockerTimeoutErrorName }
5252

53+
// IsRetriableError returns a boolean indicating whether the call that
54+
// generated the error can be retried.
55+
func (err DockerTimeoutError) IsRetriableError() bool {
56+
return true
57+
}
58+
5359
// OutOfMemoryError is a type for errors caused by running out of memory
5460
type OutOfMemoryError struct{}
5561

agent/engine/docker_task_engine.go

+24-9
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ package engine
1717
import (
1818
"context"
1919
"fmt"
20+
"github.com/aws/amazon-ecs-agent/agent/logger"
21+
"github.com/aws/amazon-ecs-agent/agent/logger/field"
2022
"os"
2123
"path/filepath"
2224
"regexp"
@@ -102,11 +104,11 @@ const (
102104

103105
defaultMonitorExecAgentsInterval = 15 * time.Minute
104106

105-
stopContainerBackoffMin = time.Second
106-
stopContainerBackoffMax = time.Second * 5
107+
defaultStopContainerBackoffMin = time.Second
108+
defaultStopContainerBackoffMax = time.Second * 5
107109
stopContainerBackoffJitter = 0.2
108110
stopContainerBackoffMultiplier = 1.3
109-
stopContainerMaxRetryCount = 3
111+
stopContainerMaxRetryCount = 5
110112
)
111113

112114
var newExponentialBackoff = retry.NewExponentialBackoff
@@ -176,6 +178,8 @@ type DockerTaskEngine struct {
176178
monitorExecAgentsTicker *time.Ticker
177179
execCmdMgr execcmd.Manager
178180
monitorExecAgentsInterval time.Duration
181+
stopContainerBackoffMin time.Duration
182+
stopContainerBackoffMax time.Duration
179183
}
180184

181185
// NewDockerTaskEngine returns a created, but uninitialized, DockerTaskEngine.
@@ -214,6 +218,8 @@ func NewDockerTaskEngine(cfg *config.Config,
214218
handleDelay: time.Sleep,
215219
execCmdMgr: execCmdMgr,
216220
monitorExecAgentsInterval: defaultMonitorExecAgentsInterval,
221+
stopContainerBackoffMin: defaultStopContainerBackoffMin,
222+
stopContainerBackoffMax: defaultStopContainerBackoffMax,
217223
}
218224

219225
dockerTaskEngine.initializeContainerStatusToTransitionFunction()
@@ -1552,19 +1558,28 @@ func (engine *DockerTaskEngine) stopContainer(task *apitask.Task, container *api
15521558
// for more information, see: https://github.com/moby/moby/issues/41587
15531559
func (engine *DockerTaskEngine) stopDockerContainer(dockerID, containerName string, apiTimeoutStopContainer time.Duration) dockerapi.DockerContainerMetadata {
15541560
var md dockerapi.DockerContainerMetadata
1555-
backoff := newExponentialBackoff(stopContainerBackoffMin, stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier)
1561+
backoff := newExponentialBackoff(engine.stopContainerBackoffMin, engine.stopContainerBackoffMax, stopContainerBackoffJitter, stopContainerBackoffMultiplier)
15561562
for i := 0; i < stopContainerMaxRetryCount; i++ {
15571563
md = engine.client.StopContainer(engine.ctx, dockerID, apiTimeoutStopContainer)
1558-
if md.Error == nil || md.Error.ErrorName() != dockerapi.DockerTimeoutErrorName {
1564+
if md.Error == nil {
15591565
return md
15601566
}
1567+
cannotStopContainerError, ok := md.Error.(cannotStopContainerError)
1568+
if ok && !cannotStopContainerError.IsRetriableError() {
1569+
return md
1570+
}
1571+
15611572
if i < stopContainerMaxRetryCount-1 {
1562-
time.Sleep(backoff.Duration())
1573+
retryIn := backoff.Duration()
1574+
logger.Warn(fmt.Sprintf("Error stopping container, retrying in %v", retryIn), logger.Fields{
1575+
field.Container: containerName,
1576+
field.RuntimeID: dockerID,
1577+
field.Error: md.Error,
1578+
"attempt": i+1,
1579+
})
1580+
time.Sleep(retryIn)
15631581
}
15641582
}
1565-
if md.Error != nil && md.Error.ErrorName() == dockerapi.DockerTimeoutErrorName {
1566-
seelog.Warnf("Unable to stop container (%s) after %d attempts that timed out; giving up and marking container as stopped anyways", containerName, stopContainerMaxRetryCount)
1567-
}
15681583
return md
15691584
}
15701585

agent/engine/docker_task_engine_test.go

+11-4
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,8 @@ func mocks(t *testing.T, ctx context.Context, cfg *config.Config) (*gomock.Contr
175175
imageManager, dockerstate.NewTaskEngineState(), metadataManager, nil, execCmdMgr)
176176
taskEngine.(*DockerTaskEngine)._time = mockTime
177177
taskEngine.(*DockerTaskEngine).ctx = ctx
178-
178+
taskEngine.(*DockerTaskEngine).stopContainerBackoffMin = time.Millisecond
179+
taskEngine.(*DockerTaskEngine).stopContainerBackoffMax = time.Millisecond * 2
179180
return ctrl, client, mockTime, taskEngine, credentialsManager, imageManager, metadataManager
180181
}
181182

@@ -920,7 +921,10 @@ func TestTaskTransitionWhenStopContainerTimesout(t *testing.T) {
920921
// Validate that timeouts are retried exactly 3 times
921922
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).
922923
Return(containerStopTimeoutError).
923-
Times(3),
924+
Times(5),
925+
926+
client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}).
927+
Times(1),
924928
)
925929
}
926930

@@ -991,7 +995,10 @@ func TestTaskTransitionWhenStopContainerReturnsUnretriableError(t *testing.T) {
991995
// event.
992996
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(dockerapi.DockerContainerMetadata{
993997
Error: dockerapi.CannotStopContainerError{dockerapi.NoSuchContainerError{}},
994-
}).MinTimes(1),
998+
}).MaxTimes(1),
999+
client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).
1000+
Return(dockerapi.PingResponse{}).
1001+
Times(1),
9951002
)
9961003
}
9971004

@@ -1042,7 +1049,7 @@ func TestTaskTransitionWhenStopContainerReturnsTransientErrorBeforeSucceeding(t
10421049
client.EXPECT().StartContainer(gomock.Any(), containerID, defaultConfig.ContainerStartTimeout).Return(
10431050
dockerapi.DockerContainerMetadata{DockerID: containerID}),
10441051
// StopContainer errors out a couple of times
1045-
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(2),
1052+
client.EXPECT().StopContainer(gomock.Any(), containerID, gomock.Any()).Return(containerStoppingError).Times(4),
10461053
// Since task is not in steady state, progressContainers causes
10471054
// another invocation of StopContainer. Return the 'succeed' response,
10481055
// which should cause the task engine to stop invoking this again and

agent/engine/task_manager.go

+13-10
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ const (
5151
// credentials from acs, after the timeout it will check the credentials manager
5252
// and start processing the task or start another round of waiting
5353
waitForPullCredentialsTimeout = 1 * time.Minute
54+
systemPingTimeout = 5 * time.Second
5455
defaultTaskSteadyStatePollInterval = 5 * time.Minute
5556
defaultTaskSteadyStatePollIntervalJitter = 30 * time.Second
5657
transitionPollTime = 5 * time.Second
@@ -132,6 +133,7 @@ type managedTask struct {
132133
cfg *config.Config
133134
credentialsManager credentials.Manager
134135
cniClient ecscni.CNIClient
136+
dockerClient dockerapi.DockerClient
135137
taskStopWG *utilsync.SequentialWaitGroup
136138

137139
acsMessages chan acsTransition
@@ -180,6 +182,7 @@ func (engine *DockerTaskEngine) newManagedTask(task *apitask.Task) *managedTask
180182
containerChangeEventStream: engine.containerChangeEventStream,
181183
credentialsManager: engine.credentialsManager,
182184
cniClient: engine.cniClient,
185+
dockerClient: engine.client,
183186
taskStopWG: engine.taskStopGroup,
184187
steadyStatePollInterval: engine.taskSteadyStatePollInterval,
185188
steadyStatePollIntervalJitter: engine.taskSteadyStatePollIntervalJitter,
@@ -930,16 +933,16 @@ func (mtask *managedTask) handleEventError(containerChange dockerContainerChange
930933
func (mtask *managedTask) handleContainerStoppedTransitionError(event dockerapi.DockerContainerChangeEvent,
931934
container *apicontainer.Container,
932935
currentKnownStatus apicontainerstatus.ContainerStatus) bool {
933-
// If docker returned a transient error while trying to stop a container,
934-
// reset the known status to the current status and return
935-
cannotStopContainerError, ok := event.Error.(cannotStopContainerError)
936-
if ok && cannotStopContainerError.IsRetriableError() {
937-
logger.Info("Error stopping the container; ignoring state change", logger.Fields{
938-
field.TaskARN: mtask.Arn,
939-
field.Container: container.Name,
940-
field.RuntimeID: container.GetRuntimeID(),
941-
"ErrorName": event.Error.ErrorName(),
942-
field.Error: cannotStopContainerError.Error(),
936+
937+
pr := mtask.dockerClient.SystemPing(mtask.ctx, systemPingTimeout)
938+
if pr.Error != nil {
939+
logger.Info("Error stopping the container, but docker seems to be unresponsive; ignoring state change", logger.Fields{
940+
field.TaskARN: mtask.Arn,
941+
field.Container: container.Name,
942+
field.RuntimeID: container.GetRuntimeID(),
943+
"ErrorName": event.Error.ErrorName(),
944+
field.Error: event.Error.Error(),
945+
"SystemPingError": pr.Error,
943946
})
944947
container.SetKnownStatus(currentKnownStatus)
945948
return false

agent/engine/task_manager_test.go

+14-4
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ func TestHandleEventError(t *testing.T) {
8484
FromError: errors.New(""),
8585
},
8686
ExpectedContainerKnownStatusSet: true,
87-
ExpectedContainerKnownStatus: apicontainerstatus.ContainerRunning,
88-
ExpectedOK: false,
87+
ExpectedContainerKnownStatus: apicontainerstatus.ContainerStopped,
88+
ExpectedOK: true,
8989
},
9090
{
9191
Name: "Unretriable error with Stop",
@@ -187,6 +187,15 @@ func TestHandleEventError(t *testing.T) {
187187

188188
for _, tc := range testCases {
189189
t.Run(tc.Name, func(t *testing.T) {
190+
ctrl := gomock.NewController(t)
191+
defer ctrl.Finish()
192+
client := mock_dockerapi.NewMockDockerClient(ctrl)
193+
194+
if tc.EventStatus == apicontainerstatus.ContainerStopped {
195+
client.EXPECT().SystemPing(gomock.Any(), gomock.Any()).Return(dockerapi.PingResponse{}).
196+
Times(1)
197+
}
198+
190199
container := &apicontainer.Container{
191200
KnownStatusUnsafe: tc.CurrentContainerKnownStatus,
192201
}
@@ -203,8 +212,9 @@ func TestHandleEventError(t *testing.T) {
203212
Task: &apitask.Task{
204213
Arn: "task1",
205214
},
206-
engine: &DockerTaskEngine{},
207-
cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior},
215+
engine: &DockerTaskEngine{},
216+
cfg: &config.Config{ImagePullBehavior: tc.ImagePullBehavior},
217+
dockerClient: client,
208218
}
209219
ok := mtask.handleEventError(containerChange, tc.CurrentContainerKnownStatus)
210220
assert.Equal(t, tc.ExpectedOK, ok, "to proceed")

0 commit comments

Comments
 (0)