Skip to content

Commit f03cbcc

Browse files
authored
Fail all activations when it fails to pull a blackbox image. (#5270)
* Fail all activations if it fails to pull images for blackbox actions. * Apply scalaFmt. * Remove 127 from the case.
1 parent 4016aa8 commit f03cbcc

File tree

8 files changed

+61
-14
lines changed

8 files changed

+61
-14
lines changed

common/scala/src/main/scala/org/apache/openwhisk/http/ErrorResponse.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,8 @@ object Messages {
235235
/** Indicates that the image could not be pulled. */
236236
def imagePullError(image: String) = s"Failed to pull container image '$image'."
237237

238+
def commandNotFoundError = "executable file not found"
239+
238240
/** Indicates that the container for the action could not be started. */
239241
val resourceProvisionError = "Failed to provision resources to run the action."
240242

core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/docker/DockerClient.scala

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -147,11 +147,15 @@ class DockerClient(dockerHost: Option[String] = None,
147147
// Examples:
148148
// - Unrecognized option specified
149149
// - Not enough disk space
150-
case pre: ProcessUnsuccessfulException if pre.exitStatus == ExitStatus(125) =>
150+
// Exit status 127 means an error that container command cannot be found.
151+
// Examples:
152+
// - executable file not found in $PATH": unknown
153+
case pre: ProcessUnsuccessfulException
154+
if pre.exitStatus == ExitStatus(125) || pre.exitStatus == ExitStatus(127) =>
151155
Future.failed(
152156
DockerContainerId
153157
.parse(pre.stdout)
154-
.map(BrokenDockerContainer(_, s"Broken container: ${pre.getMessage}"))
158+
.map(BrokenDockerContainer(_, s"Broken container: ${pre.getMessage}", Some(pre.exitStatus.statusValue)))
155159
.getOrElse(pre))
156160
}
157161
}
@@ -296,4 +300,4 @@ trait DockerApi {
296300
}
297301

298302
/** Indicates any error while starting a container that leaves a broken container behind that needs to be removed */
299-
case class BrokenDockerContainer(id: ContainerId, msg: String) extends Exception(msg)
303+
case class BrokenDockerContainer(id: ContainerId, msg: String, existStatus: Option[Int] = None) extends Exception(msg)

core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/docker/DockerContainer.scala

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,11 +127,14 @@ object DockerContainer {
127127
for {
128128
pullSuccessful <- pulled
129129
id <- docker.run(imageToUse, args).recoverWith {
130-
case BrokenDockerContainer(brokenId, _) =>
130+
case BrokenDockerContainer(brokenId, _, exitStatus) if exitStatus.isEmpty || exitStatus.contains(125) =>
131131
// Remove the broken container - but don't wait or check for the result.
132132
// If the removal fails, there is nothing we could do to recover from the recovery.
133133
docker.rm(brokenId)
134134
Future.failed(WhiskContainerStartupError(Messages.resourceProvisionError))
135+
case BrokenDockerContainer(brokenId, _, exitStatus) if exitStatus.contains(127) =>
136+
docker.rm(brokenId)
137+
Future.failed(BlackboxStartupError(s"${Messages.commandNotFoundError} in image ${imageToUse}"))
135138
case _ =>
136139
// Iff the pull was successful, we assume that the error is not due to an image pull error, otherwise
137140
// the docker run was a backup measure to try and start the container anyway. If it fails again, we assume

core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/kubernetes/KubernetesClient.scala

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import java.net.SocketTimeoutException
2222
import java.time.format.DateTimeFormatterBuilder
2323
import java.time.temporal.ChronoField
2424
import java.time.{Instant, ZoneId}
25-
2625
import akka.actor.ActorSystem
2726
import akka.event.Logging.ErrorLevel
2827
import akka.event.Logging.InfoLevel
@@ -48,6 +47,7 @@ import org.apache.openwhisk.core.containerpool.docker.ProcessRunner
4847
import org.apache.openwhisk.core.containerpool.{ContainerAddress, ContainerId}
4948
import org.apache.openwhisk.core.entity.ByteSize
5049
import org.apache.openwhisk.core.entity.size._
50+
import org.apache.openwhisk.http.Messages
5151
import pureconfig._
5252
import pureconfig.generic.auto._
5353
import spray.json.DefaultJsonProtocol._
@@ -82,6 +82,16 @@ case class KubernetesEphemeralStorageConfig(limit: ByteSize)
8282
case class KubernetesPodReadyTimeoutException(timeout: FiniteDuration)
8383
extends Exception(s"Pod readiness timed out after ${timeout.toSeconds}s")
8484

85+
/**
86+
* Exception to indicate it failed to pull an image for blackbox actions.
87+
*/
88+
case class KubernetesImagePullFailedException(msg: String) extends Exception(msg)
89+
90+
/**
91+
* Exception to indicate the command for an image is not found.
92+
*/
93+
case class KubernetesImageCommandNotFoundException(msg: String) extends Exception(msg)
94+
8595
/**
8696
* Exception to indicate a pod could not be created at the apiserver.
8797
*/
@@ -132,6 +142,8 @@ class KubernetesClient(
132142
new DefaultKubernetesClient(configBuilder.build())
133143
}
134144

145+
private val imagePullFailedMsgs = Set("ImagePullBackOff", "ErrImagePull")
146+
135147
private val podBuilder = new WhiskPodBuilder(kubeRestClient, config)
136148

137149
def run(name: String,
@@ -321,7 +333,23 @@ class KubernetesClient(
321333
.withName(pod.getMetadata.getName)
322334
val deadline = deadlineOpt.getOrElse((timeout - (System.currentTimeMillis() - start.toEpochMilli).millis).fromNow)
323335
if (!readyPod.isReady) {
324-
if (deadline.isOverdue()) {
336+
// when action pod is failed while pulling images, we need to let users know that
337+
val imagePullErr = Try {
338+
readyPod.get.getStatus.getContainerStatuses.asScala.exists { status =>
339+
imagePullFailedMsgs.contains(status.getState.getWaiting.getReason)
340+
}
341+
} getOrElse false
342+
// when command not found in image, we need to let users know that as well
343+
val commandNotFoundErr = Try {
344+
readyPod.get.getStatus.getContainerStatuses.asScala.exists { status =>
345+
status.getState.getWaiting.getMessage.contains(Messages.commandNotFoundError)
346+
}
347+
} getOrElse false
348+
if (imagePullErr) {
349+
Future.failed(KubernetesImagePullFailedException(s"Failed to pull image for pod ${pod.getMetadata.getName}"))
350+
} else if (commandNotFoundErr) {
351+
Future.failed(KubernetesImageCommandNotFoundException(Messages.commandNotFoundError))
352+
} else if (deadline.isOverdue()) {
325353
Future.failed(KubernetesPodReadyTimeoutException(timeout))
326354
} else {
327355
after(1.seconds, scheduler) {

core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/kubernetes/KubernetesContainer.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ object KubernetesContainer {
7474
case e: KubernetesPodApiException =>
7575
//apiserver call failed - this will expose a different error to users
7676
cleanupFailedPod(e, podName, WhiskContainerStartupError(Messages.resourceProvisionError))
77+
case e: KubernetesImagePullFailedException =>
78+
cleanupFailedPod(e, podName, BlackboxStartupError(Messages.imagePullError(image)))
79+
case e: KubernetesImageCommandNotFoundException =>
80+
cleanupFailedPod(e, podName, BlackboxStartupError(s"${Messages.commandNotFoundError} in image ${image}"))
7781
case e: Throwable =>
7882
cleanupFailedPod(e, podName, WhiskContainerStartupError(s"Failed to run container with image '${image}'."))
7983
}

core/standalone/src/main/scala/org/apache/openwhisk/standalone/StandaloneDockerSupport.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -221,7 +221,7 @@ class StandaloneDockerClient(pullDisabled: Boolean)(implicit log: Logging, as: A
221221
for {
222222
_ <- if (shouldPull) pull(image) else Future.successful(())
223223
id <- run(image, args).recoverWith {
224-
case t @ BrokenDockerContainer(brokenId, _) =>
224+
case t @ BrokenDockerContainer(brokenId, _, _) =>
225225
// Remove the broken container - but don't wait or check for the result.
226226
// If the removal fails, there is nothing we could do to recover from the recovery.
227227
rm(brokenId)

tests/src/test/scala/org/apache/openwhisk/common/etcd/EtcdConfigTests.scala

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,8 @@ import org.scalatest.{FlatSpec, Matchers}
77
import org.scalatest.junit.JUnitRunner
88

99
import scala.concurrent.ExecutionContextExecutor
10-
11-
1210
@RunWith(classOf[JUnitRunner])
13-
class EtcdConfigTests
14-
extends FlatSpec
15-
with Matchers
16-
with WskActorSystem {
11+
class EtcdConfigTests extends FlatSpec with Matchers with WskActorSystem {
1712
behavior of "EtcdConfig"
1813

1914
implicit val ece: ExecutionContextExecutor = actorSystem.dispatcher

tests/src/test/scala/org/apache/openwhisk/core/containerpool/docker/test/DockerClientTests.scala

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,6 @@ class DockerClientTests
358358
}
359359

360360
Seq[(ProcessRunningException, String)](
361-
(ProcessUnsuccessfulException(ExitStatus(127), id.asString, "Unknown command"), "Exit code not 125"),
362361
(ProcessUnsuccessfulException(ExitStatus(125), "", "Unknown flag: --foo"), "No container ID"),
363362
(ProcessUnsuccessfulException(ExitStatus(1), "", ""), "Exit code not 125 and no container ID"),
364363
(ProcessTimeoutException(1.second, ExitStatus(125), id.asString, ""), "Timeout instead of unsuccessful command"))
@@ -367,6 +366,18 @@ class DockerClientTests
367366
}
368367
}
369368

369+
it should "fail with BrokenDockerContainer when run returns with exit status 127 and a container ID" in {
370+
val dc = dockerClient {
371+
Future.failed(
372+
ProcessUnsuccessfulException(
373+
exitStatus = ExitStatus(127),
374+
stdout = id.asString,
375+
stderr = """Error response from daemon: OCI runtime create failed: executable file not found"""))
376+
}
377+
val bdc = the[BrokenDockerContainer] thrownBy await(dc.run("image", Seq.empty))
378+
bdc.id shouldBe id
379+
}
380+
370381
it should "fail with ProcessTimeoutException when command times out" in {
371382
val expectedPTE =
372383
ProcessTimeoutException(timeout = 10.seconds, exitStatus = ExitStatus(143), stdout = "stdout", stderr = "stderr")

0 commit comments

Comments
 (0)