Skip to content

Fail all activations when it fails to pull a blackbox image. #5270

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,8 @@ object LoggingMarkers {
val SCHEDULER_KAFKA_WAIT_TIME =
LogMarkerToken(scheduler, "kafkaWaitTime", counter)(MeasurementUnit.time.milliseconds)
def SCHEDULER_WAIT_TIME(action: String) =
LogMarkerToken(scheduler, "waitTime", counter, Some(action), Map("action" -> action))(MeasurementUnit.time.milliseconds)
LogMarkerToken(scheduler, "waitTime", counter, Some(action), Map("action" -> action))(
MeasurementUnit.time.milliseconds)

def SCHEDULER_KEEP_ALIVE(leaseId: Long) =
LogMarkerToken(scheduler, "keepAlive", counter, None, Map("leaseId" -> leaseId.toString))(MeasurementUnit.none)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,8 @@ object Messages {
/** Indicates that the image could not be pulled. */
def imagePullError(image: String) = s"Failed to pull container image '$image'."

def commandNotFoundError = "executable file not found"

/** Indicates that the container for the action could not be started. */
val resourceProvisionError = "Failed to provision resources to run the action."

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,11 +147,15 @@ class DockerClient(dockerHost: Option[String] = None,
// Examples:
// - Unrecognized option specified
// - Not enough disk space
case pre: ProcessUnsuccessfulException if pre.exitStatus == ExitStatus(125) =>
// Exit status 127 means an error that container command cannot be found.
// Examples:
// - executable file not found in $PATH": unknown
case pre: ProcessUnsuccessfulException
if pre.exitStatus == ExitStatus(125) || pre.exitStatus == ExitStatus(127) =>
Future.failed(
DockerContainerId
.parse(pre.stdout)
.map(BrokenDockerContainer(_, s"Broken container: ${pre.getMessage}"))
.map(BrokenDockerContainer(_, s"Broken container: ${pre.getMessage}", Some(pre.exitStatus.statusValue)))
.getOrElse(pre))
}
}
Expand Down Expand Up @@ -296,4 +300,4 @@ trait DockerApi {
}

/** Indicates any error while starting a container that leaves a broken container behind that needs to be removed */
case class BrokenDockerContainer(id: ContainerId, msg: String) extends Exception(msg)
case class BrokenDockerContainer(id: ContainerId, msg: String, existStatus: Option[Int] = None) extends Exception(msg)
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,14 @@ object DockerContainer {
for {
pullSuccessful <- pulled
id <- docker.run(imageToUse, args).recoverWith {
case BrokenDockerContainer(brokenId, _) =>
case BrokenDockerContainer(brokenId, _, exitStatus) if exitStatus.isEmpty || exitStatus.contains(125) =>
// Remove the broken container - but don't wait or check for the result.
// If the removal fails, there is nothing we could do to recover from the recovery.
docker.rm(brokenId)
Future.failed(WhiskContainerStartupError(Messages.resourceProvisionError))
case BrokenDockerContainer(brokenId, _, exitStatus) if exitStatus.contains(127) =>
docker.rm(brokenId)
Future.failed(BlackboxStartupError(s"${Messages.commandNotFoundError} in image ${imageToUse}"))
case _ =>
// Iff the pull was successful, we assume that the error is not due to an image pull error, otherwise
// the docker run was a backup measure to try and start the container anyway. If it fails again, we assume
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ import java.net.SocketTimeoutException
import java.time.format.DateTimeFormatterBuilder
import java.time.temporal.ChronoField
import java.time.{Instant, ZoneId}

import akka.actor.ActorSystem
import akka.event.Logging.ErrorLevel
import akka.event.Logging.InfoLevel
Expand All @@ -48,6 +47,7 @@ import org.apache.openwhisk.core.containerpool.docker.ProcessRunner
import org.apache.openwhisk.core.containerpool.{ContainerAddress, ContainerId}
import org.apache.openwhisk.core.entity.ByteSize
import org.apache.openwhisk.core.entity.size._
import org.apache.openwhisk.http.Messages
import pureconfig._
import pureconfig.generic.auto._
import spray.json.DefaultJsonProtocol._
Expand Down Expand Up @@ -82,6 +82,16 @@ case class KubernetesEphemeralStorageConfig(limit: ByteSize)
case class KubernetesPodReadyTimeoutException(timeout: FiniteDuration)
extends Exception(s"Pod readiness timed out after ${timeout.toSeconds}s")

/**
* Exception to indicate it failed to pull an image for blackbox actions.
*/
case class KubernetesImagePullFailedException(msg: String) extends Exception(msg)

/**
* Exception to indicate the command for an image is not found.
*/
case class KubernetesImageCommandNotFoundException(msg: String) extends Exception(msg)

/**
* Exception to indicate a pod could not be created at the apiserver.
*/
Expand Down Expand Up @@ -132,6 +142,8 @@ class KubernetesClient(
new DefaultKubernetesClient(configBuilder.build())
}

private val imagePullFailedMsgs = Set("ImagePullBackOff", "ErrImagePull")

private val podBuilder = new WhiskPodBuilder(kubeRestClient, config)

def run(name: String,
Expand Down Expand Up @@ -321,7 +333,23 @@ class KubernetesClient(
.withName(pod.getMetadata.getName)
val deadline = deadlineOpt.getOrElse((timeout - (System.currentTimeMillis() - start.toEpochMilli).millis).fromNow)
if (!readyPod.isReady) {
if (deadline.isOverdue()) {
// when action pod is failed while pulling images, we need to let users know that
val imagePullErr = Try {
readyPod.get.getStatus.getContainerStatuses.asScala.exists { status =>
imagePullFailedMsgs.contains(status.getState.getWaiting.getReason)
}
} getOrElse false
// when command not found in image, we need to let users know that as well
val commandNotFoundErr = Try {
readyPod.get.getStatus.getContainerStatuses.asScala.exists { status =>
status.getState.getWaiting.getMessage.contains(Messages.commandNotFoundError)
}
} getOrElse false
if (imagePullErr) {
Future.failed(KubernetesImagePullFailedException(s"Failed to pull image for pod ${pod.getMetadata.getName}"))
} else if (commandNotFoundErr) {
Future.failed(KubernetesImageCommandNotFoundException(Messages.commandNotFoundError))
} else if (deadline.isOverdue()) {
Future.failed(KubernetesPodReadyTimeoutException(timeout))
} else {
after(1.seconds, scheduler) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,10 @@ object KubernetesContainer {
case e: KubernetesPodApiException =>
//apiserver call failed - this will expose a different error to users
cleanupFailedPod(e, podName, WhiskContainerStartupError(Messages.resourceProvisionError))
case e: KubernetesImagePullFailedException =>
cleanupFailedPod(e, podName, BlackboxStartupError(Messages.imagePullError(image)))
case e: KubernetesImageCommandNotFoundException =>
cleanupFailedPod(e, podName, BlackboxStartupError(s"${Messages.commandNotFoundError} in image ${image}"))
case e: Throwable =>
cleanupFailedPod(e, podName, WhiskContainerStartupError(s"Failed to run container with image '${image}'."))
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ class StandaloneDockerClient(pullDisabled: Boolean)(implicit log: Logging, as: A
for {
_ <- if (shouldPull) pull(image) else Future.successful(())
id <- run(image, args).recoverWith {
case t @ BrokenDockerContainer(brokenId, _) =>
case t @ BrokenDockerContainer(brokenId, _, _) =>
// Remove the broken container - but don't wait or check for the result.
// If the removal fails, there is nothing we could do to recover from the recovery.
rm(brokenId)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,8 @@ import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.junit.JUnitRunner

import scala.concurrent.ExecutionContextExecutor


@RunWith(classOf[JUnitRunner])
class EtcdConfigTests
extends FlatSpec
with Matchers
with WskActorSystem {
class EtcdConfigTests extends FlatSpec with Matchers with WskActorSystem {
behavior of "EtcdConfig"

implicit val ece: ExecutionContextExecutor = actorSystem.dispatcher
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,6 @@ class DockerClientTests
}

Seq[(ProcessRunningException, String)](
(ProcessUnsuccessfulException(ExitStatus(127), id.asString, "Unknown command"), "Exit code not 125"),
(ProcessUnsuccessfulException(ExitStatus(125), "", "Unknown flag: --foo"), "No container ID"),
(ProcessUnsuccessfulException(ExitStatus(1), "", ""), "Exit code not 125 and no container ID"),
(ProcessTimeoutException(1.second, ExitStatus(125), id.asString, ""), "Timeout instead of unsuccessful command"))
Expand All @@ -367,6 +366,18 @@ class DockerClientTests
}
}

it should "fail with BrokenDockerContainer when run returns with exit status 127 and a container ID" in {
val dc = dockerClient {
Future.failed(
ProcessUnsuccessfulException(
exitStatus = ExitStatus(127),
stdout = id.asString,
stderr = """Error response from daemon: OCI runtime create failed: executable file not found"""))
}
val bdc = the[BrokenDockerContainer] thrownBy await(dc.run("image", Seq.empty))
bdc.id shouldBe id
}

it should "fail with ProcessTimeoutException when command times out" in {
val expectedPTE =
ProcessTimeoutException(timeout = 10.seconds, exitStatus = ExitStatus(143), stdout = "stdout", stderr = "stderr")
Expand Down