Skip to content

Commit 8d31e96

Browse files
authored
[Scheduler Enhancement] Increase the retention timeout for the blackbox action. (#5266)
* Increase the retention timeout for the blackbox action. * Fix test cases. * Apply scalaFmt. * Add GracefulShutdown case back. * Increase the blackbox timeout for test cases. * Access the private method directly. * Replace Thread.sleep with awaitAssert * Add the missing configuration. * Enhance the test code. * Remove thread.sleep. * Fix test cases. * Fix test cases.
1 parent a7fecfb commit 8d31e96

File tree

14 files changed

+395
-227
lines changed

14 files changed

+395
-227
lines changed

ansible/group_vars/all

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,7 @@ scheduler:
518518
dataManagementService:
519519
retryInterval: "{{ scheduler_dataManagementService_retryInterval | default('1 second') }}"
520520
inProgressJobRetention: "{{ scheduler_inProgressJobRetention | default('20 seconds') }}"
521+
blackboxMultiple: "{{ scheduler_blackboxMultiple | default(15) }}"
521522
managedFraction: "{{ scheduler_managed_fraction | default(1.0 - (scheduler_blackbox_fraction | default(__scheduler_blackbox_fraction))) }}"
522523
blackboxFraction: "{{ scheduler_blackbox_fraction | default(__scheduler_blackbox_fraction) }}"
523524
scheduling:

ansible/roles/schedulers/tasks/deploy.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@
113113
"CONFIG_whisk_scheduler_maxPeek": "{{ scheduler.maxPeek }}"
114114
"CONFIG_whisk_scheduler_dataManagementService_retryInterval": "{{ scheduler.dataManagementService.retryInterval }}"
115115
"CONFIG_whisk_scheduler_inProgressJobRetention": "{{ scheduler.inProgressJobRetention }}"
116+
"CONFIG_whisk_scheduler_blackboxMultiple": "{{ scheduler.blackboxMultiple }}"
116117
"CONFIG_whisk_scheduler_scheduling_staleThreshold": "{{ scheduler.scheduling.staleThreshold }}"
117118
"CONFIG_whisk_scheduler_scheduling_checkInterval": "{{ scheduler.scheduling.checkInterval }}"
118119
"CONFIG_whisk_scheduler_scheduling_dropInterval": "{{ scheduler.scheduling.dropInterval }}"
@@ -124,6 +125,7 @@
124125
"CONFIG_whisk_scheduler_queue_gracefulShutdownTimeout": "{{ scheduler.queue.gracefulShutdownTimeout }}"
125126
"CONFIG_whisk_scheduler_queue_maxRetentionSize": "{{ scheduler.queue.maxRetentionSize }}"
126127
"CONFIG_whisk_scheduler_queue_maxRetentionMs": "{{ scheduler.queue.maxRetentionMs }}"
128+
"CONFIG_whisk_scheduler_queue_maxBlackboxRetentionMs": "{{ scheduler.queue.maxBlackboxRetentionMs }}"
127129
"CONFIG_whisk_scheduler_queue_throttlingFraction": "{{ scheduler.queue.throttlingFraction }}"
128130
"CONFIG_whisk_scheduler_queue_durationBufferSize": "{{ scheduler.queue.durationBufferSize }}"
129131
"CONFIG_whisk_durationChecker_timeWindow": "{{ durationChecker.timeWindow }}"

common/scala/src/main/scala/org/apache/openwhisk/common/Logging.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,8 @@ object LoggingMarkers {
594594
val SCHEDULER_KAFKA_WAIT_TIME =
595595
LogMarkerToken(scheduler, "kafkaWaitTime", counter)(MeasurementUnit.time.milliseconds)
596596
def SCHEDULER_WAIT_TIME(action: String) =
597-
LogMarkerToken(scheduler, "waitTime", counter, Some(action), Map("action" -> action))(MeasurementUnit.time.milliseconds)
597+
LogMarkerToken(scheduler, "waitTime", counter, Some(action), Map("action" -> action))(
598+
MeasurementUnit.time.milliseconds)
598599

599600
def SCHEDULER_KEEP_ALIVE(leaseId: Long) =
600601
LogMarkerToken(scheduler, "keepAlive", counter, None, Map("leaseId" -> leaseId.toString))(MeasurementUnit.none)

common/scala/src/main/scala/org/apache/openwhisk/core/WhiskConfig.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,7 @@ object ConfigKeys {
305305
val schedulerQueue = "whisk.scheduler.queue"
306306
val schedulerQueueManager = "whisk.scheduler.queue-manager"
307307
val schedulerInProgressJobRetention = "whisk.scheduler.in-progress-job-retention"
308+
val schedulerBlackboxMultiple = "whisk.scheduler.blackbox-multiple"
308309
val schedulerStaleThreshold = "whisk.scheduler.stale-threshold"
309310

310311
val whiskClusterName = "whisk.cluster.name"

core/invoker/src/main/scala/org/apache/openwhisk/core/containerpool/v2/FunctionPullingContainerPool.scala

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,14 @@ class FunctionPullingContainerPool(
8888

8989
implicit val ec = context.system.dispatcher
9090

91-
private var busyPool = immutable.Map.empty[ActorRef, Data]
92-
private var inProgressPool = immutable.Map.empty[ActorRef, Data]
93-
private var warmedPool = immutable.Map.empty[ActorRef, WarmData]
94-
private var prewarmedPool = immutable.Map.empty[ActorRef, PreWarmData]
95-
private var prewarmStartingPool = immutable.Map.empty[ActorRef, (String, ByteSize)]
91+
protected[containerpool] var busyPool = immutable.Map.empty[ActorRef, Data]
92+
protected[containerpool] var inProgressPool = immutable.Map.empty[ActorRef, Data]
93+
protected[containerpool] var warmedPool = immutable.Map.empty[ActorRef, WarmData]
94+
protected[containerpool] var prewarmedPool = immutable.Map.empty[ActorRef, PreWarmData]
95+
protected[containerpool] var prewarmStartingPool = immutable.Map.empty[ActorRef, (String, ByteSize)]
9696

9797
// for shutting down
98-
private var disablingPool = immutable.Set.empty[ActorRef]
98+
protected[containerpool] var disablingPool = immutable.Set.empty[ActorRef]
9999

100100
private var shuttingDown = false
101101

core/scheduler/src/main/resources/application.conf

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ whisk {
7676
graceful-shutdown-timeout = "5 seconds"
7777
max-retention-size = "10000"
7878
max-retention-ms = "60000"
79+
max-blackbox-retention-ms = "300000"
7980
throttling-fraction = "0.9"
8081
duration-buffer-size = "10"
8182
}
@@ -85,6 +86,7 @@ whisk {
8586
}
8687
max-peek = "128"
8788
in-progress-job-retention = "20 seconds"
89+
blackbox-multiple = "15"
8890
data-management-service {
8991
retry-interval = "1 second"
9092
}

core/scheduler/src/main/scala/org/apache/openwhisk/core/scheduler/container/CreationJobManager.scala

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,11 @@ case class JobEntry(action: FullyQualifiedEntityName, timer: Cancellable)
4747

4848
class CreationJobManager(feedFactory: (ActorRefFactory, String, String, Int, Array[Byte] => Future[Unit]) => ActorRef,
4949
schedulerInstanceId: SchedulerInstanceId,
50-
dataManagementService: ActorRef)(implicit actorSystem: ActorSystem, logging: Logging)
50+
dataManagementService: ActorRef,
51+
baseTimeout: FiniteDuration,
52+
blackboxMultiple: Int)(implicit actorSystem: ActorSystem, logging: Logging)
5153
extends Actor {
5254
private implicit val ec: ExecutionContext = actorSystem.dispatcher
53-
private val baseTimeout = loadConfigOrThrow[FiniteDuration](ConfigKeys.schedulerInProgressJobRetention)
5455
private val retryLimit = 5
5556

5657
/**
@@ -152,10 +153,10 @@ class CreationJobManager(feedFactory: (ActorRefFactory, String, String, Int, Arr
152153
// If there is a JobEntry, delete it.
153154
creationJobPool
154155
.remove(creationId)
155-
.foreach(entry => {
156-
sendState(state)
157-
entry.timer.cancel()
158-
})
156+
.map(entry => entry.timer.cancel())
157+
158+
// even if there is no entry because of timeout, we still need to send the state to the queue if the queue exists
159+
sendState(state)
159160

160161
dataManagementService ! UnregisterData(key)
161162
Future.successful({})
@@ -176,7 +177,8 @@ class CreationJobManager(feedFactory: (ActorRefFactory, String, String, Int, Arr
176177
revision: DocRevision,
177178
creationId: CreationId,
178179
isBlackbox: Boolean): Cancellable = {
179-
val timeout = if (isBlackbox) FiniteDuration(baseTimeout.toSeconds * 3, TimeUnit.SECONDS) else baseTimeout
180+
val timeout =
181+
if (isBlackbox) FiniteDuration(baseTimeout.toSeconds * blackboxMultiple, TimeUnit.SECONDS) else baseTimeout
180182
actorSystem.scheduler.scheduleOnce(timeout) {
181183
logging.warn(
182184
this,
@@ -222,8 +224,12 @@ class CreationJobManager(feedFactory: (ActorRefFactory, String, String, Int, Arr
222224
}
223225

224226
object CreationJobManager {
227+
private val baseTimeout = loadConfigOrThrow[Int](ConfigKeys.schedulerInProgressJobRetention).seconds
228+
private val blackboxMultiple = loadConfigOrThrow[Int](ConfigKeys.schedulerBlackboxMultiple)
229+
225230
def props(feedFactory: (ActorRefFactory, String, String, Int, Array[Byte] => Future[Unit]) => ActorRef,
226231
schedulerInstanceId: SchedulerInstanceId,
227232
dataManagementService: ActorRef)(implicit actorSystem: ActorSystem, logging: Logging) =
228-
Props(new CreationJobManager(feedFactory, schedulerInstanceId, dataManagementService))
233+
Props(
234+
new CreationJobManager(feedFactory, schedulerInstanceId, dataManagementService, baseTimeout, blackboxMultiple))
229235
}

core/scheduler/src/main/scala/org/apache/openwhisk/core/scheduler/queue/MemoryQueue.scala

Lines changed: 66 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,6 @@ case class QueueRemoved(invocationNamespace: String, action: DocInfo, leaderKey:
8585
case class QueueReactivated(invocationNamespace: String, action: FullyQualifiedEntityName, docInfo: DocInfo)
8686
case class CancelPoll(promise: Promise[Either[MemoryQueueError, ActivationMessage]])
8787
case object QueueRemovedCompleted
88-
case object FlushPulse
8988

9089
// Events received by the actor
9190
case object Start
@@ -125,7 +124,7 @@ class MemoryQueue(private val etcdClient: EtcdClient,
125124
checkToDropStaleActivation: (Queue[TimeSeriesActivationEntry],
126125
Long,
127126
String,
128-
FullyQualifiedEntityName,
127+
WhiskActionMetaData,
129128
MemoryQueueState,
130129
ActorRef) => Unit,
131130
queueConfig: QueueConfig)(implicit logging: Logging)
@@ -151,6 +150,7 @@ class MemoryQueue(private val etcdClient: EtcdClient,
151150
private val memory = actionMetaData.limits.memory.megabytes.MB
152151
private val queueRemovedMsg = QueueRemoved(invocationNamespace, action.toDocId.asDocInfo(revision), Some(leaderKey))
153152
private val staleQueueRemovedMsg = QueueRemoved(invocationNamespace, action.toDocId.asDocInfo(revision), None)
153+
private val actionRetentionTimeout = MemoryQueue.getRetentionTimeout(actionMetaData, queueConfig)
154154

155155
private[queue] var containers = Set.empty[String]
156156
private[queue] var creationIds = Set.empty[String]
@@ -197,7 +197,9 @@ class MemoryQueue(private val etcdClient: EtcdClient,
197197

198198
when(Uninitialized) {
199199
case Event(Start, _) =>
200-
logging.info(this, s"[$invocationNamespace:$action:$stateName] a new queue is created.")
200+
logging.info(
201+
this,
202+
s"[$invocationNamespace:$action:$stateName] a new queue is created, retentionTimeout: $actionRetentionTimeout, kind: ${actionMetaData.exec.kind}.")
201203
val (schedulerActor, droppingActor) = startMonitoring()
202204
initializeThrottling()
203205

@@ -256,7 +258,9 @@ class MemoryQueue(private val etcdClient: EtcdClient,
256258
// when there is no container, it moves to the Flushing state as no activations can be invoked
257259
if (containers.size <= 0) {
258260
val isWhiskError = ContainerCreationError.whiskErrors.contains(error)
259-
completeAllActivations(message, isWhiskError)
261+
if (!isWhiskError) {
262+
completeAllActivations(message, isWhiskError)
263+
}
260264
logging.error(
261265
this,
262266
s"[$invocationNamespace:$action:$stateName] Failed to create an initial container due to ${if (isWhiskError) "whiskError"
@@ -271,7 +275,11 @@ class MemoryQueue(private val etcdClient: EtcdClient,
271275
// there is no timeout for this state as when there is no further message, it would move to the Running state again.
272276
when(NamespaceThrottled) {
273277
case Event(msg: ActivationMessage, _: ThrottledData) =>
274-
handleActivationMessage(msg)
278+
if (containers.size + creationIds.size == 0) {
279+
completeErrorActivation(msg, tooManyConcurrentRequests, isWhiskError = false)
280+
} else {
281+
handleActivationMessage(msg)
282+
}
275283
stay
276284

277285
case Event(DisableNamespaceThrottling, data: ThrottledData) =>
@@ -328,33 +336,51 @@ class MemoryQueue(private val etcdClient: EtcdClient,
328336
goto(Running) using RunningData(schedulerActor, droppingActor)
329337

330338
// log the failed information
331-
case Event(FailedCreationJob(creationId, _, _, _, _, message), data: FlushingData) =>
339+
case Event(FailedCreationJob(creationId, _, _, _, error, message), data: FlushingData) =>
332340
creationIds -= creationId.asString
333341
logging.info(
334342
this,
335343
s"[$invocationNamespace:$action:$stateName][$creationId] Failed to create a container due to $message")
336344

337345
// keep updating the reason
338-
stay using data.copy(reason = message)
346+
stay using data.copy(error = error, reason = message)
339347

340348
// since there is no container, activations cannot be handled.
341349
case Event(msg: ActivationMessage, data: FlushingData) =>
342-
completeErrorActivation(msg, data.reason, ContainerCreationError.whiskErrors.contains(data.error))
350+
logging.info(this, s"[$invocationNamespace:$action:$stateName] got a new activation message ${msg.activationId}")(
351+
msg.transid)
352+
val whiskError = isWhiskError(data.error)
353+
if (whiskError)
354+
queue = queue.enqueue(TimeSeriesActivationEntry(Instant.now, msg))
355+
else
356+
completeErrorActivation(msg, data.reason, whiskError)
343357
stay() using data.copy(activeDuringFlush = true)
344358

345359
// Since SchedulingDecisionMaker keep sending a message to create a container, this state is not automatically timed out.
346360
// Instead, StateTimeout message will be sent by a timer.
347-
case Event(StateTimeout, data: FlushingData) =>
348-
completeAllActivations(data.reason, ContainerCreationError.whiskErrors.contains(data.error))
349-
if (data.activeDuringFlush)
361+
case Event(StateTimeout | DropOld, data: FlushingData) =>
362+
logging.info(this, s"[$invocationNamespace:$action:$stateName] Received StateTimeout, drop stale messages.")
363+
queue =
364+
MemoryQueue.dropOld(queue, Duration.ofMillis(actionRetentionTimeout), data.reason, completeErrorActivation)
365+
if (data.activeDuringFlush || queue.nonEmpty)
350366
stay using data.copy(activeDuringFlush = false)
351367
else
352368
cleanUpActorsAndGotoRemoved(data)
353369

354370
case Event(GracefulShutdown, data: FlushingData) =>
355-
completeAllActivations(data.reason, ContainerCreationError.whiskErrors.contains(data.error))
371+
completeAllActivations(data.reason, isWhiskError(data.error))
356372
logging.info(this, s"[$invocationNamespace:$action:$stateName] Received GracefulShutdown, stop the queue.")
357373
cleanUpActorsAndGotoRemoved(data)
374+
375+
case Event(StopSchedulingAsOutdated, data: FlushingData) =>
376+
logging.info(this, s"[$invocationNamespace:$action:$stateName] stop further scheduling.")
377+
completeAllActivations(data.reason, isWhiskError(data.error))
378+
// let QueueManager know this queue is no longer in charge.
379+
context.parent ! staleQueueRemovedMsg
380+
cleanUpActors(data)
381+
cleanUpData()
382+
383+
goto(Removed) using NoData()
358384
}
359385

360386
// in case there is any activation in the queue, it waits until all of them are handled.
@@ -399,6 +425,7 @@ class MemoryQueue(private val etcdClient: EtcdClient,
399425

400426
// actors and data are already wiped
401427
case Event(QueueRemovedCompleted, _: NoData) =>
428+
logging.info(this, "stop fsm")
402429
stop()
403430

404431
// This is not supposed to happen. This will ensure the queue does not run forever.
@@ -523,15 +550,19 @@ class MemoryQueue(private val etcdClient: EtcdClient,
523550
case Event(DropOld, _) =>
524551
if (queue.nonEmpty && Duration
525552
.between(queue.head.timestamp, Instant.now)
526-
.compareTo(Duration.ofMillis(queueConfig.maxRetentionMs)) < 0) {
553+
.compareTo(Duration.ofMillis(actionRetentionTimeout)) < 0) {
527554
logging.error(
528555
this,
529556
s"[$invocationNamespace:$action:$stateName] Drop some stale activations for $revision, existing container is ${containers.size}, inProgress container is ${creationIds.size}, state data: $stateData, in is $in, current: ${queue.size}.")
530557
logging.error(
531558
this,
532559
s"[$invocationNamespace:$action:$stateName] the head stale message: ${queue.head.msg.activationId}")
533560
}
534-
queue = MemoryQueue.dropOld(queue, Duration.ofMillis(queueConfig.maxRetentionMs), completeErrorActivation)
561+
queue = MemoryQueue.dropOld(
562+
queue,
563+
Duration.ofMillis(actionRetentionTimeout),
564+
s"Activation processing is not initiated for $actionRetentionTimeout ms",
565+
completeErrorActivation)
535566

536567
stay
537568

@@ -861,7 +892,7 @@ class MemoryQueue(private val etcdClient: EtcdClient,
861892
// these schedulers will run forever and stop when the memory queue stops
862893
private def startMonitoring(): (ActorRef, ActorRef) = {
863894
val droppingScheduler = Scheduler.scheduleWaitAtLeast(schedulingConfig.dropInterval) { () =>
864-
checkToDropStaleActivation(queue, queueConfig.maxRetentionMs, invocationNamespace, action, stateName, self)
895+
checkToDropStaleActivation(queue, actionRetentionTimeout, invocationNamespace, actionMetaData, stateName, self)
865896
Future.successful(())
866897
}
867898

@@ -1055,11 +1086,12 @@ class MemoryQueue(private val etcdClient: EtcdClient,
10551086
causedBy ++ limits ++ binding
10561087
})
10571088
}
1089+
1090+
private def isWhiskError(error: ContainerCreationError): Boolean = ContainerCreationError.whiskErrors.contains(error)
10581091
}
10591092

10601093
object MemoryQueue {
10611094
private[queue] val queueConfig = loadConfigOrThrow[QueueConfig](ConfigKeys.schedulerQueue)
1062-
private[queue] val MaxRetentionTime = queueConfig.maxRetentionMs
10631095

10641096
def props(etcdClient: EtcdClient,
10651097
durationChecker: DurationChecker,
@@ -1105,21 +1137,27 @@ object MemoryQueue {
11051137
def dropOld(
11061138
queue: Queue[TimeSeriesActivationEntry],
11071139
retention: Duration,
1140+
reason: String,
11081141
completeErrorActivation: (ActivationMessage, String, Boolean) => Future[Any]): Queue[TimeSeriesActivationEntry] = {
11091142
if (queue.isEmpty || Duration.between(queue.head.timestamp, Instant.now).compareTo(retention) < 0)
11101143
queue
11111144
else {
1112-
completeErrorActivation(queue.head.msg, s"activation processing is not initiated for $MaxRetentionTime ms", true)
1113-
dropOld(queue.tail, retention, completeErrorActivation)
1145+
completeErrorActivation(queue.head.msg, reason, true)
1146+
dropOld(queue.tail, retention, reason, completeErrorActivation)
11141147
}
11151148
}
11161149

11171150
def checkToDropStaleActivation(queue: Queue[TimeSeriesActivationEntry],
11181151
maxRetentionMs: Long,
11191152
invocationNamespace: String,
1120-
action: FullyQualifiedEntityName,
1153+
actionMetaData: WhiskActionMetaData,
11211154
stateName: MemoryQueueState,
11221155
queueRef: ActorRef)(implicit logging: Logging) = {
1156+
val action = actionMetaData.fullyQualifiedName(true)
1157+
logging.debug(
1158+
this,
1159+
s"[$invocationNamespace:$action:$stateName] use the given retention timeout: $maxRetentionMs for this action kind: ${actionMetaData.exec.kind}.")
1160+
11231161
if (queue.nonEmpty && Duration
11241162
.between(queue.head.timestamp, Instant.now)
11251163
.compareTo(Duration.ofMillis(maxRetentionMs)) >= 0) {
@@ -1130,6 +1168,14 @@ object MemoryQueue {
11301168
queueRef ! DropOld
11311169
}
11321170
}
1171+
1172+
private def getRetentionTimeout(actionMetaData: WhiskActionMetaData, queueConfig: QueueConfig): Long = {
1173+
if (actionMetaData.exec.kind == ExecMetaDataBase.BLACKBOX) {
1174+
queueConfig.maxBlackboxRetentionMs
1175+
} else {
1176+
queueConfig.maxRetentionMs
1177+
}
1178+
}
11331179
}
11341180

11351181
case class QueueSnapshot(initialized: Boolean,
@@ -1151,6 +1197,7 @@ case class QueueConfig(idleGrace: FiniteDuration,
11511197
gracefulShutdownTimeout: FiniteDuration,
11521198
maxRetentionSize: Int,
11531199
maxRetentionMs: Long,
1200+
maxBlackboxRetentionMs: Long,
11541201
throttlingFraction: Double,
11551202
durationBufferSize: Int)
11561203

tests/src/test/resources/application.conf.j2

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ whisk {
152152
graceful-shutdown-timeout = "{{ scheduler.queue.gracefulShutdownTimeout | default('5 seconds') }}"
153153
max-retention-size = "{{ scheduler.queue.maxRetentionSize | default(10000) }}"
154154
max-retention-ms = "{{ scheduler.queue.maxRetentionMs | default(60000) }}"
155+
max-blackbox-retention-ms = "{{ scheduler.queue.maxBlackboxRetentionMs}}"
155156
throttling-fraction = "{{ scheduler.queue.throttlingFraction | default(0.9) }}"
156157
duration-buffer-size = "{{ scheduler.queue.durationBufferSize | default(10) }}"
157158
}

0 commit comments

Comments
 (0)