Skip to content

Commit 426aef4

Browse files
bdoyle0182Brendan Doyle
andauthored
add fpc load balancer metrics (#5240)
Co-authored-by: Brendan Doyle <[email protected]>
1 parent 7fdc246 commit 426aef4

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

common/scala/src/main/scala/org/apache/openwhisk/common/Logging.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,15 @@ object LoggingMarkers {
568568
val OFFLINE_INVOKER_BLACKBOX =
569569
LogMarkerToken(loadbalancer, "totalOfflineInvokerBlackBox", counter)(MeasurementUnit.none)
570570

571+
val HEALTHY_INVOKERS =
572+
LogMarkerToken(loadbalancer, "totalHealthyInvoker", counter)(MeasurementUnit.none)
573+
val UNHEALTHY_INVOKERS =
574+
LogMarkerToken(loadbalancer, "totalUnhealthyInvoker", counter)(MeasurementUnit.none)
575+
val OFFLINE_INVOKERS =
576+
LogMarkerToken(loadbalancer, "totalOfflineInvoker", counter)(MeasurementUnit.none)
577+
578+
val INVOKER_TOTALMEM = LogMarkerToken(loadbalancer, "totalCapacity", counter)(MeasurementUnit.none)
579+
571580
// Kafka related markers
572581
def KAFKA_QUEUE(topic: String) =
573582
if (TransactionId.metricsKamonTags)

core/controller/src/main/scala/org/apache/openwhisk/core/loadBalancer/FPCPoolBalancer.scala

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,12 @@ package org.apache.openwhisk.core.loadBalancer
33
import java.nio.charset.StandardCharsets
44
import java.util.concurrent.ThreadLocalRandom
55
import java.util.concurrent.atomic.LongAdder
6-
76
import akka.actor.{Actor, ActorRef, ActorRefFactory, ActorSystem, Cancellable, Props}
87
import akka.event.Logging.InfoLevel
98
import akka.pattern.ask
109
import akka.util.Timeout
1110
import org.apache.openwhisk.common.InvokerState.{Healthy, Offline, Unhealthy}
11+
import org.apache.openwhisk.common.LoggingMarkers._
1212
import org.apache.openwhisk.common._
1313
import org.apache.openwhisk.core.connector._
1414
import org.apache.openwhisk.core.controller.Controller
@@ -335,6 +335,16 @@ class FPCPoolBalancer(config: WhiskConfig,
335335
}
336336
}
337337

338+
// Singletons for counter metrics related to completion acks
339+
protected val LOADBALANCER_COMPLETION_ACK_REGULAR =
340+
LoggingMarkers.LOADBALANCER_COMPLETION_ACK(controllerInstance, RegularCompletionAck)
341+
protected val LOADBALANCER_COMPLETION_ACK_FORCED =
342+
LoggingMarkers.LOADBALANCER_COMPLETION_ACK(controllerInstance, ForcedCompletionAck)
343+
protected val LOADBALANCER_COMPLETION_ACK_REGULAR_AFTER_FORCED =
344+
LoggingMarkers.LOADBALANCER_COMPLETION_ACK(controllerInstance, RegularAfterForcedCompletionAck)
345+
protected val LOADBALANCER_COMPLETION_ACK_FORCED_AFTER_REGULAR =
346+
LoggingMarkers.LOADBALANCER_COMPLETION_ACK(controllerInstance, ForcedAfterRegularCompletionAck)
347+
338348
/** Process the completion ack and update the state */
339349
protected[loadBalancer] def processCompletion(aid: ActivationId,
340350
tid: TransactionId,
@@ -359,8 +369,10 @@ class FPCPoolBalancer(config: WhiskConfig,
359369
// the active ack is received as expected, and processing that message removed the promise
360370
// from the corresponding map
361371
logging.info(this, s"received completion ack for '$aid', system error=$isSystemError")(tid)
372+
MetricEmitter.emitCounterMetric(LOADBALANCER_COMPLETION_ACK_REGULAR)
362373
} else {
363374
logging.error(this, s"Failed to invoke action ${aid.toString}, error: timeout waiting for the active ack")
375+
MetricEmitter.emitCounterMetric(LOADBALANCER_COMPLETION_ACK_FORCED)
364376

365377
// the entry has timed out; if the active ack is still around, remove its entry also
366378
// and complete the promise with a failure if necessary
@@ -378,11 +390,13 @@ class FPCPoolBalancer(config: WhiskConfig,
378390
// Logging this condition as a warning because the invoker processed the activation and sent a completion
379391
// message - but not in time.
380392
logging.warn(this, s"received completion ack for '$aid' which has no entry, system error=$isSystemError")(tid)
393+
MetricEmitter.emitCounterMetric(LOADBALANCER_COMPLETION_ACK_REGULAR_AFTER_FORCED)
381394
case None =>
382395
// The entry has already been removed by a completion ack. This part of the code is reached by the timeout and can
383396
// happen if completion ack and timeout happen roughly at the same time (the timeout was triggered before the completion
384397
// ack canceled the timer). As the completion ack is already processed we don't have to do anything here.
385398
logging.debug(this, s"forced completion ack for '$aid' which has no entry")(tid)
399+
MetricEmitter.emitCounterMetric(LOADBALANCER_COMPLETION_ACK_FORCED_AFTER_REGULAR)
386400
}
387401
}
388402

@@ -600,6 +614,28 @@ class FPCPoolBalancer(config: WhiskConfig,
600614
}
601615
}
602616

617+
def emitMetrics() = {
618+
invokerHealth().map(invokers => {
619+
MetricEmitter.emitGaugeMetric(HEALTHY_INVOKERS, invokers.count(_.status == Healthy))
620+
MetricEmitter.emitGaugeMetric(UNHEALTHY_INVOKERS, invokers.count(_.status == Unhealthy))
621+
MetricEmitter.emitGaugeMetric(OFFLINE_INVOKERS, invokers.count(_.status == Offline))
622+
// Add both user memory and busy memory because user memory represents free memory in this case
623+
MetricEmitter.emitGaugeMetric(
624+
INVOKER_TOTALMEM,
625+
invokers.foldLeft(0L) { (total, curr) =>
626+
if (curr.status.isUsable) {
627+
curr.id.userMemory.toMB + curr.id.busyMemory.getOrElse(ByteSize(0, SizeUnits.BYTE)).toMB + total
628+
} else {
629+
total
630+
}
631+
})
632+
MetricEmitter.emitGaugeMetric(LOADBALANCER_ACTIVATIONS_INFLIGHT(controllerInstance), totalActivations.longValue)
633+
MetricEmitter.emitGaugeMetric(LOADBALANCER_MEMORY_INFLIGHT(controllerInstance, ""), totalActivationMemory.longValue)
634+
})
635+
}
636+
637+
actorSystem.scheduler.scheduleAtFixedRate(10.seconds, 10.seconds)(() => emitMetrics())
638+
603639
/** Gets the number of in-flight activations for a specific user. */
604640
override def activeActivationsFor(namespace: UUID): Future[Int] =
605641
Future.successful(activationsPerNamespace.get(namespace).map(_.intValue()).getOrElse(0))

0 commit comments

Comments
 (0)