Skip to content

Commit d3a6a9a

Browse files
author
Brendan Doyle
committed
add fpc load balancer metrics
1 parent 3e3414c commit d3a6a9a

File tree

2 files changed

+46
-1
lines changed

2 files changed

+46
-1
lines changed

common/scala/src/main/scala/org/apache/openwhisk/common/Logging.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,15 @@ object LoggingMarkers {
568568
val OFFLINE_INVOKER_BLACKBOX =
569569
LogMarkerToken(loadbalancer, "totalOfflineInvokerBlackBox", counter)(MeasurementUnit.none)
570570

571+
val HEALTHY_INVOKERS =
572+
LogMarkerToken(loadbalancer, "totalHealthyInvoker", counter)(MeasurementUnit.none)
573+
val UNHEALTHY_INVOKERS =
574+
LogMarkerToken(loadbalancer, "totalUnhealthyInvoker", counter)(MeasurementUnit.none)
575+
val OFFLINE_INVOKERS =
576+
LogMarkerToken(loadbalancer, "totalOfflineInvoker", counter)(MeasurementUnit.none)
577+
578+
val INVOKER_TOTALMEM = LogMarkerToken(loadbalancer, "totalCapacity", counter)(MeasurementUnit.none)
579+
571580
// Kafka related markers
572581
def KAFKA_QUEUE(topic: String) =
573582
if (TransactionId.metricsKamonTags)

core/controller/src/main/scala/org/apache/openwhisk/core/loadBalancer/FPCPoolBalancer.scala

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,13 @@ package org.apache.openwhisk.core.loadBalancer
33
import java.nio.charset.StandardCharsets
44
import java.util.concurrent.ThreadLocalRandom
55
import java.util.concurrent.atomic.LongAdder
6-
76
import akka.actor.{Actor, ActorRef, ActorRefFactory, ActorSystem, Cancellable, Props}
87
import akka.event.Logging.InfoLevel
98
import akka.pattern.ask
109
import akka.util.Timeout
1110
import org.apache.kafka.clients.producer.RecordMetadata
1211
import org.apache.openwhisk.common.InvokerState.{Healthy, Offline, Unhealthy}
12+
import org.apache.openwhisk.common.LoggingMarkers._
1313
import org.apache.openwhisk.common._
1414
import org.apache.openwhisk.core.connector._
1515
import org.apache.openwhisk.core.controller.Controller
@@ -336,6 +336,16 @@ class FPCPoolBalancer(config: WhiskConfig,
336336
}
337337
}
338338

339+
// Singletons for counter metrics related to completion acks
340+
protected val LOADBALANCER_COMPLETION_ACK_REGULAR =
341+
LoggingMarkers.LOADBALANCER_COMPLETION_ACK(controllerInstance, RegularCompletionAck)
342+
protected val LOADBALANCER_COMPLETION_ACK_FORCED =
343+
LoggingMarkers.LOADBALANCER_COMPLETION_ACK(controllerInstance, ForcedCompletionAck)
344+
protected val LOADBALANCER_COMPLETION_ACK_REGULAR_AFTER_FORCED =
345+
LoggingMarkers.LOADBALANCER_COMPLETION_ACK(controllerInstance, RegularAfterForcedCompletionAck)
346+
protected val LOADBALANCER_COMPLETION_ACK_FORCED_AFTER_REGULAR =
347+
LoggingMarkers.LOADBALANCER_COMPLETION_ACK(controllerInstance, ForcedAfterRegularCompletionAck)
348+
339349
/** Process the completion ack and update the state */
340350
protected[loadBalancer] def processCompletion(aid: ActivationId,
341351
tid: TransactionId,
@@ -360,8 +370,10 @@ class FPCPoolBalancer(config: WhiskConfig,
360370
// the active ack is received as expected, and processing that message removed the promise
361371
// from the corresponding map
362372
logging.info(this, s"received completion ack for '$aid', system error=$isSystemError")(tid)
373+
MetricEmitter.emitCounterMetric(LOADBALANCER_COMPLETION_ACK_REGULAR)
363374
} else {
364375
logging.error(this, s"Failed to invoke action ${aid.toString}, error: timeout waiting for the active ack")
376+
MetricEmitter.emitCounterMetric(LOADBALANCER_COMPLETION_ACK_FORCED)
365377

366378
// the entry has timed out; if the active ack is still around, remove its entry also
367379
// and complete the promise with a failure if necessary
@@ -379,11 +391,13 @@ class FPCPoolBalancer(config: WhiskConfig,
379391
// Logging this condition as a warning because the invoker processed the activation and sent a completion
380392
// message - but not in time.
381393
logging.warn(this, s"received completion ack for '$aid' which has no entry, system error=$isSystemError")(tid)
394+
MetricEmitter.emitCounterMetric(LOADBALANCER_COMPLETION_ACK_REGULAR_AFTER_FORCED)
382395
case None =>
383396
// The entry has already been removed by a completion ack. This part of the code is reached by the timeout and can
384397
// happen if completion ack and timeout happen roughly at the same time (the timeout was triggered before the completion
385398
// ack canceled the timer). As the completion ack is already processed we don't have to do anything here.
386399
logging.debug(this, s"forced completion ack for '$aid' which has no entry")(tid)
400+
MetricEmitter.emitCounterMetric(LOADBALANCER_COMPLETION_ACK_FORCED_AFTER_REGULAR)
387401
}
388402
}
389403

@@ -601,6 +615,28 @@ class FPCPoolBalancer(config: WhiskConfig,
601615
}
602616
}
603617

618+
def emitMetrics() = {
619+
invokerHealth().map(invokers => {
620+
MetricEmitter.emitGaugeMetric(HEALTHY_INVOKERS, invokers.count(_.status == Healthy))
621+
MetricEmitter.emitGaugeMetric(UNHEALTHY_INVOKERS, invokers.count(_.status == Unhealthy))
622+
MetricEmitter.emitGaugeMetric(OFFLINE_INVOKERS, invokers.count(_.status == Offline))
623+
// Add both user memory and busy memory because user memory represents free memory in this case
624+
MetricEmitter.emitGaugeMetric(
625+
INVOKER_TOTALMEM,
626+
invokers.foldLeft(0L) { (total, curr) =>
627+
if (curr.status.isUsable) {
628+
curr.id.userMemory.toMB + curr.id.busyMemory.getOrElse(ByteSize(0, SizeUnits.BYTE)).toMB + total
629+
} else {
630+
total
631+
}
632+
})
633+
MetricEmitter.emitGaugeMetric(LOADBALANCER_ACTIVATIONS_INFLIGHT(controllerInstance), totalActivations.longValue)
634+
MetricEmitter.emitGaugeMetric(LOADBALANCER_MEMORY_INFLIGHT(controllerInstance, ""), totalActivationMemory.longValue)
635+
})
636+
}
637+
638+
actorSystem.scheduler.scheduleAtFixedRate(10.seconds, 10.seconds)(() => emitMetrics())
639+
604640
/** Gets the number of in-flight activations for a specific user. */
605641
override def activeActivationsFor(namespace: UUID): Future[Int] =
606642
Future.successful(activationsPerNamespace.get(namespace).map(_.intValue()).getOrElse(0))

0 commit comments

Comments
 (0)