Skip to content

Commit d7b9d24

Browse files
Use GaugeHistogram for Prometheus LongTaskTimer
The Prometheus registry used "classic" cumulative Prometheus Histogram for LongTaskTimer. This was conceptually wrong since _count, _sum, and _bucket are counters (monotonic) in case of "classic" Prometheus Histogram but since LongTaskTimer measures the currently active tasks, these values can increase or decrease so a Prometheus GaugeHistogram is a better fit for it since there _gcount, _gsum, and _bucket are gauges (non-monotonic). See the OpenMetrics specs for more details: https://github.com/OpenObservability/OpenMetrics/blob/1386544931307dff279688f332890c31b6c5de36/specification/OpenMetrics.md#gaugehistogram This also means that the suffix of the time series change: - From _count to _gcount - From _sum to _gsum So the scrape output of a LongTaskTimer (test.ltt) could look like this: # HELP test_ltt_seconds # TYPE test_ltt_seconds histogram test_ltt_seconds_bucket{le="120.0"} 1 [...] test_ltt_seconds_bucket{le="7200.0"} 1 test_ltt_seconds_bucket{le="+Inf"} 1 # HELP test_ltt_seconds_gcount # TYPE test_ltt_seconds_gcount gauge test_ltt_seconds_gcount 1 # HELP test_ltt_seconds_gsum # TYPE test_ltt_seconds_gsum gauge test_ltt_seconds_gsum 10.0 # HELP test_ltt_seconds_max # TYPE test_ltt_seconds_max gauge test_ltt_seconds_max 10.0 Closes gh-4988
1 parent b2d8dc7 commit d7b9d24

File tree

2 files changed

+235
-15
lines changed

2 files changed

+235
-15
lines changed

implementations/micrometer-registry-prometheus/src/main/java/io/micrometer/prometheusmetrics/PrometheusMeterRegistry.java

+9-9
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222
import io.micrometer.core.instrument.distribution.HistogramSnapshot;
2323
import io.micrometer.core.instrument.distribution.*;
2424
import io.micrometer.core.instrument.distribution.pause.PauseDetector;
25-
import io.micrometer.core.instrument.internal.CumulativeHistogramLongTaskTimer;
2625
import io.micrometer.core.instrument.internal.DefaultGauge;
26+
import io.micrometer.core.instrument.internal.DefaultLongTaskTimer;
2727
import io.micrometer.core.instrument.internal.DefaultMeter;
2828
import io.micrometer.core.instrument.util.TimeUtils;
2929
import io.prometheus.metrics.config.PrometheusProperties;
@@ -308,8 +308,8 @@ protected io.micrometer.core.instrument.Timer newTimer(Meter.Id id,
308308
DistributionStatisticConfig distributionStatisticConfig, PauseDetector pauseDetector) {
309309
PrometheusTimer timer = new PrometheusTimer(id, clock, distributionStatisticConfig, pauseDetector,
310310
exemplarSamplerFactory);
311-
applyToCollector(id,
312-
(collector) -> addDistributionStatisticSamples(id, collector, timer, timer::exemplars, tagValues(id)));
311+
applyToCollector(id, (collector) -> addDistributionStatisticSamples(id, collector, timer, timer::exemplars,
312+
tagValues(id), false));
313313
return timer;
314314
}
315315

@@ -339,10 +339,9 @@ protected <T> io.micrometer.core.instrument.Gauge newGauge(Meter.Id id, @Nullabl
339339

340340
@Override
341341
protected LongTaskTimer newLongTaskTimer(Meter.Id id, DistributionStatisticConfig distributionStatisticConfig) {
342-
LongTaskTimer ltt = new CumulativeHistogramLongTaskTimer(id, clock, getBaseTimeUnit(),
343-
distributionStatisticConfig);
342+
LongTaskTimer ltt = new DefaultLongTaskTimer(id, clock, getBaseTimeUnit(), distributionStatisticConfig, true);
344343
applyToCollector(id, (collector) -> addDistributionStatisticSamples(id, collector, ltt, () -> Exemplars.EMPTY,
345-
tagValues(id)));
344+
tagValues(id), true));
346345
return ltt;
347346
}
348347

@@ -449,7 +448,8 @@ public PrometheusRegistry getPrometheusRegistry() {
449448
}
450449

451450
private void addDistributionStatisticSamples(Meter.Id id, MicrometerCollector collector,
452-
HistogramSupport histogramSupport, Supplier<Exemplars> exemplarsSupplier, List<String> tagValues) {
451+
HistogramSupport histogramSupport, Supplier<Exemplars> exemplarsSupplier, List<String> tagValues,
452+
boolean forLongTaskTimer) {
453453
collector.add(tagValues, (conventionName, tagKeys) -> {
454454
Stream.Builder<MicrometerCollector.Family<?>> families = Stream.builder();
455455

@@ -499,8 +499,8 @@ private void addDistributionStatisticSamples(Meter.Id id, MicrometerCollector co
499499

500500
Exemplars exemplars = createExemplarsWithScaledValues(exemplarsSupplier.get());
501501
families.add(new MicrometerCollector.Family<>(conventionName,
502-
family -> new io.prometheus.metrics.model.snapshots.HistogramSnapshot(family.metadata,
503-
family.dataPointSnapshots),
502+
family -> new io.prometheus.metrics.model.snapshots.HistogramSnapshot(forLongTaskTimer,
503+
family.metadata, family.dataPointSnapshots),
504504
getMetadata(id), new HistogramDataPointSnapshot(ClassicHistogramBuckets.of(buckets, counts),
505505
sum, Labels.of(tagKeys, tagValues), exemplars, 0)));
506506

implementations/micrometer-registry-prometheus/src/test/java/io/micrometer/prometheusmetrics/PrometheusMeterRegistryTest.java

+226-6
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
package io.micrometer.prometheusmetrics;
1717

1818
import io.micrometer.core.Issue;
19+
import io.micrometer.core.instrument.LongTaskTimer.Sample;
1920
import io.micrometer.core.instrument.Timer;
2021
import io.micrometer.core.instrument.*;
2122
import io.micrometer.core.instrument.binder.BaseUnits;
@@ -583,12 +584,231 @@ void timerSumAndMaxHaveCorrectBaseUnit_whenPercentileHistogramEnabled() {
583584
}
584585

585586
@Test
586-
void scrapeWithLongTaskTimer() {
587-
LongTaskTimer.builder("my.long.task.timer").register(registry);
588-
assertThat(registry.scrape()).contains("my_long_task_timer_seconds_max")
589-
// since Prometheus client 1.x, suffix _active_count => _count
590-
.contains("my_long_task_timer_seconds_count")
591-
.contains("my_long_task_timer_seconds_sum");
587+
@Issue("#4988")
588+
void longTaskTimerRecordingsSchouldBeCorrect() {
589+
LongTaskTimer ltt = LongTaskTimer.builder("test.ltt").publishPercentileHistogram().register(registry);
590+
591+
String result = registry.scrape();
592+
// since Prometheus client 1.x, suffix _active_count => _gcount and _sum => _gsum
593+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
594+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
595+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
596+
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
597+
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
598+
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
599+
600+
// A task started
601+
Sample sample = ltt.start();
602+
clock.add(150, TimeUnit.SECONDS);
603+
result = registry.scrape();
604+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
605+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"137.438953471\"} 0\n");
606+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"160.345445716\"} 1\n");
607+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
608+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
609+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
610+
assertThat(result).contains("test_ltt_seconds_gsum 150.0\n");
611+
assertThat(result).contains("test_ltt_seconds_max 150.0\n");
612+
613+
// After a while another scrape happens, the task is still in progress...
614+
clock.add(20, TimeUnit.SECONDS);
615+
result = registry.scrape();
616+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
617+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"137.438953471\"} 0\n");
618+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"160.345445716\"} 0\n");
619+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"183.251937961\"} 1\n");
620+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
621+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
622+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
623+
assertThat(result).contains("test_ltt_seconds_gsum 170.0\n");
624+
assertThat(result).contains("test_ltt_seconds_max 170.0\n");
625+
626+
// Another task started
627+
Sample sample2 = ltt.start();
628+
clock.add(10, TimeUnit.SECONDS);
629+
result = registry.scrape();
630+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 1\n");
631+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"137.438953471\"} 1\n");
632+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"160.345445716\"} 1\n");
633+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"183.251937961\"} 2\n");
634+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 2\n");
635+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 2\n");
636+
assertThat(result).contains("test_ltt_seconds_gcount 2\n");
637+
assertThat(result).contains("test_ltt_seconds_gsum 190.0\n");
638+
assertThat(result).contains("test_ltt_seconds_max 180.0\n");
639+
640+
sample2.stop();
641+
642+
// After the second task stopped
643+
clock.add(1, TimeUnit.SECONDS);
644+
result = registry.scrape();
645+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
646+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"137.438953471\"} 0\n");
647+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"160.345445716\"} 0\n");
648+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"183.251937961\"} 1\n");
649+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
650+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
651+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
652+
assertThat(result).contains("test_ltt_seconds_gsum 181.0\n");
653+
assertThat(result).contains("test_ltt_seconds_max 181.0\n");
654+
655+
sample.stop();
656+
657+
// After the first task stopped
658+
clock.add(10, TimeUnit.SECONDS);
659+
result = registry.scrape();
660+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
661+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
662+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
663+
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
664+
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
665+
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
666+
}
667+
668+
@Test
669+
@Issue("#4988")
670+
void longTaskTimerInfBucketShouldBeCorrect() {
671+
LongTaskTimer ltt = LongTaskTimer.builder("test.ltt").publishPercentileHistogram().register(registry);
672+
673+
String result = registry.scrape();
674+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
675+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
676+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
677+
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
678+
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
679+
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
680+
681+
// A task started
682+
Sample sample = ltt.start();
683+
clock.add(7000, TimeUnit.SECONDS);
684+
result = registry.scrape();
685+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
686+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"5864.062014805\"} 0\n");
687+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
688+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
689+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
690+
assertThat(result).contains("test_ltt_seconds_gsum 7000.0\n");
691+
assertThat(result).contains("test_ltt_seconds_max 7000.0\n");
692+
693+
// After a while another scrape happens, the task is still in progress...
694+
// Now the task is in the +Inf bucket
695+
clock.add(500, TimeUnit.SECONDS);
696+
result = registry.scrape();
697+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
698+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"5864.062014805\"} 0\n");
699+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
700+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
701+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
702+
assertThat(result).contains("test_ltt_seconds_gsum 7500.0\n");
703+
assertThat(result).contains("test_ltt_seconds_max 7500.0\n");
704+
705+
// Another task started
706+
Sample sample2 = ltt.start();
707+
clock.add(500, TimeUnit.SECONDS);
708+
result = registry.scrape();
709+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
710+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"458.129844906\"} 0\n");
711+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"549.755813887\"} 1\n");
712+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"5864.062014805\"} 1\n");
713+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
714+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 2\n");
715+
assertThat(result).contains("test_ltt_seconds_gcount 2\n");
716+
assertThat(result).contains("test_ltt_seconds_gsum 8500.0\n");
717+
assertThat(result).contains("test_ltt_seconds_max 8000.0\n");
718+
719+
sample2.stop();
720+
721+
// After the second task stopped
722+
clock.add(500, TimeUnit.SECONDS);
723+
result = registry.scrape();
724+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
725+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"458.129844906\"} 0\n");
726+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"549.755813887\"} 0\n");
727+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"5864.062014805\"} 0\n");
728+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
729+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
730+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
731+
assertThat(result).contains("test_ltt_seconds_gsum 8500.0\n");
732+
assertThat(result).contains("test_ltt_seconds_max 8500.0\n");
733+
734+
sample.stop();
735+
736+
// After the first task stopped
737+
result = registry.scrape();
738+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
739+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
740+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
741+
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
742+
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
743+
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
744+
}
745+
746+
@Test
747+
@Issue("#4988")
748+
void nonTelescopicLongTaskTimerRecordingsShouldBeCorrect() {
749+
LongTaskTimer ltt = LongTaskTimer.builder("test.ltt").publishPercentileHistogram().register(registry);
750+
751+
String result = registry.scrape();
752+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
753+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
754+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
755+
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
756+
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
757+
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
758+
759+
// A task started
760+
Sample sample = ltt.start();
761+
clock.add(200, TimeUnit.SECONDS);
762+
result = registry.scrape();
763+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
764+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"183.251937961\"} 0\n");
765+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"206.158430206\"} 1\n");
766+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
767+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
768+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
769+
assertThat(result).contains("test_ltt_seconds_gsum 200.0\n");
770+
assertThat(result).contains("test_ltt_seconds_max 200.0\n");
771+
772+
// A second task started before the first stopped
773+
Sample sample2 = ltt.start();
774+
// The first task stopped
775+
sample.stop();
776+
777+
clock.add(100, TimeUnit.SECONDS);
778+
result = registry.scrape();
779+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 1\n");
780+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
781+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
782+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
783+
assertThat(result).contains("test_ltt_seconds_gsum 100.0\n");
784+
assertThat(result).contains("test_ltt_seconds_max 100.0\n");
785+
786+
// The second task stopped
787+
sample2.stop();
788+
789+
// A third task started after the first and second stopped
790+
Sample sample3 = ltt.start();
791+
clock.add(300, TimeUnit.SECONDS);
792+
result = registry.scrape();
793+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
794+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"274.877906944\"} 0\n");
795+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"366.503875925\"} 1\n");
796+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 1\n");
797+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 1\n");
798+
assertThat(result).contains("test_ltt_seconds_gcount 1\n");
799+
assertThat(result).contains("test_ltt_seconds_gsum 300.0\n");
800+
assertThat(result).contains("test_ltt_seconds_max 300.0\n");
801+
802+
// The third task stopped
803+
sample3.stop();
804+
805+
result = registry.scrape();
806+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"120.0\"} 0\n");
807+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"7200.0\"} 0\n");
808+
assertThat(result).contains("test_ltt_seconds_bucket{le=\"+Inf\"} 0\n");
809+
assertThat(result).contains("test_ltt_seconds_gcount 0\n");
810+
assertThat(result).contains("test_ltt_seconds_gsum 0.0\n");
811+
assertThat(result).contains("test_ltt_seconds_max 0.0\n");
592812
}
593813

594814
@Issue("#2087")

0 commit comments

Comments
 (0)