fix: fix histogram metric query (#737)

dhable · web-flow · commit 4865ce7a62eb · 2025-04-14T22:04:13.000Z
Fix for the histogram query based on the late night session at KubeCon.

Ref: HDX-1572
diff --git a/.changeset/old-rules-check.md b/.changeset/old-rules-check.md
@@ -0,0 +1,5 @@
+---
+"@hyperdx/common-utils": patch
+---
+
+Fixes the histogram query to perform quantile calculation across all data points
diff --git a/packages/api/src/clickhouse/__tests__/__snapshots__/renderChartConfig.test.ts.snap b/packages/api/src/clickhouse/__tests__/__snapshots__/renderChartConfig.test.ts.snap
@@ -60,6 +60,15 @@ Array [
 ]
 `;
 
+exports[`renderChartConfig Query Metrics should include multiple data points in percentile computation (p50) 1`] = `
+Array [
+  Object {
+    "__hdx_time_bucket": "2022-01-05T00:00:00Z",
+    "any(toFloat64OrNull(toString(Rate)))": 2.9,
+  },
+]
+`;
+
 exports[`renderChartConfig Query Metrics single avg gauge with group-by 1`] = `
 Array [
   Object {
@@ -192,11 +201,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_bounded histogram (p25)
 Array [
   Object {
     "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
   },
   Object {
     "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 10,
+    "any(toFloat64OrNull(toString(Rate)))": 10,
   },
 ]
 `;
@@ -205,11 +214,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_bounded histogram (p50)
 Array [
   Object {
     "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
   },
   Object {
     "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 20,
+    "any(toFloat64OrNull(toString(Rate)))": 13.333333333333332,
   },
 ]
 `;
@@ -218,11 +227,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_bounded histogram (p90)
 Array [
   Object {
     "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
   },
   Object {
     "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 30,
+    "any(toFloat64OrNull(toString(Rate)))": 30,
   },
 ]
 `;
@@ -231,11 +240,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_lower_bound_inf histogra
 Array [
   Object {
     "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
   },
   Object {
     "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 1,
+    "any(toFloat64OrNull(toString(Rate)))": 1,
   },
 ]
 `;
@@ -244,11 +253,11 @@ exports[`renderChartConfig Query Metrics two_timestamps_upper_bound_inf histogra
 Array [
   Object {
     "__hdx_time_bucket": "2022-01-05T00:00:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 0,
+    "any(toFloat64OrNull(toString(Rate)))": 0,
   },
   Object {
     "__hdx_time_bucket": "2022-01-05T00:01:00Z",
-    "sum(toFloat64OrNull(toString(Rate)))": 30,
+    "any(toFloat64OrNull(toString(Rate)))": 30,
   },
 ]
 `;
diff --git a/packages/api/src/clickhouse/__tests__/renderChartConfig.test.ts b/packages/api/src/clickhouse/__tests__/renderChartConfig.test.ts
@@ -446,6 +446,36 @@ describe('renderChartConfig', () => {
         AggregationTemporality: 2, // Cumulative
         ...point,
       }));
+      const histPointsE = [
+        {
+          BucketCounts: [1, 1, 1, 1, 1, 1],
+          ExplicitBounds: [1, 2, 5, 8, 13],
+          TimeUnix: new Date(now),
+          ResourceAttributes: { host: 'test-a' },
+        },
+        {
+          BucketCounts: [2, 2, 2, 2, 2, 2],
+          ExplicitBounds: [1, 2, 5, 8, 13],
+          TimeUnix: new Date(now + ms('5s')),
+          ResourceAttributes: { host: 'test-b' },
+        },
+        {
+          BucketCounts: [2, 1, 2, 1, 2, 1],
+          ExplicitBounds: [1, 2, 5, 8, 13],
+          TimeUnix: new Date(now + ms('1m')),
+          ResourceAttributes: { host: 'test-a' },
+        },
+        {
+          BucketCounts: [3, 3, 2, 2, 3, 3],
+          ExplicitBounds: [1, 2, 5, 8, 13],
+          TimeUnix: new Date(now + ms('65s')),
+          ResourceAttributes: { host: 'test-b' },
+        },
+      ].map(point => ({
+        MetricName: 'test.multiple_series',
+        AggregationTemporality: 2, // Cumulative
+        ...point,
+      }));
 
       await Promise.all([
         bulkInsertMetricsGauge([...gaugePointsA, ...gaugePointsB]),
@@ -462,6 +492,7 @@ describe('renderChartConfig', () => {
           ...histPointsB,
           ...histPointsC,
           ...histPointsD,
+          ...histPointsE,
         ]),
       ]);
     });
@@ -655,6 +686,10 @@ describe('renderChartConfig', () => {
     });
 
     it('calculates min_rate/max_rate correctly for sum metrics', async () => {
+      // Raw Data is
+      // MIN_VARIANT_0: [0, 1, 8, 0, 7, 7, 15, 17, 0, 42]
+      // MIN_VARIANT_1: [0, 2, 9, 0, 15, 25 35, 57, 0, 92]
+      //
       // Based on the data inserted in the fixture, the expected stream of values
       // for each series after adjusting for the zero reset should be:
       // MIN_VARIANT_0: [0, 1, 8, 8, 15, 15, 23, 25, 25, 67]
@@ -725,20 +760,10 @@ describe('renderChartConfig', () => {
         Since the AggregationTemporality is 2(cumulative), we need to calculate the delta between the two points:
           delta: [10, 10, 10] - [0, 0, 0] = [10, 10, 10]
 
-        Total observations: 10 + 10 + 10 = 30
-        Cumulative counts: [10, 20, 30]
-        p50 point:
-          Rank = 0.5 * 30 = 15
-          This falls in the second bucket (since 10 < 15 ≤ 20)
-
         We need to interpolate between the lower and upper bounds of the second bucket:
-          Lower bound: 10
-          Upper bound: 30
-          Position in bucket: (15 - 10) / (20 - 10) = 0.5
-          Interpolated value: 10 + (30 - 10) * 0.5 = 10 + 10 = 20
-
-        Thus the first point value would be 0 since it's at the start of the bounds.
-        The second point value would be 20 since that is the median point value delta from the first point.
+          cum sum = [10, 20, 30]
+          rank = 0.5 * 30 = 15 (between bounds 10 - 30)
+          interpolate: 10 + ((15 - 10) / 30) * (30 - 10) = 13.3333
        */
       const query = await renderChartConfig(
         {
@@ -952,6 +977,57 @@ describe('renderChartConfig', () => {
       expect(res).toMatchSnapshot();
     });
 
+    it('should include multiple data points in percentile computation (p50)', async () => {
+      /*
+        bounds: [1, 2, 5, 8, 13]
+        host = test-a:
+          p1 = [1, 1, 1, 1, 1, 1]
+          p2 = [2, 1, 2, 1, 2, 1]
+        host = test-b:
+          p1 = [2, 2, 2, 2, 2, 2]
+          p2 = [3, 3, 2, 2, 3, 3]
+
+        Compute the diff between adjacent points for each unique host (deltaSumForEach)
+          host = test-a, diff = [1, 0, 1, 0, 1, 0]
+          host = test-b, diff = [1, 1, 0, 0, 1, 1]
+
+        Sum the diffs together to obtain a combined count for the different series
+          sum elements(d) = [2, 1, 1, 0, 2, 1]
+
+        Now compute the p50 value:
+          sum(d) = 7
+          cum sum = [2, 3, 4, 4, 6, 7]
+          rank = 0.5 * 7 = 3.5 (between bounds 2 - 5)
+          interpolate: 2 + ((3.5 - 2) / 5) * (5 - 2) = 2.9
+
+        Since all the points fall within a single granularity interval the result should be a single row
+        with the value 2.9.
+       */
+      const query = await renderChartConfig(
+        {
+          select: [
+            {
+              aggFn: 'quantile',
+              level: 0.5,
+              metricName: 'test.multiple_series',
+              metricType: MetricsDataType.Histogram,
+              valueExpression: 'Value',
+            },
+          ],
+          from: metricSource.from,
+          where: '',
+          metricTables: TEST_METRIC_TABLES,
+          dateRange: [new Date(now), new Date(now + ms('2m'))],
+          granularity: '5 minute',
+          timestampValueExpression: metricSource.timestampValueExpression,
+          connection: connection.id,
+        },
+        metadata,
+      );
+      const res = await queryData(query);
+      expect(res).toMatchSnapshot();
+    });
+
     // HDX-1515: Handle counter reset in histogram metric in the same way that the counter reset
     // is handled for sum metrics.
     it.skip('three_timestamps_bounded histogram with reset (p50)', async () => {
diff --git a/packages/common-utils/src/__tests__/__snapshots__/renderChartConfig.test.ts.snap b/packages/common-utils/src/__tests__/__snapshots__/renderChartConfig.test.ts.snap
@@ -36,39 +36,47 @@ exports[`renderChartConfig should generate sql for a single gauge metric 1`] = `
 `;
 
 exports[`renderChartConfig should generate sql for a single histogram metric 1`] = `
-"WITH HistRate AS (
+"WITH Source AS (
           SELECT
             *,
             cityHash64(mapConcat(ScopeAttributes, ResourceAttributes, Attributes)) AS AttributesHash,
-            length(BucketCounts) as CountLength,
-            any(BucketCounts) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevBucketCounts,
-            any(CountLength) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevCountLength,
-            any(AttributesHash) OVER (ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS PrevAttributesHash,
             IF(AggregationTemporality = 1,
-               BucketCounts,
-               IF(AttributesHash = PrevAttributesHash AND CountLength = PrevCountLength,
-                  arrayMap((prev, curr) -> IF(curr < prev, curr, toUInt64(toInt64(curr) - toInt64(prev))), PrevBucketCounts, BucketCounts),
-                  BucketCounts)) as BucketRates
+              sumForEach(BucketCounts) OVER (PARTITION BY AttributesHash ORDER BY AttributesHash, TimeUnix ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW),
+              deltaSumForEach(BucketCounts) OVER (PARTITION BY AttributesHash ORDER BY AttributesHash, TimeUnix ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)
+            ) AS MonoBucketCounts
           FROM default.otel_metrics_histogram
           WHERE (TimeUnix >= fromUnixTimestamp64Milli(1739318400000) AND TimeUnix <= fromUnixTimestamp64Milli(1765670400000)) AND ((MetricName = 'http.server.duration'))
-          ORDER BY Attributes, TimeUnix ASC
-          ),RawHist AS (
+          ORDER BY AttributesHash ASC
+          ),Bucketed AS (
             SELECT
-              *,
-              toUInt64( 0.5 * arraySum(BucketRates)) AS Rank,
-              arrayCumSum(BucketRates) as CumRates,
-              arrayFirstIndex(x -> if(x > Rank, 1, 0), CumRates) AS BucketLowIdx,
-              IF(BucketLowIdx = length(BucketRates),
-                arrayElement(ExplicitBounds, length(ExplicitBounds)),
-                IF(BucketLowIdx > 1,
-                  arrayElement(ExplicitBounds, BucketLowIdx - 1) + (arrayElement(ExplicitBounds, BucketLowIdx) - arrayElement(ExplicitBounds, BucketLowIdx - 1)) *
-                    IF(arrayElement(CumRates, BucketLowIdx) > arrayElement(CumRates, BucketLowIdx - 1),
-                      (Rank - arrayElement(CumRates, BucketLowIdx - 1)) / (arrayElement(CumRates, BucketLowIdx) - arrayElement(CumRates, BucketLowIdx - 1)), 0),
-                  arrayElement(ExplicitBounds, BucketLowIdx)
-                )) as Rate
-            FROM HistRate) SELECT sum(
+              
+              any(Source.ExplicitBounds) AS ExplicitBounds,
+              sumForEach(MonoBucketCounts) AS TotalBucketCounts,
+              arrayCumSum(TotalBucketCounts) AS BucketCumCounts,
+              0.5 * arraySum(TotalBucketCounts) AS Rank,
+              arrayFirstIndex(x -> if(x > Rank, 1, 0), BucketCumCounts) AS BucketLowIdx,
+              CASE
+                WHEN BucketLowIdx = 0 THEN 0
+                WHEN BucketLowIdx = 1 THEN ExplicitBounds[BucketLowIdx]
+                WHEN BucketLowIdx = length(TotalBucketCounts) THEN ExplicitBounds[length(ExplicitBounds)]
+                ELSE
+                  ExplicitBounds[BucketLowIdx - 1] +
+                    ((Rank - ExplicitBounds[BucketLowIdx - 1]) / (ExplicitBounds[BucketLowIdx])) * (ExplicitBounds[BucketLowIdx] - ExplicitBounds[BucketLowIdx - 1])
+              END AS Value
+            FROM Source
+            GROUP BY 
+            ORDER BY  ASC
+          ),Rates AS (
+            SELECT
+              \`__hdx_time_bucket\`,
+              any(Bucketed.Value) AS \`__hdx_value_high\`,
+              any(\`__hdx_value_high\`) OVER (ORDER BY \`__hdx_time_bucket\` ROWS BETWEEN 1 PRECEDING AND 1 PRECEDING) AS \`__hdx_prev_high\`,
+              \`__hdx_value_high\` - \`__hdx_prev_high\` AS Rate
+            FROM Bucketed
+            GROUP BY \`__hdx_time_bucket\`
+          ) SELECT any(
       toFloat64OrNull(toString(Rate))
-    ) FROM RawHist WHERE (TimeUnix >= fromUnixTimestamp64Milli(1739318400000) AND TimeUnix <= fromUnixTimestamp64Milli(1765670400000)) LIMIT 10"
+    ) FROM Rates WHERE (\`__hdx_time_bucket\` >= fromUnixTimestamp64Milli(1739318400000) AND \`__hdx_time_bucket\` <= fromUnixTimestamp64Milli(1765670400000)) LIMIT 10"
 `;
 
 exports[`renderChartConfig should generate sql for a single sum metric 1`] = `
diff --git a/packages/common-utils/src/renderChartConfig.ts b/packages/common-utils/src/renderChartConfig.ts

-Original file line number
+Diff line change
@@ @@ -0,0 +1,5 @@ @@
 +---
 +"@hyperdx/common-utils": patch
 +---
++
 +Fixes the histogram query to perform quantile calculation across all data points