Skip to content

Commit 63c3695

Browse files
authored
feat(frontend): support dispatcher metrics for explain analyze (#21227)
1 parent 80ad7df commit 63c3695

File tree

4 files changed

+178
-40
lines changed

4 files changed

+178
-40
lines changed

proto/monitor_service.proto

+6
Original file line numberDiff line numberDiff line change
@@ -94,12 +94,18 @@ message TieredCacheTracingRequest {
9494
message TieredCacheTracingResponse {}
9595

9696
message GetProfileStatsRequest {
97+
// Executors to fetch statistics for.
9798
repeated uint64 executor_ids = 1;
99+
// Dispatchers do not have executors.
100+
// We have to fetch their statistics separately.
101+
repeated uint32 dispatcher_fragment_ids = 2;
98102
}
99103

100104
message GetProfileStatsResponse {
101105
map<uint64, uint64> stream_node_output_row_count = 1;
102106
map<uint64, uint64> stream_node_output_blocking_duration_ms = 2;
107+
map<uint32, uint64> dispatch_fragment_output_row_count = 3;
108+
map<uint32, uint64> dispatch_fragment_output_blocking_duration_ns = 4;
103109
}
104110

105111
service MonitorService {

src/common/metrics/src/lib.rs

+14
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use std::sync::LazyLock;
1919

2020
use hytra::TrAdder;
2121
use prometheus::core::{Atomic, AtomicU64, GenericCounter, GenericGauge};
22+
use prometheus::proto::Metric;
2223
use prometheus::register_int_counter_with_registry;
2324
use tracing::Subscriber;
2425
use tracing_subscriber::Layer;
@@ -142,3 +143,16 @@ impl PartialOrd for MetricLevel {
142143
(*self as u8).partial_cmp(&(*other as u8))
143144
}
144145
}
146+
147+
pub fn get_label<T: std::str::FromStr>(metric: &Metric, label: &str) -> Option<T> {
148+
metric
149+
.get_label()
150+
.iter()
151+
.find(|lp| lp.get_name() == label)
152+
.and_then(|lp| lp.get_value().parse::<T>().ok())
153+
}
154+
155+
// Must ensure the label exists and can be parsed into `T`
156+
pub fn get_label_infallible<T: std::str::FromStr>(metric: &Metric, label: &str) -> T {
157+
get_label(metric, label).expect("label not found or can't be parsed")
158+
}

src/compute/src/rpc/service/monitor_service.rs

+47-28
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15-
use std::collections::{BTreeMap, HashMap};
15+
use std::collections::{BTreeMap, HashMap, HashSet};
1616
use std::ffi::CString;
1717
use std::fs;
1818
use std::path::Path;
@@ -293,16 +293,45 @@ impl MonitorService for MonitorServiceImpl {
293293
request: Request<GetProfileStatsRequest>,
294294
) -> Result<Response<GetProfileStatsResponse>, Status> {
295295
let metrics = global_streaming_metrics(MetricLevel::Info);
296-
let executor_ids = &request.into_inner().executor_ids;
296+
let inner = request.into_inner();
297+
let executor_ids = &inner.executor_ids;
298+
let fragment_ids = HashSet::from_iter(inner.dispatcher_fragment_ids.into_iter());
297299
let stream_node_output_row_count = metrics
298300
.mem_stream_node_output_row_count
299301
.collect(executor_ids);
300302
let stream_node_output_blocking_duration_ms = metrics
301303
.mem_stream_node_output_blocking_duration_ms
302304
.collect(executor_ids);
305+
306+
// Collect count metrics by fragment_ids
307+
fn collect_by_fragment_ids<T: Collector>(
308+
m: &T,
309+
fragment_ids: &HashSet<u32>,
310+
) -> HashMap<u32, u64> {
311+
let mut metrics = HashMap::new();
312+
for mut metric_family in m.collect() {
313+
for metric in metric_family.take_metric() {
314+
let fragment_id = get_label_infallible(&metric, "fragment_id");
315+
if fragment_ids.contains(&fragment_id) {
316+
let entry = metrics.entry(fragment_id).or_insert(0);
317+
*entry += metric.get_counter().get_value() as u64;
318+
}
319+
}
320+
}
321+
metrics
322+
}
323+
324+
let dispatch_fragment_output_row_count =
325+
collect_by_fragment_ids(&metrics.actor_out_record_cnt, &fragment_ids);
326+
let dispatch_fragment_output_blocking_duration_ns = collect_by_fragment_ids(
327+
&metrics.actor_output_buffer_blocking_duration_ns,
328+
&fragment_ids,
329+
);
303330
Ok(Response::new(GetProfileStatsResponse {
304331
stream_node_output_row_count,
305332
stream_node_output_blocking_duration_ms,
333+
dispatch_fragment_output_row_count,
334+
dispatch_fragment_output_blocking_duration_ns,
306335
}))
307336
}
308337

@@ -322,27 +351,14 @@ impl MonitorService for MonitorServiceImpl {
322351
.into_vec()
323352
}
324353

325-
// Must ensure the label exists and can be parsed into `T`
326-
fn get_label<T: std::str::FromStr>(metric: &Metric, label: &str) -> T {
327-
metric
328-
.get_label()
329-
.iter()
330-
.find(|lp| lp.get_name() == label)
331-
.unwrap()
332-
.get_value()
333-
.parse::<T>()
334-
.ok()
335-
.unwrap()
336-
}
337-
338354
let actor_output_buffer_blocking_duration_ns =
339355
collect(&metrics.actor_output_buffer_blocking_duration_ns);
340356
let actor_count = collect(&metrics.actor_count);
341357

342358
let actor_count: HashMap<_, _> = actor_count
343359
.iter()
344360
.map(|m| {
345-
let fragment_id: u32 = get_label(m, "fragment_id");
361+
let fragment_id: u32 = get_label_infallible(m, "fragment_id");
346362
let count = m.get_gauge().get_value() as u32;
347363
(fragment_id, count)
348364
})
@@ -361,7 +377,7 @@ impl MonitorService for MonitorServiceImpl {
361377

362378
let actor_current_epoch = collect(&metrics.actor_current_epoch);
363379
for m in &actor_current_epoch {
364-
let fragment_id: u32 = get_label(m, "fragment_id");
380+
let fragment_id: u32 = get_label_infallible(m, "fragment_id");
365381
let epoch = m.get_gauge().get_value() as u64;
366382
if let Some(s) = fragment_stats.get_mut(&fragment_id) {
367383
s.current_epoch = if s.current_epoch == 0 {
@@ -380,7 +396,7 @@ impl MonitorService for MonitorServiceImpl {
380396
let mut relation_stats: HashMap<u32, RelationStats> = HashMap::new();
381397
let mview_current_epoch = collect(&metrics.materialize_current_epoch);
382398
for m in &mview_current_epoch {
383-
let table_id: u32 = get_label(m, "table_id");
399+
let table_id: u32 = get_label_infallible(m, "table_id");
384400
let epoch = m.get_gauge().get_value() as u64;
385401
if let Some(s) = relation_stats.get_mut(&table_id) {
386402
s.current_epoch = if s.current_epoch == 0 {
@@ -403,8 +419,9 @@ impl MonitorService for MonitorServiceImpl {
403419
let mut channel_stats: BTreeMap<String, ChannelStats> = BTreeMap::new();
404420

405421
for metric in actor_output_buffer_blocking_duration_ns {
406-
let fragment_id: u32 = get_label(&metric, "fragment_id");
407-
let downstream_fragment_id: u32 = get_label(&metric, "downstream_fragment_id");
422+
let fragment_id: u32 = get_label_infallible(&metric, "fragment_id");
423+
let downstream_fragment_id: u32 =
424+
get_label_infallible(&metric, "downstream_fragment_id");
408425

409426
let key = format!("{}_{}", fragment_id, downstream_fragment_id);
410427
let channel_stat = channel_stats.entry(key).or_insert_with(|| ChannelStats {
@@ -416,17 +433,18 @@ impl MonitorService for MonitorServiceImpl {
416433

417434
// When metrics level is Debug, `actor_id` will be removed to reduce metrics.
418435
// See `src/common/metrics/src/relabeled_metric.rs`
419-
channel_stat.actor_count += if get_label::<String>(&metric, "actor_id").is_empty() {
420-
actor_count[&fragment_id]
421-
} else {
422-
1
423-
};
436+
channel_stat.actor_count +=
437+
if get_label_infallible::<String>(&metric, "actor_id").is_empty() {
438+
actor_count[&fragment_id]
439+
} else {
440+
1
441+
};
424442
channel_stat.output_blocking_duration += metric.get_counter().get_value();
425443
}
426444

427445
let actor_output_row_count = collect(&metrics.actor_out_record_cnt);
428446
for metric in actor_output_row_count {
429-
let fragment_id: u32 = get_label(&metric, "fragment_id");
447+
let fragment_id: u32 = get_label_infallible(&metric, "fragment_id");
430448

431449
// Find out and write to all downstream channels
432450
let key_prefix = format!("{}_", fragment_id);
@@ -438,8 +456,8 @@ impl MonitorService for MonitorServiceImpl {
438456

439457
let actor_input_row_count = collect(&metrics.actor_in_record_cnt);
440458
for metric in actor_input_row_count {
441-
let upstream_fragment_id: u32 = get_label(&metric, "upstream_fragment_id");
442-
let fragment_id: u32 = get_label(&metric, "fragment_id");
459+
let upstream_fragment_id: u32 = get_label_infallible(&metric, "upstream_fragment_id");
460+
let fragment_id: u32 = get_label_infallible(&metric, "fragment_id");
443461

444462
let key = format!("{}_{}", upstream_fragment_id, fragment_id);
445463
if let Some(s) = channel_stats.get_mut(&key) {
@@ -529,6 +547,7 @@ impl MonitorService for MonitorServiceImpl {
529547
}
530548

531549
pub use grpc_middleware::*;
550+
use risingwave_common::metrics::get_label_infallible;
532551

533552
pub mod grpc_middleware {
534553
use std::sync::Arc;

0 commit comments

Comments
 (0)