Skip to content

Commit 1b9a45d

Browse files
BugenZhaoLi0k
authored andcommitted
feat(grafana): report local barrier manager progress (#12001)
Signed-off-by: Bugen Zhao <[email protected]>
1 parent a32441b commit 1b9a45d

File tree

6 files changed

+85
-75
lines changed

6 files changed

+85
-75
lines changed

docker/dashboards/risingwave-dev-dashboard.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

grafana/common.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ def next_one_third_width_graph(self):
9696

9797

9898
class Panels:
99+
# Common options for timeseries panels
99100
common_options = {
100101
"fillOpacity": 10,
101102
"interval": "1s",

grafana/risingwave-dev-dashboard.dashboard.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -839,6 +839,17 @@ def section_streaming(panels):
839839
),
840840
],
841841
),
842+
panels.timeseries_ops(
843+
"Earliest In-Flight Barrier Progress",
844+
"The number of actors that have processed the earliest in-flight barriers per second. "
845+
"This metric helps users to detect potential congestion or stuck in the system.",
846+
[
847+
panels.target(
848+
f"rate({metric('stream_barrier_manager_progress')}[$__rate_interval])",
849+
"{{instance}}",
850+
),
851+
],
852+
),
842853
]
843854

844855

grafana/risingwave-dev-dashboard.json

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

src/stream/src/executor/monitor/streaming_stats.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ pub struct StreamingMetrics {
105105
pub barrier_inflight_latency: Histogram,
106106
/// The duration of sync to storage.
107107
pub barrier_sync_latency: Histogram,
108+
/// The progress made by the earliest in-flight barriers in the local barrier manager.
109+
pub barrier_manager_progress: IntCounter,
108110

109111
pub sink_commit_duration: HistogramVec,
110112

@@ -606,6 +608,14 @@ impl StreamingMetrics {
606608
exponential_buckets(0.1, 1.5, 16).unwrap() // max 43s
607609
);
608610
let barrier_sync_latency = register_histogram_with_registry!(opts, registry).unwrap();
611+
612+
let barrier_manager_progress = register_int_counter_with_registry!(
613+
"stream_barrier_manager_progress",
614+
"The number of actors that have processed the earliest in-flight barriers",
615+
registry
616+
)
617+
.unwrap();
618+
609619
let sink_commit_duration = register_histogram_vec_with_registry!(
610620
"sink_commit_duration",
611621
"Duration of commit op in sink",
@@ -769,6 +779,7 @@ impl StreamingMetrics {
769779
arrangement_backfill_upstream_output_row_count,
770780
barrier_inflight_latency,
771781
barrier_sync_latency,
782+
barrier_manager_progress,
772783
sink_commit_duration,
773784
lru_current_watermark_time_ms,
774785
lru_physical_now_ms,

src/stream/src/task/barrier_manager/managed_state.rs

Lines changed: 60 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ use tokio::sync::oneshot;
2525
use super::progress::ChainState;
2626
use super::CollectResult;
2727
use crate::error::{StreamError, StreamResult};
28+
use crate::executor::monitor::GLOBAL_STREAMING_METRICS;
2829
use crate::executor::Barrier;
2930
use crate::task::ActorId;
3031

@@ -84,85 +85,71 @@ impl ManagedBarrierState {
8485

8586
/// Notify if we have collected barriers from all actor ids. The state must be `Issued`.
8687
fn may_notify(&mut self, curr_epoch: u64) {
87-
let to_notify = match self.epoch_barrier_state_map.get(&curr_epoch) {
88-
Some(BarrierState {
89-
inner:
90-
ManagedBarrierStateInner::Issued {
91-
remaining_actors, ..
92-
},
93-
..
94-
}) => remaining_actors.is_empty(),
95-
_ => unreachable!(),
96-
};
88+
// Report if there's progress on the earliest in-flight barrier.
89+
if self.epoch_barrier_state_map.keys().next() == Some(&curr_epoch) {
90+
GLOBAL_STREAMING_METRICS.barrier_manager_progress.inc();
91+
}
9792

98-
if to_notify {
99-
while let Some((
100-
_,
101-
BarrierState {
102-
inner: barrier_inner,
103-
..
104-
},
105-
)) = self.epoch_barrier_state_map.first_key_value()
106-
{
107-
match barrier_inner {
108-
ManagedBarrierStateInner::Issued {
109-
remaining_actors, ..
110-
} => {
111-
if !remaining_actors.is_empty() {
112-
break;
113-
}
114-
}
115-
_ => break,
116-
}
117-
let (epoch, barrier_state) = self.epoch_barrier_state_map.pop_first().unwrap();
118-
let create_mview_progress = self
119-
.create_mview_progress
120-
.remove(&epoch)
121-
.unwrap_or_default()
122-
.into_iter()
123-
.map(|(actor, state)| CreateMviewProgress {
124-
chain_actor_id: actor,
125-
done: matches!(state, ChainState::Done),
126-
consumed_epoch: match state {
127-
ChainState::ConsumingUpstream(consumed_epoch, _) => consumed_epoch,
128-
ChainState::Done => epoch,
129-
},
130-
consumed_rows: match state {
131-
ChainState::ConsumingUpstream(_, consumed_rows) => consumed_rows,
132-
ChainState::Done => 0,
133-
},
134-
})
135-
.collect();
93+
while let Some(entry) = self.epoch_barrier_state_map.first_entry() {
94+
let to_notify = matches!(
95+
&entry.get().inner,
96+
ManagedBarrierStateInner::Issued {
97+
remaining_actors, ..
98+
} if remaining_actors.is_empty(),
99+
);
136100

137-
let kind = barrier_state.kind;
138-
match kind {
139-
BarrierKind::Unspecified => unreachable!(),
140-
BarrierKind::Initial => tracing::info!(
141-
epoch = barrier_state.prev_epoch,
142-
"ignore sealing data for the first barrier"
143-
),
144-
BarrierKind::Barrier | BarrierKind::Checkpoint => {
145-
dispatch_state_store!(&self.state_store, state_store, {
146-
state_store.seal_epoch(barrier_state.prev_epoch, kind.is_checkpoint());
147-
});
148-
}
101+
if !to_notify {
102+
break;
103+
}
104+
105+
let (epoch, barrier_state) = entry.remove_entry();
106+
let create_mview_progress = self
107+
.create_mview_progress
108+
.remove(&epoch)
109+
.unwrap_or_default()
110+
.into_iter()
111+
.map(|(actor, state)| CreateMviewProgress {
112+
chain_actor_id: actor,
113+
done: matches!(state, ChainState::Done),
114+
consumed_epoch: match state {
115+
ChainState::ConsumingUpstream(consumed_epoch, _) => consumed_epoch,
116+
ChainState::Done => epoch,
117+
},
118+
consumed_rows: match state {
119+
ChainState::ConsumingUpstream(_, consumed_rows) => consumed_rows,
120+
ChainState::Done => 0,
121+
},
122+
})
123+
.collect();
124+
125+
let kind = barrier_state.kind;
126+
match kind {
127+
BarrierKind::Unspecified => unreachable!(),
128+
BarrierKind::Initial => tracing::info!(
129+
epoch = barrier_state.prev_epoch,
130+
"ignore sealing data for the first barrier"
131+
),
132+
BarrierKind::Barrier | BarrierKind::Checkpoint => {
133+
dispatch_state_store!(&self.state_store, state_store, {
134+
state_store.seal_epoch(barrier_state.prev_epoch, kind.is_checkpoint());
135+
});
149136
}
137+
}
150138

151-
match barrier_state.inner {
152-
ManagedBarrierStateInner::Issued {
153-
collect_notifier, ..
154-
} => {
155-
// Notify about barrier finishing.
156-
let result = CollectResult {
157-
create_mview_progress,
158-
kind,
159-
};
160-
if collect_notifier.unwrap().send(Ok(result)).is_err() {
161-
warn!("failed to notify barrier collection with epoch {}", epoch)
162-
}
139+
match barrier_state.inner {
140+
ManagedBarrierStateInner::Issued {
141+
collect_notifier, ..
142+
} => {
143+
// Notify about barrier finishing.
144+
let result = CollectResult {
145+
create_mview_progress,
146+
kind,
147+
};
148+
if collect_notifier.unwrap().send(Ok(result)).is_err() {
149+
warn!("failed to notify barrier collection with epoch {}", epoch)
163150
}
164-
_ => unreachable!(),
165151
}
152+
_ => unreachable!(),
166153
}
167154
}
168155
}

0 commit comments

Comments
 (0)