Skip to content

feat(metrics): add metrics for the evicted watermark for each executors #10379

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/dashboards/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

10 changes: 10 additions & 0 deletions grafana/risingwave-dev-dashboard.dashboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -2617,6 +2617,16 @@ def section_memory_manager(outer_panels):
),
],
),
panels.timeseries_ms(
"LRU manager diff between current watermark and evicted watermark time (ms) for actors",
"",
[
panels.target(
f"{metric('lru_evicted_watermark_time_diff_ms')}",
"table {{table_id}} actor {{actor_id}} desc: {{desc}}",
),
],
),
],
),
]
Expand Down
2 changes: 1 addition & 1 deletion grafana/risingwave-dev-dashboard.json

Large diffs are not rendered by default.

36 changes: 32 additions & 4 deletions src/stream/src/cache/managed_lru.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use std::sync::Arc;
use lru::{DefaultHasher, KeyRef, LruCache};
use prometheus::IntGauge;
use risingwave_common::estimate_size::EstimateSize;
use risingwave_common::util::epoch::Epoch;

use crate::common::metrics::MetricsInfo;

Expand All @@ -40,6 +41,8 @@ pub struct ManagedLruCache<K, V, S = DefaultHasher, A: Clone + Allocator = Globa
kv_heap_size: usize,
/// The metrics of memory usage
memory_usage_metrics: IntGauge,
// The metrics of evicted watermark time
lru_evicted_watermark_time_diff_ms: IntGauge,
// Metrics info
metrics_info: MetricsInfo,
/// The size reported last time
Expand All @@ -55,6 +58,10 @@ impl<K, V, S, A: Clone + Allocator> Drop for ManagedLruCache<K, V, S, A> {
.stream_memory_usage
.remove_label_values(&[&info.table_id, &info.actor_id, &info.desc])
.unwrap();
info.metrics
.lru_evicted_watermark_time_diff_ms
.remove_label_values(&[&info.table_id, &info.actor_id, &info.desc])
.unwrap();
}
}

Expand All @@ -74,27 +81,37 @@ impl<K: Hash + Eq + EstimateSize, V: EstimateSize, S: BuildHasher, A: Clone + Al
&metrics_info.actor_id,
&metrics_info.desc,
]);
memory_usage_metrics.set(0.into());

let lru_evicted_watermark_time_diff_ms = metrics_info
.metrics
.lru_evicted_watermark_time_diff_ms
.with_label_values(&[
&metrics_info.table_id,
&metrics_info.actor_id,
&metrics_info.desc,
]);
lru_evicted_watermark_time_diff_ms.set(watermark_epoch.load(Ordering::Relaxed) as _);

Self {
inner,
watermark_epoch,
kv_heap_size: 0,
memory_usage_metrics,
lru_evicted_watermark_time_diff_ms,
metrics_info,
last_reported_size_bytes: 0,
}
}

/// Evict epochs lower than the watermark
pub fn evict(&mut self) {
let epoch = self.watermark_epoch.load(Ordering::Relaxed);
self.evict_by_epoch(epoch);
self.evict_by_epoch(self.load_cur_epoch());
}

/// Evict epochs lower than the watermark, except those entry which touched in this epoch
pub fn evict_except_cur_epoch(&mut self) {
let epoch = self.watermark_epoch.load(Ordering::Relaxed);
let epoch = min(epoch, self.inner.current_epoch());
let epoch = min(self.load_cur_epoch(), self.inner.current_epoch());
self.evict_by_epoch(epoch);
}

Expand All @@ -103,6 +120,7 @@ impl<K: Hash + Eq + EstimateSize, V: EstimateSize, S: BuildHasher, A: Clone + Al
while let Some((key, value)) = self.inner.pop_lru_by_epoch(epoch) {
self.kv_heap_size_dec(key.estimated_size() + value.estimated_size());
}
self.report_evicted_watermark_time(epoch);
}

pub fn update_epoch(&mut self, epoch: u64) {
Expand Down Expand Up @@ -225,6 +243,16 @@ impl<K: Hash + Eq + EstimateSize, V: EstimateSize, S: BuildHasher, A: Clone + Al
false
}
}

fn report_evicted_watermark_time(&self, epoch: u64) {
self.lru_evicted_watermark_time_diff_ms.set(
(Epoch(self.load_cur_epoch()).physical_time() - Epoch(epoch).physical_time()) as _,
);
}

fn load_cur_epoch(&self) -> u64 {
self.watermark_epoch.load(Ordering::Relaxed)
}
}

pub fn new_unbounded<K: Hash + Eq + EstimateSize, V: EstimateSize>(
Expand Down
10 changes: 10 additions & 0 deletions src/stream/src/executor/monitor/streaming_stats.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ pub struct StreamingMetrics {
pub lru_physical_now_ms: IntGauge,
pub lru_runtime_loop_count: IntCounter,
pub lru_watermark_step: IntGauge,
pub lru_evicted_watermark_time_diff_ms: GenericGaugeVec<AtomicI64>,
pub jemalloc_allocated_bytes: IntGauge,
pub jemalloc_active_bytes: IntGauge,

Expand Down Expand Up @@ -588,6 +589,14 @@ impl StreamingMetrics {
)
.unwrap();

let lru_evicted_watermark_time_diff_ms = register_int_gauge_vec_with_registry!(
"lru_evicted_watermark_time_diff_ms",
"The diff between current watermark and latest evicted watermark time by actors",
&["table_id", "actor_id", "desc"],
registry
)
.unwrap();

let jemalloc_allocated_bytes = register_int_gauge_with_registry!(
"jemalloc_allocated_bytes",
"The allocated memory jemalloc, got from jemalloc_ctl",
Expand Down Expand Up @@ -695,6 +704,7 @@ impl StreamingMetrics {
lru_physical_now_ms,
lru_runtime_loop_count,
lru_watermark_step,
lru_evicted_watermark_time_diff_ms,
jemalloc_allocated_bytes,
jemalloc_active_bytes,
user_compute_error_count,
Expand Down