Skip to content

Commit 8b2b7b3

Browse files
authored
Fix stalls on ffmpeg based decoder reset (#7998)
### What * Part of #7607 (All perf numbers on Windows which is well known for slower process handling compared to its competitors.) Makes the ffmpeg listen/write thread shutdown much more rigorous. It worked fine before, but on Windows in particular I often noticed long stalls: ![383056190-113e4efa-3376-4b7b-96cf-dfd79fac8e4d](https://github.com/user-attachments/assets/a31186dd-3971-4e30-b0c3-7bf9c9facffd) This happened mostly due to delays in "noticing" that ffmpeg was shut down already. The shutdown fixes themselves, make the problem _almost_ go away. But it still happens that the "listen thread" takes a fair while until it closes: ![image](https://github.com/user-attachments/assets/936372ab-db4c-4c16-adf4-d25e45fe27c6) Occasionally I also still observed 100ms+ for this operation, especially when having several decoders open at the same time (coincidence? did that just make it more likely to get a bad one or is there more to it?). So I decided to have the thread shut down in the background instead - this is safe now, since the `on_output` callback gets disconnected prior to the shutdown. No profiler picture for this, since there's nothing left to look at ;-) ### Checklist * [x] I have read and agree to [Contributor Guide](https://github.com/rerun-io/rerun/blob/main/CONTRIBUTING.md) and the [Code of Conduct](https://github.com/rerun-io/rerun/blob/main/CODE_OF_CONDUCT.md) * [x] I've included a screenshot or gif (if applicable) * [x] I have tested the web demo (if applicable): * Using examples from latest `main` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7998?manifest_url=https://app.rerun.io/version/main/examples_manifest.json) * Using full set of examples from `nightly` build: [rerun.io/viewer](https://rerun.io/viewer/pr/7998?manifest_url=https://app.rerun.io/version/nightly/examples_manifest.json) * [x] The PR title and labels are set such as to maximize their usefulness for the next release's CHANGELOG * [x] If applicable, add a new check to the [release checklist](https://github.com/rerun-io/rerun/blob/main/tests/python/release_checklist)! * [x] If have noted any breaking changes to the log API in `CHANGELOG.md` and the migration guide - [PR Build Summary](https://build.rerun.io/pr/7998) - [Recent benchmark results](https://build.rerun.io/graphs/crates.html) - [Wasm size tracking](https://build.rerun.io/graphs/sizes.html) To run all checks from `main`, comment on the PR with `@rerun-bot full-check`.
1 parent c9d319c commit 8b2b7b3

File tree

1 file changed

+116
-56
lines changed

1 file changed

+116
-56
lines changed

crates/store/re_video/src/decode/ffmpeg.rs

+116-56
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
33
use std::{
44
collections::BTreeMap,
5-
io::Write,
5+
process::ChildStdin,
66
sync::{atomic::AtomicBool, Arc},
77
};
88

@@ -12,6 +12,7 @@ use ffmpeg_sidecar::{
1212
command::FfmpegCommand,
1313
event::{FfmpegEvent, LogLevel},
1414
};
15+
use parking_lot::Mutex;
1516

1617
use crate::Time;
1718

@@ -87,6 +88,36 @@ enum FfmpegFrameData {
8788
EndOfStream,
8889
}
8990

91+
/// Wraps an stdin with a shared shutdown boolean.
92+
struct StdinWithShutdown {
93+
shutdown: Arc<AtomicBool>,
94+
stdin: ChildStdin,
95+
}
96+
97+
impl StdinWithShutdown {
98+
// Don't use `std::io::ErrorKind::Interrupted` because it has special meaning for default implementations of the `Write` trait,
99+
// causing it to continue.
100+
const SHUTDOWN_ERROR_KIND: std::io::ErrorKind = std::io::ErrorKind::Other;
101+
}
102+
103+
impl std::io::Write for StdinWithShutdown {
104+
fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
105+
if self.shutdown.load(std::sync::atomic::Ordering::Acquire) {
106+
Err(std::io::Error::new(Self::SHUTDOWN_ERROR_KIND, "shutdown"))
107+
} else {
108+
self.stdin.write(buf)
109+
}
110+
}
111+
112+
fn flush(&mut self) -> std::io::Result<()> {
113+
if self.shutdown.load(std::sync::atomic::Ordering::Acquire) {
114+
Err(std::io::Error::new(Self::SHUTDOWN_ERROR_KIND, "shutdown"))
115+
} else {
116+
self.stdin.flush()
117+
}
118+
}
119+
}
120+
90121
struct FfmpegProcessAndListener {
91122
ffmpeg: FfmpegChild,
92123

@@ -100,7 +131,10 @@ struct FfmpegProcessAndListener {
100131
write_thread: Option<std::thread::JoinHandle<()>>,
101132

102133
/// If true, the write thread will not report errors. Used upon exit, so the write thread won't log spam on the hung up stdin.
103-
suppress_write_error_reports: Arc<AtomicBool>,
134+
stdin_shutdown: Arc<AtomicBool>,
135+
136+
/// On output instance used by the threads.
137+
on_output: Arc<Mutex<Option<Arc<OutputCallback>>>>,
104138
}
105139

106140
impl FfmpegProcessAndListener {
@@ -151,6 +185,12 @@ impl FfmpegProcessAndListener {
151185
let (frame_info_tx, frame_info_rx) = crossbeam::channel::unbounded();
152186
let (frame_data_tx, frame_data_rx) = crossbeam::channel::unbounded();
153187

188+
let stdin_shutdown = Arc::new(AtomicBool::new(false));
189+
190+
// Mutex protect `on_output` so that we can shut down the threads at a defined point in time at which we
191+
// no longer receive any new frames or errors from this process.
192+
let on_output = Arc::new(Mutex::new(Some(on_output)));
193+
154194
let listen_thread = std::thread::Builder::new()
155195
.name(format!("ffmpeg-reader for {debug_name}"))
156196
.spawn({
@@ -166,20 +206,21 @@ impl FfmpegProcessAndListener {
166206
}
167207
})
168208
.expect("Failed to spawn ffmpeg listener thread");
169-
170-
let suppress_write_error_reports = Arc::new(AtomicBool::new(false));
171209
let write_thread = std::thread::Builder::new()
172210
.name(format!("ffmpeg-writer for {debug_name}"))
173211
.spawn({
212+
let on_output = on_output.clone();
174213
let ffmpeg_stdin = ffmpeg.take_stdin().ok_or(Error::NoStdin)?;
175-
let suppress_write_error_reports = suppress_write_error_reports.clone();
214+
let mut ffmpeg_stdin = StdinWithShutdown {
215+
stdin: ffmpeg_stdin,
216+
shutdown: stdin_shutdown.clone(),
217+
};
176218
move || {
177219
write_ffmpeg_input(
178-
ffmpeg_stdin,
220+
&mut ffmpeg_stdin,
179221
&frame_data_rx,
180222
on_output.as_ref(),
181223
&avcc,
182-
&suppress_write_error_reports,
183224
);
184225
}
185226
})
@@ -191,38 +232,65 @@ impl FfmpegProcessAndListener {
191232
frame_data_tx,
192233
listen_thread: Some(listen_thread),
193234
write_thread: Some(write_thread),
194-
suppress_write_error_reports,
235+
stdin_shutdown,
236+
on_output,
195237
})
196238
}
197239
}
198240

199241
impl Drop for FfmpegProcessAndListener {
200242
fn drop(&mut self) {
201243
re_tracing::profile_function!();
202-
self.suppress_write_error_reports
203-
.store(true, std::sync::atomic::Ordering::Relaxed);
244+
245+
// Stop all outputs from being written to - any attempt from here on out will fail and cause thread shutdown.
246+
// This way, we ensure all ongoing writes are finished and won't get any more on_output callbacks from this process
247+
// before we take any other action on the shutdown sequence.
248+
{
249+
self.on_output.lock().take();
250+
}
251+
252+
// Notify (potentially wake up) the stdin write thread to stop it (it might be sleeping).
204253
self.frame_data_tx.send(FfmpegFrameData::EndOfStream).ok();
254+
// Kill stdin for the write thread. This helps cancelling ongoing stream write operations.
255+
self.stdin_shutdown
256+
.store(true, std::sync::atomic::Ordering::Release);
257+
258+
// Kill the ffmpeg process itself.
259+
// This should wake up the listen thread if it is sleeping, but that may take a while.
205260
self.ffmpeg.kill().ok();
206261

207-
if let Some(write_thread) = self.write_thread.take() {
208-
if write_thread.join().is_err() {
209-
re_log::error!("Failed to join ffmpeg listener thread.");
262+
// Unfortunately, even with the above measures, it can still happen that the listen threads take occasionally 100ms and more to shut down.
263+
// (very much depending on the system & OS, typical times may be low with large outliers)
264+
// It is crucial that the threads come down eventually and rather timely so to avoid leaking resources.
265+
// However, in order to avoid stalls, we'll let them finish in parallel.
266+
//
267+
// Since we disconnected the `on_output` callback from them, they won't influence any new instances.
268+
if false {
269+
{
270+
re_tracing::profile_scope!("shutdown write thread");
271+
if let Some(write_thread) = self.write_thread.take() {
272+
if write_thread.join().is_err() {
273+
re_log::error!("Failed to join ffmpeg listener thread.");
274+
}
275+
}
210276
}
211-
}
212-
if let Some(listen_thread) = self.listen_thread.take() {
213-
if listen_thread.join().is_err() {
214-
re_log::error!("Failed to join ffmpeg listener thread.");
277+
{
278+
re_tracing::profile_scope!("shutdown listen thread");
279+
if let Some(listen_thread) = self.listen_thread.take() {
280+
if listen_thread.join().is_err() {
281+
re_log::error!("Failed to join ffmpeg listener thread.");
282+
}
283+
}
215284
}
216285
}
217286
}
218287
}
219288

220289
fn write_ffmpeg_input(
221-
mut ffmpeg_stdin: std::process::ChildStdin,
290+
ffmpeg_stdin: &mut dyn std::io::Write,
222291
frame_data_rx: &Receiver<FfmpegFrameData>,
223-
on_output: &OutputCallback,
292+
on_output: &Mutex<Option<Arc<OutputCallback>>>,
224293
avcc: &re_mp4::Avc1Box,
225-
suppress_write_error_reports: &AtomicBool,
226294
) {
227295
let mut state = NaluStreamState::default();
228296

@@ -232,19 +300,18 @@ fn write_ffmpeg_input(
232300
FfmpegFrameData::EndOfStream => break,
233301
};
234302

235-
if let Err(err) =
236-
write_avc_chunk_to_nalu_stream(avcc, &mut ffmpeg_stdin, &chunk, &mut state)
237-
{
238-
let write_error = matches!(err, Error::FailedToWriteToFfmpeg(_));
239-
if !write_error
240-
|| !suppress_write_error_reports.load(std::sync::atomic::Ordering::Relaxed)
241-
{
242-
(on_output)(Err(err.into()));
243-
}
303+
if let Err(err) = write_avc_chunk_to_nalu_stream(avcc, ffmpeg_stdin, &chunk, &mut state) {
304+
let on_output = on_output.lock();
305+
if let Some(on_output) = on_output.as_ref() {
306+
let write_error = matches!(err, Error::FailedToWriteToFfmpeg(_));
307+
on_output(Err(err.into()));
244308

245-
// This is unlikely to improve! Ffmpeg process likely died.
246-
// By exiting here we hang up on the channel, making future attempts to push into it fail which should cause a reset eventually.
247-
if write_error {
309+
if write_error {
310+
// This is unlikely to improve! Ffmpeg process likely died.
311+
// By exiting here we hang up on the channel, making future attempts to push into it fail which should cause a reset eventually.
312+
return;
313+
}
314+
} else {
248315
return;
249316
}
250317
} else {
@@ -257,8 +324,8 @@ fn read_ffmpeg_output(
257324
debug_name: &str,
258325
ffmpeg_iterator: ffmpeg_sidecar::iter::FfmpegIterator,
259326
frame_info_rx: &Receiver<FfmpegFrameInfo>,
260-
on_output: &OutputCallback,
261-
) {
327+
on_output: &Mutex<Option<Arc<OutputCallback>>>,
328+
) -> Option<()> {
262329
/// Ignore some common output from ffmpeg:
263330
fn should_ignore_log_msg(msg: &str) -> bool {
264331
let patterns = [
@@ -310,19 +377,18 @@ fn read_ffmpeg_output(
310377
}
311378

312379
FfmpegEvent::Log(LogLevel::Error, msg) => {
313-
on_output(Err(Error::Ffmpeg(msg).into()));
380+
(on_output.lock().as_ref()?)(Err(Error::Ffmpeg(msg).into()));
314381
}
315382

316383
FfmpegEvent::Log(LogLevel::Fatal, msg) => {
317-
on_output(Err(Error::FfmpegFatal(msg).into()));
318-
return;
384+
(on_output.lock().as_ref()?)(Err(Error::FfmpegFatal(msg).into()));
319385
}
320386

321387
FfmpegEvent::Log(LogLevel::Unknown, msg) => {
322388
if msg.contains("system signals, hard exiting") {
323389
// That was probably us, killing the process.
324390
re_log::debug!("FFmpeg process for {debug_name} was killed");
325-
return;
391+
return None;
326392
}
327393
if !should_ignore_log_msg(&msg) {
328394
re_log::warn_once!("{debug_name} decoder: {msg}");
@@ -336,7 +402,7 @@ fn read_ffmpeg_output(
336402

337403
FfmpegEvent::Error(error) => {
338404
// An error in ffmpeg sidecar itself, rather than ffmpeg.
339-
on_output(Err(Error::FfmpegSidecar(error).into()));
405+
(on_output.lock().as_ref()?)(Err(Error::FfmpegSidecar(error).into()));
340406
}
341407

342408
FfmpegEvent::ParsedInput(input) => {
@@ -423,7 +489,7 @@ fn read_ffmpeg_output(
423489
re_log::debug!(
424490
"{debug_name} ffmpeg decoder frame info channel disconnected"
425491
);
426-
return;
492+
return None;
427493
};
428494

429495
// If the decodetimestamp did not increase, we're probably seeking backwards!
@@ -458,7 +524,7 @@ fn read_ffmpeg_output(
458524
debug_assert_eq!(pix_fmt, "rgb24");
459525
debug_assert_eq!(width as usize * height as usize * 3, data.len());
460526

461-
on_output(Ok(super::Frame {
527+
(on_output.lock().as_ref()?)(Ok(super::Frame {
462528
content: super::FrameContent {
463529
data,
464530
width,
@@ -476,7 +542,7 @@ fn read_ffmpeg_output(
476542
FfmpegEvent::Done => {
477543
// This happens on `pkill ffmpeg`, for instance.
478544
re_log::debug!("{debug_name}'s ffmpeg is Done");
479-
return;
545+
return None;
480546
}
481547

482548
FfmpegEvent::ParsedVersion(ffmpeg_version) => {
@@ -497,11 +563,13 @@ fn read_ffmpeg_output(
497563
FfmpegEvent::OutputChunk(_) => {
498564
// Something went seriously wrong if we end up here.
499565
re_log::error!("Unexpected ffmpeg output chunk for {debug_name}");
500-
on_output(Err(Error::UnexpectedFfmpegOutputChunk.into()));
501-
return;
566+
(on_output.lock().as_ref()?)(Err(Error::UnexpectedFfmpegOutputChunk.into()));
567+
return None;
502568
}
503569
}
504570
}
571+
572+
Some(())
505573
}
506574

507575
/// Decode H.264 video via ffmpeg over CLI
@@ -606,20 +674,12 @@ fn write_avc_chunk_to_nalu_stream(
606674
// Otherwise the decoder is not able to get the necessary information about how the video stream is encoded.
607675
if chunk.is_sync && !state.previous_frame_was_idr {
608676
for sps in &avcc.sequence_parameter_sets {
609-
nalu_stream
610-
.write_all(NAL_START_CODE)
611-
.map_err(Error::FailedToWriteToFfmpeg)?;
612-
nalu_stream
613-
.write_all(&sps.bytes)
614-
.map_err(Error::FailedToWriteToFfmpeg)?;
677+
write_bytes(nalu_stream, NAL_START_CODE)?;
678+
write_bytes(nalu_stream, &sps.bytes)?;
615679
}
616680
for pps in &avcc.picture_parameter_sets {
617-
nalu_stream
618-
.write_all(NAL_START_CODE)
619-
.map_err(Error::FailedToWriteToFfmpeg)?;
620-
nalu_stream
621-
.write_all(&pps.bytes)
622-
.map_err(Error::FailedToWriteToFfmpeg)?;
681+
write_bytes(nalu_stream, NAL_START_CODE)?;
682+
write_bytes(nalu_stream, &pps.bytes)?;
623683
}
624684
state.previous_frame_was_idr = true;
625685
} else {

0 commit comments

Comments
 (0)