From 485bd2dc19c25aff90a198f68c586b2fdc6543e1 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Sun, 2 Feb 2025 19:33:29 -0500 Subject: [PATCH] Memoize filesystem operations during unpack --- Cargo.toml | 1 + src/archive.rs | 13 +++++++------ src/entry.rs | 51 +++++++++++++++++++++++++++++++------------------- 3 files changed, 40 insertions(+), 25 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 84418f8..229ff90 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ contents are never required to be entirely resident in memory all at once. filetime = "0.2" futures-core = "0.3" portable-atomic = "1" +rustc-hash = "2.1.0" tokio = { version = "1", features = ["fs", "io-util", "rt"] } tokio-stream = "0.1" diff --git a/src/archive.rs b/src/archive.rs index 5d86346..9733cea 100644 --- a/src/archive.rs +++ b/src/archive.rs @@ -1,3 +1,5 @@ +use portable_atomic::{AtomicU64, Ordering}; +use rustc_hash::FxHashSet; use std::{ cmp, collections::VecDeque, @@ -6,10 +8,6 @@ use std::{ sync::Arc, task::{Context, Poll}, }; -use portable_atomic::{ - AtomicU64, - Ordering, -}; use tokio::{ fs, io::{self, AsyncRead as Read, AsyncReadExt}, @@ -244,6 +242,9 @@ impl Archive { .await .unwrap_or_else(|_| dst.to_path_buf()); + // Memoize filesystem calls to canonicalize paths. + let mut targets = FxHashSet::default(); + // Delay any directory entries until the end (they will be created if needed by // descendants), to ensure that directory permissions do not interfer with descendant // extraction. @@ -253,12 +254,12 @@ impl Archive { if file.header().entry_type() == crate::EntryType::Directory { directories.push(file); } else { - file.unpack_in(&dst).await?; + file.fields.unpack_in(&dst, &mut targets).await?; } } for mut dir in directories { - dir.unpack_in(&dst).await?; + dir.fields.unpack_in(&dst, &mut targets).await?; } Ok(()) diff --git a/src/entry.rs b/src/entry.rs index 26f349f..9c1dae2 100644 --- a/src/entry.rs +++ b/src/entry.rs @@ -2,6 +2,7 @@ use crate::{ error::TarError, header::bytes2path, other, pax::pax_extensions, Archive, Header, PaxExtensions, }; use filetime::{self, FileTime}; +use rustc_hash::FxHashSet; use std::{ borrow::Cow, cmp, @@ -26,7 +27,7 @@ use tokio::{ /// be inspected. It acts as a file handle by implementing the Reader trait. An /// entry cannot be rewritten once inserted into an archive. pub struct Entry { - fields: EntryFields, + pub(crate) fields: EntryFields, _ignored: marker::PhantomData>, } @@ -274,7 +275,9 @@ impl Entry { /// # Ok(()) }) } /// ``` pub async fn unpack_in>(&mut self, dst: P) -> io::Result> { - self.fields.unpack_in(dst.as_ref()).await + self.fields + .unpack_in(dst.as_ref(), &mut FxHashSet::default()) + .await } /// Indicate whether extended file attributes (xattrs on Unix) are preserved @@ -426,7 +429,20 @@ impl EntryFields { Ok(Some(pax_extensions(self.pax_extensions.as_ref().unwrap()))) } - async fn unpack_in(&mut self, dst: &Path) -> io::Result> { + /// Unpack the [`Entry`] into the specified destination. + /// + /// It's assumed that `dst` is already canonicalized, and that the memoized set of validated + /// paths are tied to `dst`. + pub(crate) async fn unpack_in( + &mut self, + dst: &Path, + memo: &mut FxHashSet, + ) -> io::Result> { + // It's assumed that `dst` is already canonicalized. + debug_assert!(dst + .canonicalize() + .is_ok_and(|canon_target| canon_target == dst)); + // Notes regarding bsdtar 2.8.3 / libarchive 2.8.3: // * Leading '/'s are trimmed. For example, `///test` is treated as // `test`. @@ -478,13 +494,16 @@ impl EntryFields { None => return Ok(None), }; - self.ensure_dir_created(dst, parent) - .await - .map_err(|e| TarError::new(&format!("failed to create `{}`", parent.display()), e))?; - - let canon_target = self.validate_inside_dst(dst, parent).await?; + // Validate the parent, if we haven't seen it yet. + if !memo.contains(parent) { + self.ensure_dir_created(dst, parent).await.map_err(|e| { + TarError::new(&format!("failed to create `{}`", parent.display()), e) + })?; + self.validate_inside_dst(dst, parent).await?; + memo.insert(parent.to_path_buf()); + } - self.unpack(Some(&canon_target), &file_dst) + self.unpack(Some(&parent), &file_dst) .await .map_err(|e| TarError::new(&format!("failed to unpack `{}`", file_dst.display()), e))?; @@ -890,7 +909,7 @@ impl EntryFields { Ok(()) } - async fn validate_inside_dst(&self, dst: &Path, file_dst: &Path) -> io::Result { + async fn validate_inside_dst(&self, dst: &Path, file_dst: &Path) -> io::Result<()> { // Abort if target (canonical) parent is outside of `dst` let canon_parent = file_dst.canonicalize().map_err(|err| { Error::new( @@ -898,24 +917,18 @@ impl EntryFields { format!("{} while canonicalizing {}", err, file_dst.display()), ) })?; - let canon_target = dst.canonicalize().map_err(|err| { - Error::new( - err.kind(), - format!("{} while canonicalizing {}", err, dst.display()), - ) - })?; - if !canon_parent.starts_with(&canon_target) { + if !canon_parent.starts_with(&dst) { let err = TarError::new( &format!( "trying to unpack outside of destination path: {}", - canon_target.display() + dst.display() ), // TODO: use ErrorKind::InvalidInput here? (minor breaking change) Error::new(ErrorKind::Other, "Invalid argument"), ); return Err(err.into()); } - Ok(canon_target) + Ok(()) } }