Skip to content

Memoize filesystem operations during unpack #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ contents are never required to be entirely resident in memory all at once.
filetime = "0.2"
futures-core = "0.3"
portable-atomic = "1"
rustc-hash = "2.1.0"
tokio = { version = "1", features = ["fs", "io-util", "rt"] }
tokio-stream = "0.1"

Expand Down
13 changes: 7 additions & 6 deletions src/archive.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use portable_atomic::{AtomicU64, Ordering};
use rustc_hash::FxHashSet;
use std::{
cmp,
collections::VecDeque,
Expand All @@ -6,10 +8,6 @@ use std::{
sync::Arc,
task::{Context, Poll},
};
use portable_atomic::{
AtomicU64,
Ordering,
};
use tokio::{
fs,
io::{self, AsyncRead as Read, AsyncReadExt},
Expand Down Expand Up @@ -244,6 +242,9 @@ impl<R: Read + Unpin> Archive<R> {
.await
.unwrap_or_else(|_| dst.to_path_buf());

// Memoize filesystem calls to canonicalize paths.
let mut targets = FxHashSet::default();

// Delay any directory entries until the end (they will be created if needed by
// descendants), to ensure that directory permissions do not interfer with descendant
// extraction.
Expand All @@ -253,12 +254,12 @@ impl<R: Read + Unpin> Archive<R> {
if file.header().entry_type() == crate::EntryType::Directory {
directories.push(file);
} else {
file.unpack_in(&dst).await?;
file.fields.unpack_in(&dst, &mut targets).await?;
}
}

for mut dir in directories {
dir.unpack_in(&dst).await?;
dir.fields.unpack_in(&dst, &mut targets).await?;
}

Ok(())
Expand Down
51 changes: 32 additions & 19 deletions src/entry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::{
error::TarError, header::bytes2path, other, pax::pax_extensions, Archive, Header, PaxExtensions,
};
use filetime::{self, FileTime};
use rustc_hash::FxHashSet;
use std::{
borrow::Cow,
cmp,
Expand All @@ -26,7 +27,7 @@ use tokio::{
/// be inspected. It acts as a file handle by implementing the Reader trait. An
/// entry cannot be rewritten once inserted into an archive.
pub struct Entry<R: Read + Unpin> {
fields: EntryFields<R>,
pub(crate) fields: EntryFields<R>,
_ignored: marker::PhantomData<Archive<R>>,
}

Expand Down Expand Up @@ -274,7 +275,9 @@ impl<R: Read + Unpin> Entry<R> {
/// # Ok(()) }) }
/// ```
pub async fn unpack_in<P: AsRef<Path>>(&mut self, dst: P) -> io::Result<Option<PathBuf>> {
self.fields.unpack_in(dst.as_ref()).await
self.fields
.unpack_in(dst.as_ref(), &mut FxHashSet::default())
.await
}

/// Indicate whether extended file attributes (xattrs on Unix) are preserved
Expand Down Expand Up @@ -426,7 +429,20 @@ impl<R: Read + Unpin> EntryFields<R> {
Ok(Some(pax_extensions(self.pax_extensions.as_ref().unwrap())))
}

async fn unpack_in(&mut self, dst: &Path) -> io::Result<Option<PathBuf>> {
/// Unpack the [`Entry`] into the specified destination.
///
/// It's assumed that `dst` is already canonicalized, and that the memoized set of validated
/// paths are tied to `dst`.
pub(crate) async fn unpack_in(
&mut self,
dst: &Path,
memo: &mut FxHashSet<PathBuf>,
) -> io::Result<Option<PathBuf>> {
// It's assumed that `dst` is already canonicalized.
debug_assert!(dst
.canonicalize()
.is_ok_and(|canon_target| canon_target == dst));

// Notes regarding bsdtar 2.8.3 / libarchive 2.8.3:
// * Leading '/'s are trimmed. For example, `///test` is treated as
// `test`.
Expand Down Expand Up @@ -478,13 +494,16 @@ impl<R: Read + Unpin> EntryFields<R> {
None => return Ok(None),
};

self.ensure_dir_created(dst, parent)
.await
.map_err(|e| TarError::new(&format!("failed to create `{}`", parent.display()), e))?;

let canon_target = self.validate_inside_dst(dst, parent).await?;
// Validate the parent, if we haven't seen it yet.
if !memo.contains(parent) {
self.ensure_dir_created(dst, parent).await.map_err(|e| {
TarError::new(&format!("failed to create `{}`", parent.display()), e)
})?;
self.validate_inside_dst(dst, parent).await?;
memo.insert(parent.to_path_buf());
}

self.unpack(Some(&canon_target), &file_dst)
self.unpack(Some(&parent), &file_dst)
.await
.map_err(|e| TarError::new(&format!("failed to unpack `{}`", file_dst.display()), e))?;

Expand Down Expand Up @@ -890,32 +909,26 @@ impl<R: Read + Unpin> EntryFields<R> {
Ok(())
}

async fn validate_inside_dst(&self, dst: &Path, file_dst: &Path) -> io::Result<PathBuf> {
async fn validate_inside_dst(&self, dst: &Path, file_dst: &Path) -> io::Result<()> {
// Abort if target (canonical) parent is outside of `dst`
let canon_parent = file_dst.canonicalize().map_err(|err| {
Error::new(
err.kind(),
format!("{} while canonicalizing {}", err, file_dst.display()),
)
})?;
let canon_target = dst.canonicalize().map_err(|err| {
Error::new(
err.kind(),
format!("{} while canonicalizing {}", err, dst.display()),
)
})?;
if !canon_parent.starts_with(&canon_target) {
if !canon_parent.starts_with(&dst) {
let err = TarError::new(
&format!(
"trying to unpack outside of destination path: {}",
canon_target.display()
dst.display()
),
// TODO: use ErrorKind::InvalidInput here? (minor breaking change)
Error::new(ErrorKind::Other, "Invalid argument"),
);
return Err(err.into());
}
Ok(canon_target)
Ok(())
}
}

Expand Down