Skip to content

Commit eb5ac69

Browse files
authored
Cleanup ChunkReader (#4118) (#4156)
* Remove length from ChunkReader (#4118) * Remove ChunkReader::T Send bound * Remove FileSource * Tweak docs
1 parent 08dc16c commit eb5ac69

File tree

7 files changed

+88
-334
lines changed

7 files changed

+88
-334
lines changed

parquet/src/arrow/async_reader/mod.rs

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,25 @@ enum ColumnChunkData {
744744
Dense { offset: usize, data: Bytes },
745745
}
746746

747+
impl ColumnChunkData {
748+
fn get(&self, start: u64) -> Result<Bytes> {
749+
match &self {
750+
ColumnChunkData::Sparse { data, .. } => data
751+
.binary_search_by_key(&start, |(offset, _)| *offset as u64)
752+
.map(|idx| data[idx].1.clone())
753+
.map_err(|_| {
754+
ParquetError::General(format!(
755+
"Invalid offset in sparse column chunk data: {start}"
756+
))
757+
}),
758+
ColumnChunkData::Dense { offset, data } => {
759+
let start = start as usize - *offset;
760+
Ok(data.slice(start..))
761+
}
762+
}
763+
}
764+
}
765+
747766
impl Length for ColumnChunkData {
748767
fn len(&self) -> u64 {
749768
match &self {
@@ -756,26 +775,12 @@ impl Length for ColumnChunkData {
756775
impl ChunkReader for ColumnChunkData {
757776
type T = bytes::buf::Reader<Bytes>;
758777

759-
fn get_read(&self, start: u64, length: usize) -> Result<Self::T> {
760-
Ok(self.get_bytes(start, length)?.reader())
778+
fn get_read(&self, start: u64) -> Result<Self::T> {
779+
Ok(self.get(start)?.reader())
761780
}
762781

763782
fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
764-
match &self {
765-
ColumnChunkData::Sparse { data, .. } => data
766-
.binary_search_by_key(&start, |(offset, _)| *offset as u64)
767-
.map(|idx| data[idx].1.slice(0..length))
768-
.map_err(|_| {
769-
ParquetError::General(format!(
770-
"Invalid offset in sparse column chunk data: {start}"
771-
))
772-
}),
773-
ColumnChunkData::Dense { offset, data } => {
774-
let start = start as usize - *offset;
775-
let end = start + length;
776-
Ok(data.slice(start..end))
777-
}
778-
}
783+
Ok(self.get(start)?.slice(..length))
779784
}
780785
}
781786

parquet/src/bin/parquet-layout.rs

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,7 @@ fn read_page_header<C: ChunkReader>(
175175
}
176176
}
177177

178-
let len = reader.len().checked_sub(offset).unwrap() as usize;
179-
let input = reader.get_read(offset, len)?;
178+
let input = reader.get_read(offset)?;
180179
let mut tracked = TrackedRead(input, 0);
181180
let mut prot = TCompactInputProtocol::new(&mut tracked);
182181
let header = PageHeader::read_from_in_protocol(&mut prot)?;

parquet/src/file/footer.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ pub fn parse_metadata<R: ChunkReader>(chunk_reader: &R) -> Result<ParquetMetaDat
4646

4747
let mut footer = [0_u8; 8];
4848
chunk_reader
49-
.get_read(file_size - 8, 8)?
49+
.get_read(file_size - 8)?
5050
.read_exact(&mut footer)?;
5151

5252
let metadata_len = decode_footer(&footer)?;

parquet/src/file/reader.rs

Lines changed: 59 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
//! readers to read individual column chunks, or access record
2020
//! iterator.
2121
22-
use bytes::Bytes;
22+
use bytes::{Buf, Bytes};
23+
use std::fs::File;
24+
use std::io::{BufReader, Seek, SeekFrom};
2325
use std::{boxed::Box, io::Read, sync::Arc};
2426

2527
use crate::bloom_filter::Sbbf;
@@ -44,19 +46,47 @@ pub trait Length {
4446
}
4547

4648
/// The ChunkReader trait generates readers of chunks of a source.
47-
/// For a file system reader, each chunk might contain a clone of File bounded on a given range.
48-
/// For an object store reader, each read can be mapped to a range request.
49+
///
50+
/// For more information see [`File::try_clone`]
4951
pub trait ChunkReader: Length + Send + Sync {
50-
type T: Read + Send;
51-
/// Get a serially readable slice of the current reader
52-
/// This should fail if the slice exceeds the current bounds
53-
fn get_read(&self, start: u64, length: usize) -> Result<Self::T>;
52+
type T: Read;
53+
54+
/// Get a [`Read`] starting at the provided file offset
55+
///
56+
/// Subsequent or concurrent calls to [`Self::get_read`] or [`Self::get_bytes`] may
57+
/// side-effect on previously returned [`Self::T`]. Care should be taken to avoid this
58+
///
59+
/// See [`File::try_clone`] for more information
60+
fn get_read(&self, start: u64) -> Result<Self::T>;
5461

5562
/// Get a range as bytes
56-
/// This should fail if the exact number of bytes cannot be read
63+
///
64+
/// Concurrent calls to [`Self::get_bytes`] may result in interleaved output
65+
///
66+
/// See [`File::try_clone`] for more information
67+
fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes>;
68+
}
69+
70+
impl Length for File {
71+
fn len(&self) -> u64 {
72+
self.metadata().map(|m| m.len()).unwrap_or(0u64)
73+
}
74+
}
75+
76+
impl ChunkReader for File {
77+
type T = BufReader<File>;
78+
79+
fn get_read(&self, start: u64) -> Result<Self::T> {
80+
let mut reader = self.try_clone()?;
81+
reader.seek(SeekFrom::Start(start))?;
82+
Ok(BufReader::new(self.try_clone()?))
83+
}
84+
5785
fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
5886
let mut buffer = Vec::with_capacity(length);
59-
let read = self.get_read(start, length)?.read_to_end(&mut buffer)?;
87+
let mut reader = self.try_clone()?;
88+
reader.seek(SeekFrom::Start(start))?;
89+
let read = reader.take(length as _).read_to_end(&mut buffer)?;
6090

6191
if read != length {
6292
return Err(eof_err!(
@@ -69,6 +99,26 @@ pub trait ChunkReader: Length + Send + Sync {
6999
}
70100
}
71101

102+
impl Length for Bytes {
103+
fn len(&self) -> u64 {
104+
self.len() as u64
105+
}
106+
}
107+
108+
impl ChunkReader for Bytes {
109+
type T = bytes::buf::Reader<Bytes>;
110+
111+
fn get_read(&self, start: u64) -> Result<Self::T> {
112+
let start = start as usize;
113+
Ok(self.slice(start..).reader())
114+
}
115+
116+
fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
117+
let start = start as usize;
118+
Ok(self.slice(start..start + length))
119+
}
120+
}
121+
72122
// ----------------------------------------------------------------------
73123
// APIs for file & row group readers
74124

parquet/src/file/serialized_reader.rs

Lines changed: 5 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -40,60 +40,8 @@ use crate::format::{PageHeader, PageLocation, PageType};
4040
use crate::record::reader::RowIter;
4141
use crate::record::Row;
4242
use crate::schema::types::Type as SchemaType;
43-
use crate::util::{io::TryClone, memory::ByteBufferPtr};
44-
use bytes::{Buf, Bytes};
43+
use crate::util::memory::ByteBufferPtr;
4544
use thrift::protocol::{TCompactInputProtocol, TSerializable};
46-
// export `SliceableCursor` and `FileSource` publicly so clients can
47-
// re-use the logic in their own ParquetFileWriter wrappers
48-
pub use crate::util::io::FileSource;
49-
50-
// ----------------------------------------------------------------------
51-
// Implementations of traits facilitating the creation of a new reader
52-
53-
impl Length for File {
54-
fn len(&self) -> u64 {
55-
self.metadata().map(|m| m.len()).unwrap_or(0u64)
56-
}
57-
}
58-
59-
impl TryClone for File {
60-
fn try_clone(&self) -> std::io::Result<Self> {
61-
self.try_clone()
62-
}
63-
}
64-
65-
impl ChunkReader for File {
66-
type T = FileSource<File>;
67-
68-
fn get_read(&self, start: u64, length: usize) -> Result<Self::T> {
69-
Ok(FileSource::new(self, start, length))
70-
}
71-
}
72-
73-
impl Length for Bytes {
74-
fn len(&self) -> u64 {
75-
self.len() as u64
76-
}
77-
}
78-
79-
impl TryClone for Bytes {
80-
fn try_clone(&self) -> std::io::Result<Self> {
81-
Ok(self.clone())
82-
}
83-
}
84-
85-
impl ChunkReader for Bytes {
86-
type T = bytes::buf::Reader<Bytes>;
87-
88-
fn get_read(&self, start: u64, length: usize) -> Result<Self::T> {
89-
Ok(self.get_bytes(start, length)?.reader())
90-
}
91-
92-
fn get_bytes(&self, start: u64, length: usize) -> Result<Bytes> {
93-
let start = start as usize;
94-
Ok(self.slice(start..start + length))
95-
}
96-
}
9745

9846
impl TryFrom<File> for SerializedFileReader<File> {
9947
type Error = ParquetError;
@@ -662,7 +610,7 @@ impl<R: ChunkReader> PageReader for SerializedPageReader<R> {
662610
return Ok(None);
663611
}
664612

665-
let mut read = self.reader.get_read(*offset as u64, *remaining)?;
613+
let mut read = self.reader.get_read(*offset as u64)?;
666614
let header = if let Some(header) = next_page_header.take() {
667615
*header
668616
} else {
@@ -752,8 +700,7 @@ impl<R: ChunkReader> PageReader for SerializedPageReader<R> {
752700
continue;
753701
}
754702
} else {
755-
let mut read =
756-
self.reader.get_read(*offset as u64, *remaining_bytes)?;
703+
let mut read = self.reader.get_read(*offset as u64)?;
757704
let (header_len, header) = read_page_header_len(&mut read)?;
758705
*offset += header_len;
759706
*remaining_bytes -= header_len;
@@ -807,8 +754,7 @@ impl<R: ChunkReader> PageReader for SerializedPageReader<R> {
807754
*offset += buffered_header.compressed_page_size as usize;
808755
*remaining_bytes -= buffered_header.compressed_page_size as usize;
809756
} else {
810-
let mut read =
811-
self.reader.get_read(*offset as u64, *remaining_bytes)?;
757+
let mut read = self.reader.get_read(*offset as u64)?;
812758
let (header_len, header) = read_page_header_len(&mut read)?;
813759
let data_page_size = header.compressed_page_size as usize;
814760
*offset += header_len + data_page_size;
@@ -827,6 +773,7 @@ impl<R: ChunkReader> PageReader for SerializedPageReader<R> {
827773

828774
#[cfg(test)]
829775
mod tests {
776+
use bytes::Bytes;
830777
use std::sync::Arc;
831778

832779
use crate::format::BoundaryOrder;

0 commit comments

Comments
 (0)