This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 741121b258 Update documentation for ParquetReader (#7501)
741121b258 is described below
commit 741121b25820c97b16c678c4b38fae67b8762d35
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu May 15 13:10:31 2025 -0400
Update documentation for ParquetReader (#7501)
---
parquet/src/arrow/arrow_reader/mod.rs | 1 +
parquet/src/arrow/async_reader/mod.rs | 18 ++++++++++++++----
2 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 2f670a64e1..6d11902658 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -792,6 +792,7 @@ pub struct ParquetRecordBatchReader {
batch_size: usize,
array_reader: Box<dyn ArrayReader>,
schema: SchemaRef,
+ /// Row ranges to be selected from the data source
selection: Option<VecDeque<RowSelector>>,
}
diff --git a/parquet/src/arrow/async_reader/mod.rs
b/parquet/src/arrow/async_reader/mod.rs
index 45df68821c..9466fb9a35 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -586,6 +586,7 @@ where
metadata: self.metadata.as_ref(),
};
+ // Update selection based on any filters
if let Some(filter) = self.filter.as_mut() {
for predicate in filter.predicates.iter_mut() {
if !selects_any(selection.as_ref()) {
@@ -865,6 +866,7 @@ where
/// An in-memory collection of column chunks
struct InMemoryRowGroup<'a> {
offset_index: Option<&'a [OffsetIndexMetaData]>,
+ /// Column chunks for this row group
column_chunks: Vec<Option<Arc<ColumnChunkData>>>,
row_count: usize,
row_group_idx: usize,
@@ -872,7 +874,11 @@ struct InMemoryRowGroup<'a> {
}
impl InMemoryRowGroup<'_> {
- /// Fetches the necessary column data into memory
+ /// Fetches any additional column data specified in `projection` that is
not already
+ /// present in `self.column_chunks`.
+ ///
+ /// If `selection` is provided, only the pages required for the selection
+ /// are fetched. Otherwise, all pages are fetched.
async fn fetch<T: AsyncFileReader + Send>(
&mut self,
input: &mut T,
@@ -1017,15 +1023,18 @@ enum ColumnChunkData {
Sparse {
/// Length of the full column chunk
length: usize,
- /// Set of data pages included in this sparse chunk. Each element is a
tuple
- /// of (page offset, page data)
+ /// Subset of data pages included in this sparse chunk.
+ ///
+ /// Each element is a tuple of (page offset within file, page data).
+ /// Each entry is a complete page and the list is ordered by offset.
data: Vec<(usize, Bytes)>,
},
- /// Full column chunk and its offset
+ /// Full column chunk and the offset within the original file
Dense { offset: usize, data: Bytes },
}
impl ColumnChunkData {
+ /// Return the data for this column chunk at the given offset
fn get(&self, start: u64) -> Result<Bytes> {
match &self {
ColumnChunkData::Sparse { data, .. } => data
@@ -1045,6 +1054,7 @@ impl ColumnChunkData {
}
impl Length for ColumnChunkData {
+ /// Return the total length of the full column chunk
fn len(&self) -> u64 {
match &self {
ColumnChunkData::Sparse { length, .. } => *length as u64,