(arrow-rs) branch main updated: Update documentation for ParquetReader (#7501)

alamb Thu, 15 May 2025 10:10:58 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new 741121b258 Update documentation for ParquetReader (#7501)
741121b258 is described below

commit 741121b25820c97b16c678c4b38fae67b8762d35
Author: Andrew Lamb <[email protected]>
AuthorDate: Thu May 15 13:10:31 2025 -0400

    Update documentation for ParquetReader (#7501)
---
 parquet/src/arrow/arrow_reader/mod.rs |  1 +
 parquet/src/arrow/async_reader/mod.rs | 18 ++++++++++++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index 2f670a64e1..6d11902658 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -792,6 +792,7 @@ pub struct ParquetRecordBatchReader {
     batch_size: usize,
     array_reader: Box<dyn ArrayReader>,
     schema: SchemaRef,
+    /// Row ranges to be selected from the data source
     selection: Option<VecDeque<RowSelector>>,
 }
 
diff --git a/parquet/src/arrow/async_reader/mod.rs 
b/parquet/src/arrow/async_reader/mod.rs
index 45df68821c..9466fb9a35 100644
--- a/parquet/src/arrow/async_reader/mod.rs
+++ b/parquet/src/arrow/async_reader/mod.rs
@@ -586,6 +586,7 @@ where
             metadata: self.metadata.as_ref(),
         };
 
+        // Update selection based on any filters
         if let Some(filter) = self.filter.as_mut() {
             for predicate in filter.predicates.iter_mut() {
                 if !selects_any(selection.as_ref()) {
@@ -865,6 +866,7 @@ where
 /// An in-memory collection of column chunks
 struct InMemoryRowGroup<'a> {
     offset_index: Option<&'a [OffsetIndexMetaData]>,
+    /// Column chunks for this row group
     column_chunks: Vec<Option<Arc<ColumnChunkData>>>,
     row_count: usize,
     row_group_idx: usize,
@@ -872,7 +874,11 @@ struct InMemoryRowGroup<'a> {
 }
 
 impl InMemoryRowGroup<'_> {
-    /// Fetches the necessary column data into memory
+    /// Fetches any additional column data specified in `projection` that is 
not already
+    /// present in `self.column_chunks`.
+    ///
+    /// If `selection` is provided, only the pages required for the selection
+    /// are fetched. Otherwise, all pages are fetched.
     async fn fetch<T: AsyncFileReader + Send>(
         &mut self,
         input: &mut T,
@@ -1017,15 +1023,18 @@ enum ColumnChunkData {
     Sparse {
         /// Length of the full column chunk
         length: usize,
-        /// Set of data pages included in this sparse chunk. Each element is a 
tuple
-        /// of (page offset, page data)
+        /// Subset of data pages included in this sparse chunk.
+        ///
+        /// Each element is a tuple of (page offset within file, page data).
+        /// Each entry is a complete page and the list is ordered by offset.
         data: Vec<(usize, Bytes)>,
     },
-    /// Full column chunk and its offset
+    /// Full column chunk and the offset within the original file
     Dense { offset: usize, data: Bytes },
 }
 
 impl ColumnChunkData {
+    /// Return the data for this column chunk at the given offset
     fn get(&self, start: u64) -> Result<Bytes> {
         match &self {
             ColumnChunkData::Sparse { data, .. } => data
@@ -1045,6 +1054,7 @@ impl ColumnChunkData {
 }
 
 impl Length for ColumnChunkData {
+    /// Return the total length of the full column chunk
     fn len(&self) -> u64 {
         match &self {
             ColumnChunkData::Sparse { length, .. } => *length as u64,

(arrow-rs) branch main updated: Update documentation for ParquetReader (#7501)

Reply via email to