(arrow-rs) branch main updated: docs(parquet): move async parquet example into ArrowReaderBuilder docs (#9167)

alamb Wed, 21 Jan 2026 08:15:05 -0800

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new d60b1cd7d6 docs(parquet): move async parquet example into 
ArrowReaderBuilder docs (#9167)
d60b1cd7d6 is described below

commit d60b1cd7d60007382bf84c58ac7ba2626d887e19
Author: Vignesh <[email protected]>
AuthorDate: Wed Jan 21 21:44:50 2026 +0530

    docs(parquet): move async parquet example into ArrowReaderBuilder docs 
(#9167)
    
    Closes #9154
    
    ## Rationale for this change
    
    The async parquet reader example under `parquet/examples` duplicated
    functionality
    that is already exposed through the public `ArrowReaderBuilder::try_new`
    API.
    
    Moving this example into the API documentation improves discoverability
    for users
    while avoiding the need to maintain duplicate standalone examples.
    
    ## What changes are included in this PR?
    
    - Added an async parquet reading example to the rustdoc comments for
      `ArrowReaderBuilder::try_new`
    - Removed the redundant `async_read_parquet.rs` example file from
      `parquet/examples`
    
    ## Are these changes tested?
    
    Yes. Existing tests were run locally using:
    
    ```bash
    cargo test -p parquet
    
    ---------
    
    Co-authored-by: Vignesh S <[email protected]>
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .github/workflows/parquet.yml          |  1 -
 parquet/Cargo.toml                     |  5 ---
 parquet/examples/async_read_parquet.rs | 69 ----------------------------------
 parquet/src/arrow/arrow_reader/mod.rs  | 18 +++++----
 4 files changed, 10 insertions(+), 83 deletions(-)

diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
index 8b94efd91f..9ae7d47edd 100644
--- a/.github/workflows/parquet.yml
+++ b/.github/workflows/parquet.yml
@@ -68,7 +68,6 @@ jobs:
         run: |
           # Test parquet examples
           cargo run -p parquet --example read_parquet
-          cargo run -p parquet --example async_read_parquet --features="async"
           cargo run -p parquet --example read_with_rowgroup --features="async"
 
   # test compilation
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 454f8455ff..053adc4fca 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -151,11 +151,6 @@ name = "write_parquet"
 required-features = ["cli"]
 path = "./examples/write_parquet.rs"
 
-[[example]]
-name = "async_read_parquet"
-required-features = ["arrow", "async"]
-path = "./examples/async_read_parquet.rs"
-
 [[example]]
 name = "read_with_rowgroup"
 required-features = ["arrow", "async"]
diff --git a/parquet/examples/async_read_parquet.rs 
b/parquet/examples/async_read_parquet.rs
deleted file mode 100644
index 78287fa846..0000000000
--- a/parquet/examples/async_read_parquet.rs
+++ /dev/null
@@ -1,69 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::compute::kernels::cmp::eq;
-use arrow::util::pretty::print_batches;
-use arrow_array::{Int32Array, Scalar};
-use futures::TryStreamExt;
-use parquet::arrow::arrow_reader::{ArrowPredicateFn, RowFilter};
-use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
-use parquet::errors::Result;
-use std::time::SystemTime;
-use tokio::fs::File;
-
-#[tokio::main(flavor = "current_thread")]
-async fn main() -> Result<()> {
-    // Create parquet file that will be read.
-    let testdata = arrow::util::test_util::parquet_test_data();
-    let path = format!("{testdata}/alltypes_plain.parquet");
-    let file = File::open(path).await.unwrap();
-
-    // Create a async parquet reader builder with batch_size.
-    // batch_size is the number of rows to read up to buffer once from pages, 
defaults to 1024
-    let mut builder = ParquetRecordBatchStreamBuilder::new(file)
-        .await
-        .unwrap()
-        .with_batch_size(8192);
-
-    let file_metadata = builder.metadata().file_metadata().clone();
-    let mask = ProjectionMask::roots(file_metadata.schema_descr(), [0, 1, 2]);
-    // Set projection mask to read only root columns 1 and 2.
-    builder = builder.with_projection(mask);
-
-    // Highlight: set `RowFilter`, it'll push down filter predicates to skip 
IO and decode.
-    // For more specific usage: please refer to 
https://github.com/apache/datafusion/blob/main/datafusion/datasource-parquet/src/row_filter.rs.
-    let scalar = Int32Array::from(vec![1]);
-    let filter = ArrowPredicateFn::new(
-        ProjectionMask::roots(file_metadata.schema_descr(), [0]),
-        move |record_batch| eq(record_batch.column(0), &Scalar::new(&scalar)),
-    );
-    let row_filter = RowFilter::new(vec![Box::new(filter)]);
-    builder = builder.with_row_filter(row_filter);
-
-    // Build a async parquet reader.
-    let stream = builder.build().unwrap();
-
-    let start = SystemTime::now();
-
-    let result = stream.try_collect::<Vec<_>>().await?;
-
-    println!("took: {} ms", start.elapsed().unwrap().as_millis());
-
-    print_batches(&result).unwrap();
-
-    Ok(())
-}
diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index 29346f0b27..cef30961d0 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1039,17 +1039,19 @@ impl<T: ChunkReader + 'static> 
ParquetRecordBatchReaderBuilder<T> {
     /// # writer.write(&batch).unwrap();
     /// # writer.close().unwrap();
     /// # let file = Bytes::from(file);
-    /// #
+    /// // Build the reader from anything that implements `ChunkReader`
+    /// // such as a `File`, or `Bytes`
     /// let mut builder = 
ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
-    ///
-    /// // Inspect metadata
+    /// // The builder has access to ParquetMetaData such
+    /// // as the number and layout of row groups
     /// assert_eq!(builder.metadata().num_row_groups(), 1);
-    ///
-    /// // Construct reader
-    /// let mut reader: ParquetRecordBatchReader = 
builder.with_row_groups(vec![0]).build().unwrap();
-    ///
+    /// // Call build to create the reader
+    /// let mut reader: ParquetRecordBatchReader = builder.build().unwrap();
     /// // Read data
-    /// let _batch = reader.next().unwrap().unwrap();
+    /// while let Some(batch) = reader.next().transpose()? {
+    ///     println!("Read {} rows", batch.num_rows());
+    /// }
+    /// # Ok::<(), parquet::errors::ParquetError>(())
     /// ```
     pub fn try_new(reader: T) -> Result<Self> {
         Self::try_new_with_options(reader, Default::default())

(arrow-rs) branch main updated: docs(parquet): move async parquet example into ArrowReaderBuilder docs (#9167)

Reply via email to