This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new d60b1cd7d6 docs(parquet): move async parquet example into
ArrowReaderBuilder docs (#9167)
d60b1cd7d6 is described below
commit d60b1cd7d60007382bf84c58ac7ba2626d887e19
Author: Vignesh <[email protected]>
AuthorDate: Wed Jan 21 21:44:50 2026 +0530
docs(parquet): move async parquet example into ArrowReaderBuilder docs
(#9167)
Closes #9154
## Rationale for this change
The async parquet reader example under `parquet/examples` duplicated
functionality
that is already exposed through the public `ArrowReaderBuilder::try_new`
API.
Moving this example into the API documentation improves discoverability
for users
while avoiding the need to maintain duplicate standalone examples.
## What changes are included in this PR?
- Added an async parquet reading example to the rustdoc comments for
`ArrowReaderBuilder::try_new`
- Removed the redundant `async_read_parquet.rs` example file from
`parquet/examples`
## Are these changes tested?
Yes. Existing tests were run locally using:
```bash
cargo test -p parquet
---------
Co-authored-by: Vignesh S <[email protected]>
Co-authored-by: Andrew Lamb <[email protected]>
---
.github/workflows/parquet.yml | 1 -
parquet/Cargo.toml | 5 ---
parquet/examples/async_read_parquet.rs | 69 ----------------------------------
parquet/src/arrow/arrow_reader/mod.rs | 18 +++++----
4 files changed, 10 insertions(+), 83 deletions(-)
diff --git a/.github/workflows/parquet.yml b/.github/workflows/parquet.yml
index 8b94efd91f..9ae7d47edd 100644
--- a/.github/workflows/parquet.yml
+++ b/.github/workflows/parquet.yml
@@ -68,7 +68,6 @@ jobs:
run: |
# Test parquet examples
cargo run -p parquet --example read_parquet
- cargo run -p parquet --example async_read_parquet --features="async"
cargo run -p parquet --example read_with_rowgroup --features="async"
# test compilation
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index 454f8455ff..053adc4fca 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -151,11 +151,6 @@ name = "write_parquet"
required-features = ["cli"]
path = "./examples/write_parquet.rs"
-[[example]]
-name = "async_read_parquet"
-required-features = ["arrow", "async"]
-path = "./examples/async_read_parquet.rs"
-
[[example]]
name = "read_with_rowgroup"
required-features = ["arrow", "async"]
diff --git a/parquet/examples/async_read_parquet.rs
b/parquet/examples/async_read_parquet.rs
deleted file mode 100644
index 78287fa846..0000000000
--- a/parquet/examples/async_read_parquet.rs
+++ /dev/null
@@ -1,69 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-use arrow::compute::kernels::cmp::eq;
-use arrow::util::pretty::print_batches;
-use arrow_array::{Int32Array, Scalar};
-use futures::TryStreamExt;
-use parquet::arrow::arrow_reader::{ArrowPredicateFn, RowFilter};
-use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
-use parquet::errors::Result;
-use std::time::SystemTime;
-use tokio::fs::File;
-
-#[tokio::main(flavor = "current_thread")]
-async fn main() -> Result<()> {
- // Create parquet file that will be read.
- let testdata = arrow::util::test_util::parquet_test_data();
- let path = format!("{testdata}/alltypes_plain.parquet");
- let file = File::open(path).await.unwrap();
-
- // Create a async parquet reader builder with batch_size.
- // batch_size is the number of rows to read up to buffer once from pages,
defaults to 1024
- let mut builder = ParquetRecordBatchStreamBuilder::new(file)
- .await
- .unwrap()
- .with_batch_size(8192);
-
- let file_metadata = builder.metadata().file_metadata().clone();
- let mask = ProjectionMask::roots(file_metadata.schema_descr(), [0, 1, 2]);
- // Set projection mask to read only root columns 1 and 2.
- builder = builder.with_projection(mask);
-
- // Highlight: set `RowFilter`, it'll push down filter predicates to skip
IO and decode.
- // For more specific usage: please refer to
https://github.com/apache/datafusion/blob/main/datafusion/datasource-parquet/src/row_filter.rs.
- let scalar = Int32Array::from(vec![1]);
- let filter = ArrowPredicateFn::new(
- ProjectionMask::roots(file_metadata.schema_descr(), [0]),
- move |record_batch| eq(record_batch.column(0), &Scalar::new(&scalar)),
- );
- let row_filter = RowFilter::new(vec![Box::new(filter)]);
- builder = builder.with_row_filter(row_filter);
-
- // Build a async parquet reader.
- let stream = builder.build().unwrap();
-
- let start = SystemTime::now();
-
- let result = stream.try_collect::<Vec<_>>().await?;
-
- println!("took: {} ms", start.elapsed().unwrap().as_millis());
-
- print_batches(&result).unwrap();
-
- Ok(())
-}
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 29346f0b27..cef30961d0 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -1039,17 +1039,19 @@ impl<T: ChunkReader + 'static>
ParquetRecordBatchReaderBuilder<T> {
/// # writer.write(&batch).unwrap();
/// # writer.close().unwrap();
/// # let file = Bytes::from(file);
- /// #
+ /// // Build the reader from anything that implements `ChunkReader`
+ /// // such as a `File`, or `Bytes`
/// let mut builder =
ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
- ///
- /// // Inspect metadata
+ /// // The builder has access to ParquetMetaData such
+ /// // as the number and layout of row groups
/// assert_eq!(builder.metadata().num_row_groups(), 1);
- ///
- /// // Construct reader
- /// let mut reader: ParquetRecordBatchReader =
builder.with_row_groups(vec![0]).build().unwrap();
- ///
+ /// // Call build to create the reader
+ /// let mut reader: ParquetRecordBatchReader = builder.build().unwrap();
/// // Read data
- /// let _batch = reader.next().unwrap().unwrap();
+ /// while let Some(batch) = reader.next().transpose()? {
+ /// println!("Read {} rows", batch.num_rows());
+ /// }
+ /// # Ok::<(), parquet::errors::ParquetError>(())
/// ```
pub fn try_new(reader: T) -> Result<Self> {
Self::try_new_with_options(reader, Default::default())