Re: [PR] Improve `ParquetRecordBatchStreamBuilder` docs / examples [arrow-rs]

via GitHub Mon, 06 Jan 2025 12:41:25 -0800


alamb commented on code in PR #6948:
URL: https://github.com/apache/arrow-rs/pull/6948#discussion_r1904615500



##########
parquet/src/arrow/async_reader/mod.rs:
##########
@@ -249,53 +197,153 @@ impl ArrowReaderMetadata {
 /// breaking the pre-existing ParquetRecordBatchStreamBuilder API
 pub struct AsyncReader<T>(T);
 
-/// A builder used to construct a [`ParquetRecordBatchStream`] for `async` 
reading of a parquet file
+/// A builder for reading parquet files from an `async` source as  
[`ParquetRecordBatchStream`]
 ///
-/// In particular, this handles reading the parquet file metadata, allowing 
consumers
+/// This builder  handles reading the parquet file metadata, allowing consumers
 /// to use this information to select what specific columns, row groups, etc...
 /// they wish to be read by the resulting stream
 ///
+/// See examples on [`ParquetRecordBatchStreamBuilder::new`]
+///
 /// See [`ArrowReaderBuilder`] for additional member functions
 pub type ParquetRecordBatchStreamBuilder<T> = 
ArrowReaderBuilder<AsyncReader<T>>;
 
 impl<T: AsyncFileReader + Send + 'static> ParquetRecordBatchStreamBuilder<T> {
-    /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided 
parquet file
+    /// Create a new [`ParquetRecordBatchStreamBuilder`] for reading from the
+    /// specified source.
     ///
     /// # Example
+    /// ```
+    /// # #[tokio::main(flavor="current_thread")]
+    /// # async fn main() {
+    /// #
+    /// # use arrow_array::RecordBatch;
+    /// # use arrow::util::pretty::pretty_format_batches;
+    /// # use futures::TryStreamExt;
+    /// #
+    /// # use parquet::arrow::{ParquetRecordBatchStreamBuilder, 
ProjectionMask};
+    /// #
+    /// # fn assert_batches_eq(batches: &[RecordBatch], expected_lines: 
&[&str]) {
+    /// #     let formatted = 
pretty_format_batches(batches).unwrap().to_string();
+    /// #     let actual_lines: Vec<_> = formatted.trim().lines().collect();
+    /// #     assert_eq!(
+    /// #          &actual_lines, expected_lines,
+    /// #          "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
+    /// #          expected_lines, actual_lines
+    /// #      );
+    /// #  }
+    /// #
+    /// # let testdata = arrow::util::test_util::parquet_test_data();
+    /// # let path = format!("{}/alltypes_plain.parquet", testdata);
+    /// // use tokio::fs::File to read data using an async I/O. This can be 
replaced with
+    /// // other async I/O reader such as a reader from an object store.
+    /// let file = tokio::fs::File::open(path).await.unwrap();
+    ///
+    /// // Configure options for reading from the async souce
+    /// let builder = ParquetRecordBatchStreamBuilder::new(file)
+    ///     .await
+    ///     .unwrap();
+    /// // Building the stream opens the parquet file (reads metadata, etc) 
and returns
+    /// // a stream that can be used to incrementally read the data in batches
+    /// let stream = builder.build().unwrap();
+    /// // in this example, we collect the stream into a Vec<RecordBatch>
+    /// // but real applications would likely process the batches as they are 
read
+    /// let results = stream.try_collect::<Vec<_>>().await.unwrap();
+    /// // demonstrate the results are as expected
+    /// assert_batches_eq(
+    ///     &results,
+    ///     &[
+    ///       
"+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+    ///       "| id | bool_col | tinyint_col | smallint_col | int_col | 
bigint_col | float_col | double_col | date_string_col  | string_col | 
timestamp_col       |",
+    ///       
"+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+    ///       "| 4  | true     | 0           | 0            | 0       | 0      
    | 0.0       | 0.0        | 30332f30312f3039 | 30         | 
2009-03-01T00:00:00 |",
+    ///       "| 5  | false    | 1           | 1            | 1       | 10     
    | 1.1       | 10.1       | 30332f30312f3039 | 31         | 
2009-03-01T00:01:00 |",
+    ///       "| 6  | true     | 0           | 0            | 0       | 0      
    | 0.0       | 0.0        | 30342f30312f3039 | 30         | 
2009-04-01T00:00:00 |",
+    ///       "| 7  | false    | 1           | 1            | 1       | 10     
    | 1.1       | 10.1       | 30342f30312f3039 | 31         | 
2009-04-01T00:01:00 |",
+    ///       "| 2  | true     | 0           | 0            | 0       | 0      
    | 0.0       | 0.0        | 30322f30312f3039 | 30         | 
2009-02-01T00:00:00 |",
+    ///       "| 3  | false    | 1           | 1            | 1       | 10     
    | 1.1       | 10.1       | 30322f30312f3039 | 31         | 
2009-02-01T00:01:00 |",
+    ///       "| 0  | true     | 0           | 0            | 0       | 0      
    | 0.0       | 0.0        | 30312f30312f3039 | 30         | 
2009-01-01T00:00:00 |",
+    ///       "| 1  | false    | 1           | 1            | 1       | 10     
    | 1.1       | 10.1       | 30312f30312f3039 | 31         | 
2009-01-01T00:01:00 |",
+    ///       
"+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+    ///      ],
+    ///  );
+    /// # }
+    /// ```
+    ///
+    /// # Example configuring options and reading metadata

Review Comment:
   Made a more advanced example with projection pushdown



##########
parquet/src/arrow/async_reader/mod.rs:
##########
@@ -249,53 +197,153 @@ impl ArrowReaderMetadata {
 /// breaking the pre-existing ParquetRecordBatchStreamBuilder API
 pub struct AsyncReader<T>(T);
 
-/// A builder used to construct a [`ParquetRecordBatchStream`] for `async` 
reading of a parquet file
+/// A builder for reading parquet files from an `async` source as  
[`ParquetRecordBatchStream`]
 ///
-/// In particular, this handles reading the parquet file metadata, allowing 
consumers
+/// This builder  handles reading the parquet file metadata, allowing consumers
 /// to use this information to select what specific columns, row groups, etc...
 /// they wish to be read by the resulting stream
 ///
+/// See examples on [`ParquetRecordBatchStreamBuilder::new`]
+///
 /// See [`ArrowReaderBuilder`] for additional member functions
 pub type ParquetRecordBatchStreamBuilder<T> = 
ArrowReaderBuilder<AsyncReader<T>>;
 
 impl<T: AsyncFileReader + Send + 'static> ParquetRecordBatchStreamBuilder<T> {
-    /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided 
parquet file
+    /// Create a new [`ParquetRecordBatchStreamBuilder`] for reading from the
+    /// specified source.
     ///
     /// # Example
+    /// ```
+    /// # #[tokio::main(flavor="current_thread")]
+    /// # async fn main() {
+    /// #
+    /// # use arrow_array::RecordBatch;
+    /// # use arrow::util::pretty::pretty_format_batches;
+    /// # use futures::TryStreamExt;
+    /// #
+    /// # use parquet::arrow::{ParquetRecordBatchStreamBuilder, 
ProjectionMask};
+    /// #
+    /// # fn assert_batches_eq(batches: &[RecordBatch], expected_lines: 
&[&str]) {
+    /// #     let formatted = 
pretty_format_batches(batches).unwrap().to_string();
+    /// #     let actual_lines: Vec<_> = formatted.trim().lines().collect();
+    /// #     assert_eq!(
+    /// #          &actual_lines, expected_lines,
+    /// #          "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
+    /// #          expected_lines, actual_lines
+    /// #      );
+    /// #  }
+    /// #
+    /// # let testdata = arrow::util::test_util::parquet_test_data();
+    /// # let path = format!("{}/alltypes_plain.parquet", testdata);
+    /// // use tokio::fs::File to read data using an async I/O. This can be 
replaced with
+    /// // other async I/O reader such as a reader from an object store.
+    /// let file = tokio::fs::File::open(path).await.unwrap();
+    ///
+    /// // Configure options for reading from the async souce
+    /// let builder = ParquetRecordBatchStreamBuilder::new(file)
+    ///     .await
+    ///     .unwrap();
+    /// // Building the stream opens the parquet file (reads metadata, etc) 
and returns
+    /// // a stream that can be used to incrementally read the data in batches
+    /// let stream = builder.build().unwrap();
+    /// // in this example, we collect the stream into a Vec<RecordBatch>
+    /// // but real applications would likely process the batches as they are 
read
+    /// let results = stream.try_collect::<Vec<_>>().await.unwrap();
+    /// // demonstrate the results are as expected
+    /// assert_batches_eq(
+    ///     &results,
+    ///     &[
+    ///       
"+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",

Review Comment:
   made the first example select all columns to make it simpler to understand



##########
parquet/src/arrow/async_reader/mod.rs:
##########
@@ -249,53 +197,153 @@ impl ArrowReaderMetadata {
 /// breaking the pre-existing ParquetRecordBatchStreamBuilder API
 pub struct AsyncReader<T>(T);
 
-/// A builder used to construct a [`ParquetRecordBatchStream`] for `async` 
reading of a parquet file
+/// A builder for reading parquet files from an `async` source as  
[`ParquetRecordBatchStream`]
 ///
-/// In particular, this handles reading the parquet file metadata, allowing 
consumers
+/// This builder  handles reading the parquet file metadata, allowing consumers
 /// to use this information to select what specific columns, row groups, etc...
 /// they wish to be read by the resulting stream
 ///
+/// See examples on [`ParquetRecordBatchStreamBuilder::new`]
+///
 /// See [`ArrowReaderBuilder`] for additional member functions
 pub type ParquetRecordBatchStreamBuilder<T> = 
ArrowReaderBuilder<AsyncReader<T>>;
 
 impl<T: AsyncFileReader + Send + 'static> ParquetRecordBatchStreamBuilder<T> {
-    /// Create a new [`ParquetRecordBatchStreamBuilder`] with the provided 
parquet file
+    /// Create a new [`ParquetRecordBatchStreamBuilder`] for reading from the
+    /// specified source.
     ///
     /// # Example
+    /// ```
+    /// # #[tokio::main(flavor="current_thread")]
+    /// # async fn main() {
+    /// #
+    /// # use arrow_array::RecordBatch;
+    /// # use arrow::util::pretty::pretty_format_batches;
+    /// # use futures::TryStreamExt;
+    /// #
+    /// # use parquet::arrow::{ParquetRecordBatchStreamBuilder, 
ProjectionMask};
+    /// #
+    /// # fn assert_batches_eq(batches: &[RecordBatch], expected_lines: 
&[&str]) {
+    /// #     let formatted = 
pretty_format_batches(batches).unwrap().to_string();
+    /// #     let actual_lines: Vec<_> = formatted.trim().lines().collect();
+    /// #     assert_eq!(
+    /// #          &actual_lines, expected_lines,
+    /// #          "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
+    /// #          expected_lines, actual_lines
+    /// #      );
+    /// #  }
+    /// #
+    /// # let testdata = arrow::util::test_util::parquet_test_data();
+    /// # let path = format!("{}/alltypes_plain.parquet", testdata);
+    /// // use tokio::fs::File to read data using an async I/O. This can be 
replaced with
+    /// // other async I/O reader such as a reader from an object store.
+    /// let file = tokio::fs::File::open(path).await.unwrap();
+    ///
+    /// // Configure options for reading from the async souce
+    /// let builder = ParquetRecordBatchStreamBuilder::new(file)
+    ///     .await
+    ///     .unwrap();
+    /// // Building the stream opens the parquet file (reads metadata, etc) 
and returns
+    /// // a stream that can be used to incrementally read the data in batches
+    /// let stream = builder.build().unwrap();
+    /// // in this example, we collect the stream into a Vec<RecordBatch>
+    /// // but real applications would likely process the batches as they are 
read
+    /// let results = stream.try_collect::<Vec<_>>().await.unwrap();
+    /// // demonstrate the results are as expected
+    /// assert_batches_eq(
+    ///     &results,
+    ///     &[
+    ///       
"+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+    ///       "| id | bool_col | tinyint_col | smallint_col | int_col | 
bigint_col | float_col | double_col | date_string_col  | string_col | 
timestamp_col       |",
+    ///       
"+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+    ///       "| 4  | true     | 0           | 0            | 0       | 0      
    | 0.0       | 0.0        | 30332f30312f3039 | 30         | 
2009-03-01T00:00:00 |",
+    ///       "| 5  | false    | 1           | 1            | 1       | 10     
    | 1.1       | 10.1       | 30332f30312f3039 | 31         | 
2009-03-01T00:01:00 |",
+    ///       "| 6  | true     | 0           | 0            | 0       | 0      
    | 0.0       | 0.0        | 30342f30312f3039 | 30         | 
2009-04-01T00:00:00 |",
+    ///       "| 7  | false    | 1           | 1            | 1       | 10     
    | 1.1       | 10.1       | 30342f30312f3039 | 31         | 
2009-04-01T00:01:00 |",
+    ///       "| 2  | true     | 0           | 0            | 0       | 0      
    | 0.0       | 0.0        | 30322f30312f3039 | 30         | 
2009-02-01T00:00:00 |",
+    ///       "| 3  | false    | 1           | 1            | 1       | 10     
    | 1.1       | 10.1       | 30322f30312f3039 | 31         | 
2009-02-01T00:01:00 |",
+    ///       "| 0  | true     | 0           | 0            | 0       | 0      
    | 0.0       | 0.0        | 30312f30312f3039 | 30         | 
2009-01-01T00:00:00 |",
+    ///       "| 1  | false    | 1           | 1            | 1       | 10     
    | 1.1       | 10.1       | 30312f30312f3039 | 31         | 
2009-01-01T00:01:00 |",
+    ///       
"+----+----------+-------------+--------------+---------+------------+-----------+------------+------------------+------------+---------------------+",
+    ///      ],
+    ///  );
+    /// # }
+    /// ```
+    ///
+    /// # Example configuring options and reading metadata
+    ///
+    /// There are many options that control the behavior of the reader, such as
+    /// `with_batch_size`, `with_projection`, `with_filter`, etc...
     ///
     /// ```
-    /// # use std::fs::metadata;

Review Comment:
   the existing example is less compelling in my mind than what was on the 
module level docs so I remove the current one)



##########
parquet/src/arrow/async_reader/mod.rs:
##########
@@ -15,65 +15,13 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Provides `async` API for reading parquet files as
+//! [`ParquetRecordBatchStreamBuilder`]:  `async` API for reading Parquet 
files as
 //! [`RecordBatch`]es
 //!
-//! ```
-//! # #[tokio::main(flavor="current_thread")]
-//! # async fn main() {
-//! #
-//! # use arrow_array::RecordBatch;
-//! # use arrow::util::pretty::pretty_format_batches;
-//! # use futures::TryStreamExt;
-//! # use tokio::fs::File;
-//! #
-//! # use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
-//! #
-//! # fn assert_batches_eq(batches: &[RecordBatch], expected_lines: &[&str]) {
-//! #     let formatted = pretty_format_batches(batches).unwrap().to_string();
-//! #     let actual_lines: Vec<_> = formatted.trim().lines().collect();
-//! #     assert_eq!(
-//! #          &actual_lines, expected_lines,
-//! #          "\n\nexpected:\n\n{:#?}\nactual:\n\n{:#?}\n\n",
-//! #          expected_lines, actual_lines
-//! #      );
-//! #  }
-//! #
-//! let testdata = arrow::util::test_util::parquet_test_data();
-//! let path = format!("{}/alltypes_plain.parquet", testdata);
-//! let file = File::open(path).await.unwrap();
+//! This can be used to decode a Parquet file in streaming fashion (without
+//! downloading the whole file at once) from a remote source, such as an 
object store.
 //!
-//! let builder = ParquetRecordBatchStreamBuilder::new(file)
-//!     .await
-//!     .unwrap()
-//!     .with_batch_size(3);
-//!
-//! let file_metadata = builder.metadata().file_metadata();
-//! let mask = ProjectionMask::roots(file_metadata.schema_descr(), [1, 2, 6]);
-//!
-//! let stream = builder.with_projection(mask).build().unwrap();
-//! let results = stream.try_collect::<Vec<_>>().await.unwrap();
-//! assert_eq!(results.len(), 3);
-//!
-//! assert_batches_eq(
-//!     &results,
-//!     &[
-//!         "+----------+-------------+-----------+",
-//!         "| bool_col | tinyint_col | float_col |",
-//!         "+----------+-------------+-----------+",
-//!         "| true     | 0           | 0.0       |",
-//!         "| false    | 1           | 1.1       |",
-//!         "| true     | 0           | 0.0       |",
-//!         "| false    | 1           | 1.1       |",
-//!         "| true     | 0           | 0.0       |",
-//!         "| false    | 1           | 1.1       |",
-//!         "| true     | 0           | 0.0       |",
-//!         "| false    | 1           | 1.1       |",
-//!         "+----------+-------------+-----------+",
-//!      ],
-//!  );
-//! # }
-//! ```
+//! See example on [`ParquetRecordBatchStreamBuilder`]

Review Comment:
   moved examples to `ParquetRecordBatchStreamBuilder`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Improve `ParquetRecordBatchStreamBuilder` docs / examples [arrow-rs]

Reply via email to