(arrow-rs) branch master updated: Expand parquet crate overview doc (#5093)

tustvold Mon, 20 Nov 2023 01:13:27 -0800

This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/master by this push:
     new 6815bf153d Expand parquet crate overview doc (#5093)
6815bf153d is described below

commit 6815bf153d2e2166ce3b63beed8d499aef48c7cc
Author: Matthieu Maitre <[email protected]>
AuthorDate: Mon Nov 20 01:12:16 2023 -0800

    Expand parquet crate overview doc (#5093)
    
    * Expand parquet crate overview doc
    
    * Run `cargo fmt --all`
    
    * Nit: ask to format the code before sending a PR
    
    * Add example reading Parquet files from cloud provider
    
    * Tweak copy
    
    * Fix doctest
    
    ---------
    
    Co-authored-by: Matthieu Maitre <[email protected]>
    Co-authored-by: Raphael Taylor-Davies <[email protected]>
---
 parquet/CONTRIBUTING.md                 |  6 ++--
 parquet/Cargo.toml                      |  1 +
 parquet/README.md                       |  8 -----
 parquet/src/arrow/async_reader/store.rs | 25 +++++++++++++-
 parquet/src/arrow/mod.rs                |  2 +-
 parquet/src/file/mod.rs                 |  2 +-
 parquet/src/file/reader.rs              |  4 +--
 parquet/src/lib.rs                      | 59 ++++++++++++++++++++++++++++-----
 parquet/src/record/reader.rs            | 17 ++++++++--
 9 files changed, 98 insertions(+), 26 deletions(-)

diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md
index 5670eef081..922332b15d 100644
--- a/parquet/CONTRIBUTING.md
+++ b/parquet/CONTRIBUTING.md
@@ -57,8 +57,10 @@ Run `cargo bench` for benchmarks.
 
 ## Docs
 
-To build documentation, run `cargo doc --no-deps`.
-To compile and view in the browser, run `cargo doc --no-deps --open`.
+To build documentation, run `cargo doc --no-deps --all-features`.
+To compile and view in the browser, run `cargo doc --no-deps --all-features 
--open`.
+
+Before submitting a pull request, run `cargo fmt --all` to format the change.
 
 ## Update Parquet Format
 
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index bdcbcb81cf..4cd03c051e 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -81,6 +81,7 @@ serde_json = { version = "1.0", features = ["std"], 
default-features = false }
 arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint", 
"json"] }
 tokio = { version = "1.0", default-features = false, features = ["macros", 
"rt", "io-util", "fs"] }
 rand = { version = "0.8", default-features = false, features = ["std", 
"std_rng"] }
+object_store = { version = "0.8", default-features = false, features = 
["azure"] }
 
 [package.metadata.docs.rs]
 all-features = true
diff --git a/parquet/README.md b/parquet/README.md
index 2e0ab1d52c..9de7aec4e5 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -71,14 +71,6 @@ The `parquet` crate provides the following features which 
may be enabled in your
 - [x] Predicate pushdown
 - [x] Parquet format 4.0.0 support
 
-## Support for `wasm32-unknown-unknown` target
-
-It's possible to build `parquet` for the `wasm32-unknown-unknown` target, 
however not all the compression features are currently unsupported due to 
issues with the upstream crates. In particular, the `zstd` and `lz4` features 
may have compilation issues. See issue 
[#180](https://github.com/apache/arrow-rs/issues/180).
-
-```
-cargo build -p parquet --target wasm32-unknown-unknown --no-default-features 
--features cli,snap,flate2,brotli
-```
-
 ## License
 
 Licensed under the Apache License, Version 2.0: 
http://www.apache.org/licenses/LICENSE-2.0.
diff --git a/parquet/src/arrow/async_reader/store.rs 
b/parquet/src/arrow/async_reader/store.rs
index 3e27a96124..293b91aea3 100644
--- a/parquet/src/arrow/async_reader/store.rs
+++ b/parquet/src/arrow/async_reader/store.rs
@@ -28,7 +28,30 @@ use crate::arrow::async_reader::{AsyncFileReader, 
MetadataLoader};
 use crate::errors::{ParquetError, Result};
 use crate::file::metadata::ParquetMetaData;
 
-/// Implements [`AsyncFileReader`] for a parquet file in object storage
+/// Reads Parquet files in object storage using [`ObjectStore`].
+///
+/// ```no_run
+/// # use std::io::stdout;
+/// # use std::sync::Arc;
+/// # use object_store::azure::MicrosoftAzureBuilder;
+/// # use object_store::ObjectStore;
+/// # use object_store::path::Path;
+/// # use parquet::arrow::async_reader::ParquetObjectReader;
+/// # use parquet::arrow::ParquetRecordBatchStreamBuilder;
+/// # use parquet::schema::printer::print_parquet_metadata;
+/// # async fn run() {
+/// // Populate configuration from environment
+/// let storage_container = 
Arc::new(MicrosoftAzureBuilder::from_env().build().unwrap());
+/// let location = Path::from("path/to/blob.parquet");
+/// let meta = storage_container.head(&location).await.unwrap();
+/// println!("Found Blob with {}B at {}", meta.size, meta.location);
+///
+/// // Show Parquet metadata
+/// let reader = ParquetObjectReader::new(storage_container, meta);
+/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
+/// print_parquet_metadata(&mut stdout(), builder.metadata());
+/// # }
+/// ```
 #[derive(Clone, Debug)]
 pub struct ParquetObjectReader {
     store: Arc<dyn ObjectStore>,
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index 63885643c0..950226aef7 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Provides API for reading/writing Arrow
+//! High-level API for reading/writing Arrow
 //! [RecordBatch](arrow_array::RecordBatch)es and
 //! [Array](arrow_array::Array)s to/from Parquet Files.
 //!
diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs
index c20fd38c7f..6589d2efaf 100644
--- a/parquet/src/file/mod.rs
+++ b/parquet/src/file/mod.rs
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-//! Main entrypoint for working with Parquet API.
+//! Low level APIs for reading raw parquet data.
 //!
 //! Provides access to file and row group readers and writers, record API, 
metadata, etc.
 //!
diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index 921f9df290..dd6a0fdd23 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -134,7 +134,7 @@ pub trait FileReader: Send + Sync {
     /// Get the `i`th row group reader. Note this doesn't do bound check.
     fn get_row_group(&self, i: usize) -> Result<Box<dyn RowGroupReader + '_>>;
 
-    /// Get full iterator of `Row`s from a file (over all row groups).
+    /// Get an iterator over the row in this file, see [`RowIter`] for caveats.
     ///
     /// Iterator will automatically load the next row group to advance.
     ///
@@ -194,7 +194,7 @@ pub trait RowGroupReader: Send + Sync {
     /// to read bloom filters.
     fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf>;
 
-    /// Get iterator of `Row`s from this row group.
+    /// Get an iterator over the row in this file, see [`RowIter`] for caveats.
     ///
     /// Projected schema can be a subset of or equal to the file schema, when 
it is None,
     /// full file schema is assumed.
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 0279bbc382..db5d726343 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -15,24 +15,67 @@
 // specific language governing permissions and limitations
 // under the License.
 
+//!
 //! This crate contains the official Native Rust implementation of
 //! [Apache Parquet](https://parquet.apache.org/), part of
 //! the [Apache Arrow](https://arrow.apache.org/) project.
+//! The crate provides a number of APIs to read and write Parquet files,
+//! covering a range of use cases.
 //!
 //! Please see the [parquet crates.io](https://crates.io/crates/parquet)
 //! page for feature flags and tips to improve performance.
 //!
-//! # Getting Started
-//! Start with some examples:
+//! # Format Overview
+//!
+//! Parquet is a columnar format, which means that unlike row formats like 
[CSV], values are
+//! iterated along columns instead of rows. Parquet is similar in spirit to 
[Arrow], with Parquet
+//! focusing on storage efficiency whereas Arrow prioritizes compute 
efficiency.
+//!
+//! Parquet files are partitioned for scalability. Each file contains metadata,
+//! along with zero or more "row groups", each row group containing one or
+//! more columns. The APIs in this crate reflect this structure.
+//!
+//! Parquet distinguishes between "logical" and "physical" data types.
+//! For instance, strings (logical type) are stored as byte arrays (physical 
type).
+//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
+//! are stored as integers (physical type). This crate exposes both kinds of 
types.
+//!
+//! For more details about the Parquet format, see the
+//! [Parquet 
spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
+//!
+//! # APIs
+//!
+//! This crate exposes a number of APIs for different use-cases.
+//!
+//! ## Read/Write Arrow
+//!
+//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow 
`RecordBatch`.
+//! This makes for a simple and performant interface to parquet data, whilst 
allowing workloads
+//! to leverage the wide range of data transforms provided by the [arrow] 
crate, and by the
+//! ecosystem of libraries and services using [Arrow] as an interop format.
+//!
+//! ## Read/Write Arrow Async
+//!
+//! When the `async` feature is enabled, [`arrow::async_reader`] and 
[`arrow::async_writer`]
+//! provide the ability to read and write [`arrow`] data asynchronously. 
Additionally, with the
+//! `object_store` feature is enabled, 
[`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader)
+//! provides efficient integration with object storage services such as S3 via 
the [object_store]
+//! crate, automatically optimizing IO based on any predicates or projections 
provided.
 //!
-//! 1. [mod@file] for reading and writing parquet files using the
-//! [ColumnReader](column::reader::ColumnReader) API.
+//! ## Read/Write Parquet
 //!
-//! 2. [arrow] for reading and writing parquet files to Arrow
-//! `RecordBatch`es
+//! Workloads needing finer-grained control, or looking to not take a 
dependency on arrow,
+//! can use the lower-level APIs in [`mod@file`]. These APIs expose the 
underlying parquet
+//! data model, and therefore require knowledge of the underlying parquet 
format,
+//! including the details of [Dremel] record shredding and [Logical Types]. 
Most workloads
+//! should prefer the arrow interfaces.
 //!
-//! 3. [arrow::async_reader] and [arrow::async_writer] for `async` reading
-//! and writing parquet files to Arrow `RecordBatch`es (requires the `async` 
feature).
+//! [arrow]: https://docs.rs/arrow/latest/arrow/index.html
+//! [Arrow]: https://arrow.apache.org/
+//! [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values
+//! [Dremel]: https://research.google/pubs/pub36632/
+//! [Logical Types]: 
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+//! [object_store]: https://docs.rs/object_store/latest/object_store/
 
 /// Defines a an item with an experimental public API
 ///
diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs
index f989397255..feaa8055e2 100644
--- a/parquet/src/record/reader.rs
+++ b/parquet/src/record/reader.rs
@@ -609,9 +609,20 @@ impl<'a> Either<'a> {
     }
 }
 
-/// Iterator of [`Row`]s.
-/// It is used either for a single row group to iterate over data in that row 
group, or
-/// an entire file with auto buffering of all row groups.
+/// Access parquet data as an iterator of [`Row`]
+///
+/// # Caveats
+///
+/// Parquet stores data in a columnar fashion using [Dremel] encoding, and is 
therefore highly
+/// optimised for reading data by column, not row. As a consequence 
applications concerned with
+/// performance should prefer the columnar arrow or [ColumnReader] APIs.
+///
+/// Additionally the current implementation does not correctly handle repeated 
fields ([#2394]),
+/// and workloads looking to handle such schema should use the other APIs.
+///
+/// [#2394]: https://github.com/apache/arrow-rs/issues/2394
+/// [ColumnReader]: crate::file::reader::RowGroupReader::get_column_reader
+/// [Dremel]: https://research.google/pubs/pub36632/
 pub struct RowIter<'a> {
     descr: SchemaDescPtr,
     tree_builder: TreeBuilder,

(arrow-rs) branch master updated: Expand parquet crate overview doc (#5093)

Reply via email to