This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 6815bf153d Expand parquet crate overview doc (#5093)
6815bf153d is described below
commit 6815bf153d2e2166ce3b63beed8d499aef48c7cc
Author: Matthieu Maitre <[email protected]>
AuthorDate: Mon Nov 20 01:12:16 2023 -0800
Expand parquet crate overview doc (#5093)
* Expand parquet crate overview doc
* Run `cargo fmt --all`
* Nit: ask to format the code before sending a PR
* Add example reading Parquet files from cloud provider
* Tweak copy
* Fix doctest
---------
Co-authored-by: Matthieu Maitre <[email protected]>
Co-authored-by: Raphael Taylor-Davies <[email protected]>
---
parquet/CONTRIBUTING.md | 6 ++--
parquet/Cargo.toml | 1 +
parquet/README.md | 8 -----
parquet/src/arrow/async_reader/store.rs | 25 +++++++++++++-
parquet/src/arrow/mod.rs | 2 +-
parquet/src/file/mod.rs | 2 +-
parquet/src/file/reader.rs | 4 +--
parquet/src/lib.rs | 59 ++++++++++++++++++++++++++++-----
parquet/src/record/reader.rs | 17 ++++++++--
9 files changed, 98 insertions(+), 26 deletions(-)
diff --git a/parquet/CONTRIBUTING.md b/parquet/CONTRIBUTING.md
index 5670eef081..922332b15d 100644
--- a/parquet/CONTRIBUTING.md
+++ b/parquet/CONTRIBUTING.md
@@ -57,8 +57,10 @@ Run `cargo bench` for benchmarks.
## Docs
-To build documentation, run `cargo doc --no-deps`.
-To compile and view in the browser, run `cargo doc --no-deps --open`.
+To build documentation, run `cargo doc --no-deps --all-features`.
+To compile and view in the browser, run `cargo doc --no-deps --all-features
--open`.
+
+Before submitting a pull request, run `cargo fmt --all` to format the change.
## Update Parquet Format
diff --git a/parquet/Cargo.toml b/parquet/Cargo.toml
index bdcbcb81cf..4cd03c051e 100644
--- a/parquet/Cargo.toml
+++ b/parquet/Cargo.toml
@@ -81,6 +81,7 @@ serde_json = { version = "1.0", features = ["std"],
default-features = false }
arrow = { workspace = true, features = ["ipc", "test_utils", "prettyprint",
"json"] }
tokio = { version = "1.0", default-features = false, features = ["macros",
"rt", "io-util", "fs"] }
rand = { version = "0.8", default-features = false, features = ["std",
"std_rng"] }
+object_store = { version = "0.8", default-features = false, features =
["azure"] }
[package.metadata.docs.rs]
all-features = true
diff --git a/parquet/README.md b/parquet/README.md
index 2e0ab1d52c..9de7aec4e5 100644
--- a/parquet/README.md
+++ b/parquet/README.md
@@ -71,14 +71,6 @@ The `parquet` crate provides the following features which
may be enabled in your
- [x] Predicate pushdown
- [x] Parquet format 4.0.0 support
-## Support for `wasm32-unknown-unknown` target
-
-It's possible to build `parquet` for the `wasm32-unknown-unknown` target,
however not all the compression features are currently unsupported due to
issues with the upstream crates. In particular, the `zstd` and `lz4` features
may have compilation issues. See issue
[#180](https://github.com/apache/arrow-rs/issues/180).
-
-```
-cargo build -p parquet --target wasm32-unknown-unknown --no-default-features
--features cli,snap,flate2,brotli
-```
-
## License
Licensed under the Apache License, Version 2.0:
http://www.apache.org/licenses/LICENSE-2.0.
diff --git a/parquet/src/arrow/async_reader/store.rs
b/parquet/src/arrow/async_reader/store.rs
index 3e27a96124..293b91aea3 100644
--- a/parquet/src/arrow/async_reader/store.rs
+++ b/parquet/src/arrow/async_reader/store.rs
@@ -28,7 +28,30 @@ use crate::arrow::async_reader::{AsyncFileReader,
MetadataLoader};
use crate::errors::{ParquetError, Result};
use crate::file::metadata::ParquetMetaData;
-/// Implements [`AsyncFileReader`] for a parquet file in object storage
+/// Reads Parquet files in object storage using [`ObjectStore`].
+///
+/// ```no_run
+/// # use std::io::stdout;
+/// # use std::sync::Arc;
+/// # use object_store::azure::MicrosoftAzureBuilder;
+/// # use object_store::ObjectStore;
+/// # use object_store::path::Path;
+/// # use parquet::arrow::async_reader::ParquetObjectReader;
+/// # use parquet::arrow::ParquetRecordBatchStreamBuilder;
+/// # use parquet::schema::printer::print_parquet_metadata;
+/// # async fn run() {
+/// // Populate configuration from environment
+/// let storage_container =
Arc::new(MicrosoftAzureBuilder::from_env().build().unwrap());
+/// let location = Path::from("path/to/blob.parquet");
+/// let meta = storage_container.head(&location).await.unwrap();
+/// println!("Found Blob with {}B at {}", meta.size, meta.location);
+///
+/// // Show Parquet metadata
+/// let reader = ParquetObjectReader::new(storage_container, meta);
+/// let builder = ParquetRecordBatchStreamBuilder::new(reader).await.unwrap();
+/// print_parquet_metadata(&mut stdout(), builder.metadata());
+/// # }
+/// ```
#[derive(Clone, Debug)]
pub struct ParquetObjectReader {
store: Arc<dyn ObjectStore>,
diff --git a/parquet/src/arrow/mod.rs b/parquet/src/arrow/mod.rs
index 63885643c0..950226aef7 100644
--- a/parquet/src/arrow/mod.rs
+++ b/parquet/src/arrow/mod.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! Provides API for reading/writing Arrow
+//! High-level API for reading/writing Arrow
//! [RecordBatch](arrow_array::RecordBatch)es and
//! [Array](arrow_array::Array)s to/from Parquet Files.
//!
diff --git a/parquet/src/file/mod.rs b/parquet/src/file/mod.rs
index c20fd38c7f..6589d2efaf 100644
--- a/parquet/src/file/mod.rs
+++ b/parquet/src/file/mod.rs
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-//! Main entrypoint for working with Parquet API.
+//! Low level APIs for reading raw parquet data.
//!
//! Provides access to file and row group readers and writers, record API,
metadata, etc.
//!
diff --git a/parquet/src/file/reader.rs b/parquet/src/file/reader.rs
index 921f9df290..dd6a0fdd23 100644
--- a/parquet/src/file/reader.rs
+++ b/parquet/src/file/reader.rs
@@ -134,7 +134,7 @@ pub trait FileReader: Send + Sync {
/// Get the `i`th row group reader. Note this doesn't do bound check.
fn get_row_group(&self, i: usize) -> Result<Box<dyn RowGroupReader + '_>>;
- /// Get full iterator of `Row`s from a file (over all row groups).
+ /// Get an iterator over the row in this file, see [`RowIter`] for caveats.
///
/// Iterator will automatically load the next row group to advance.
///
@@ -194,7 +194,7 @@ pub trait RowGroupReader: Send + Sync {
/// to read bloom filters.
fn get_column_bloom_filter(&self, i: usize) -> Option<&Sbbf>;
- /// Get iterator of `Row`s from this row group.
+ /// Get an iterator over the row in this file, see [`RowIter`] for caveats.
///
/// Projected schema can be a subset of or equal to the file schema, when
it is None,
/// full file schema is assumed.
diff --git a/parquet/src/lib.rs b/parquet/src/lib.rs
index 0279bbc382..db5d726343 100644
--- a/parquet/src/lib.rs
+++ b/parquet/src/lib.rs
@@ -15,24 +15,67 @@
// specific language governing permissions and limitations
// under the License.
+//!
//! This crate contains the official Native Rust implementation of
//! [Apache Parquet](https://parquet.apache.org/), part of
//! the [Apache Arrow](https://arrow.apache.org/) project.
+//! The crate provides a number of APIs to read and write Parquet files,
+//! covering a range of use cases.
//!
//! Please see the [parquet crates.io](https://crates.io/crates/parquet)
//! page for feature flags and tips to improve performance.
//!
-//! # Getting Started
-//! Start with some examples:
+//! # Format Overview
+//!
+//! Parquet is a columnar format, which means that unlike row formats like
[CSV], values are
+//! iterated along columns instead of rows. Parquet is similar in spirit to
[Arrow], with Parquet
+//! focusing on storage efficiency whereas Arrow prioritizes compute
efficiency.
+//!
+//! Parquet files are partitioned for scalability. Each file contains metadata,
+//! along with zero or more "row groups", each row group containing one or
+//! more columns. The APIs in this crate reflect this structure.
+//!
+//! Parquet distinguishes between "logical" and "physical" data types.
+//! For instance, strings (logical type) are stored as byte arrays (physical
type).
+//! Likewise, temporal types like dates, times, timestamps, etc. (logical type)
+//! are stored as integers (physical type). This crate exposes both kinds of
types.
+//!
+//! For more details about the Parquet format, see the
+//! [Parquet
spec](https://github.com/apache/parquet-format/blob/master/README.md#file-format).
+//!
+//! # APIs
+//!
+//! This crate exposes a number of APIs for different use-cases.
+//!
+//! ## Read/Write Arrow
+//!
+//! The [`arrow`] module allows reading and writing Parquet data to/from Arrow
`RecordBatch`.
+//! This makes for a simple and performant interface to parquet data, whilst
allowing workloads
+//! to leverage the wide range of data transforms provided by the [arrow]
crate, and by the
+//! ecosystem of libraries and services using [Arrow] as an interop format.
+//!
+//! ## Read/Write Arrow Async
+//!
+//! When the `async` feature is enabled, [`arrow::async_reader`] and
[`arrow::async_writer`]
+//! provide the ability to read and write [`arrow`] data asynchronously.
Additionally, with the
+//! `object_store` feature is enabled,
[`ParquetObjectReader`](arrow::async_reader::ParquetObjectReader)
+//! provides efficient integration with object storage services such as S3 via
the [object_store]
+//! crate, automatically optimizing IO based on any predicates or projections
provided.
//!
-//! 1. [mod@file] for reading and writing parquet files using the
-//! [ColumnReader](column::reader::ColumnReader) API.
+//! ## Read/Write Parquet
//!
-//! 2. [arrow] for reading and writing parquet files to Arrow
-//! `RecordBatch`es
+//! Workloads needing finer-grained control, or looking to not take a
dependency on arrow,
+//! can use the lower-level APIs in [`mod@file`]. These APIs expose the
underlying parquet
+//! data model, and therefore require knowledge of the underlying parquet
format,
+//! including the details of [Dremel] record shredding and [Logical Types].
Most workloads
+//! should prefer the arrow interfaces.
//!
-//! 3. [arrow::async_reader] and [arrow::async_writer] for `async` reading
-//! and writing parquet files to Arrow `RecordBatch`es (requires the `async`
feature).
+//! [arrow]: https://docs.rs/arrow/latest/arrow/index.html
+//! [Arrow]: https://arrow.apache.org/
+//! [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values
+//! [Dremel]: https://research.google/pubs/pub36632/
+//! [Logical Types]:
https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
+//! [object_store]: https://docs.rs/object_store/latest/object_store/
/// Defines a an item with an experimental public API
///
diff --git a/parquet/src/record/reader.rs b/parquet/src/record/reader.rs
index f989397255..feaa8055e2 100644
--- a/parquet/src/record/reader.rs
+++ b/parquet/src/record/reader.rs
@@ -609,9 +609,20 @@ impl<'a> Either<'a> {
}
}
-/// Iterator of [`Row`]s.
-/// It is used either for a single row group to iterate over data in that row
group, or
-/// an entire file with auto buffering of all row groups.
+/// Access parquet data as an iterator of [`Row`]
+///
+/// # Caveats
+///
+/// Parquet stores data in a columnar fashion using [Dremel] encoding, and is
therefore highly
+/// optimised for reading data by column, not row. As a consequence
applications concerned with
+/// performance should prefer the columnar arrow or [ColumnReader] APIs.
+///
+/// Additionally the current implementation does not correctly handle repeated
fields ([#2394]),
+/// and workloads looking to handle such schema should use the other APIs.
+///
+/// [#2394]: https://github.com/apache/arrow-rs/issues/2394
+/// [ColumnReader]: crate::file::reader::RowGroupReader::get_column_reader
+/// [Dremel]: https://research.google/pubs/pub36632/
pub struct RowIter<'a> {
descr: SchemaDescPtr,
tree_builder: TreeBuilder,