This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 8bee08bf3c6 Improve Parquet reader/writer properties docs (#5863)
8bee08bf3c6 is described below
commit 8bee08bf3c68ba6b8cb933b5b230ede2ff6f11ef
Author: Andrew Lamb <[email protected]>
AuthorDate: Wed Jun 12 06:18:59 2024 -0400
Improve Parquet reader/writer properties docs (#5863)
* Improve Parquet reader/writer properties docs
* fix
* Apply suggestions from code review
Co-authored-by: Val Lorentz <[email protected]>
* Apply suggestions from code review
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---------
Co-authored-by: Val Lorentz <[email protected]>
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---
parquet/src/arrow/arrow_reader/mod.rs | 18 ++--
parquet/src/arrow/arrow_writer/mod.rs | 8 +-
parquet/src/file/properties.rs | 175 +++++++++++++++++++++++-----------
3 files changed, 132 insertions(+), 69 deletions(-)
diff --git a/parquet/src/arrow/arrow_reader/mod.rs
b/parquet/src/arrow/arrow_reader/mod.rs
index 793f79272c1..6b95324bee3 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -258,12 +258,12 @@ impl ArrowReaderOptions {
Self::default()
}
- /// Parquet files generated by some writers may contain embedded arrow
- /// schema and metadata. This may not be correct or compatible with your
system.
- ///
- /// For
example:[ARROW-16184](https://issues.apache.org/jira/browse/ARROW-16184)
+ /// Skip decoding the embedded arrow metadata (defaults to `false`)
///
- /// Set `skip_arrow_metadata` to true, to skip decoding this
+ /// Parquet files generated by some writers may contain embedded arrow
+ /// schema and metadata.
+ /// This may not be correct or compatible with your system,
+ /// for example:
[ARROW-16184](https://issues.apache.org/jira/browse/ARROW-16184)
pub fn with_skip_arrow_metadata(self, skip_arrow_metadata: bool) -> Self {
Self {
skip_arrow_metadata,
@@ -271,10 +271,12 @@ impl ArrowReaderOptions {
}
}
- /// Set this true to enable decoding of the [PageIndex] if present. This
can be used
- /// to push down predicates to the parquet scan, potentially eliminating
unnecessary IO
+ /// Enable decoding of the [`PageIndex`], if present (defaults to `false`)
+ ///
+ /// The `PageIndex` can be used to push down predicates to the parquet
scan,
+ /// potentially eliminating unnecessary IO, by some query engines.
///
- /// [PageIndex]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
+ /// [`PageIndex`]:
https://github.com/apache/parquet-format/blob/master/PageIndex.md
pub fn with_page_index(self, page_index: bool) -> Self {
Self { page_index, ..self }
}
diff --git a/parquet/src/arrow/arrow_writer/mod.rs
b/parquet/src/arrow/arrow_writer/mod.rs
index fd3f9591718..53287dec572 100644
--- a/parquet/src/arrow/arrow_writer/mod.rs
+++ b/parquet/src/arrow/arrow_writer/mod.rs
@@ -341,10 +341,12 @@ impl ArrowWriterOptions {
Self { properties, ..self }
}
+ /// Skip encoding the embedded arrow metadata (defaults to `false`)
+ ///
/// Parquet files generated by the [`ArrowWriter`] contain embedded arrow
schema
/// by default.
///
- /// Set `skip_arrow_metadata` to true, to skip encoding this.
+ /// Set `skip_arrow_metadata` to true, to skip encoding the embedded
metadata.
pub fn with_skip_arrow_metadata(self, skip_arrow_metadata: bool) -> Self {
Self {
skip_arrow_metadata,
@@ -352,9 +354,7 @@ impl ArrowWriterOptions {
}
}
- /// Overrides the name of the root parquet schema element
- ///
- /// Defaults to `"arrow_schema"`
+ /// Set the name of the root parquet schema element (defaults to
`"arrow_schema"`)
pub fn with_schema_root(self, name: String) -> Self {
Self {
schema_root: Some(name),
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index 59e29440ae0..87d84cef80a 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -91,18 +91,17 @@ pub type WriterPropertiesPtr = Arc<WriterProperties>;
/// Configuration settings for writing parquet files.
///
-/// All properties except the key-value metadata are immutable,
-/// use [`WriterPropertiesBuilder`] to assemble these properties.
+/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change
settings.
///
/// # Example
///
/// ```rust
-/// use parquet::{
-/// basic::{Compression, Encoding},
-/// file::properties::*,
-/// schema::types::ColumnPath,
-/// };
-///
+/// # use parquet::{
+/// # basic::{Compression, Encoding},
+/// # file::properties::*,
+/// # schema::types::ColumnPath,
+/// # };
+/// #
/// // Create properties with default configuration.
/// let props = WriterProperties::default();
///
@@ -155,7 +154,8 @@ impl WriterProperties {
Self::default()
}
- /// Returns builder for writer properties with default values.
+ /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
+ /// properties.
pub fn builder() -> WriterPropertiesBuilder {
WriterPropertiesBuilder::with_defaults()
}
@@ -299,7 +299,7 @@ impl WriterProperties {
.unwrap_or(DEFAULT_DICTIONARY_ENABLED)
}
- /// Returns `true` if statistics are enabled for a column.
+ /// Returns which statistics are written for a column.
pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
self.column_properties
.get(col)
@@ -329,8 +329,9 @@ impl WriterProperties {
}
}
-/// Builder for parquet file writer configuration. See example on
-/// [`WriterProperties`]
+/// Builder for [`WriterProperties`] parquet writer configuration.
+///
+/// See example on [`WriterProperties`]
pub struct WriterPropertiesBuilder {
data_page_size_limit: usize,
dictionary_page_size_limit: usize,
@@ -389,7 +390,11 @@ impl WriterPropertiesBuilder {
// ----------------------------------------------------------------------
// Writer properties related to a file
- /// Sets writer version.
+ /// Sets the `WriterVersion` written into the parquet metadata (defaults
to [`PARQUET_1_0`])
+ ///
+ /// This value can determine what features some readers will support.
+ ///
+ /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
self.writer_version = value;
self
@@ -405,7 +410,7 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets best effort maximum size of a data page in bytes.
+ /// Sets best effort maximum size of a data page in bytes (defaults to
`1024 * 1024`).
///
/// The parquet writer will attempt to limit the sizes of each
/// `DataPage` to this many bytes. Reducing this value will result
@@ -419,7 +424,7 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets best effort maximum number of rows in a data page.
+ /// Sets best effort maximum number of rows in a data page (defaults to
`usize::MAX`).
///
/// The parquet writer will attempt to limit the number of rows in
/// each `DataPage` to this value. Reducing this value will result
@@ -443,7 +448,7 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets best effort maximum dictionary page size, in bytes.
+ /// Sets best effort maximum dictionary page size, in bytes (defaults to
`1024 * 1024`).
///
/// The parquet writer will attempt to limit the size of each
/// `DataPage` used to store dictionaries to this many
@@ -458,7 +463,7 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets write batch size.
+ /// Sets write batch size (defaults to 1024).
///
/// For performance reasons, data for each column is written in
/// batches of this size.
@@ -472,26 +477,29 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets maximum number of rows in a row group.
+ /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`).
+ ///
+ /// # Panics
+ /// If the value is set to 0.
pub fn set_max_row_group_size(mut self, value: usize) -> Self {
assert!(value > 0, "Cannot have a 0 max row group size");
self.max_row_group_size = value;
self
}
- /// Sets "created by" property.
+ /// Sets "created by" property (defaults to `parquet-rs version
<VERSION>`).
pub fn set_created_by(mut self, value: String) -> Self {
self.created_by = value;
self
}
- /// Sets "key_value_metadata" property.
+ /// Sets "key_value_metadata" property (defaults to `None`).
pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) ->
Self {
self.key_value_metadata = value;
self
}
- /// Sets sorting order of rows in the row group if any
+ /// Sets sorting order of rows in the row group if any (defaults to
`None`).
pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) ->
Self {
self.sorting_columns = value;
self
@@ -500,26 +508,30 @@ impl WriterPropertiesBuilder {
// ----------------------------------------------------------------------
// Setters for any column (global)
- /// Sets encoding for any column.
+ /// Sets default encoding for all columns.
///
/// If dictionary is not enabled, this is treated as a primary encoding
for all
/// columns. In case when dictionary is enabled for any column, this value
is
/// considered to be a fallback encoding for that column.
///
- /// Panics if user tries to set dictionary encoding here, regardless of
dictionary
+ /// # Panics
+ ///
+ /// if dictionary encoding is specified, regardless of dictionary
/// encoding flag being set.
pub fn set_encoding(mut self, value: Encoding) -> Self {
self.default_column_properties.set_encoding(value);
self
}
- /// Sets compression codec for any column.
+ /// Sets default compression codec for all columns (default to
[`UNCOMPRESSED`]).
+ ///
+ /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
pub fn set_compression(mut self, value: Compression) -> Self {
self.default_column_properties.set_compression(value);
self
}
- /// Sets flag to enable/disable dictionary encoding for any column.
+ /// Sets default flag to enable/disable dictionary encoding for all
columns (defaults to `true`).
///
/// Use this method to set dictionary encoding, instead of explicitly
specifying
/// encoding in `set_encoding` method.
@@ -528,13 +540,16 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets flag to enable/disable statistics for any column.
+ /// Sets default statistics level for all columns (defaults to [`Page`]).
+ ///
+ /// [`Page`]: EnabledStatistics::Page
pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
self.default_column_properties.set_statistics_enabled(value);
self
}
- /// Sets max statistics size for any column.
+ /// Sets default max statistics size for all columns (defaults to `4096`).
+ ///
/// Applicable only if statistics are enabled.
pub fn set_max_statistics_size(mut self, value: usize) -> Self {
self.default_column_properties
@@ -542,25 +557,43 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets whether bloom filter is enabled for any column.
- /// If the bloom filter is enabled previously then it is a no-op.
- /// If the bloom filter is not yet enabled, a default set of ndv and fpp
value will be used.
- /// You can use [`set_bloom_filter_ndv`](Self::set_bloom_filter_ndv) and
[`set_bloom_filter_fpp`](Self::set_bloom_filter_fpp) to further adjust the ndv
and fpp.
+ /// Sets if bloom filter is enabled by default for all columns (defaults
to `false`).
+ ///
+ /// # Notes
+ ///
+ /// * If the bloom filter is enabled previously then it is a no-op.
+ ///
+ /// * If the bloom filter is not enabled, default values for ndv and fpp
+ /// value are used used. See [`set_bloom_filter_ndv`] and
+ /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
+ ///
+ /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
+ /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
self.default_column_properties
.set_bloom_filter_enabled(value);
self
}
- /// Sets bloom filter false positive probability (fpp) for any column.
- /// Implicitly
[`set_bloom_filter_enabled`](Self::set_bloom_filter_enabled).
+ /// Sets the default target bloom filter false positive probability (fpp)
+ /// for all columns (defaults to `0.05`).
+ ///
+ /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`]
had
+ /// been called.
+ ///
+ /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
self.default_column_properties.set_bloom_filter_fpp(value);
self
}
- /// Sets number of distinct values (ndv) for bloom filter for any column.
- /// Implicitly
[`set_bloom_filter_enabled`](Self::set_bloom_filter_enabled).
+ /// Sets default number of distinct values (ndv) for bloom filter for all
+ /// columns (defaults to `1_000_000`).
+ ///
+ /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`]
had
+ /// been called.
+ ///
+ /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
self.default_column_properties.set_bloom_filter_ndv(value);
self
@@ -575,37 +608,42 @@ impl WriterPropertiesBuilder {
self.column_properties.entry(col).or_default()
}
- /// Sets encoding for a column.
- /// Takes precedence over globally defined settings.
+ /// Sets encoding for a specific column.
+ ///
+ /// Takes precedence over [`Self::set_encoding`].
///
/// If dictionary is not enabled, this is treated as a primary encoding
for this
/// column. In case when dictionary is enabled for this column, either
through
/// global defaults or explicitly, this value is considered to be a
fallback
/// encoding for this column.
///
- /// Panics if user tries to set dictionary encoding here, regardless of
dictionary
+ /// # Panics
+ /// If user tries to set dictionary encoding here, regardless of dictionary
/// encoding flag being set.
pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) ->
Self {
self.get_mut_props(col).set_encoding(value);
self
}
- /// Sets compression codec for a column.
- /// Takes precedence over globally defined settings.
+ /// Sets compression codec for a specific column.
+ ///
+ /// Takes precedence over [`Self::set_compression`].
pub fn set_column_compression(mut self, col: ColumnPath, value:
Compression) -> Self {
self.get_mut_props(col).set_compression(value);
self
}
- /// Sets flag to enable/disable dictionary encoding for a column.
- /// Takes precedence over globally defined settings.
+ /// Sets flag to enable/disable dictionary encoding for a specific column.
+ ///
+ /// Takes precedence over [`Self::set_dictionary_enabled`].
pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value:
bool) -> Self {
self.get_mut_props(col).set_dictionary_enabled(value);
self
}
- /// Sets flag to enable/disable statistics for a column.
- /// Takes precedence over globally defined settings.
+ /// Sets statistics level for a specific column.
+ ///
+ /// Takes precedence over [`Self::set_statistics_enabled`].
pub fn set_column_statistics_enabled(
mut self,
col: ColumnPath,
@@ -615,39 +653,53 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets max size for statistics for a column.
- /// Takes precedence over globally defined settings.
+ /// Sets max size for statistics for a specific column.
+ ///
+ /// Takes precedence over [`Self::set_max_statistics_size`].
pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value:
usize) -> Self {
self.get_mut_props(col).set_max_statistics_size(value);
self
}
- /// Sets whether a bloom filter should be created for a specific column.
- /// The behavior is similar to
[`set_bloom_filter_enabled`](Self::set_bloom_filter_enabled).
- /// Takes precedence over globally defined settings.
+ /// Sets whether a bloom filter should be written for a specific column.
+ ///
+ /// Takes precedence over [`Self::set_bloom_filter_enabled`].
pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value:
bool) -> Self {
self.get_mut_props(col).set_bloom_filter_enabled(value);
self
}
/// Sets the false positive probability for bloom filter for a specific
column.
- /// The behavior is similar to
[`set_bloom_filter_fpp`](Self::set_bloom_filter_fpp) but will
- /// override the default.
+ ///
+ /// Takes precedence over [`Self::set_bloom_filter_fpp`].
pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64)
-> Self {
self.get_mut_props(col).set_bloom_filter_fpp(value);
self
}
/// Sets the number of distinct values for bloom filter for a specific
column.
- /// The behavior is similar to
[`set_bloom_filter_ndv`](Self::set_bloom_filter_ndv) but will
- /// override the default.
+ ///
+ /// Takes precedence over [`Self::set_bloom_filter_ndv`].
pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64)
-> Self {
self.get_mut_props(col).set_bloom_filter_ndv(value);
self
}
- /// Sets the max length of min/max value fields in the column index. Must
be greater than 0.
- /// If set to `None` - there's no effective limit.
+ /// Sets the max length of min/max value fields when writing the column
+ /// [`Index`] (defaults to `None`).
+ ///
+ /// This can be used to prevent columns with very long values (hundreds of
+ /// bytes long) from causing the parquet metadata to become huge.
+ ///
+ /// # Notes
+ ///
+ /// The column [`Index`] is written when [`Self::set_statistics_enabled`]
is
+ /// set to [`EnabledStatistics::Page`].
+ ///
+ /// * If `Some`, must be greater than 0, otherwise will panic
+ /// * If `None`, there's no effective limit.
+ ///
+ /// [`Index`]: crate::file::page_index::index::Index
pub fn set_column_index_truncate_length(mut self, max_length:
Option<usize>) -> Self {
if let Some(value) = max_length {
assert!(value > 0, "Cannot have a 0 column index truncate length.
If you wish to disable min/max value truncation, set it to `None`.");
@@ -657,8 +709,17 @@ impl WriterPropertiesBuilder {
self
}
- /// Sets the max length of min/max value fields in statistics. Must be
greater than 0.
- /// If set to `None` - there's no effective limit.
+ /// Sets the max length of min/max value fields in row group level
+ /// [`Statistics`] (defaults to `None`).
+ ///
+ /// # Notes
+ /// Row group level [`Statistics`] are written when
[`Self::set_statistics_enabled`] is
+ /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`].
+ ///
+ /// * If `Some`, must be greater than 0, otherwise will panic
+ /// * If `None`, there's no effective limit.
+ ///
+ /// [`Statistics`]: crate::file::statistics::Statistics
pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>)
-> Self {
if let Some(value) = max_length {
assert!(value > 0, "Cannot have a 0 statistics truncate length. If
you wish to disable min/max value truncation, set it to `None`.");