This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 354d4ff521 Minor: fix some parquet writer session level defaults
(#7295)
354d4ff521 is described below
commit 354d4ff521bf711cbaced480a17626c6f1fac4ba
Author: Devin D'Angelo <[email protected]>
AuthorDate: Thu Aug 17 08:31:16 2023 -0400
Minor: fix some parquet writer session level defaults (#7295)
* fix parquet writer defaults
* update configs.md
* Update datafusion/common/src/config.rs
Co-authored-by: Andrew Lamb <[email protected]>
* Apply suggestions from code review
clarify meaning of null in session default writer settings
Co-authored-by: Andrew Lamb <[email protected]>
* update docs
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
datafusion/common/src/config.rs | 35 ++++++++------
.../core/src/datasource/file_format/parquet.rs | 55 +++++++++++++++++-----
.../sqllogictest/test_files/information_schema.slt | 14 +++---
docs/source/user-guide/configs.md | 14 +++---
4 files changed, 78 insertions(+), 40 deletions(-)
diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index b7a91684bb..38c530f2bd 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -302,22 +302,26 @@ config_namespace! {
/// Sets default parquet compression codec
/// Valid values are: uncompressed, snappy, gzip(level),
/// lzo, brotli(level), lz4, zstd(level), and lz4_raw.
- /// These values are not case sensitive.
- pub compression: String, default = "snappy".into()
+ /// These values are not case sensitive. If NULL, uses
+ /// default parquet writer setting
+ pub compression: Option<String>, default = None
- /// Sets if dictionary encoding is enabled
- pub dictionary_enabled: bool, default = true
+ /// Sets if dictionary encoding is enabled. If NULL, uses
+ /// default parquet writer setting
+ pub dictionary_enabled: Option<bool>, default = None
/// Sets best effort maximum dictionary page size, in bytes
pub dictionary_page_size_limit: usize, default = 1024 * 1024
/// Sets if statistics are enabled for any column
/// Valid values are: "none", "chunk", and "page"
- /// These values are not case sensitive.
- pub statistics_enabled: String, default = "page".into()
+ /// These values are not case sensitive. If NULL, uses
+ /// default parquet writer setting
+ pub statistics_enabled: Option<String>, default = None
- /// Sets max statistics size for any column
- pub max_statistics_size: usize, default = 4096
+ /// Sets max statistics size for any column. If NULL, uses
+ /// default parquet writer setting
+ pub max_statistics_size: Option<usize>, default = None
/// Sets maximum number of rows in a row group
pub max_row_group_size: usize, default = 1024 * 1024
@@ -335,17 +339,20 @@ config_namespace! {
/// Valid values are: plain, plain_dictionary, rle,
/// bit_packed, delta_binary_packed, delta_length_byte_array,
/// delta_byte_array, rle_dictionary, and byte_stream_split.
- /// These values are not case sensitive.
- pub encoding: String, default = "plain".into()
+ /// These values are not case sensitive. If NULL, uses
+ /// default parquet writer setting
+ pub encoding: Option<String>, default = None
/// Sets if bloom filter is enabled for any column
pub bloom_filter_enabled: bool, default = false
- /// Sets bloom filter false positive probability
- pub bloom_filter_fpp: f64, default = 0.05
+ /// Sets bloom filter false positive probability. If NULL, uses
+ /// default parquet writer setting
+ pub bloom_filter_fpp: Option<f64>, default = None
- /// Sets bloom filter number of distinct values
- pub bloom_filter_ndv: u64, default = 1_000_000_u64
+ /// Sets bloom filter number of distinct values. If NULL, uses
+ /// default parquet writer setting
+ pub bloom_filter_ndv: Option<u64>, default = None
}
}
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs
b/datafusion/core/src/datasource/file_format/parquet.rs
index 6688d3dd37..58fdb5fc43 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -755,28 +755,59 @@ impl ParquetSink {
context: &Arc<TaskContext>,
) -> Result<WriterProperties> {
let parquet_context =
&context.session_config().options().execution.parquet;
- Ok(WriterProperties::builder()
+ let mut builder = WriterProperties::builder()
.set_data_page_size_limit(parquet_context.data_pagesize_limit)
.set_write_batch_size(parquet_context.write_batch_size)
.set_writer_version(parse_version_string(&parquet_context.writer_version)?)
-
.set_compression(parse_compression_string(&parquet_context.compression)?)
- .set_dictionary_enabled(parquet_context.dictionary_enabled)
.set_dictionary_page_size_limit(parquet_context.dictionary_page_size_limit)
- .set_statistics_enabled(parse_statistics_string(
- &parquet_context.statistics_enabled,
- )?)
- .set_max_statistics_size(parquet_context.max_statistics_size)
.set_max_row_group_size(parquet_context.max_row_group_size)
.set_created_by(parquet_context.created_by.clone())
.set_column_index_truncate_length(
parquet_context.column_index_truncate_length,
)
.set_data_page_row_count_limit(parquet_context.data_page_row_count_limit)
- .set_encoding(parse_encoding_string(&parquet_context.encoding)?)
- .set_bloom_filter_enabled(parquet_context.bloom_filter_enabled)
- .set_bloom_filter_fpp(parquet_context.bloom_filter_fpp)
- .set_bloom_filter_ndv(parquet_context.bloom_filter_ndv)
- .build())
+ .set_bloom_filter_enabled(parquet_context.bloom_filter_enabled);
+
+ builder = match &parquet_context.encoding {
+ Some(encoding) =>
builder.set_encoding(parse_encoding_string(encoding)?),
+ None => builder,
+ };
+
+ builder = match &parquet_context.dictionary_enabled {
+ Some(enabled) => builder.set_dictionary_enabled(*enabled),
+ None => builder,
+ };
+
+ builder = match &parquet_context.compression {
+ Some(compression) => {
+ builder.set_compression(parse_compression_string(compression)?)
+ }
+ None => builder,
+ };
+
+ builder = match &parquet_context.statistics_enabled {
+ Some(statistics) => {
+
builder.set_statistics_enabled(parse_statistics_string(statistics)?)
+ }
+ None => builder,
+ };
+
+ builder = match &parquet_context.max_statistics_size {
+ Some(size) => builder.set_max_statistics_size(*size),
+ None => builder,
+ };
+
+ builder = match &parquet_context.bloom_filter_fpp {
+ Some(fpp) => builder.set_bloom_filter_fpp(*fpp),
+ None => builder,
+ };
+
+ builder = match &parquet_context.bloom_filter_ndv {
+ Some(ndv) => builder.set_bloom_filter_ndv(*ndv),
+ None => builder,
+ };
+
+ Ok(builder.build())
}
// Create a write for parquet files
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt
b/datafusion/sqllogictest/test_files/information_schema.slt
index 1c70419a47..5db305105f 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -151,25 +151,25 @@ datafusion.execution.batch_size 8192
datafusion.execution.coalesce_batches true
datafusion.execution.collect_statistics false
datafusion.execution.parquet.bloom_filter_enabled false
-datafusion.execution.parquet.bloom_filter_fpp 0.05
-datafusion.execution.parquet.bloom_filter_ndv 1000000
+datafusion.execution.parquet.bloom_filter_fpp NULL
+datafusion.execution.parquet.bloom_filter_ndv NULL
datafusion.execution.parquet.column_index_truncate_length NULL
-datafusion.execution.parquet.compression snappy
+datafusion.execution.parquet.compression NULL
datafusion.execution.parquet.created_by datafusion
datafusion.execution.parquet.data_page_row_count_limit 18446744073709551615
datafusion.execution.parquet.data_pagesize_limit 1048576
-datafusion.execution.parquet.dictionary_enabled true
+datafusion.execution.parquet.dictionary_enabled NULL
datafusion.execution.parquet.dictionary_page_size_limit 1048576
datafusion.execution.parquet.enable_page_index true
-datafusion.execution.parquet.encoding plain
+datafusion.execution.parquet.encoding NULL
datafusion.execution.parquet.max_row_group_size 1048576
-datafusion.execution.parquet.max_statistics_size 4096
+datafusion.execution.parquet.max_statistics_size NULL
datafusion.execution.parquet.metadata_size_hint NULL
datafusion.execution.parquet.pruning true
datafusion.execution.parquet.pushdown_filters false
datafusion.execution.parquet.reorder_filters false
datafusion.execution.parquet.skip_metadata true
-datafusion.execution.parquet.statistics_enabled page
+datafusion.execution.parquet.statistics_enabled NULL
datafusion.execution.parquet.write_batch_size 1024
datafusion.execution.parquet.writer_version 1.0
datafusion.execution.planning_concurrency 13
diff --git a/docs/source/user-guide/configs.md
b/docs/source/user-guide/configs.md
index acb0893a70..a81bece2b5 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -58,19 +58,19 @@ Environment variables are read during `SessionConfig`
initialisation so they mus
| datafusion.execution.parquet.data_pagesize_limit | 1048576
| Sets best effort maximum size of data page in bytes
[...]
| datafusion.execution.parquet.write_batch_size | 1024
| Sets write_batch_size in bytes
[...]
| datafusion.execution.parquet.writer_version | 1.0
| Sets parquet writer version valid values are "1.0" and "2.0"
[...]
-| datafusion.execution.parquet.compression | snappy
| Sets default parquet compression codec Valid values are:
uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and
lz4_raw. These values are not case sensitive.
[...]
-| datafusion.execution.parquet.dictionary_enabled | true
| Sets if dictionary encoding is enabled
[...]
+| datafusion.execution.parquet.compression | NULL
| Sets default parquet compression codec Valid values are:
uncompressed, snappy, gzip(level), lzo, brotli(level), lz4, zstd(level), and
lz4_raw. These values are not case sensitive. If NULL, uses default parquet
writer setting
[...]
+| datafusion.execution.parquet.dictionary_enabled | NULL
| Sets if dictionary encoding is enabled. If NULL, uses default
parquet writer setting
[...]
| datafusion.execution.parquet.dictionary_page_size_limit | 1048576
| Sets best effort maximum dictionary page size, in bytes
[...]
-| datafusion.execution.parquet.statistics_enabled | page
| Sets if statistics are enabled for any column Valid values are:
"none", "chunk", and "page" These values are not case sensitive.
[...]
-| datafusion.execution.parquet.max_statistics_size | 4096
| Sets max statistics size for any column
[...]
+| datafusion.execution.parquet.statistics_enabled | NULL
| Sets if statistics are enabled for any column Valid values are:
"none", "chunk", and "page" These values are not case sensitive. If NULL, uses
default parquet writer setting
[...]
+| datafusion.execution.parquet.max_statistics_size | NULL
| Sets max statistics size for any column. If NULL, uses default
parquet writer setting
[...]
| datafusion.execution.parquet.max_row_group_size | 1048576
| Sets maximum number of rows in a row group
[...]
| datafusion.execution.parquet.created_by | datafusion
version 29.0.0 | Sets "created by" property
[...]
| datafusion.execution.parquet.column_index_truncate_length | NULL
| Sets column index trucate length
[...]
| datafusion.execution.parquet.data_page_row_count_limit |
18446744073709551615 | Sets best effort maximum number of rows in data
page
[...]
-| datafusion.execution.parquet.encoding | plain
| Sets default encoding for any column Valid values are: plain,
plain_dictionary, rle, bit_packed, delta_binary_packed,
delta_length_byte_array, delta_byte_array, rle_dictionary, and
byte_stream_split. These values are not case sensitive.
[...]
+| datafusion.execution.parquet.encoding | NULL
| Sets default encoding for any column Valid values are: plain,
plain_dictionary, rle, bit_packed, delta_binary_packed,
delta_length_byte_array, delta_byte_array, rle_dictionary, and
byte_stream_split. These values are not case sensitive. If NULL, uses default
parquet writer setting
[...]
| datafusion.execution.parquet.bloom_filter_enabled | false
| Sets if bloom filter is enabled for any column
[...]
-| datafusion.execution.parquet.bloom_filter_fpp | 0.05
| Sets bloom filter false positive probability
[...]
-| datafusion.execution.parquet.bloom_filter_ndv | 1000000
| Sets bloom filter number of distinct values
[...]
+| datafusion.execution.parquet.bloom_filter_fpp | NULL
| Sets bloom filter false positive probability. If NULL, uses default
parquet writer setting
[...]
+| datafusion.execution.parquet.bloom_filter_ndv | NULL
| Sets bloom filter number of distinct values. If NULL, uses default
parquet writer setting
[...]
| datafusion.execution.aggregate.scalar_update_factor | 10
| Specifies the threshold for using `ScalarValue`s to update
accumulators during high-cardinality aggregations for each input batch. The
aggregation is considered high-cardinality if the number of affected groups is
greater than or equal to `batch_size / scalar_update_factor`. In such cases,
`ScalarValue`s are utilized for updating accumulators, rather than the default
batch-slice approach. This can [...]
| datafusion.execution.planning_concurrency | 0
| Fan-out during initial physical planning. This is mostly use to
plan `UNION` children in parallel. Defaults to the number of CPU cores on the
system
[...]
| datafusion.execution.sort_spill_reservation_bytes | 10485760
| Specifies the reserved memory for each spillable sort operation to
facilitate an in-memory merge. When a sort operation spills to disk, the
in-memory data must be sorted and merged before being written to a file. This
setting reserves a specific amount of memory for that in-memory sort/merge
process. Note: This setting is irrelevant if the sort operation cannot spill
(i.e., if there's no `DiskManag [...]