alamb commented on code in PR #11558: URL: https://github.com/apache/datafusion/pull/11558#discussion_r1688482991
########## datafusion/common/src/config.rs: ########## @@ -314,6 +314,85 @@ config_namespace! { } } +/// When using the parquet feature, +/// use the same default writer settings as the extern parquet. +#[cfg(feature = "parquet")] +mod parquet_defaults { + use parquet::basic::Compression; + use parquet::file::properties as props; + + /// Default value for [`parquet::WriterProperties::data_page_size_limit`] + pub const DEFAULT_PAGE_SIZE: usize = props::DEFAULT_PAGE_SIZE; + /// Default value for [`parquet::WriterProperties::write_batch_size`] + pub const DEFAULT_WRITE_BATCH_SIZE: usize = props::DEFAULT_WRITE_BATCH_SIZE; + /// Default value for [`parquet::WriterProperties::writer_version`] + pub const DEFAULT_WRITER_VERSION: &str = "1.0"; + /// Default value for [`parquet::WriterProperties::dictionary_enabled`] + pub const DEFAULT_DICTIONARY_ENABLED: Option<bool> = + Some(props::DEFAULT_DICTIONARY_ENABLED); + /// Default value for [`parquet::WriterProperties::dictionary_page_size_limit`] + pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = + props::DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT; + /// Default value for [`parquet::WriterProperties::data_page_row_count_limit`] + pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = + props::DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT; + /// Default value for [`parquet::WriterProperties::max_statistics_size`] + pub const DEFAULT_MAX_STATISTICS_SIZE: Option<usize> = + Some(props::DEFAULT_MAX_STATISTICS_SIZE); + /// Default value for [`parquet::WriterProperties::max_row_group_size`] + pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = props::DEFAULT_MAX_ROW_GROUP_SIZE; + /// Default value for [`parquet::WriterProperties::column_index_truncate_length`] + pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = + props::DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH; + + // TODO: discuss if we want datafusion to use these defaults from the extern parquet + // refer to https://github.com/apache/datafusion/issues/11367 + + #[allow(dead_code)] + /// Default value for [`parquet::WriterProperties::statistics_enabled`] + pub const DEFAULT_STATISTICS_ENABLED: Option<&str> = Some("page"); + #[allow(dead_code)] + /// Default value for [`parquet::BloomFilterProperties::fpp`] + pub const DEFAULT_BLOOM_FILTER_FPP: Option<f64> = + Some(props::DEFAULT_BLOOM_FILTER_FPP); + #[allow(dead_code)] + /// Default value for [`parquet::BloomFilterProperties::ndv`] + pub const DEFAULT_BLOOM_FILTER_NDV: Option<u64> = + Some(props::DEFAULT_BLOOM_FILTER_NDV); + + #[allow(dead_code)] + /// Default value for [`parquet::WriterProperties::compression`] + pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED; Review Comment: I think there is a tension between raw read performance (uncompressed) and storage efficiency (compressed). @tustvold I think preferred arrow-rs to be on the read performance side of the equation -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org