This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new 93e6749f7 Add option to disable writing of Parquet offset index (#6797)
93e6749f7 is described below
commit 93e6749f7b05d9e11b8cb6d871bc21744cc8b935
Author: Ed Seidl <[email protected]>
AuthorDate: Wed Nov 27 02:41:13 2024 -0800
Add option to disable writing of Parquet offset index (#6797)
* add option to disable offset indexes
* revert to global setting rather than per-column
* fix docs
* change offset index builder to option
* suggestion from review
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---------
Co-authored-by: Raphael Taylor-Davies
<[email protected]>
---
parquet/src/column/writer/mod.rs | 74 ++++++++++++++++++++++++++++++++++------
parquet/src/file/properties.rs | 37 ++++++++++++++++++++
parquet/src/file/writer.rs | 1 +
3 files changed, 102 insertions(+), 10 deletions(-)
diff --git a/parquet/src/column/writer/mod.rs b/parquet/src/column/writer/mod.rs
index 9bd79840f..8d0be5f9f 100644
--- a/parquet/src/column/writer/mod.rs
+++ b/parquet/src/column/writer/mod.rs
@@ -347,7 +347,7 @@ pub struct GenericColumnWriter<'a, E: ColumnValueEncoder> {
data_pages: VecDeque<CompressedPage>,
// column index and offset index
column_index_builder: ColumnIndexBuilder,
- offset_index_builder: OffsetIndexBuilder,
+ offset_index_builder: Option<OffsetIndexBuilder>,
// Below fields used to incrementally check boundary order across data
pages.
// We assume they are ascending/descending until proven wrong.
@@ -394,6 +394,12 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E>
{
column_index_builder.to_invalid()
}
+ // Disable offset_index_builder if requested by user.
+ let offset_index_builder = match props.offset_index_disabled() {
+ false => Some(OffsetIndexBuilder::new()),
+ _ => None,
+ };
+
Self {
descr,
props,
@@ -408,7 +414,7 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
page_metrics,
column_metrics,
column_index_builder,
- offset_index_builder: OffsetIndexBuilder::new(),
+ offset_index_builder,
encodings,
data_page_boundary_ascending: true,
data_page_boundary_descending: true,
@@ -613,7 +619,8 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a, E> {
.column_index_builder
.valid()
.then(|| self.column_index_builder.build_to_thrift());
- let offset_index = Some(self.offset_index_builder.build_to_thrift());
+
+ let offset_index = self.offset_index_builder.map(|b|
b.build_to_thrift());
Ok(ColumnCloseResult {
bytes_written: self.column_metrics.total_bytes_written,
@@ -841,11 +848,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a,
E> {
);
// Update the offset index
- self.offset_index_builder
- .append_row_count(self.page_metrics.num_buffered_rows as i64);
-
- self.offset_index_builder
-
.append_unencoded_byte_array_data_bytes(page_variable_length_bytes);
+ if let Some(builder) = self.offset_index_builder.as_mut() {
+ builder.append_row_count(self.page_metrics.num_buffered_rows as
i64);
+
builder.append_unencoded_byte_array_data_bytes(page_variable_length_bytes);
+ }
}
/// Determine if we should allow truncating min/max values for this
column's statistics
@@ -1174,8 +1180,10 @@ impl<'a, E: ColumnValueEncoder> GenericColumnWriter<'a,
E> {
let page_spec = self.page_writer.write_page(page)?;
// update offset index
// compressed_size = header_size + compressed_data_size
- self.offset_index_builder
- .append_offset_and_size(page_spec.offset as i64,
page_spec.compressed_size as i32);
+ if let Some(builder) = self.offset_index_builder.as_mut() {
+ builder
+ .append_offset_and_size(page_spec.offset as i64,
page_spec.compressed_size as i32)
+ }
self.update_metrics_for_page(page_spec);
Ok(())
}
@@ -3215,6 +3223,52 @@ mod tests {
assert!(column_close_result.column_index.is_none());
}
+ #[test]
+ fn test_no_offset_index_when_disabled() {
+ // Test that offset indexes can be disabled
+ let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));
+ let props = Arc::new(
+ WriterProperties::builder()
+ .set_statistics_enabled(EnabledStatistics::None)
+ .set_offset_index_disabled(true)
+ .build(),
+ );
+ let column_writer = get_column_writer(descr, props,
get_test_page_writer());
+ let mut writer = get_typed_column_writer::<Int32Type>(column_writer);
+
+ let data = Vec::new();
+ let def_levels = vec![0; 10];
+ writer.write_batch(&data, Some(&def_levels), None).unwrap();
+ writer.flush_data_pages().unwrap();
+
+ let column_close_result = writer.close().unwrap();
+ assert!(column_close_result.offset_index.is_none());
+ assert!(column_close_result.column_index.is_none());
+ }
+
+ #[test]
+ fn test_offset_index_overridden() {
+ // Test that offset indexes are not disabled when gathering page
statistics
+ let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));
+ let props = Arc::new(
+ WriterProperties::builder()
+ .set_statistics_enabled(EnabledStatistics::Page)
+ .set_offset_index_disabled(true)
+ .build(),
+ );
+ let column_writer = get_column_writer(descr, props,
get_test_page_writer());
+ let mut writer = get_typed_column_writer::<Int32Type>(column_writer);
+
+ let data = Vec::new();
+ let def_levels = vec![0; 10];
+ writer.write_batch(&data, Some(&def_levels), None).unwrap();
+ writer.flush_data_pages().unwrap();
+
+ let column_close_result = writer.close().unwrap();
+ assert!(column_close_result.offset_index.is_some());
+ assert!(column_close_result.column_index.is_some());
+ }
+
#[test]
fn test_boundary_order() -> Result<()> {
let descr = Arc::new(get_test_column_descr::<Int32Type>(1, 0));
diff --git a/parquet/src/file/properties.rs b/parquet/src/file/properties.rs
index cb07c1f49..1e8a4868d 100644
--- a/parquet/src/file/properties.rs
+++ b/parquet/src/file/properties.rs
@@ -57,6 +57,8 @@ pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
/// Default values for [`WriterProperties::statistics_truncate_length`]
pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
+/// Default value for [`WriterProperties::offset_index_disabled`]
+pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
/// Default values for [`WriterProperties::coerce_types`]
pub const DEFAULT_COERCE_TYPES: bool = false;
@@ -159,6 +161,7 @@ pub struct WriterProperties {
bloom_filter_position: BloomFilterPosition,
writer_version: WriterVersion,
created_by: String,
+ offset_index_disabled: bool,
pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
default_column_properties: ColumnProperties,
column_properties: HashMap<ColumnPath, ColumnProperties>,
@@ -244,6 +247,22 @@ impl WriterProperties {
&self.created_by
}
+ /// Returns `true` if offset index writing is disabled.
+ pub fn offset_index_disabled(&self) -> bool {
+ // If page statistics are to be collected, then do not disable the
offset indexes.
+ let default_page_stats_enabled =
+ self.default_column_properties.statistics_enabled() ==
Some(EnabledStatistics::Page);
+ let column_page_stats_enabled = self
+ .column_properties
+ .iter()
+ .any(|path_props| path_props.1.statistics_enabled() ==
Some(EnabledStatistics::Page));
+ if default_page_stats_enabled || column_page_stats_enabled {
+ return false;
+ }
+
+ self.offset_index_disabled
+ }
+
/// Returns `key_value_metadata` KeyValue pairs.
pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
self.key_value_metadata.as_ref()
@@ -371,6 +390,7 @@ pub struct WriterPropertiesBuilder {
bloom_filter_position: BloomFilterPosition,
writer_version: WriterVersion,
created_by: String,
+ offset_index_disabled: bool,
key_value_metadata: Option<Vec<KeyValue>>,
default_column_properties: ColumnProperties,
column_properties: HashMap<ColumnPath, ColumnProperties>,
@@ -392,6 +412,7 @@ impl WriterPropertiesBuilder {
bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
writer_version: DEFAULT_WRITER_VERSION,
created_by: DEFAULT_CREATED_BY.to_string(),
+ offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
key_value_metadata: None,
default_column_properties: Default::default(),
column_properties: HashMap::new(),
@@ -413,6 +434,7 @@ impl WriterPropertiesBuilder {
bloom_filter_position: self.bloom_filter_position,
writer_version: self.writer_version,
created_by: self.created_by,
+ offset_index_disabled: self.offset_index_disabled,
key_value_metadata: self.key_value_metadata,
default_column_properties: self.default_column_properties,
column_properties: self.column_properties,
@@ -515,6 +537,21 @@ impl WriterPropertiesBuilder {
self
}
+ /// Sets whether the writing of offset indexes is disabled (defaults to
`false`).
+ ///
+ /// If statistics level is set to [`Page`] this setting will be overridden
with `false`.
+ ///
+ /// Note: As the offset indexes are useful for accessing data by row
number,
+ /// they are always written by default, regardless of whether other
statistics
+ /// are enabled. Disabling this metadata may result in a degradation in
read
+ /// performance, so use this option with care.
+ ///
+ /// [`Page`]: EnabledStatistics::Page
+ pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
+ self.offset_index_disabled = value;
+ self
+ }
+
/// Sets "key_value_metadata" property (defaults to `None`).
pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) ->
Self {
self.key_value_metadata = value;
diff --git a/parquet/src/file/writer.rs b/parquet/src/file/writer.rs
index b84c57a60..6b7707f03 100644
--- a/parquet/src/file/writer.rs
+++ b/parquet/src/file/writer.rs
@@ -1742,6 +1742,7 @@ mod tests {
let props = WriterProperties::builder()
.set_statistics_enabled(EnabledStatistics::None)
.set_column_statistics_enabled("a".into(), EnabledStatistics::Page)
+ .set_offset_index_disabled(true) // this should be ignored because
of the line above
.build();
let mut file = Vec::with_capacity(1024);
let mut file_writer =