tustvold commented on code in PR #2022:
URL: https://github.com/apache/arrow-rs/pull/2022#discussion_r916095152
##########
parquet/src/column/writer.rs:
##########
@@ -302,61 +308,85 @@ impl<'a, T: DataType> ColumnWriterImpl<'a, T> {
// Find out the minimal length to prevent index out of bound errors.
let mut min_len = values.len();
if let Some(levels) = def_levels {
- min_len = cmp::min(min_len, levels.len());
+ min_len = min_len.min(levels.len());
}
if let Some(levels) = rep_levels {
- min_len = cmp::min(min_len, levels.len());
+ min_len = min_len.min(levels.len());
}
// Find out number of batches to process.
let write_batch_size = self.props.write_batch_size();
let num_batches = min_len / write_batch_size;
- // Process pre-calculated statistics
- match (min, max) {
- (Some(min), Some(max)) => {
- if self
- .min_column_value
- .as_ref()
- .map_or(true, |v| self.compare_greater(v, min))
- {
- self.min_column_value = Some(min.clone());
+ if self.statistics_enabled == EnabledStatistics::Chunk {
+ match (min, max) {
+ (Some(min), Some(max)) => {
+ if self
+ .min_column_value
+ .as_ref()
+ .map_or(true, |v| self.compare_greater(v, min))
+ {
+ self.min_column_value = Some(min.clone());
+ }
+
+ if self
+ .max_column_value
+ .as_ref()
+ .map_or(true, |v| self.compare_greater(max, v))
+ {
+ self.max_column_value = Some(max.clone());
+ }
}
- if self
- .max_column_value
- .as_ref()
- .map_or(true, |v| self.compare_greater(max, v))
- {
- self.max_column_value = Some(max.clone());
+ (None, Some(_)) | (Some(_), None) => {
+ panic!("min/max should be both set or both None")
}
- }
- (None, Some(_)) | (Some(_), None) => {
- panic!("min/max should be both set or both None")
- }
- (None, None) => {}
+ (None, None) => {
+ for val in values {
+ if let Type::FLOAT | Type::DOUBLE =
T::get_physical_type() {
+ // Skip NaN values
+ if val != val {
+ continue;
+ }
+ }
+
+ if self
+ .min_column_value
+ .as_ref()
+ .map_or(true, |v| self.compare_greater(v, val))
+ {
+ self.min_column_value = Some(val.clone());
+ }
+
+ if self
+ .max_column_value
+ .as_ref()
+ .map_or(true, |v| self.compare_greater(val, v))
+ {
+ self.max_column_value = Some(val.clone());
+ }
+ }
+ }
+ };
}
- if let Some(distinct) = distinct_count {
- self.column_distinct_count =
- Some(self.column_distinct_count.unwrap_or(0) + distinct);
+ // We can only set the distinct count if there are no other writes
Review Comment:
This is the fix for #2016
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]