alamb commented on code in PR #8669:
URL: https://github.com/apache/arrow-datafusion/pull/8669#discussion_r1437874851
##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -276,15 +278,74 @@ impl<'a> PruningStatistics for
RowGroupPruningStatistics<'a> {
scalar.to_array().ok()
}
+ /// The basic idea is to check whether all of the `values` are not within
the min-max boundary.
+ /// If any one value is within the min-max boundary, then this row group
will not be skipped.
+ /// Otherwise, this row group will be able to be skipped.
fn contained(
&self,
- _column: &Column,
- _values: &HashSet<ScalarValue>,
+ column: &Column,
+ values: &HashSet<ScalarValue>,
) -> Option<BooleanArray> {
- None
+ let min_values = self.min_values(column)?;
+ let max_values = self.max_values(column)?;
+ // The boundary should be with length of 1
+ if min_values.len() != max_values.len() || min_values.len() != 1 {
+ return None;
+ }
+ let min_value = ScalarValue::try_from_array(min_values.as_ref(),
0).ok()?;
+ let max_value = ScalarValue::try_from_array(max_values.as_ref(),
0).ok()?;
+
+ // The boundary should be with the same data type
+ if min_value.data_type() != max_value.data_type() {
+ return None;
+ }
+ let target_data_type = min_value.data_type();
+
+ let (c, _) = self.column(&column.name)?;
+ let has_null = c.statistics()?.null_count() > 0;
+ let mut known_not_present = true;
+ for value in values {
+ // If it's null, check whether the null exists from the statistics
+ if has_null && value.is_null() {
+ known_not_present = false;
+ break;
+ }
+ // The filter values should be cast to the boundary's data type
+ if !can_cast_types(&value.data_type(), &target_data_type) {
+ return None;
+ }
+ let value =
+ cast_scalar_value(value, &target_data_type,
&DEFAULT_CAST_OPTIONS)
+ .ok()?;
Review Comment:
I think you could combine these checks:
```suggestion
// The filter values should be cast to the boundary's data type
let Ok(value) = cast_scalar_value(value, &target_data_type,
&DEFAULT_CAST_OPTIONS) else {
return None;
};
```
##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -603,19 +666,38 @@ mod tests {
),
vec![1]
);
+
+ let expr = col("c1").lt(lit(5)).and(col("c2").is_null());
Review Comment:
```suggestion
// c1 > 15 and c2 IS NULL => c1_max > 15 and bool_null_count > 0
let expr = col("c1").lt(lit(5)).and(col("c2").is_null());
```
##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -276,15 +278,74 @@ impl<'a> PruningStatistics for
RowGroupPruningStatistics<'a> {
scalar.to_array().ok()
}
+ /// The basic idea is to check whether all of the `values` are not within
the min-max boundary.
+ /// If any one value is within the min-max boundary, then this row group
will not be skipped.
+ /// Otherwise, this row group will be able to be skipped.
fn contained(
&self,
- _column: &Column,
- _values: &HashSet<ScalarValue>,
+ column: &Column,
+ values: &HashSet<ScalarValue>,
) -> Option<BooleanArray> {
- None
+ let min_values = self.min_values(column)?;
+ let max_values = self.max_values(column)?;
+ // The boundary should be with length of 1
+ if min_values.len() != max_values.len() || min_values.len() != 1 {
+ return None;
+ }
+ let min_value = ScalarValue::try_from_array(min_values.as_ref(),
0).ok()?;
+ let max_value = ScalarValue::try_from_array(max_values.as_ref(),
0).ok()?;
+
+ // The boundary should be with the same data type
+ if min_value.data_type() != max_value.data_type() {
+ return None;
+ }
+ let target_data_type = min_value.data_type();
+
+ let (c, _) = self.column(&column.name)?;
+ let has_null = c.statistics()?.null_count() > 0;
+ let mut known_not_present = true;
+ for value in values {
+ // If it's null, check whether the null exists from the statistics
Review Comment:
If `value` is null, I think it means that the statistics value is not known.
To the best of my knowledge, `NULL` values on the column are never encoded in
parquet statistics .
Thus I think this check needs to be something like
```rust
if value.is_null() {
known_not_present = false;
break;
```
##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -579,12 +640,14 @@ mod tests {
#[test]
fn row_group_pruning_predicate_null_expr() {
use datafusion_expr::{col, lit};
- // int > 1 and IsNull(bool) => c1_max > 1 and bool_null_count > 0
+
let schema = Arc::new(Schema::new(vec![
Field::new("c1", DataType::Int32, false),
Field::new("c2", DataType::Boolean, false),
]));
let schema_descr = arrow_to_parquet_schema(&schema).unwrap();
+
+ // int > 1 and IsNull(bool) => c1_max > 1 and bool_null_count > 0
Review Comment:
```suggestion
// c1 > 15 and c2 IS NULL => c1_max > 15 and bool_null_count > 0
```
##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -276,15 +278,74 @@ impl<'a> PruningStatistics for
RowGroupPruningStatistics<'a> {
scalar.to_array().ok()
}
+ /// The basic idea is to check whether all of the `values` are not within
the min-max boundary.
+ /// If any one value is within the min-max boundary, then this row group
will not be skipped.
+ /// Otherwise, this row group will be able to be skipped.
fn contained(
&self,
- _column: &Column,
- _values: &HashSet<ScalarValue>,
+ column: &Column,
+ values: &HashSet<ScalarValue>,
) -> Option<BooleanArray> {
- None
+ let min_values = self.min_values(column)?;
Review Comment:
it is very unfortunate that we have to use ScalarValues here when the
underlying code uses ArrayRefs (though I realize this PR is just following the
same model as the existing code)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]