Re: [PR] Implement the contained method of RowGroupPruningStatistics [arrow-datafusion]

via GitHub Thu, 28 Dec 2023 12:51:30 -0800


alamb commented on code in PR #8669:
URL: https://github.com/apache/arrow-datafusion/pull/8669#discussion_r1437874851



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -276,15 +278,74 @@ impl<'a> PruningStatistics for 
RowGroupPruningStatistics<'a> {
         scalar.to_array().ok()
     }
 
+    /// The basic idea is to check whether all of the `values` are not within 
the min-max boundary.
+    /// If any one value is within the min-max boundary, then this row group 
will not be skipped.
+    /// Otherwise, this row group will be able to be skipped.
     fn contained(
         &self,
-        _column: &Column,
-        _values: &HashSet<ScalarValue>,
+        column: &Column,
+        values: &HashSet<ScalarValue>,
     ) -> Option<BooleanArray> {
-        None
+        let min_values = self.min_values(column)?;
+        let max_values = self.max_values(column)?;
+        // The boundary should be with length of 1
+        if min_values.len() != max_values.len() || min_values.len() != 1 {
+            return None;
+        }
+        let min_value = ScalarValue::try_from_array(min_values.as_ref(), 
0).ok()?;
+        let max_value = ScalarValue::try_from_array(max_values.as_ref(), 
0).ok()?;
+
+        // The boundary should be with the same data type
+        if min_value.data_type() != max_value.data_type() {
+            return None;
+        }
+        let target_data_type = min_value.data_type();
+
+        let (c, _) = self.column(&column.name)?;
+        let has_null = c.statistics()?.null_count() > 0;
+        let mut known_not_present = true;
+        for value in values {
+            // If it's null, check whether the null exists from the statistics
+            if has_null && value.is_null() {
+                known_not_present = false;
+                break;
+            }
+            // The filter values should be cast to the boundary's data type
+            if !can_cast_types(&value.data_type(), &target_data_type) {
+                return None;
+            }
+            let value =
+                cast_scalar_value(value, &target_data_type, 
&DEFAULT_CAST_OPTIONS)
+                    .ok()?;

Review Comment:
   I think you could combine these checks:
   ```suggestion
               // The filter values should be cast to the boundary's data type
               let Ok(value) = cast_scalar_value(value, &target_data_type, 
&DEFAULT_CAST_OPTIONS) else {
                 return None;
               };
   ```



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -603,19 +666,38 @@ mod tests {
             ),
             vec![1]
         );
+
+        let expr = col("c1").lt(lit(5)).and(col("c2").is_null());

Review Comment:
   ```suggestion
           // c1 > 15 and c2 IS NULL => c1_max > 15 and bool_null_count > 0
           let expr = col("c1").lt(lit(5)).and(col("c2").is_null());
   ```



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -276,15 +278,74 @@ impl<'a> PruningStatistics for 
RowGroupPruningStatistics<'a> {
         scalar.to_array().ok()
     }
 
+    /// The basic idea is to check whether all of the `values` are not within 
the min-max boundary.
+    /// If any one value is within the min-max boundary, then this row group 
will not be skipped.
+    /// Otherwise, this row group will be able to be skipped.
     fn contained(
         &self,
-        _column: &Column,
-        _values: &HashSet<ScalarValue>,
+        column: &Column,
+        values: &HashSet<ScalarValue>,
     ) -> Option<BooleanArray> {
-        None
+        let min_values = self.min_values(column)?;
+        let max_values = self.max_values(column)?;
+        // The boundary should be with length of 1
+        if min_values.len() != max_values.len() || min_values.len() != 1 {
+            return None;
+        }
+        let min_value = ScalarValue::try_from_array(min_values.as_ref(), 
0).ok()?;
+        let max_value = ScalarValue::try_from_array(max_values.as_ref(), 
0).ok()?;
+
+        // The boundary should be with the same data type
+        if min_value.data_type() != max_value.data_type() {
+            return None;
+        }
+        let target_data_type = min_value.data_type();
+
+        let (c, _) = self.column(&column.name)?;
+        let has_null = c.statistics()?.null_count() > 0;
+        let mut known_not_present = true;
+        for value in values {
+            // If it's null, check whether the null exists from the statistics

Review Comment:
   If `value` is null, I think it means that the statistics value is not known. 
To the best of my knowledge, `NULL` values on the column are never encoded in 
parquet statistics .
   
   Thus I think this check needs to be something like 
   ```rust
   if value.is_null() { 
     known_not_present = false;
     break;
   ```



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -579,12 +640,14 @@ mod tests {
     #[test]
     fn row_group_pruning_predicate_null_expr() {
         use datafusion_expr::{col, lit};
-        // int > 1 and IsNull(bool) => c1_max > 1 and bool_null_count > 0
+
         let schema = Arc::new(Schema::new(vec![
             Field::new("c1", DataType::Int32, false),
             Field::new("c2", DataType::Boolean, false),
         ]));
         let schema_descr = arrow_to_parquet_schema(&schema).unwrap();
+
+        // int > 1 and IsNull(bool) => c1_max > 1 and bool_null_count > 0

Review Comment:
   ```suggestion
           // c1 > 15 and c2 IS NULL  => c1_max > 15 and bool_null_count > 0
   ```



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -276,15 +278,74 @@ impl<'a> PruningStatistics for 
RowGroupPruningStatistics<'a> {
         scalar.to_array().ok()
     }
 
+    /// The basic idea is to check whether all of the `values` are not within 
the min-max boundary.
+    /// If any one value is within the min-max boundary, then this row group 
will not be skipped.
+    /// Otherwise, this row group will be able to be skipped.
     fn contained(
         &self,
-        _column: &Column,
-        _values: &HashSet<ScalarValue>,
+        column: &Column,
+        values: &HashSet<ScalarValue>,
     ) -> Option<BooleanArray> {
-        None
+        let min_values = self.min_values(column)?;

Review Comment:
   it is very unfortunate that we have to use ScalarValues here when the 
underlying code uses ArrayRefs (though I realize this PR is just following the 
same model as the existing code)



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Implement the contained method of RowGroupPruningStatistics [arrow-datafusion]

Reply via email to