Re: [PR] Implement the contained method of RowGroupPruningStatistics [arrow-datafusion]

via GitHub Fri, 29 Dec 2023 04:31:55 -0800


alamb commented on code in PR #8669:
URL: https://github.com/apache/arrow-datafusion/pull/8669#discussion_r1438176448



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -598,19 +662,39 @@ mod tests {
             ),
             vec![1]
         );
+

Review Comment:
   I don't think this new test case covers the new code in this PR (as `col IS 
NULL` doesn't result in a literal guarantee). Is the idea to extend the test 
coverage?



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -854,9 +956,9 @@ mod tests {
         let rgm2 = get_row_group_meta_data(
             &schema_descr,
             vec![ParquetStatistics::fixed_len_byte_array(
-                // 5.00
+                // 10.00

Review Comment:
   can you explain why you changed this value in the test? 
   
   Would it be possible to change this PR  to not change existing tests so it 
is clear that the code change in this PR doesn't cause a regression in existing 
behavior?  Maybe we can add a new test case with the different values?



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -632,6 +716,29 @@ mod tests {
             ),
             vec![1]
         );
+
+        // c1 < 5 and c2 IS NULL  => c1_min < 5 and bool_null_count > 0

Review Comment:
   ```suggestion
           // c1 < 5 and c2 = NULL  => c1_min < 5 and bool_null_count > 0
   ```



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -632,6 +716,29 @@ mod tests {
             ),
             vec![1]
         );
+
+        // c1 < 5 and c2 IS NULL  => c1_min < 5 and bool_null_count > 0
+        let expr = col("c1")
+            .lt(lit(5))
+            .and(col("c2").eq(lit(ScalarValue::Boolean(None))));
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate = PruningPredicate::try_new(expr, 
schema.clone()).unwrap();
+        let groups = gen_row_group_meta_data_for_pruning_predicate();
+
+        let metrics = parquet_file_metrics();
+        // bool = NULL always evaluates to NULL (and thus will not

Review Comment:
   I think this comment is incorrect of date -- both row groups are actually 
pruned as the vec is empty



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -276,15 +278,74 @@ impl<'a> PruningStatistics for 
RowGroupPruningStatistics<'a> {
         scalar.to_array().ok()
     }
 
+    /// The basic idea is to check whether all of the `values` are not within 
the min-max boundary.
+    /// If any one value is within the min-max boundary, then this row group 
will not be skipped.
+    /// Otherwise, this row group will be able to be skipped.
     fn contained(
         &self,
-        _column: &Column,
-        _values: &HashSet<ScalarValue>,
+        column: &Column,
+        values: &HashSet<ScalarValue>,
     ) -> Option<BooleanArray> {
-        None
+        let min_values = self.min_values(column)?;
+        let max_values = self.max_values(column)?;
+        // The boundary should be with length of 1
+        if min_values.len() != max_values.len() || min_values.len() != 1 {
+            return None;
+        }
+        let min_value = ScalarValue::try_from_array(min_values.as_ref(), 
0).ok()?;
+        let max_value = ScalarValue::try_from_array(max_values.as_ref(), 
0).ok()?;
+
+        // The boundary should be with the same data type
+        if min_value.data_type() != max_value.data_type() {
+            return None;
+        }
+        let target_data_type = min_value.data_type();
+
+        let (c, _) = self.column(&column.name)?;
+        let has_null = c.statistics()?.null_count() > 0;
+        let mut known_not_present = true;
+        for value in values {
+            // If it's null, check whether the null exists from the statistics

Review Comment:
   Thank you for the clarification @yahoNanJing. I am still confused -- I don't 
think `col IS NULL` is handled by the `LiteralGuarantee` code so I am not sure 
how it would result in a `value` of NULL here.
   
   `col IN (NULL)` (as opposed to `col IS NULL`) always evaluates to `NULL` 
(can never be `true`) which perhaps we should also handle 🤔 
   
   



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -867,25 +969,80 @@ mod tests {
                 false,
             )],
         );
-
         let rgm3 = get_row_group_meta_data(
             &schema_descr,
             vec![ParquetStatistics::fixed_len_byte_array(
                 None, None, None, 0, false,
             )],
         );
+        let rgms = [rgm1, rgm2, rgm3];
+        // cast the type of c1 to decimal(28,3)
+        let left = cast(col("c1"), DataType::Decimal128(28, 3));
+        let expr = left.eq(lit(ScalarValue::Decimal128(Some(100000), 28, 3)));
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate = PruningPredicate::try_new(expr, 
schema.clone()).unwrap();
         let metrics = parquet_file_metrics();
         assert_eq!(
             prune_row_groups_by_statistics(
                 &schema,
                 &schema_descr,
-                &[rgm1, rgm2, rgm3],
+                &rgms,
                 None,
                 Some(&pruning_predicate),
                 &metrics
             ),
             vec![1, 2]
         );
+        // c1 in (10, 300, 400)

Review Comment:
   Isn't the first value  `0.8`?
   
   ```suggestion
           // c1 in (0.8, 300, 400)
   ```
   
   The same comment applies to several other comments below 



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -867,25 +969,80 @@ mod tests {
                 false,
             )],
         );
-
         let rgm3 = get_row_group_meta_data(
             &schema_descr,
             vec![ParquetStatistics::fixed_len_byte_array(
                 None, None, None, 0, false,
             )],
         );
+        let rgms = [rgm1, rgm2, rgm3];
+        // cast the type of c1 to decimal(28,3)
+        let left = cast(col("c1"), DataType::Decimal128(28, 3));
+        let expr = left.eq(lit(ScalarValue::Decimal128(Some(100000), 28, 3)));
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate = PruningPredicate::try_new(expr, 
schema.clone()).unwrap();
         let metrics = parquet_file_metrics();
         assert_eq!(
             prune_row_groups_by_statistics(
                 &schema,
                 &schema_descr,
-                &[rgm1, rgm2, rgm3],
+                &rgms,
                 None,
                 Some(&pruning_predicate),
                 &metrics
             ),
             vec![1, 2]
         );
+        // c1 in (10, 300, 400)
+        // cast the type of c1 to decimal(28,3)
+        let left = cast(col("c1"), DataType::Decimal128(28, 3));
+        let expr = left.in_list(
+            vec![
+                lit(ScalarValue::Decimal128(Some(8000), 28, 3)),
+                lit(ScalarValue::Decimal128(Some(300000), 28, 3)),
+                lit(ScalarValue::Decimal128(Some(400000), 28, 3)),
+            ],
+            false,
+        );
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate = PruningPredicate::try_new(expr, 
schema.clone()).unwrap();
+        let metrics = parquet_file_metrics();
+        assert_eq!(
+            prune_row_groups_by_statistics(
+                &schema,
+                &schema_descr,
+                &rgms,
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
+            vec![0, 2]

Review Comment:
   ```suggestion
              // rgm2 (index 1) has ranges between 10 and 200. None of the 
              // constants are in that range so expect this is pruned by 
lliterals
               vec![0, 2]
   ```



##########
datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs:
##########
@@ -598,19 +662,39 @@ mod tests {
             ),
             vec![1]
         );
+
+        // c1 < 5 and c2 IS NULL  => c1_min < 5 and bool_null_count > 0
+        let expr = col("c1").lt(lit(5)).and(col("c2").is_null());
+        let expr = logical2physical(&expr, &schema);
+        let pruning_predicate = PruningPredicate::try_new(expr, 
schema.clone()).unwrap();
+        let groups = gen_row_group_meta_data_for_pruning_predicate();
+
+        let metrics = parquet_file_metrics();
+        // First row group was filtered out because it contains no null value 
on "c2".
+        assert_eq!(
+            prune_row_groups_by_statistics(
+                &schema,
+                &schema_descr,
+                &groups,
+                None,
+                Some(&pruning_predicate),
+                &metrics
+            ),
+            Vec::<usize>::new()
+        );
     }
 
     #[test]
     fn row_group_pruning_predicate_eq_null_expr() {
         use datafusion_expr::{col, lit};
         // test row group predicate with an unknown (Null) expr
-        //
-        // int > 1 and bool = NULL => c1_max > 1 and null
         let schema = Arc::new(Schema::new(vec![
             Field::new("c1", DataType::Int32, false),
             Field::new("c2", DataType::Boolean, false),
         ]));
         let schema_descr = arrow_to_parquet_schema(&schema).unwrap();
+
+        // c1 > 15 and c2 IS NULL  => c1_max > 15 and bool_null_count > 0

Review Comment:
   ```suggestion
           // c1 > 15 and c2 = NULL  => c1_max > 15 and bool_null_count > 0
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Implement the contained method of RowGroupPruningStatistics [arrow-datafusion]

Reply via email to