Re: [PR] Implement `contains` API in PruningPredicate [arrow-datafusion]

via GitHub Wed, 06 Dec 2023 09:15:37 -0800


alamb commented on code in PR #8440:
URL: https://github.com/apache/arrow-datafusion/pull/8440#discussion_r1417686897



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -276,21 +383,21 @@ fn is_always_true(expr: &Arc<dyn PhysicalExpr>) -> bool {
 /// Handles creating references to the min/max statistics
 /// for columns as well as recording which statistics are needed
 #[derive(Debug, Default, Clone)]
-pub(crate) struct RequiredStatColumns {
+pub(crate) struct RequiredColumns {

Review Comment:
   I renamed this to be more specific and since it is `crate` private it is not 
a breaking API change
   
   One thing I did try was encoding the columns needed for literal guarantees 
in this structure, but I found the code was very specific to `min/max/count` 
statistics



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -93,6 +94,27 @@ pub trait PruningStatistics {
     ///
     /// Note: the returned array must contain [`Self::num_containers`] rows
     fn null_counts(&self, column: &Column) -> Option<ArrayRef>;
+
+    /// Returns an array where each row represents if the value of the
+    /// column is known to contain or not contain any of the set of `values`.
+    ///
+    /// This is used to prune containers using structures such as Bloom
+    /// Filters which can quickly test set membership.
+    ///
+    /// The returned array has one row for each container, with the following:
+    /// * `true` if the value of column CERTAINLY IS one of `values`
+    /// * `false` if the value of column CERTAINLY IS NOT one of  `values`
+    /// * `null` if the value of column may or may not be in values
+    ///
+    /// If these statistics can not determine column membership for any
+    /// container, return `None` (the default).
+    ///
+    /// Note: the returned array must contain [`Self::num_containers`] rows
+    fn contains(

Review Comment:
   This is the new API -- it is slightly different than the proposal because it 
takes a `HashSet` rather than a single value, which is necessary to support `x 
IN (....)` type predicates



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -344,11 +451,10 @@ impl RequiredStatColumns {
 
         // only add statistics column if not previously added
         if need_to_insert {
-            let stat_field = Field::new(
-                stat_column.name(),
-                field.data_type().clone(),
-                field.is_nullable(),
-            );
+            // may be null if statistics are not present

Review Comment:
   A non-nullable column may appear as NULL in the statistics if the min or max 
values are not known



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -993,95 +1099,139 @@ mod tests {
     ///
     /// Note All `ArrayRefs` must be the same size.
     struct ContainerStats {
-        min: ArrayRef,
-        max: ArrayRef,
+        min: Option<ArrayRef>,
+        max: Option<ArrayRef>,
         /// Optional values
         null_counts: Option<ArrayRef>,
+        /// Optional known values (e.g. mimic a bloom filter)
+        /// (value, contains)
+        /// If present, all BooleanArrays must be the same size as min/max
+        contains: Vec<(HashSet<ScalarValue>, BooleanArray)>,
     }
 
     impl ContainerStats {
+        fn new() -> Self {
+            Default::default()
+        }
         fn new_decimal128(
             min: impl IntoIterator<Item = Option<i128>>,
             max: impl IntoIterator<Item = Option<i128>>,
             precision: u8,
             scale: i8,
         ) -> Self {
-            Self {
-                min: Arc::new(
+            Self::new()
+                .with_min(Arc::new(
                     min.into_iter()
                         .collect::<Decimal128Array>()
                         .with_precision_and_scale(precision, scale)
                         .unwrap(),
-                ),
-                max: Arc::new(
+                ))
+                .with_max(Arc::new(
                     max.into_iter()
                         .collect::<Decimal128Array>()
                         .with_precision_and_scale(precision, scale)
                         .unwrap(),
-                ),
-                null_counts: None,
-            }
+                ))
         }
 
         fn new_i64(
             min: impl IntoIterator<Item = Option<i64>>,
             max: impl IntoIterator<Item = Option<i64>>,
         ) -> Self {
-            Self {
-                min: Arc::new(min.into_iter().collect::<Int64Array>()),
-                max: Arc::new(max.into_iter().collect::<Int64Array>()),
-                null_counts: None,
-            }
+            Self::new()
+                .with_min(Arc::new(min.into_iter().collect::<Int64Array>()))
+                .with_max(Arc::new(max.into_iter().collect::<Int64Array>()))
         }
 
         fn new_i32(
             min: impl IntoIterator<Item = Option<i32>>,
             max: impl IntoIterator<Item = Option<i32>>,
         ) -> Self {
-            Self {
-                min: Arc::new(min.into_iter().collect::<Int32Array>()),
-                max: Arc::new(max.into_iter().collect::<Int32Array>()),
-                null_counts: None,
-            }
+            Self::new()
+                .with_min(Arc::new(min.into_iter().collect::<Int32Array>()))
+                .with_max(Arc::new(max.into_iter().collect::<Int32Array>()))
         }
 
         fn new_utf8<'a>(
             min: impl IntoIterator<Item = Option<&'a str>>,
             max: impl IntoIterator<Item = Option<&'a str>>,
         ) -> Self {
-            Self {
-                min: Arc::new(min.into_iter().collect::<StringArray>()),
-                max: Arc::new(max.into_iter().collect::<StringArray>()),
-                null_counts: None,
-            }
+            Self::new()
+                .with_min(Arc::new(min.into_iter().collect::<StringArray>()))
+                .with_max(Arc::new(max.into_iter().collect::<StringArray>()))
         }
 
         fn new_bool(
             min: impl IntoIterator<Item = Option<bool>>,
             max: impl IntoIterator<Item = Option<bool>>,
         ) -> Self {
-            Self {
-                min: Arc::new(min.into_iter().collect::<BooleanArray>()),
-                max: Arc::new(max.into_iter().collect::<BooleanArray>()),
-                null_counts: None,
-            }
+            Self::new()
+                .with_min(Arc::new(min.into_iter().collect::<BooleanArray>()))
+                .with_max(Arc::new(max.into_iter().collect::<BooleanArray>()))
         }
 
         fn min(&self) -> Option<ArrayRef> {
-            Some(self.min.clone())
+            self.min.clone()
         }
 
         fn max(&self) -> Option<ArrayRef> {
-            Some(self.max.clone())
+            self.max.clone()
         }
 
         fn null_counts(&self) -> Option<ArrayRef> {
             self.null_counts.clone()
         }
 
+        /// return an iterator over all arrays in this statistics
+        fn arrays(&self) -> Vec<ArrayRef> {

Review Comment:
   this is testing infrastructure



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1385,20 +1596,14 @@ mod tests {
 
         // test column on the left
         let expr = col("c1").eq(lit(1));
-        let predicate_expr = test_build_predicate_expression(
-            &expr,
-            &schema,
-            &mut RequiredStatColumns::new(),
-        );
+        let predicate_expr =

Review Comment:
   this is just reformatting resulting in a shorter name for 
`RequiredStatColumns`



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -2484,10 +2614,376 @@ mod tests {
         // TODO: add other negative test for other case and op
     }
 
+    #[test]
+    fn prune_with_contains_one_column() {

Review Comment:
   I spent quite a while on these tests and I think they are pretty thorough



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Implement `contains` API in PruningPredicate [arrow-datafusion]

Reply via email to