alamb commented on code in PR #8772:
URL: https://github.com/apache/arrow-datafusion/pull/8772#discussion_r1445254909


##########
datafusion/core/src/datasource/physical_plan/parquet/mod.rs:
##########
@@ -278,7 +279,17 @@ impl DisplayAs for ParquetExec {
                 let pruning_predicate_string = self
                     .pruning_predicate
                     .as_ref()
-                    .map(|pre| format!(", pruning_predicate={}", 
pre.predicate_expr()))
+                    .map(|pre| {
+                        format!(
+                            ", pruning_predicate={} [{}]",
+                            pre.predicate_expr(),
+                            pre.literal_guarantees()

Review Comment:
   this is a neat idea to display guarantees 👍 



##########
datafusion/core/tests/parquet/row_group_pruning.rs:
##########
@@ -336,16 +369,38 @@ async fn prune_int32_eq_in_list() {
 #[tokio::test]
 async fn prune_int32_eq_in_list_2() {
     // result of sql "SELECT * FROM t where in (1000)", prune all
-    test_prune(
+    // test whether statistics works
+    test_prune_verbose(
         Scenario::Int32,
         "SELECT * FROM t where i in (1000)",
         Some(0),
+        Some(0),
         Some(4),
         0,
     )
     .await;
 }
 
+#[tokio::test]
+async fn prune_int32_eq_large_in_list() {
+    // result of sql "SELECT * FROM t where i in (2050...2582)", prune all
+    // test whether sbbf works
+    test_prune_verbose(
+        Scenario::Int32Range,
+        format!(
+            "SELECT * FROM t where i in ({})",
+            (200050..200082).join(",")
+        )
+        .as_str(),
+        Some(0),
+        Some(1),
+        // we don't support pruning by statistics for in_list with more than 
20 elements currently

Review Comment:
   Also I think it the pruning by bloom filters happens first, so it may not 
even try to prune by statistics
   
   👍 



##########
datafusion/sqllogictest/test_files/repartition_scan.slt:
##########
@@ -61,7 +61,7 @@ Filter: parquet_table.column1 != Int32(42)
 physical_plan
 CoalesceBatchesExec: target_batch_size=8192
 --FilterExec: column1@0 != 42
-----ParquetExec: file_groups={4 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..101],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:101..202],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:202..303],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:303..403]]},
 projection=[column1], predicate=column1@0 != 42, 
pruning_predicate=column1_min@0 != 42 OR 42 != column1_max@1
+----ParquetExec: file_groups={4 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..101],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:101..202],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:202..303],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:303..403]]},
 projection=[column1], predicate=column1@0 != 42, 
pruning_predicate=column1_min@0 != 42 OR 42 != column1_max@1 [column1 not in 
(42)]

Review Comment:
   what do you think about adding a header to the guarantees to it was more 
explicit for interpreting?
   
   ```suggestion
   ----ParquetExec: file_groups={4 groups: 
[[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..101],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:101..202],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:202..303],
 
[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:303..403]]},
 projection=[column1], predicate=column1@0 != 42, 
pruning_predicate=column1_min@0 != 42 OR 42 != column1_max@1 
required_guarantees=[column1 not in (42)]
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to