This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 820843ff59 Removes Bloom filter for Int8/Int16/Uint8/Uint16 (#9969)
820843ff59 is described below

commit 820843ff597161c9cdacd0e79cecf20d05755081
Author: Edmondo Porcu <[email protected]>
AuthorDate: Mon Apr 8 06:31:10 2024 -0400

    Removes Bloom filter for Int8/Int16/Uint8/Uint16 (#9969)
    
    * Removing broken tests
    
    * Simplifying tests / removing support for failed tests
    
    * Revert "Simplifying tests / removing support for failed tests"
    
    This reverts commit 6e50a8064436943d9f42d313cef2c2b017d196f1.
    
    * Fixing tests for real
    
    * Apply suggestions from code review
    
    Thanks @alamb !
    
    Co-authored-by: Andrew Lamb <[email protected]>
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../datasource/physical_plan/parquet/row_groups.rs |  4 --
 datafusion/core/tests/parquet/row_group_pruning.rs | 54 +++++++++++-----------
 2 files changed, 26 insertions(+), 32 deletions(-)

diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs 
b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs
index 8df4925fc5..6600dd07d7 100644
--- a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs
+++ b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs
@@ -232,12 +232,8 @@ impl PruningStatistics for BloomFilterStatistics {
                     ScalarValue::Float32(Some(v)) => sbbf.check(v),
                     ScalarValue::Int64(Some(v)) => sbbf.check(v),
                     ScalarValue::Int32(Some(v)) => sbbf.check(v),
-                    ScalarValue::Int16(Some(v)) => sbbf.check(v),
-                    ScalarValue::Int8(Some(v)) => sbbf.check(v),
                     ScalarValue::UInt64(Some(v)) => sbbf.check(v),
                     ScalarValue::UInt32(Some(v)) => sbbf.check(v),
-                    ScalarValue::UInt16(Some(v)) => sbbf.check(v),
-                    ScalarValue::UInt8(Some(v)) => sbbf.check(v),
                     ScalarValue::Decimal128(Some(v), p, s) => match 
parquet_type {
                         Type::INT32 => {
                             
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42
diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs 
b/datafusion/core/tests/parquet/row_group_pruning.rs
index b7b434d1c3..8fc7936552 100644
--- a/datafusion/core/tests/parquet/row_group_pruning.rs
+++ b/datafusion/core/tests/parquet/row_group_pruning.rs
@@ -290,7 +290,7 @@ async fn prune_disabled() {
 // https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests 
pass
 // if and only if Bloom filters on Int8 and Int16 columns are still buggy.
 macro_rules! int_tests {
-    ($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => {
+    ($bits:expr) => {
         paste::item! {
             #[tokio::test]
             async fn [<prune_int $bits _lt >]() {
@@ -329,9 +329,9 @@ macro_rules! int_tests {
                     .with_expected_errors(Some(0))
                     .with_matched_by_stats(Some(1))
                     .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(if 
$correct_bloom_filters { 1 } else { 0 }))
-                    .with_pruned_by_bloom_filter(Some(if 
$correct_bloom_filters { 0 } else { 1 }))
-                    .with_expected_rows(if $correct_bloom_filters { 1 } else { 
0 })
+                    .with_matched_by_bloom_filter(Some(1))
+                    .with_pruned_by_bloom_filter(Some(0))
+                    .with_expected_rows(1)
                     .test_row_group_prune()
                     .await;
             }
@@ -343,9 +343,9 @@ macro_rules! int_tests {
                     .with_expected_errors(Some(0))
                     .with_matched_by_stats(Some(1))
                     .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(if 
$correct_bloom_filters { 1 } else { 0 }))
-                    .with_pruned_by_bloom_filter(Some(if 
$correct_bloom_filters { 0 } else { 1 }))
-                    .with_expected_rows(if $correct_bloom_filters { 1 } else { 
0 })
+                    .with_matched_by_bloom_filter(Some(1))
+                    .with_pruned_by_bloom_filter(Some(0))
+                    .with_expected_rows(1)
                     .test_row_group_prune()
                     .await;
             }
@@ -404,9 +404,9 @@ macro_rules! int_tests {
                     .with_expected_errors(Some(0))
                     .with_matched_by_stats(Some(1))
                     .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(if 
$correct_bloom_filters { 1 } else { 0 }))
-                    .with_pruned_by_bloom_filter(Some(if 
$correct_bloom_filters { 0 } else { 1 }))
-                    .with_expected_rows(if $correct_bloom_filters { 1 } else { 
0 })
+                    .with_matched_by_bloom_filter(Some(1))
+                    .with_pruned_by_bloom_filter(Some(0))
+                    .with_expected_rows(1)
                     .test_row_group_prune()
                     .await;
             }
@@ -447,17 +447,16 @@ macro_rules! int_tests {
     };
 }
 
-int_tests!(8, correct_bloom_filters: false);
-int_tests!(16, correct_bloom_filters: false);
-int_tests!(32, correct_bloom_filters: true);
-int_tests!(64, correct_bloom_filters: true);
+// int8/int16 are incorrect: 
https://github.com/apache/arrow-datafusion/issues/9779
+int_tests!(32);
+int_tests!(64);
 
 // $bits: number of bits of the integer to test (8, 16, 32, 64)
 // $correct_bloom_filters: if false, replicates the
 // https://github.com/apache/arrow-datafusion/issues/9779 bug so that tests 
pass
 // if and only if Bloom filters on UInt8 and UInt16 columns are still buggy.
 macro_rules! uint_tests {
-    ($bits:expr, correct_bloom_filters: $correct_bloom_filters:expr) => {
+    ($bits:expr) => {
         paste::item! {
             #[tokio::test]
             async fn [<prune_uint $bits _lt >]() {
@@ -482,9 +481,9 @@ macro_rules! uint_tests {
                     .with_expected_errors(Some(0))
                     .with_matched_by_stats(Some(1))
                     .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(if 
$correct_bloom_filters { 1 } else { 0 }))
-                    .with_pruned_by_bloom_filter(Some(if 
$correct_bloom_filters { 0 } else { 1 }))
-                    .with_expected_rows(if $correct_bloom_filters { 1 } else { 
0 })
+                    .with_matched_by_bloom_filter(Some(1))
+                    .with_pruned_by_bloom_filter(Some(0))
+                    .with_expected_rows(1)
                     .test_row_group_prune()
                     .await;
             }
@@ -496,9 +495,9 @@ macro_rules! uint_tests {
                     .with_expected_errors(Some(0))
                     .with_matched_by_stats(Some(1))
                     .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(if 
$correct_bloom_filters { 1 } else { 0 }))
-                    .with_pruned_by_bloom_filter(Some(if 
$correct_bloom_filters { 0 } else { 1 }))
-                    .with_expected_rows(if $correct_bloom_filters { 1 } else { 
0 })
+                    .with_matched_by_bloom_filter(Some(1))
+                    .with_pruned_by_bloom_filter(Some(0))
+                    .with_expected_rows(1)
                     .test_row_group_prune()
                     .await;
             }
@@ -542,9 +541,9 @@ macro_rules! uint_tests {
                     .with_expected_errors(Some(0))
                     .with_matched_by_stats(Some(1))
                     .with_pruned_by_stats(Some(3))
-                    .with_matched_by_bloom_filter(Some(if 
$correct_bloom_filters { 1 } else { 0 }))
-                    .with_pruned_by_bloom_filter(Some(if 
$correct_bloom_filters { 0 } else { 1 }))
-                    .with_expected_rows(if $correct_bloom_filters { 1 } else { 
0 })
+                    .with_matched_by_bloom_filter(Some(1))
+                    .with_pruned_by_bloom_filter(Some(0))
+                    .with_expected_rows(1)
                     .test_row_group_prune()
                     .await;
             }
@@ -585,10 +584,9 @@ macro_rules! uint_tests {
     };
 }
 
-uint_tests!(8, correct_bloom_filters: false);
-uint_tests!(16, correct_bloom_filters: false);
-uint_tests!(32, correct_bloom_filters: true);
-uint_tests!(64, correct_bloom_filters: true);
+// uint8/uint16 are incorrect: 
https://github.com/apache/arrow-datafusion/issues/9779
+uint_tests!(32);
+uint_tests!(64);
 
 #[tokio::test]
 async fn prune_int32_eq_large_in_list() {

Reply via email to