alamb commented on code in PR #13768:
URL: https://github.com/apache/datafusion/pull/13768#discussion_r1885869652


##########
datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs:
##########
@@ -268,70 +347,7 @@ impl PruningStatistics for BloomFilterStatistics {
 
         let known_not_present = values
             .iter()
-            .map(|value| {
-                match value {
-                    ScalarValue::Utf8(Some(v)) | 
ScalarValue::Utf8View(Some(v)) => {
-                        sbbf.check(&v.as_str())
-                    }
-                    ScalarValue::Binary(Some(v)) | 
ScalarValue::BinaryView(Some(v)) => {
-                        sbbf.check(v)
-                    }
-                    ScalarValue::FixedSizeBinary(_size, Some(v)) => 
sbbf.check(v),
-                    ScalarValue::Boolean(Some(v)) => sbbf.check(v),
-                    ScalarValue::Float64(Some(v)) => sbbf.check(v),
-                    ScalarValue::Float32(Some(v)) => sbbf.check(v),
-                    ScalarValue::Int64(Some(v)) => sbbf.check(v),
-                    ScalarValue::Int32(Some(v)) => sbbf.check(v),
-                    ScalarValue::UInt64(Some(v)) => sbbf.check(v),
-                    ScalarValue::UInt32(Some(v)) => sbbf.check(v),
-                    ScalarValue::Decimal128(Some(v), p, s) => match 
parquet_type {
-                        Type::INT32 => {
-                            
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42
-                            // All physical type  are little-endian
-                            if *p > 9 {
-                                //DECIMAL can be used to annotate the 
following types:
-                                //
-                                // int32: for 1 <= precision <= 9
-                                // int64: for 1 <= precision <= 18
-                                return true;
-                            }
-                            let b = (*v as i32).to_le_bytes();
-                            // Use Decimal constructor after 
https://github.com/apache/arrow-rs/issues/5325
-                            let decimal = Decimal::Int32 {
-                                value: b,
-                                precision: *p as i32,
-                                scale: *s as i32,
-                            };
-                            sbbf.check(&decimal)
-                        }
-                        Type::INT64 => {
-                            if *p > 18 {
-                                return true;
-                            }
-                            let b = (*v as i64).to_le_bytes();
-                            let decimal = Decimal::Int64 {
-                                value: b,
-                                precision: *p as i32,
-                                scale: *s as i32,
-                            };
-                            sbbf.check(&decimal)
-                        }
-                        Type::FIXED_LEN_BYTE_ARRAY => {
-                            // keep with from_bytes_to_i128
-                            let b = v.to_be_bytes().to_vec();
-                            // Use Decimal constructor after 
https://github.com/apache/arrow-rs/issues/5325
-                            let decimal = Decimal::Bytes {
-                                value: b.into(),
-                                precision: *p as i32,
-                                scale: *s as i32,
-                            };
-                            sbbf.check(&decimal)
-                        }
-                        _ => true,
-                    },
-                    _ => true,
-                }
-            })
+            .map(|value| BloomFilterStatistics::check_scalar(sbbf, value, 
parquet_type))

Review Comment:
   👍 



##########
datafusion/core/tests/parquet/mod.rs:
##########
@@ -740,6 +741,32 @@ fn make_utf8_batch(value: Vec<Option<&str>>) -> 
RecordBatch {
     .unwrap()
 }
 
+fn make_dictionary_batch(strings: Vec<&str>) -> RecordBatch {
+    let keys = Int32Array::from_iter(0..strings.len() as i32);
+
+    let utf8_values = StringArray::from(strings.clone());
+    let utf8_dict = DictionaryArray::new(keys.clone(), Arc::new(utf8_values));
+
+    let large_utf8 = LargeStringArray::from(strings.clone());
+    let large_utf8_dict = DictionaryArray::new(keys.clone(), 
Arc::new(large_utf8));
+
+    let binary =
+        BinaryArray::from_iter_values(strings.iter().cloned().map(|v| 
v.as_bytes()));
+    let binary_dict = DictionaryArray::new(keys.clone(), Arc::new(binary));
+
+    let large_binary =

Review Comment:
   I noticed that this PR adds data for `Binary` / `LargeBinary` as well, but 
not the code / tests
   
   Maybe we could either add that feature in this PR or else file a ticket to 
track adding it in as a follow on PR



##########
datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs:
##########
@@ -228,6 +228,85 @@ struct BloomFilterStatistics {
     column_sbbf: HashMap<String, (Sbbf, Type)>,
 }
 
+impl BloomFilterStatistics {
+    /// Helper function for checking if [`Sbbf`] filter contains 
[`ScalarValue`].
+    ///
+    /// In case the type of scalar is not supported, returns `true`, assuming 
that the
+    /// value may be present.
+    fn check_scalar(sbbf: &Sbbf, value: &ScalarValue, parquet_type: &Type) -> 
bool {
+        match value {
+            ScalarValue::Utf8(Some(v))
+            | ScalarValue::Utf8View(Some(v))
+            | ScalarValue::LargeUtf8(Some(v)) => sbbf.check(&v.as_str()),
+            ScalarValue::Binary(Some(v)) | ScalarValue::BinaryView(Some(v)) => 
{
+                sbbf.check(v)
+            }
+            ScalarValue::FixedSizeBinary(_size, Some(v)) => sbbf.check(v),
+            ScalarValue::Boolean(Some(v)) => sbbf.check(v),
+            ScalarValue::Float64(Some(v)) => sbbf.check(v),
+            ScalarValue::Float32(Some(v)) => sbbf.check(v),
+            ScalarValue::Int64(Some(v)) => sbbf.check(v),
+            ScalarValue::Int32(Some(v)) => sbbf.check(v),
+            ScalarValue::UInt64(Some(v)) => sbbf.check(v),
+            ScalarValue::UInt32(Some(v)) => sbbf.check(v),
+            ScalarValue::Decimal128(Some(v), p, s) => match parquet_type {
+                Type::INT32 => {
+                    
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42
+                    // All physical type  are little-endian
+                    if *p > 9 {
+                        //DECIMAL can be used to annotate the following types:
+                        //
+                        // int32: for 1 <= precision <= 9
+                        // int64: for 1 <= precision <= 18
+                        return true;
+                    }
+                    let b = (*v as i32).to_le_bytes();
+                    // Use Decimal constructor after 
https://github.com/apache/arrow-rs/issues/5325
+                    let decimal = Decimal::Int32 {
+                        value: b,
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                Type::INT64 => {
+                    if *p > 18 {
+                        return true;
+                    }
+                    let b = (*v as i64).to_le_bytes();
+                    let decimal = Decimal::Int64 {
+                        value: b,
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                Type::FIXED_LEN_BYTE_ARRAY => {
+                    // keep with from_bytes_to_i128
+                    let b = v.to_be_bytes().to_vec();
+                    // Use Decimal constructor after 
https://github.com/apache/arrow-rs/issues/5325
+                    let decimal = Decimal::Bytes {
+                        value: b.into(),
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                _ => true,
+            },
+            // Bloom filter pruning is performed only for Utf8 dictionary 
types since
+            // pruning predicate is not created for Dictionary(Numeric/Binary) 
types
+            ScalarValue::Dictionary(_, inner) => match inner.as_ref() {

Review Comment:
   it isn't clear to me that it is impossible to use Dictionary(Int8, Int64)` 
or something to encode Int64 values, but I think this is the most common example



##########
datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs:
##########
@@ -228,6 +228,85 @@ struct BloomFilterStatistics {
     column_sbbf: HashMap<String, (Sbbf, Type)>,
 }
 
+impl BloomFilterStatistics {
+    /// Helper function for checking if [`Sbbf`] filter contains 
[`ScalarValue`].
+    ///
+    /// In case the type of scalar is not supported, returns `true`, assuming 
that the
+    /// value may be present.
+    fn check_scalar(sbbf: &Sbbf, value: &ScalarValue, parquet_type: &Type) -> 
bool {
+        match value {
+            ScalarValue::Utf8(Some(v))
+            | ScalarValue::Utf8View(Some(v))
+            | ScalarValue::LargeUtf8(Some(v)) => sbbf.check(&v.as_str()),
+            ScalarValue::Binary(Some(v)) | ScalarValue::BinaryView(Some(v)) => 
{
+                sbbf.check(v)
+            }
+            ScalarValue::FixedSizeBinary(_size, Some(v)) => sbbf.check(v),
+            ScalarValue::Boolean(Some(v)) => sbbf.check(v),
+            ScalarValue::Float64(Some(v)) => sbbf.check(v),
+            ScalarValue::Float32(Some(v)) => sbbf.check(v),
+            ScalarValue::Int64(Some(v)) => sbbf.check(v),
+            ScalarValue::Int32(Some(v)) => sbbf.check(v),
+            ScalarValue::UInt64(Some(v)) => sbbf.check(v),
+            ScalarValue::UInt32(Some(v)) => sbbf.check(v),
+            ScalarValue::Decimal128(Some(v), p, s) => match parquet_type {
+                Type::INT32 => {
+                    
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42
+                    // All physical type  are little-endian
+                    if *p > 9 {
+                        //DECIMAL can be used to annotate the following types:
+                        //
+                        // int32: for 1 <= precision <= 9
+                        // int64: for 1 <= precision <= 18
+                        return true;
+                    }
+                    let b = (*v as i32).to_le_bytes();
+                    // Use Decimal constructor after 
https://github.com/apache/arrow-rs/issues/5325
+                    let decimal = Decimal::Int32 {
+                        value: b,
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                Type::INT64 => {
+                    if *p > 18 {
+                        return true;
+                    }
+                    let b = (*v as i64).to_le_bytes();
+                    let decimal = Decimal::Int64 {
+                        value: b,
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                Type::FIXED_LEN_BYTE_ARRAY => {
+                    // keep with from_bytes_to_i128
+                    let b = v.to_be_bytes().to_vec();
+                    // Use Decimal constructor after 
https://github.com/apache/arrow-rs/issues/5325
+                    let decimal = Decimal::Bytes {
+                        value: b.into(),
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                _ => true,
+            },
+            // Bloom filter pruning is performed only for Utf8 dictionary 
types since
+            // pruning predicate is not created for Dictionary(Numeric/Binary) 
types
+            ScalarValue::Dictionary(_, inner) => match inner.as_ref() {
+                ScalarValue::Utf8(_) | ScalarValue::LargeUtf8(_) => {

Review Comment:
   Did you also mean to check `ScalarValue::{Binary,LargeBinary}` here as well?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to