alamb commented on code in PR #13768:
URL: https://github.com/apache/datafusion/pull/13768#discussion_r1889230183


##########
datafusion/core/src/datasource/physical_plan/parquet/row_group_filter.rs:
##########
@@ -228,6 +228,85 @@ struct BloomFilterStatistics {
     column_sbbf: HashMap<String, (Sbbf, Type)>,
 }
 
+impl BloomFilterStatistics {
+    /// Helper function for checking if [`Sbbf`] filter contains 
[`ScalarValue`].
+    ///
+    /// In case the type of scalar is not supported, returns `true`, assuming 
that the
+    /// value may be present.
+    fn check_scalar(sbbf: &Sbbf, value: &ScalarValue, parquet_type: &Type) -> 
bool {
+        match value {
+            ScalarValue::Utf8(Some(v))
+            | ScalarValue::Utf8View(Some(v))
+            | ScalarValue::LargeUtf8(Some(v)) => sbbf.check(&v.as_str()),
+            ScalarValue::Binary(Some(v)) | ScalarValue::BinaryView(Some(v)) => 
{
+                sbbf.check(v)
+            }
+            ScalarValue::FixedSizeBinary(_size, Some(v)) => sbbf.check(v),
+            ScalarValue::Boolean(Some(v)) => sbbf.check(v),
+            ScalarValue::Float64(Some(v)) => sbbf.check(v),
+            ScalarValue::Float32(Some(v)) => sbbf.check(v),
+            ScalarValue::Int64(Some(v)) => sbbf.check(v),
+            ScalarValue::Int32(Some(v)) => sbbf.check(v),
+            ScalarValue::UInt64(Some(v)) => sbbf.check(v),
+            ScalarValue::UInt32(Some(v)) => sbbf.check(v),
+            ScalarValue::Decimal128(Some(v), p, s) => match parquet_type {
+                Type::INT32 => {
+                    
//https://github.com/apache/parquet-format/blob/eb4b31c1d64a01088d02a2f9aefc6c17c54cc6fc/Encodings.md?plain=1#L35-L42
+                    // All physical type  are little-endian
+                    if *p > 9 {
+                        //DECIMAL can be used to annotate the following types:
+                        //
+                        // int32: for 1 <= precision <= 9
+                        // int64: for 1 <= precision <= 18
+                        return true;
+                    }
+                    let b = (*v as i32).to_le_bytes();
+                    // Use Decimal constructor after 
https://github.com/apache/arrow-rs/issues/5325
+                    let decimal = Decimal::Int32 {
+                        value: b,
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                Type::INT64 => {
+                    if *p > 18 {
+                        return true;
+                    }
+                    let b = (*v as i64).to_le_bytes();
+                    let decimal = Decimal::Int64 {
+                        value: b,
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                Type::FIXED_LEN_BYTE_ARRAY => {
+                    // keep with from_bytes_to_i128
+                    let b = v.to_be_bytes().to_vec();
+                    // Use Decimal constructor after 
https://github.com/apache/arrow-rs/issues/5325
+                    let decimal = Decimal::Bytes {
+                        value: b.into(),
+                        precision: *p as i32,
+                        scale: *s as i32,
+                    };
+                    sbbf.check(&decimal)
+                }
+                _ => true,
+            },
+            // Bloom filter pruning is performed only for Utf8 dictionary 
types since
+            // pruning predicate is not created for Dictionary(Numeric/Binary) 
types
+            ScalarValue::Dictionary(_, inner) => match inner.as_ref() {

Review Comment:
   It seems like we maybe don't even need to chack the inner type explicitly 
(it would be checked by `BloomFilterStatistics::check_scalar` as well). However 
I think this is better than what is on main today and if it is important we can 
add support for other types



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to