alamb commented on code in PR #8551:
URL: https://github.com/apache/arrow-rs/pull/8551#discussion_r2441056001


##########
parquet/src/bloom_filter/mod.rs:
##########
@@ -387,7 +387,7 @@ impl Sbbf {
     }
 
     /// Check if an [AsBytes] value is probably present or definitely absent 
in the filter
-    pub fn check<T: AsBytes>(&self, value: &T) -> bool {
+    pub fn check<T: AsBytes + ?Sized>(&self, value: &T) -> bool {

Review Comment:
   why is this needed?



##########
parquet/src/arrow/bloom_filter.rs:
##########
@@ -0,0 +1,667 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Arrow-aware bloom filter
+//!
+//! This module provides [`ArrowSbbf`], a wrapper around [`Sbbf`] that handles
+//! Arrow-to-Parquet type coercion when checking bloom filters.
+//!
+//! # Overview
+//!
+//! Arrow supports types like `Int8` and `Int16` that have no corresponding 
Parquet type.
+//! Data of these types must be coerced to the appropriate Parquet type when 
writing.
+//!
+//! For example, Arrow small integer types are coerced when writing to Parquet:
+//! - `Int8` → `INT32` (i8 → i32)
+//! - `Int16` → `INT32` (i16 → i32)
+//! - `UInt8` → `INT32` (u8 → u32 → i32)
+//! - `UInt16` → `INT32` (u16 → u32 → i32)
+//!
+//! [`ArrowSbbf`] wraps an [`Sbbf`] and an Arrow [`arrow_schema::DataType`], 
automatically coercing
+//! values to their Parquet representation before checking the bloom filter.
+//!
+//! # Example
+//!
+//! ```ignore
+//! use arrow_schema::DataType;
+//! use parquet::arrow::bloom_filter::ArrowSbbf;
+//!
+//! // Get bloom filter from row group reader
+//! let sbbf = row_group_reader.get_column_bloom_filter(0)?;
+//!
+//! // Create ArrowSbbf with the logical Arrow type
+//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Int8);
+//!
+//! // Check with i8 value - automatically coerced to i32
+//! let value: i8 = 42;
+//! if arrow_sbbf.check(&value) {
+//!     println!("Value might be present");
+//! }
+//! ```
+//!
+//! # Date64 with Type Coercion
+//!
+//! `Date64` requires special handling because it can be stored as either 
INT32 or INT64
+//! depending on [`WriterProperties::set_coerce_types`].
+//!
+//! When `coerce_types=true`, Date64 is stored as Date32 (INT32, days since 
epoch).
+//! Users must convert their Date64 values before checking:
+//!
+//! ```ignore
+//! use arrow_array::temporal_conversions::MILLISECONDS_IN_DAY;
+//! use arrow_array::Date64Array;
+//! use arrow_cast::cast;
+//! use arrow_schema::DataType;
+//!
+//! let date64_value = 864_000_000_i64; // milliseconds
+//!
+//! // Check the column's physical type to determine conversion
+//! let column_chunk = row_group_reader.metadata().column(0);
+//! match column_chunk.column_type() {
+//!     ParquetType::INT32 => {
+//!         // Date64 was coerced to Date32 - convert milliseconds to days
+//!         let date32_value = (date64_value / MILLISECONDS_IN_DAY) as i32;
+//!         let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Date32);
+//!         arrow_sbbf.check(&date32_value)
+//!     }
+//!     ParquetType::INT64 => {
+//!         // Date64 stored as-is - check directly
+//!         let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Date64);
+//!         arrow_sbbf.check(&date64_value)
+//!     }
+//!     _ => unreachable!()
+//! }
+//! ```
+//!
+//! Alternatively, use [`arrow_cast::cast`] for the conversion:
+//! ```ignore
+//! let date64_array = Date64Array::from(vec![date64_value]);
+//! let date32_array = cast(&date64_array, &DataType::Date32)?;
+//! let date32_value = date32_array.as_primitive::<Date32Type>().value(0);
+//! ```
+//!
+//! [`WriterProperties::set_coerce_types`]: 
crate::file::properties::WriterPropertiesBuilder::set_coerce_types
+//! [`arrow_cast::cast`]: 
https://docs.rs/arrow-cast/latest/arrow_cast/cast/fn.cast.html
+
+use crate::bloom_filter::Sbbf;
+use crate::data_type::AsBytes;
+use arrow_schema::DataType as ArrowType;
+
+/// Wraps an [`Sbbf`] and provides automatic type coercion based on Arrow 
schema.
+#[derive(Debug, Clone)]
+pub struct ArrowSbbf<'a> {
+    sbbf: &'a Sbbf,
+    arrow_type: &'a ArrowType,
+}
+
+impl<'a> ArrowSbbf<'a> {
+    /// Create a new Arrow-aware bloom filter wrapper
+    ///
+    /// # Arguments
+    /// * `sbbf` - Parquet bloom filter for the column
+    /// * `arrow_type` - Arrow data type for the column
+    pub fn new(sbbf: &'a Sbbf, arrow_type: &'a ArrowType) -> Self {
+        Self { sbbf, arrow_type }
+    }
+
+    /// Check if a value might be present in the bloom filter

Review Comment:
   What is the expected format of the bytes? It appears to be the arrow 
representation 🤔 
   
   This code looks slightly different than what is in DataFusion. Not sure if 
that is good/bad 🤔 
   
   
https://github.com/apache/datafusion/blob/522403bb44780679109055abca6048d21add0d25/datafusion/datasource-parquet/src/row_group_filter.rs#L239-L298



##########
parquet/benches/bloom_filter.rs:
##########


Review Comment:
   so this means that casting the bloom filter results is slower?



##########
parquet/src/arrow/bloom_filter.rs:
##########
@@ -0,0 +1,667 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Arrow-aware bloom filter
+//!
+//! This module provides [`ArrowSbbf`], a wrapper around [`Sbbf`] that handles
+//! Arrow-to-Parquet type coercion when checking bloom filters.
+//!
+//! # Overview
+//!
+//! Arrow supports types like `Int8` and `Int16` that have no corresponding 
Parquet type.
+//! Data of these types must be coerced to the appropriate Parquet type when 
writing.
+//!
+//! For example, Arrow small integer types are coerced when writing to Parquet:
+//! - `Int8` → `INT32` (i8 → i32)
+//! - `Int16` → `INT32` (i16 → i32)
+//! - `UInt8` → `INT32` (u8 → u32 → i32)
+//! - `UInt16` → `INT32` (u16 → u32 → i32)
+//!
+//! [`ArrowSbbf`] wraps an [`Sbbf`] and an Arrow [`arrow_schema::DataType`], 
automatically coercing
+//! values to their Parquet representation before checking the bloom filter.
+//!
+//! # Example
+//!
+//! ```ignore
+//! use arrow_schema::DataType;
+//! use parquet::arrow::bloom_filter::ArrowSbbf;
+//!
+//! // Get bloom filter from row group reader
+//! let sbbf = row_group_reader.get_column_bloom_filter(0)?;
+//!
+//! // Create ArrowSbbf with the logical Arrow type
+//! let arrow_sbbf = ArrowSbbf::new(&sbbf, &DataType::Int8);
+//!
+//! // Check with i8 value - automatically coerced to i32
+//! let value: i8 = 42;
+//! if arrow_sbbf.check(&value) {
+//!     println!("Value might be present");
+//! }
+//! ```
+//!
+//! # Date64 with Type Coercion
+//!
+//! `Date64` requires special handling because it can be stored as either 
INT32 or INT64
+//! depending on [`WriterProperties::set_coerce_types`].
+//!
+//! When `coerce_types=true`, Date64 is stored as Date32 (INT32, days since 
epoch).
+//! Users must convert their Date64 values before checking:
+//!
+//! ```ignore
+//! use arrow_array::temporal_conversions::MILLISECONDS_IN_DAY;
+//! use arrow_array::Date64Array;
+//! use arrow_cast::cast;
+//! use arrow_schema::DataType;
+//!
+//! let date64_value = 864_000_000_i64; // milliseconds
+//!
+//! // Check the column's physical type to determine conversion
+//! let column_chunk = row_group_reader.metadata().column(0);
+//! match column_chunk.column_type() {
+//!     ParquetType::INT32 => {
+//!         // Date64 was coerced to Date32 - convert milliseconds to days
+//!         let date32_value = (date64_value / MILLISECONDS_IN_DAY) as i32;

Review Comment:
   how do you envision a user getting this `date32_value`? 
   
   I would expect for an Arrow usecase they would have a `Date32Array` 🤔 
   
   I wonder if the API would more cleanly be expressed as an array kernel? 
Something like
   
   ```rust
   let boolean_array = ArrowSbbf::check(&date32_array)?;
   ```
   
   Though I suppose for the common case where there is a single (constant) 
value this may be overkill



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to