tustvold commented on code in PR #3284:
URL: https://github.com/apache/arrow-rs/pull/3284#discussion_r1042679999
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -1312,6 +1314,70 @@ mod tests {
values_optional::<A, I>(iter);
}
+ fn check_bloom_filter<T: AsBytes>(
+ files: Vec<File>,
+ file_column: String,
+ positive_values: Vec<T>,
+ negative_values: Vec<T>,
+ ) {
+ files.into_iter().for_each(|file| {
+ let file_reader = SerializedFileReader::new_with_options(
+ file,
+ ReadOptionsBuilder::new()
+ .with_reader_properties(
+ ReaderProperties::builder()
+ .set_read_bloom_filter(true)
+ .build(),
+ )
+ .build(),
+ )
+ .expect("Unable to open file as Parquet");
+ let metadata = file_reader.metadata();
+ for (ri, row_group) in metadata.row_groups().iter().enumerate() {
+ if let Some((column_index, _)) = row_group
+ .columns()
+ .iter()
+ .enumerate()
+ .find(|(_, column)| column.column_path().string() ==
file_column)
+ {
+ let row_group_reader = file_reader
+ .get_row_group(ri)
+ .expect("Unable to read row group");
+ if let Some(sbbf) =
+ row_group_reader.get_column_bloom_filter(column_index)
+ {
+ if row_group.num_rows() >= positive_values.len() as
i64 {
Review Comment:
What is this if statement for?
##########
parquet/src/arrow/arrow_writer/byte_array.rs:
##########
@@ -555,6 +561,13 @@ where
}
}
+ // encode the values into bloom filter if enabled
+ if let Some(bloom_filter) = &mut encoder.bloom_filter {
+ for idx in 0..values.len() {
+ bloom_filter.insert(&values.value(idx));
Review Comment:
I think this needs to take into account nulls? Otherwise it will add
whatever happens to be in the null slot, most likely an empty string, into the
bloom filter?
##########
parquet/src/arrow/arrow_writer/byte_array.rs:
##########
@@ -543,7 +549,7 @@ impl ColumnValueEncoder for ByteArrayEncoder {
fn encode<T>(values: T, indices: &[usize], encoder: &mut ByteArrayEncoder)
where
T: ArrayAccessor + Copy,
- T::Item: Copy + Ord + AsRef<[u8]>,
+ T::Item: Copy + Ord + AsRef<[u8]> + AsBytes,
Review Comment:
I wonder if just adding `.as_ref()` to the call site would be sufficient
instead of adding an additional trait bound? Not sure though. It isn't a major
thing, just a very minor nit
##########
parquet/src/data_type.rs:
##########
@@ -448,6 +448,12 @@ impl AsBytes for [u8] {
}
}
+impl AsBytes for &[u8] {
Review Comment:
I think this is redundant given the implementation above? Just add `&`
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -1528,6 +1594,33 @@ mod tests {
values_required::<BinaryArray, _>(many_vecs_iter);
}
+ #[test]
+ fn i32_column_bloom_filter() {
+ let positive_values: Vec<i32> = (0..SMALL_SIZE as i32).collect();
+ let files = values_required::<Int32Array, _>(positive_values);
+ check_bloom_filter(
+ files,
+ "col".to_string(),
+ (0..SMALL_SIZE as i32).collect(),
+ (SMALL_SIZE as i32 + 1..SMALL_SIZE as i32 + 10).collect(),
+ );
+ }
+
+ #[test]
+ fn binary_column_bloom_filter() {
Review Comment:
Perhaps we could get a test of nulls and empty strings?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]