viirya commented on code in PR #5600:
URL: https://github.com/apache/arrow-rs/pull/5600#discussion_r1555112846
##########
parquet/src/arrow/async_reader/mod.rs:
##########
@@ -1857,4 +1857,92 @@ mod tests {
assert_eq!(total_rows, expected);
}
}
+
+ #[tokio::test]
+ async fn test_row_filter_nested() {
+ let a = StringArray::from_iter_values(["a", "b", "b", "b", "c", "c"]);
+ let b = StructArray::from(vec![
+ (
+ Arc::new(Field::new("aa", DataType::Utf8, true)),
+ Arc::new(StringArray::from(vec!["a", "b", "b", "b", "c",
"c"])) as ArrayRef,
+ ),
+ (
+ Arc::new(Field::new("bb", DataType::Utf8, true)),
+ Arc::new(StringArray::from(vec!["1", "2", "3", "4", "5",
"6"])) as ArrayRef,
+ ),
+ ]);
+ let c = Int32Array::from_iter(0..6);
+ let data = RecordBatch::try_from_iter([
+ ("a", Arc::new(a) as ArrayRef),
+ ("b", Arc::new(b) as ArrayRef),
+ ("c", Arc::new(c) as ArrayRef),
+ ])
+ .unwrap();
+
+ let mut buf = Vec::with_capacity(1024);
+ let mut writer = ArrowWriter::try_new(&mut buf, data.schema(),
None).unwrap();
+ writer.write(&data).unwrap();
+ writer.close().unwrap();
+
+ let data: Bytes = buf.into();
+ let metadata = parse_metadata(&data).unwrap();
+ let parquet_schema = metadata.file_metadata().schema_descr_ptr();
+
+ let test = TestReader {
+ data,
+ metadata: Arc::new(metadata),
+ requests: Default::default(),
+ };
+ let requests = test.requests.clone();
+
+ let a_scalar = StringArray::from_iter_values(["b"]);
+ let a_filter = ArrowPredicateFn::new(
+ ProjectionMask::leaves(&parquet_schema, vec![0]),
+ move |batch| eq(batch.column(0), &Scalar::new(&a_scalar)),
+ );
+
+ let b_scalar = StringArray::from_iter_values(["4"]);
+ let b_filter = ArrowPredicateFn::new(
+ ProjectionMask::leaves(&parquet_schema, vec![2]),
+ move |batch| {
+ // Filter on the second element of the struct.
+ let struct_array = batch
+ .column(0)
+ .as_any()
+ .downcast_ref::<StructArray>()
+ .unwrap();
+ eq(struct_array.column(0), &Scalar::new(&b_scalar))
Review Comment:
Btw, the row filter needs to know what the schema is so it can get correct
(nested) column to do filtering. For the general filter implementation like
https://github.com/apache/iceberg-rust/pull/295 proposes to be, is any utility
we can use to "flatten" nested columns from the batch?
In other words, is any existing way to flatten projected (nested) columns in
the batch? So if we know a leaf column's index, we can know its position in
projection mask and the flatten batch. Then we can simply get the column by
`flatten_batch.column`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]