alamb commented on code in PR #11321:
URL: https://github.com/apache/datafusion/pull/11321#discussion_r1667750918
##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -100,6 +110,49 @@ impl PhysicalExpr for IsNullExpr {
}
}
+pub(crate) fn union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
+ if let Some(offsets) = union_array.offsets() {
+ dense_union_is_null(union_array, offsets)
+ } else {
+ sparce_union_is_null(union_array)
+ }
+}
+
+fn dense_union_is_null(
+ union_array: &UnionArray,
+ offsets: &ScalarBuffer<i32>,
+) -> Result<BooleanArray> {
+ let child_arrays = (0..union_array.type_names().len())
+ .map(|type_id| {
+ compute::is_null(&union_array.child(type_id as
i8)).map_err(Into::into)
+ })
+ .collect::<Result<Vec<BooleanArray>>>()?;
+
+ let buffer: BooleanBuffer = offsets
+ .iter()
+ .zip(union_array.type_ids())
+ .map(|(offset, type_id)| child_arrays[*type_id as usize].value(*offset
as usize))
+ .collect();
+
+ Ok(BooleanArray::new(buffer, None))
+}
+
+fn sparce_union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
+ let type_ids = Int8Array::new(union_array.type_ids().clone(), None);
+
+ let mut union_is_null =
+ BooleanArray::new(BooleanBuffer::new_unset(union_array.len()), None);
+ for type_id in 0..union_array.type_names().len() {
+ let type_id = type_id as i8;
+ let union_is_child = cmp::eq(&type_ids,
&Int8Array::new_scalar(type_id))?;
Review Comment:
I double checked this and it looks good to me
##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -100,6 +110,49 @@ impl PhysicalExpr for IsNullExpr {
}
}
+pub(crate) fn union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
+ if let Some(offsets) = union_array.offsets() {
+ dense_union_is_null(union_array, offsets)
+ } else {
+ sparce_union_is_null(union_array)
+ }
+}
+
+fn dense_union_is_null(
+ union_array: &UnionArray,
+ offsets: &ScalarBuffer<i32>,
+) -> Result<BooleanArray> {
+ let child_arrays = (0..union_array.type_names().len())
+ .map(|type_id| {
+ compute::is_null(&union_array.child(type_id as
i8)).map_err(Into::into)
+ })
+ .collect::<Result<Vec<BooleanArray>>>()?;
+
+ let buffer: BooleanBuffer = offsets
+ .iter()
+ .zip(union_array.type_ids())
+ .map(|(offset, type_id)| child_arrays[*type_id as usize].value(*offset
as usize))
+ .collect();
+
+ Ok(BooleanArray::new(buffer, None))
+}
+
+fn sparce_union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
Review Comment:
Nit: It may be a British vs American thing but I think typically an `s` is
used:
```suggestion
fn sparse_union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
```
This is also consistent with the arrow spec and library
https://docs.rs/arrow/latest/arrow/array/struct.UnionArray.html
##########
datafusion/physical-expr/src/expressions/is_not_null.rs:
##########
@@ -73,9 +74,16 @@ impl PhysicalExpr for IsNotNullExpr {
fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
let arg = self.arg.evaluate(batch)?;
match arg {
- ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new(
- compute::is_not_null(array.as_ref())?,
- ))),
+ ColumnarValue::Array(array) => {
+ let bool_array = if let Some(union_array) =
Review Comment:
I was going to suggest adding using `logical_nulls` here but it seems as if
`compute::is_not_null` already does that:
https://docs.rs/arrow-arith/52.1.0/src/arrow_arith/boolean.rs.html#345-350
Filed https://github.com/apache/arrow-rs/issues/6017 for the upstream bug
```suggestion
// workaround https://github.com/apache/arrow-rs/issues/6017
let bool_array = if let Some(union_array) =
```
##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -74,9 +77,16 @@ impl PhysicalExpr for IsNullExpr {
fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
let arg = self.arg.evaluate(batch)?;
match arg {
- ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new(
- compute::is_null(array.as_ref())?,
- ))),
+ ColumnarValue::Array(array) => {
+ let bool_array = if let Some(union_array) =
+ array.as_any().downcast_ref::<UnionArray>()
+ {
+ union_is_null(union_array)?
+ } else {
+ compute::is_null(array.as_ref())?
+ };
+ Ok(ColumnarValue::Array(Arc::new(bool_array)))
+ }
Review Comment:
instead of replicating the special case for `UnionArray` here and in
`is_not_null` what do you think about making a wrapper for `compute::is_null`
in DataFusion:
```rust
/// wraper around arrow::compute::is_null
fn is_null(datum: &Datum) -> Result<BooleanArray> {
if let Some(union_array) =
array.as_any().downcast_ref::<UnionArray>()
{
union_is_null(union_array)
} else {
compute::is_null(array.as_ref())
}
}
```
The idea being that then when the fix is available in arrow-rs then we can
simple remove the wrapper
##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -74,9 +77,16 @@ impl PhysicalExpr for IsNullExpr {
fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
let arg = self.arg.evaluate(batch)?;
match arg {
- ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new(
- compute::is_null(array.as_ref())?,
- ))),
+ ColumnarValue::Array(array) => {
Review Comment:
```suggestion
// workaround https://github.com/apache/arrow-rs/issues/6017
ColumnarValue::Array(array) => {
```
##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -100,6 +110,49 @@ impl PhysicalExpr for IsNullExpr {
}
}
+pub(crate) fn union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
Review Comment:
```suggestion
// workaround https://github.com/apache/arrow-rs/issues/6017
pub(crate) fn union_is_null(union_array: &UnionArray) ->
Result<BooleanArray> {
```
##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -100,6 +110,49 @@ impl PhysicalExpr for IsNullExpr {
}
}
+pub(crate) fn union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
+ if let Some(offsets) = union_array.offsets() {
+ dense_union_is_null(union_array, offsets)
+ } else {
+ sparce_union_is_null(union_array)
+ }
+}
+
+fn dense_union_is_null(
Review Comment:
I did and it looks good to me
##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -145,4 +201,65 @@ mod tests {
Ok(())
}
+
+ fn union_fields() -> UnionFields {
+ [
+ (0, Arc::new(Field::new("A", DataType::Int32, true))),
+ (1, Arc::new(Field::new("B", DataType::Float64, true))),
+ (2, Arc::new(Field::new("C", DataType::Utf8, true))),
+ ]
+ .into_iter()
+ .collect()
+ }
+
+ #[test]
+ fn sparse_union_is_null() {
+ // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+ let int_array = Int32Array::from(vec![Some(1), None, None, None, None,
None]);
+ let float_array =
+ Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
+ let str_array = StringArray::from(vec![None, None, None, None,
Some("a"), None]);
+ let type_ids = [0, 0, 1, 1, 2,
2].into_iter().collect::<ScalarBuffer<i8>>();
Review Comment:
I wonder if it makes sense to vary the spacing or something so that the
nullness is not exactly the same for each child
Maybe `[{A=1}, {A=}, {B=3.2}, {B=3.3}, {B=}. ...]` or something
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]