alamb commented on code in PR #11321:
URL: https://github.com/apache/datafusion/pull/11321#discussion_r1667750918


##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -100,6 +110,49 @@ impl PhysicalExpr for IsNullExpr {
     }
 }
 
+pub(crate) fn union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
+    if let Some(offsets) = union_array.offsets() {
+        dense_union_is_null(union_array, offsets)
+    } else {
+        sparce_union_is_null(union_array)
+    }
+}
+
+fn dense_union_is_null(
+    union_array: &UnionArray,
+    offsets: &ScalarBuffer<i32>,
+) -> Result<BooleanArray> {
+    let child_arrays = (0..union_array.type_names().len())
+        .map(|type_id| {
+            compute::is_null(&union_array.child(type_id as 
i8)).map_err(Into::into)
+        })
+        .collect::<Result<Vec<BooleanArray>>>()?;
+
+    let buffer: BooleanBuffer = offsets
+        .iter()
+        .zip(union_array.type_ids())
+        .map(|(offset, type_id)| child_arrays[*type_id as usize].value(*offset 
as usize))
+        .collect();
+
+    Ok(BooleanArray::new(buffer, None))
+}
+
+fn sparce_union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
+    let type_ids = Int8Array::new(union_array.type_ids().clone(), None);
+
+    let mut union_is_null =
+        BooleanArray::new(BooleanBuffer::new_unset(union_array.len()), None);
+    for type_id in 0..union_array.type_names().len() {
+        let type_id = type_id as i8;
+        let union_is_child = cmp::eq(&type_ids, 
&Int8Array::new_scalar(type_id))?;

Review Comment:
   I double checked this and it looks good to me



##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -100,6 +110,49 @@ impl PhysicalExpr for IsNullExpr {
     }
 }
 
+pub(crate) fn union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
+    if let Some(offsets) = union_array.offsets() {
+        dense_union_is_null(union_array, offsets)
+    } else {
+        sparce_union_is_null(union_array)
+    }
+}
+
+fn dense_union_is_null(
+    union_array: &UnionArray,
+    offsets: &ScalarBuffer<i32>,
+) -> Result<BooleanArray> {
+    let child_arrays = (0..union_array.type_names().len())
+        .map(|type_id| {
+            compute::is_null(&union_array.child(type_id as 
i8)).map_err(Into::into)
+        })
+        .collect::<Result<Vec<BooleanArray>>>()?;
+
+    let buffer: BooleanBuffer = offsets
+        .iter()
+        .zip(union_array.type_ids())
+        .map(|(offset, type_id)| child_arrays[*type_id as usize].value(*offset 
as usize))
+        .collect();
+
+    Ok(BooleanArray::new(buffer, None))
+}
+
+fn sparce_union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {

Review Comment:
   Nit: It may be a British vs American thing but I think typically an `s` is 
used: 
   
   ```suggestion
   fn sparse_union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
   ```
   
   This is also consistent with the arrow spec and library 
https://docs.rs/arrow/latest/arrow/array/struct.UnionArray.html



##########
datafusion/physical-expr/src/expressions/is_not_null.rs:
##########
@@ -73,9 +74,16 @@ impl PhysicalExpr for IsNotNullExpr {
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
         let arg = self.arg.evaluate(batch)?;
         match arg {
-            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new(
-                compute::is_not_null(array.as_ref())?,
-            ))),
+            ColumnarValue::Array(array) => {
+                let bool_array = if let Some(union_array) =

Review Comment:
   I was going to suggest adding using `logical_nulls` here but it seems as if 
`compute::is_not_null` already does that: 
https://docs.rs/arrow-arith/52.1.0/src/arrow_arith/boolean.rs.html#345-350
   
   Filed https://github.com/apache/arrow-rs/issues/6017 for the upstream bug
   
   ```suggestion
                   // workaround https://github.com/apache/arrow-rs/issues/6017
                   let bool_array = if let Some(union_array) =
   ```



##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -74,9 +77,16 @@ impl PhysicalExpr for IsNullExpr {
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
         let arg = self.arg.evaluate(batch)?;
         match arg {
-            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new(
-                compute::is_null(array.as_ref())?,
-            ))),
+            ColumnarValue::Array(array) => {
+                let bool_array = if let Some(union_array) =
+                    array.as_any().downcast_ref::<UnionArray>()
+                {
+                    union_is_null(union_array)?
+                } else {
+                    compute::is_null(array.as_ref())?
+                };
+                Ok(ColumnarValue::Array(Arc::new(bool_array)))
+            }

Review Comment:
   instead of replicating the special case for `UnionArray` here and in 
`is_not_null` what do you think about making a wrapper for `compute::is_null` 
in  DataFusion:
   
   ```rust
   /// wraper around arrow::compute::is_null
   fn is_null(datum: &Datum) -> Result<BooleanArray> {
     if let Some(union_array) =
                       array.as_any().downcast_ref::<UnionArray>()
                   {
                       union_is_null(union_array)
                   } else {
                       compute::is_null(array.as_ref())
                   }
     }
   ```
   
   The idea being that then when the fix is available in arrow-rs then we can 
simple remove the wrapper



##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -74,9 +77,16 @@ impl PhysicalExpr for IsNullExpr {
     fn evaluate(&self, batch: &RecordBatch) -> Result<ColumnarValue> {
         let arg = self.arg.evaluate(batch)?;
         match arg {
-            ColumnarValue::Array(array) => Ok(ColumnarValue::Array(Arc::new(
-                compute::is_null(array.as_ref())?,
-            ))),
+            ColumnarValue::Array(array) => {

Review Comment:
   ```suggestion
               // workaround https://github.com/apache/arrow-rs/issues/6017
               ColumnarValue::Array(array) => {
   ```



##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -100,6 +110,49 @@ impl PhysicalExpr for IsNullExpr {
     }
 }
 
+pub(crate) fn union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {

Review Comment:
   ```suggestion
   
   // workaround https://github.com/apache/arrow-rs/issues/6017
   pub(crate) fn union_is_null(union_array: &UnionArray) -> 
Result<BooleanArray> {
   ```



##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -100,6 +110,49 @@ impl PhysicalExpr for IsNullExpr {
     }
 }
 
+pub(crate) fn union_is_null(union_array: &UnionArray) -> Result<BooleanArray> {
+    if let Some(offsets) = union_array.offsets() {
+        dense_union_is_null(union_array, offsets)
+    } else {
+        sparce_union_is_null(union_array)
+    }
+}
+
+fn dense_union_is_null(

Review Comment:
   I did and it looks good to me



##########
datafusion/physical-expr/src/expressions/is_null.rs:
##########
@@ -145,4 +201,65 @@ mod tests {
 
         Ok(())
     }
+
+    fn union_fields() -> UnionFields {
+        [
+            (0, Arc::new(Field::new("A", DataType::Int32, true))),
+            (1, Arc::new(Field::new("B", DataType::Float64, true))),
+            (2, Arc::new(Field::new("C", DataType::Utf8, true))),
+        ]
+        .into_iter()
+        .collect()
+    }
+
+    #[test]
+    fn sparse_union_is_null() {
+        // union of [{A=1}, {A=}, {B=3.2}, {B=}, {C="a"}, {C=}]
+        let int_array = Int32Array::from(vec![Some(1), None, None, None, None, 
None]);
+        let float_array =
+            Float64Array::from(vec![None, None, Some(3.2), None, None, None]);
+        let str_array = StringArray::from(vec![None, None, None, None, 
Some("a"), None]);
+        let type_ids = [0, 0, 1, 1, 2, 
2].into_iter().collect::<ScalarBuffer<i8>>();

Review Comment:
   I wonder if it makes sense to vary the spacing or something so that the 
nullness is not exactly the same for each child
   
   Maybe `[{A=1}, {A=}, {B=3.2}, {B=3.3}, {B=}. ...]` or something



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to