alamb commented on code in PR #4716:
URL: https://github.com/apache/arrow-rs/pull/4716#discussion_r1300051777
##########
arrow-ord/src/partition.rs:
##########
@@ -157,23 +157,7 @@ fn find_boundaries(v: &dyn Array) -> Result<BooleanBuffer,
ArrowError> {
let slice_len = v.len() - 1;
let v1 = v.slice(0, slice_len);
let v2 = v.slice(1, slice_len);
-
- let array_ne = neq(&v1, &v2)?;
- // Set if values have different non-NULL values
- let values_ne = match array_ne.nulls().filter(|n| n.null_count() > 0) {
- Some(n) => n.inner() & array_ne.values(),
- None => array_ne.values().clone(),
- };
-
- Ok(match v.nulls().filter(|x| x.null_count() > 0) {
- Some(n) => {
- let n1 = n.inner().slice(0, slice_len);
- let n2 = n.inner().slice(1, slice_len);
- // Set if values_ne or the nullability has changed
- &(&n1 ^ &n2) | &values_ne
- }
- None => values_ne,
- })
+ Ok(distinct(&v1, &v2)?.values().clone())
Review Comment:
nice
##########
arrow-array/src/array/boolean_array.rs:
##########
@@ -437,6 +437,15 @@ impl<Ptr: std::borrow::Borrow<Option<bool>>>
FromIterator<Ptr> for BooleanArray
}
}
+impl From<BooleanBuffer> for BooleanArray {
+ fn from(values: BooleanBuffer) -> Self {
Review Comment:
this is a nice UX improvement -- I had been looking for that when working on
group by in DataFusion
##########
arrow-ord/src/cmp.rs:
##########
@@ -129,7 +134,35 @@ pub fn gt_eq(lhs: &dyn Datum, rhs: &dyn Datum) ->
Result<BooleanArray, ArrowErro
compare_op(Op::GreaterEqual, lhs, rhs)
}
+/// Perform `left IS DISTINCT FROM right` operation on two [`Datum`]
Review Comment:
```suggestion
/// Perform `left IS DISTINCT FROM right` operation on two [`Datum`]. `IS
DISTINCT`
/// similar to `NotEq`, differing in null handling. Two operands are
considered DISTINCT
/// if they have a different value or if one of them is NULL and the other
isn't.
/// The result of `IS DISTINCT FROM` is never NULL.
```
##########
arrow-ord/src/cmp.rs:
##########
@@ -141,51 +174,114 @@ fn compare_op(
let l_len = l.len();
let r_len = r.len();
- let l_nulls = l.logical_nulls();
- let r_nulls = r.logical_nulls();
- let (len, nulls) = match (l_s, r_s) {
- (true, true) | (false, false) => {
- if l_len != r_len {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Cannot compare arrays of different lengths, got {l_len}
vs {r_len}"
- )));
- }
- (l_len, NullBuffer::union(l_nulls.as_ref(), r_nulls.as_ref()))
- }
- (true, false) => match l_nulls.map(|x| x.null_count() !=
0).unwrap_or_default() {
- true => (r_len, Some(NullBuffer::new_null(r_len))),
- false => (r_len, r_nulls), // Left is scalar and not null
- },
- (false, true) => match r_nulls.map(|x| x.null_count() !=
0).unwrap_or_default() {
- true => (l_len, Some(NullBuffer::new_null(l_len))),
- false => (l_len, l_nulls), // Right is scalar and not null
- },
+ if l_len != r_len && !l_s && !r_s {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Cannot compare arrays of different lengths, got {l_len} vs
{r_len}"
+ )));
+ }
+
+ let len = match l_s {
+ true => r_len,
+ false => l_len,
};
+ let l_nulls = l.logical_nulls();
+ let r_nulls = r.logical_nulls();
+
let l_v = l.as_any_dictionary_opt();
let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l);
+ let l_t = l.data_type();
let r_v = r.as_any_dictionary_opt();
let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r);
+ let r_t = r.data_type();
+
+ if l_t != r_t || l_t.is_nested() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Invalid comparison operation: {l_t} {op} {r_t}"
+ )));
+ }
+
+ // Defer computation as may not be necessary
+ let values = || -> BooleanBuffer {
+ let d = downcast_primitive_array! {
+ (l, r) => apply(op, l.values().as_ref(), l_s, l_v,
r.values().as_ref(), r_s, r_v),
+ (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v,
r.as_boolean(), r_s, r_v),
+ (Utf8, Utf8) => apply(op, l.as_string::<i32>(), l_s, l_v,
r.as_string::<i32>(), r_s, r_v),
+ (LargeUtf8, LargeUtf8) => apply(op, l.as_string::<i64>(), l_s,
l_v, r.as_string::<i64>(), r_s, r_v),
+ (Binary, Binary) => apply(op, l.as_binary::<i32>(), l_s, l_v,
r.as_binary::<i32>(), r_s, r_v),
+ (LargeBinary, LargeBinary) => apply(op, l.as_binary::<i64>(), l_s,
l_v, r.as_binary::<i64>(), r_s, r_v),
+ (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op,
l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v),
+ (Null, Null) => None,
+ _ => unreachable!(),
+ };
+ d.unwrap_or_else(|| BooleanBuffer::new_unset(len))
+ };
- let values = downcast_primitive_array! {
- (l, r) => apply(op, l.values().as_ref(), l_s, l_v,
r.values().as_ref(), r_s, r_v),
- (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v,
r.as_boolean(), r_s, r_v),
- (Utf8, Utf8) => apply(op, l.as_string::<i32>(), l_s, l_v,
r.as_string::<i32>(), r_s, r_v),
- (LargeUtf8, LargeUtf8) => apply(op, l.as_string::<i64>(), l_s, l_v,
r.as_string::<i64>(), r_s, r_v),
- (Binary, Binary) => apply(op, l.as_binary::<i32>(), l_s, l_v,
r.as_binary::<i32>(), r_s, r_v),
- (LargeBinary, LargeBinary) => apply(op, l.as_binary::<i64>(), l_s,
l_v, r.as_binary::<i64>(), r_s, r_v),
- (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op,
l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v),
- (l_t, r_t) => return
Err(ArrowError::InvalidArgumentError(format!("Invalid comparison operation:
{l_t} {op} {r_t}"))),
- }.unwrap_or_else(|| {
- let count = nulls.as_ref().map(|x| x.null_count()).unwrap_or_default();
- assert_eq!(count, len); // Sanity check
- BooleanBuffer::new_unset(len)
- });
-
- assert_eq!(values.len(), len); // Sanity check
- Ok(BooleanArray::new(values, nulls))
+ let l_nulls = l_nulls.filter(|n| n.null_count() > 0);
+ let r_nulls = r_nulls.filter(|n| n.null_count() > 0);
+ Ok(match (l_nulls, l_s, r_nulls, r_s) {
+ (Some(l), true, Some(r), true) | (Some(l), false, Some(r), false) => {
+ // Either both sides are scalar or neither side is scalar
+ match op {
+ Op::Distinct => {
+ let values = values();
+ let l = l.inner().bit_chunks().iter_padded();
+ let r = r.inner().bit_chunks().iter_padded();
+ let ne = values.bit_chunks().iter_padded();
+
+ let c = |((l, r), n)| ((l ^ r) | (l & r & n));
+ let buffer = l.zip(r).zip(ne).map(c).collect();
+ BooleanBuffer::new(buffer, 0, len).into()
Review Comment:
💯 for no null buffer -- not sure if that is worth calling out in a comment
or not
##########
arrow-ord/src/cmp.rs:
##########
@@ -141,51 +174,114 @@ fn compare_op(
let l_len = l.len();
let r_len = r.len();
- let l_nulls = l.logical_nulls();
- let r_nulls = r.logical_nulls();
- let (len, nulls) = match (l_s, r_s) {
- (true, true) | (false, false) => {
- if l_len != r_len {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Cannot compare arrays of different lengths, got {l_len}
vs {r_len}"
- )));
- }
- (l_len, NullBuffer::union(l_nulls.as_ref(), r_nulls.as_ref()))
- }
- (true, false) => match l_nulls.map(|x| x.null_count() !=
0).unwrap_or_default() {
- true => (r_len, Some(NullBuffer::new_null(r_len))),
- false => (r_len, r_nulls), // Left is scalar and not null
- },
- (false, true) => match r_nulls.map(|x| x.null_count() !=
0).unwrap_or_default() {
- true => (l_len, Some(NullBuffer::new_null(l_len))),
- false => (l_len, l_nulls), // Right is scalar and not null
- },
+ if l_len != r_len && !l_s && !r_s {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Cannot compare arrays of different lengths, got {l_len} vs
{r_len}"
+ )));
+ }
+
+ let len = match l_s {
+ true => r_len,
+ false => l_len,
};
+ let l_nulls = l.logical_nulls();
+ let r_nulls = r.logical_nulls();
+
let l_v = l.as_any_dictionary_opt();
let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l);
+ let l_t = l.data_type();
let r_v = r.as_any_dictionary_opt();
let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r);
+ let r_t = r.data_type();
+
+ if l_t != r_t || l_t.is_nested() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Invalid comparison operation: {l_t} {op} {r_t}"
+ )));
+ }
+
+ // Defer computation as may not be necessary
+ let values = || -> BooleanBuffer {
+ let d = downcast_primitive_array! {
+ (l, r) => apply(op, l.values().as_ref(), l_s, l_v,
r.values().as_ref(), r_s, r_v),
+ (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v,
r.as_boolean(), r_s, r_v),
+ (Utf8, Utf8) => apply(op, l.as_string::<i32>(), l_s, l_v,
r.as_string::<i32>(), r_s, r_v),
+ (LargeUtf8, LargeUtf8) => apply(op, l.as_string::<i64>(), l_s,
l_v, r.as_string::<i64>(), r_s, r_v),
+ (Binary, Binary) => apply(op, l.as_binary::<i32>(), l_s, l_v,
r.as_binary::<i32>(), r_s, r_v),
+ (LargeBinary, LargeBinary) => apply(op, l.as_binary::<i64>(), l_s,
l_v, r.as_binary::<i64>(), r_s, r_v),
+ (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op,
l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v),
+ (Null, Null) => None,
+ _ => unreachable!(),
+ };
+ d.unwrap_or_else(|| BooleanBuffer::new_unset(len))
+ };
- let values = downcast_primitive_array! {
- (l, r) => apply(op, l.values().as_ref(), l_s, l_v,
r.values().as_ref(), r_s, r_v),
- (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v,
r.as_boolean(), r_s, r_v),
- (Utf8, Utf8) => apply(op, l.as_string::<i32>(), l_s, l_v,
r.as_string::<i32>(), r_s, r_v),
- (LargeUtf8, LargeUtf8) => apply(op, l.as_string::<i64>(), l_s, l_v,
r.as_string::<i64>(), r_s, r_v),
- (Binary, Binary) => apply(op, l.as_binary::<i32>(), l_s, l_v,
r.as_binary::<i32>(), r_s, r_v),
- (LargeBinary, LargeBinary) => apply(op, l.as_binary::<i64>(), l_s,
l_v, r.as_binary::<i64>(), r_s, r_v),
- (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op,
l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v),
- (l_t, r_t) => return
Err(ArrowError::InvalidArgumentError(format!("Invalid comparison operation:
{l_t} {op} {r_t}"))),
- }.unwrap_or_else(|| {
- let count = nulls.as_ref().map(|x| x.null_count()).unwrap_or_default();
- assert_eq!(count, len); // Sanity check
- BooleanBuffer::new_unset(len)
- });
-
- assert_eq!(values.len(), len); // Sanity check
- Ok(BooleanArray::new(values, nulls))
+ let l_nulls = l_nulls.filter(|n| n.null_count() > 0);
+ let r_nulls = r_nulls.filter(|n| n.null_count() > 0);
+ Ok(match (l_nulls, l_s, r_nulls, r_s) {
+ (Some(l), true, Some(r), true) | (Some(l), false, Some(r), false) => {
+ // Either both sides are scalar or neither side is scalar
+ match op {
+ Op::Distinct => {
+ let values = values();
+ let l = l.inner().bit_chunks().iter_padded();
+ let r = r.inner().bit_chunks().iter_padded();
+ let ne = values.bit_chunks().iter_padded();
+
+ let c = |((l, r), n)| ((l ^ r) | (l & r & n));
+ let buffer = l.zip(r).zip(ne).map(c).collect();
+ BooleanBuffer::new(buffer, 0, len).into()
Review Comment:
💯 for no null buffer -- not sure if that is worth calling out in a comment
or not
##########
arrow-ord/src/cmp.rs:
##########
@@ -129,7 +134,35 @@ pub fn gt_eq(lhs: &dyn Datum, rhs: &dyn Datum) ->
Result<BooleanArray, ArrowErro
compare_op(Op::GreaterEqual, lhs, rhs)
}
+/// Perform `left IS DISTINCT FROM right` operation on two [`Datum`]
+///
+/// For floating values like f32 and f64, this comparison produces an ordering
in accordance to
+/// the totalOrder predicate as defined in the IEEE 754 (2008 revision)
floating point standard.
+/// Note that totalOrder treats positive and negative zeros as different. If
it is necessary
+/// to treat them as equal, please normalize zeros before calling this kernel.
+///
+/// Please refer to [`f32::total_cmp`] and [`f64::total_cmp`]
+pub fn distinct(lhs: &dyn Datum, rhs: &dyn Datum) -> Result<BooleanArray,
ArrowError> {
+ compare_op(Op::Distinct, lhs, rhs)
+}
+
+/// Perform `left IS NOT DISTINCT FROM right` operation on two [`Datum`]
Review Comment:
```suggestion
/// Perform `left IS NOT DISTINCT FROM right` operation on two [`Datum`].
`IS NOT DISTINCT`
/// similar to `Eq`, differing in null handling. Two operands are
considered NOT DISTINCT
/// if they have the same value or if both of them are NULL.
/// The result of `IS NOT DISTINCT FROM` is never NULL.
```
##########
arrow-ord/src/cmp.rs:
##########
@@ -141,51 +174,114 @@ fn compare_op(
let l_len = l.len();
let r_len = r.len();
- let l_nulls = l.logical_nulls();
- let r_nulls = r.logical_nulls();
- let (len, nulls) = match (l_s, r_s) {
- (true, true) | (false, false) => {
- if l_len != r_len {
- return Err(ArrowError::InvalidArgumentError(format!(
- "Cannot compare arrays of different lengths, got {l_len}
vs {r_len}"
- )));
- }
- (l_len, NullBuffer::union(l_nulls.as_ref(), r_nulls.as_ref()))
- }
- (true, false) => match l_nulls.map(|x| x.null_count() !=
0).unwrap_or_default() {
- true => (r_len, Some(NullBuffer::new_null(r_len))),
- false => (r_len, r_nulls), // Left is scalar and not null
- },
- (false, true) => match r_nulls.map(|x| x.null_count() !=
0).unwrap_or_default() {
- true => (l_len, Some(NullBuffer::new_null(l_len))),
- false => (l_len, l_nulls), // Right is scalar and not null
- },
+ if l_len != r_len && !l_s && !r_s {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Cannot compare arrays of different lengths, got {l_len} vs
{r_len}"
+ )));
+ }
+
+ let len = match l_s {
+ true => r_len,
+ false => l_len,
};
+ let l_nulls = l.logical_nulls();
+ let r_nulls = r.logical_nulls();
+
let l_v = l.as_any_dictionary_opt();
let l = l_v.map(|x| x.values().as_ref()).unwrap_or(l);
+ let l_t = l.data_type();
let r_v = r.as_any_dictionary_opt();
let r = r_v.map(|x| x.values().as_ref()).unwrap_or(r);
+ let r_t = r.data_type();
+
+ if l_t != r_t || l_t.is_nested() {
+ return Err(ArrowError::InvalidArgumentError(format!(
+ "Invalid comparison operation: {l_t} {op} {r_t}"
+ )));
+ }
+
+ // Defer computation as may not be necessary
+ let values = || -> BooleanBuffer {
+ let d = downcast_primitive_array! {
+ (l, r) => apply(op, l.values().as_ref(), l_s, l_v,
r.values().as_ref(), r_s, r_v),
+ (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v,
r.as_boolean(), r_s, r_v),
+ (Utf8, Utf8) => apply(op, l.as_string::<i32>(), l_s, l_v,
r.as_string::<i32>(), r_s, r_v),
+ (LargeUtf8, LargeUtf8) => apply(op, l.as_string::<i64>(), l_s,
l_v, r.as_string::<i64>(), r_s, r_v),
+ (Binary, Binary) => apply(op, l.as_binary::<i32>(), l_s, l_v,
r.as_binary::<i32>(), r_s, r_v),
+ (LargeBinary, LargeBinary) => apply(op, l.as_binary::<i64>(), l_s,
l_v, r.as_binary::<i64>(), r_s, r_v),
+ (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op,
l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v),
+ (Null, Null) => None,
+ _ => unreachable!(),
+ };
+ d.unwrap_or_else(|| BooleanBuffer::new_unset(len))
+ };
- let values = downcast_primitive_array! {
- (l, r) => apply(op, l.values().as_ref(), l_s, l_v,
r.values().as_ref(), r_s, r_v),
- (Boolean, Boolean) => apply(op, l.as_boolean(), l_s, l_v,
r.as_boolean(), r_s, r_v),
- (Utf8, Utf8) => apply(op, l.as_string::<i32>(), l_s, l_v,
r.as_string::<i32>(), r_s, r_v),
- (LargeUtf8, LargeUtf8) => apply(op, l.as_string::<i64>(), l_s, l_v,
r.as_string::<i64>(), r_s, r_v),
- (Binary, Binary) => apply(op, l.as_binary::<i32>(), l_s, l_v,
r.as_binary::<i32>(), r_s, r_v),
- (LargeBinary, LargeBinary) => apply(op, l.as_binary::<i64>(), l_s,
l_v, r.as_binary::<i64>(), r_s, r_v),
- (FixedSizeBinary(_), FixedSizeBinary(_)) => apply(op,
l.as_fixed_size_binary(), l_s, l_v, r.as_fixed_size_binary(), r_s, r_v),
- (l_t, r_t) => return
Err(ArrowError::InvalidArgumentError(format!("Invalid comparison operation:
{l_t} {op} {r_t}"))),
- }.unwrap_or_else(|| {
- let count = nulls.as_ref().map(|x| x.null_count()).unwrap_or_default();
- assert_eq!(count, len); // Sanity check
- BooleanBuffer::new_unset(len)
- });
-
- assert_eq!(values.len(), len); // Sanity check
- Ok(BooleanArray::new(values, nulls))
+ let l_nulls = l_nulls.filter(|n| n.null_count() > 0);
+ let r_nulls = r_nulls.filter(|n| n.null_count() > 0);
+ Ok(match (l_nulls, l_s, r_nulls, r_s) {
+ (Some(l), true, Some(r), true) | (Some(l), false, Some(r), false) => {
+ // Either both sides are scalar or neither side is scalar
+ match op {
+ Op::Distinct => {
+ let values = values();
+ let l = l.inner().bit_chunks().iter_padded();
+ let r = r.inner().bit_chunks().iter_padded();
+ let ne = values.bit_chunks().iter_padded();
+
+ let c = |((l, r), n)| ((l ^ r) | (l & r & n));
+ let buffer = l.zip(r).zip(ne).map(c).collect();
+ BooleanBuffer::new(buffer, 0, len).into()
+ }
+ Op::NotDistinct => {
+ let values = values();
+ let l = l.inner().bit_chunks().iter_padded();
+ let r = r.inner().bit_chunks().iter_padded();
+ let e = values.bit_chunks().iter_padded();
+
+ let c = |((l, r), e)| u64::not(l | r) | (l & r & e);
+ let buffer = l.zip(r).zip(e).map(c).collect();
+ BooleanBuffer::new(buffer, 0, len).into()
+ }
+ _ => BooleanArray::new(values(), NullBuffer::union(Some(&l),
Some(&r))),
+ }
+ }
+ (Some(_), true, Some(a), false) | (Some(a), false, Some(_), true) => {
+ // Scalar is null, other side is non-scalar and nullable
+ match op {
+ Op::Distinct => a.into_inner().into(),
+ Op::NotDistinct => a.into_inner().not().into(),
+ _ => BooleanArray::new_null(len),
+ }
+ }
+ (Some(nulls), is_scalar, None, _) | (None, _, Some(nulls), is_scalar)
=> {
+ // Only one side is nullable
+ match is_scalar {
+ true => match op {
+ // Scalar is null, other side is not nullable
+ Op::Distinct => BooleanBuffer::new_set(len).into(),
+ Op::NotDistinct => BooleanBuffer::new_unset(len).into(),
+ _ => BooleanArray::new_null(len),
+ },
+ false => match op {
+ Op::Distinct => {
+ let values = values();
+ let l = nulls.inner().bit_chunks().iter_padded();
+ let ne = values.bit_chunks().iter_padded();
+ let c = |(l, n)| u64::not(l) | n;
+ let buffer = l.zip(ne).map(c).collect();
Review Comment:
It took me a while to understand why this clause didn't mirror the
`Op::NotDistinct` clause
`NotDistinct` can simply use
https://docs.rs/arrow/latest/arrow/buffer/struct.BooleanBuffer.html#impl-BitOr%3C%26BooleanBuffer%3E-for-%26BooleanBuffer
but It seems it is because there is no equivalent of
https://doc.rust-lang.org/nightly/core/ops/trait.Neg.html for BooleanBuffer and
even if there were that would result in allocating a temporary buffer that
might wasteful
Though now that I write this I wonder if performance could be improved here
by deferring / reusing the buffer allocated by `values()`. Maybe as some future
optimization.
##########
arrow-ord/src/cmp.rs:
##########
@@ -311,7 +411,7 @@ fn apply_op<T: ArrayOrd>(
(Some(l_s), Some(r_s)) => {
let a = l.value(l_s);
let b = r.value(r_s);
- std::iter::once(op(a, b)).collect()
+ std::iter::once(op(a, b) ^ neg).collect()
Review Comment:
Can we also please add a documentation comment explaining how to interpret
the `neg` argument?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]