alamb commented on code in PR #5446:
URL: https://github.com/apache/arrow-datafusion/pull/5446#discussion_r1123643959
##########
datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs:
##########
@@ -87,25 +78,120 @@ pub(crate) fn is_not_distinct_from<T>(
where
T: ArrowNumericType,
{
- let left_data = left.data();
- let right_data = right.data();
- let array_len = left_data.len().min(right_data.len());
+ distinct(
+ left,
+ right,
+ |left_value, right_value, left_isnull, right_isnull| {
+ !(left_isnull != right_isnull || left_value != right_value)
+ },
+ )
+}
+fn distinct<
+ T,
+ F: FnMut(
+ <T as ArrowPrimitiveType>::Native,
+ <T as ArrowPrimitiveType>::Native,
+ bool,
+ bool,
+ ) -> bool,
+>(
+ left: &PrimitiveArray<T>,
+ right: &PrimitiveArray<T>,
+ mut op: F,
+) -> Result<BooleanArray>
+where
+ T: ArrowNumericType,
+{
let left_values = left.values();
let right_values = right.values();
+ let left_data = left.data();
+ let right_data = right.data();
+ let array_len = left_data.len().min(right_data.len());
let distinct = arrow_buffer::MutableBuffer::collect_bool(array_len, |i| {
- !(left_data.is_null(i) != right_data.is_null(i)
- || left_values[i] != right_values[i])
+ op(
+ left_values[i],
+ right_values[i],
+ left_data.is_null(i),
+ right_data.is_null(i),
+ )
});
-
let array_data = ArrayData::builder(arrow_schema::DataType::Boolean)
.len(array_len)
.add_buffer(distinct.into());
Ok(BooleanArray::from(unsafe { array_data.build_unchecked() }))
}
+pub(crate) fn is_distinct_from_f32(
+ left: &Float32Array,
+ right: &Float32Array,
+) -> Result<BooleanArray> {
+ distinct(
+ left,
+ right,
+ |left_value, right_value, left_isnull, right_isnull| {
+ left_isnull != right_isnull
+ || left_value.is_nan() != right_value.is_nan()
+ || (!left_value.is_nan()
+ && !right_value.is_nan()
+ && left_value != right_value)
+ },
+ )
+}
+
+pub(crate) fn is_not_distinct_from_f32(
+ left: &Float32Array,
+ right: &Float32Array,
+) -> Result<BooleanArray> {
+ distinct(
+ left,
+ right,
+ |left_value, right_value, left_isnull, right_isnull| {
+ !(left_isnull != right_isnull
+ || left_value.is_nan() != right_value.is_nan()
+ || (!left_value.is_nan()
+ && !right_value.is_nan()
+ && left_value != right_value))
+ },
Review Comment:
It took me quite some time to see that the only difference here is that
there is a `!` in front of the entire expression. Would it be possible to
factor out the duplication so the difference between the kernels are clearer?
##########
datafusion/core/tests/sqllogictests/test_files/select.slt:
##########
@@ -80,3 +80,29 @@ select '1' from foo order by column1;
# foo distinct order by
statement error DataFusion error: Error during planning: For SELECT DISTINCT,
ORDER BY expressions column1 must appear in select list
select distinct '1' from foo order by column1;
+
+# distincts for float nan
Review Comment:
This might be a good test to add to
https://github.com/apache/arrow-datafusion/tree/main/datafusion/core/tests/sqllogictests/test_files/pg_compat
so they are automatically compared with postgres as part of CI (kudos to
@melgenek for help there)
https://github.com/apache/arrow-datafusion/blob/main/datafusion/core/tests/sqllogictests/test_files/pg_compat/pg_compat_simple.slt
perhaps?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]