kosiew commented on code in PR #22892:
URL: https://github.com/apache/datafusion/pull/22892#discussion_r3408154901
##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -2585,53 +2586,99 @@ impl ScalarValue {
/// distance is greater than [`usize::MAX`]. If the type is a float, then
the distance will be
/// rounded to the nearest integer.
///
- ///
/// Note: the datatype itself must support subtraction.
+ #[deprecated(since = "54.0.0", note = "Use distance_u64 instead")]
pub fn distance(&self, other: &ScalarValue) -> Option<usize> {
+ self.distance_u64(other)
+ .and_then(|d| usize::try_from(d).ok())
+ }
+
+ /// Absolute distance between two numeric values (of the same type). This
method will return
+ /// None if either one of the arguments are null. It might also return
None if the resulting
+ /// distance is greater than [`u64::MAX`]. If the type is a float, then
the distance will be
+ /// rounded to the nearest integer.
+ ///
+ /// Note: the datatype itself must support subtraction.
+ pub fn distance_u64(&self, other: &ScalarValue) -> Option<u64> {
match (self, other) {
- (Self::Int8(Some(l)), Self::Int8(Some(r))) => Some(l.abs_diff(*r)
as _),
- (Self::Int16(Some(l)), Self::Int16(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::Int32(Some(l)), Self::Int32(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::Int64(Some(l)), Self::Int64(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt8(Some(l)), Self::UInt8(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt16(Some(l)), Self::UInt16(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt32(Some(l)), Self::UInt32(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt64(Some(l)), Self::UInt64(Some(r))) =>
Some(l.abs_diff(*r) as _),
+ (Self::Int8(Some(l)), Self::Int8(Some(r))) => Some(l.abs_diff(*r)
as u64),
+ (Self::Int16(Some(l)), Self::Int16(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::Int32(Some(l)), Self::Int32(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::Int64(Some(l)), Self::Int64(Some(r))) =>
Some(l.abs_diff(*r)),
+ (Self::UInt8(Some(l)), Self::UInt8(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt16(Some(l)), Self::UInt16(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt32(Some(l)), Self::UInt32(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt64(Some(l)), Self::UInt64(Some(r))) =>
Some(l.abs_diff(*r)),
// TODO: we might want to look into supporting ceil/floor here for
floats.
(Self::Float16(Some(l)), Self::Float16(Some(r))) => {
- Some((f16::to_f32(*l) - f16::to_f32(*r)).abs().round() as _)
+ let diff = (f16::to_f32(*l) - f16::to_f32(*r)).abs().round();
+ if diff <= u64::MAX as f32 {
+ Some(diff as u64)
+ } else {
+ None
+ }
}
(Self::Float32(Some(l)), Self::Float32(Some(r))) => {
- Some((l - r).abs().round() as _)
+ let diff = (l - r).abs().round();
+ if diff <= u64::MAX as f32 {
+ Some(diff as u64)
+ } else {
+ None
+ }
}
(Self::Float64(Some(l)), Self::Float64(Some(r))) => {
- Some((l - r).abs().round() as _)
+ let diff = (l - r).abs().round();
+ if diff <= u64::MAX as f64 {
Review Comment:
`u64::MAX as f64` rounds up to `2^64`, so a rounded float distance of
exactly `2^64` can pass this guard. After that, the `as u64` conversion
saturates to `u64::MAX`, which means we return `Some(u64::MAX)` even though the
new overflow-aware contract says distances greater than `u64::MAX` should
return `None`.
For example, this shape currently returns `Some(u64::MAX)`:
```rust
ScalarValue::Float64(Some(0.0))
.distance_u64(&ScalarValue::Float64(Some(18446744073709551616.0)))
```
Could we make the upper bound strict, for example `diff.is_finite() && diff
< u64::MAX as f64`, or move this into an explicit helper? It would also be
great to add boundary coverage for exact `2^64`.
##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -2585,53 +2586,99 @@ impl ScalarValue {
/// distance is greater than [`usize::MAX`]. If the type is a float, then
the distance will be
/// rounded to the nearest integer.
///
- ///
/// Note: the datatype itself must support subtraction.
+ #[deprecated(since = "54.0.0", note = "Use distance_u64 instead")]
pub fn distance(&self, other: &ScalarValue) -> Option<usize> {
+ self.distance_u64(other)
+ .and_then(|d| usize::try_from(d).ok())
+ }
+
+ /// Absolute distance between two numeric values (of the same type). This
method will return
+ /// None if either one of the arguments are null. It might also return
None if the resulting
+ /// distance is greater than [`u64::MAX`]. If the type is a float, then
the distance will be
+ /// rounded to the nearest integer.
+ ///
+ /// Note: the datatype itself must support subtraction.
+ pub fn distance_u64(&self, other: &ScalarValue) -> Option<u64> {
match (self, other) {
- (Self::Int8(Some(l)), Self::Int8(Some(r))) => Some(l.abs_diff(*r)
as _),
- (Self::Int16(Some(l)), Self::Int16(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::Int32(Some(l)), Self::Int32(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::Int64(Some(l)), Self::Int64(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt8(Some(l)), Self::UInt8(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt16(Some(l)), Self::UInt16(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt32(Some(l)), Self::UInt32(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt64(Some(l)), Self::UInt64(Some(r))) =>
Some(l.abs_diff(*r) as _),
+ (Self::Int8(Some(l)), Self::Int8(Some(r))) => Some(l.abs_diff(*r)
as u64),
+ (Self::Int16(Some(l)), Self::Int16(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::Int32(Some(l)), Self::Int32(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::Int64(Some(l)), Self::Int64(Some(r))) =>
Some(l.abs_diff(*r)),
+ (Self::UInt8(Some(l)), Self::UInt8(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt16(Some(l)), Self::UInt16(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt32(Some(l)), Self::UInt32(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt64(Some(l)), Self::UInt64(Some(r))) =>
Some(l.abs_diff(*r)),
// TODO: we might want to look into supporting ceil/floor here for
floats.
(Self::Float16(Some(l)), Self::Float16(Some(r))) => {
- Some((f16::to_f32(*l) - f16::to_f32(*r)).abs().round() as _)
+ let diff = (f16::to_f32(*l) - f16::to_f32(*r)).abs().round();
+ if diff <= u64::MAX as f32 {
+ Some(diff as u64)
+ } else {
+ None
+ }
}
(Self::Float32(Some(l)), Self::Float32(Some(r))) => {
- Some((l - r).abs().round() as _)
+ let diff = (l - r).abs().round();
+ if diff <= u64::MAX as f32 {
Review Comment:
Same issue here for `Float32`: `u64::MAX as f32` is `2^64`, so a rounded
`Float32` distance equal to `2^64` is accepted and then saturates to `u64::MAX`
instead of returning `None`.
Could we reject the rounded value at the first float representation above
the `u64` domain and add a boundary test for this case too?
##########
datafusion/common/src/scalar/mod.rs:
##########
@@ -2585,53 +2586,99 @@ impl ScalarValue {
/// distance is greater than [`usize::MAX`]. If the type is a float, then
the distance will be
/// rounded to the nearest integer.
///
- ///
/// Note: the datatype itself must support subtraction.
+ #[deprecated(since = "54.0.0", note = "Use distance_u64 instead")]
pub fn distance(&self, other: &ScalarValue) -> Option<usize> {
+ self.distance_u64(other)
+ .and_then(|d| usize::try_from(d).ok())
+ }
+
+ /// Absolute distance between two numeric values (of the same type). This
method will return
+ /// None if either one of the arguments are null. It might also return
None if the resulting
+ /// distance is greater than [`u64::MAX`]. If the type is a float, then
the distance will be
+ /// rounded to the nearest integer.
+ ///
+ /// Note: the datatype itself must support subtraction.
+ pub fn distance_u64(&self, other: &ScalarValue) -> Option<u64> {
match (self, other) {
- (Self::Int8(Some(l)), Self::Int8(Some(r))) => Some(l.abs_diff(*r)
as _),
- (Self::Int16(Some(l)), Self::Int16(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::Int32(Some(l)), Self::Int32(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::Int64(Some(l)), Self::Int64(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt8(Some(l)), Self::UInt8(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt16(Some(l)), Self::UInt16(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt32(Some(l)), Self::UInt32(Some(r))) =>
Some(l.abs_diff(*r) as _),
- (Self::UInt64(Some(l)), Self::UInt64(Some(r))) =>
Some(l.abs_diff(*r) as _),
+ (Self::Int8(Some(l)), Self::Int8(Some(r))) => Some(l.abs_diff(*r)
as u64),
+ (Self::Int16(Some(l)), Self::Int16(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::Int32(Some(l)), Self::Int32(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::Int64(Some(l)), Self::Int64(Some(r))) =>
Some(l.abs_diff(*r)),
+ (Self::UInt8(Some(l)), Self::UInt8(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt16(Some(l)), Self::UInt16(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt32(Some(l)), Self::UInt32(Some(r))) =>
Some(l.abs_diff(*r) as u64),
+ (Self::UInt64(Some(l)), Self::UInt64(Some(r))) =>
Some(l.abs_diff(*r)),
// TODO: we might want to look into supporting ceil/floor here for
floats.
(Self::Float16(Some(l)), Self::Float16(Some(r))) => {
Review Comment:
Non-blocking thought: the float arms all repeat the same pattern of taking a
rounded absolute diff and then doing the checked conversion. Once the boundary
check is fixed, it might be worth using a small helper like `fn
rounded_float_distance_u64(diff: f64) -> Option<u64>` and calling it from
`Float16`, `Float32`, and `Float64`, with `f16` and `f32` widened to `f64`.
That would keep the overflow invariant in one place and make future drift less
likely.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]