scovich commented on code in PR #9689:
URL: https://github.com/apache/arrow-rs/pull/9689#discussion_r3065514546
##########
arrow-cast/src/cast/decimal.rs:
##########
@@ -826,84 +837,63 @@ where
let mut value_builder = PrimitiveBuilder::<T>::with_capacity(array.len());
- if scale < 0 {
- match cast_options.safe {
- true => {
- for i in 0..array.len() {
- if array.is_null(i) {
- value_builder.append_null();
- } else {
- let v = array
- .value(i)
- .mul_checked(div)
- .ok()
- .and_then(<T::Native as
NumCast>::from::<D::Native>);
- value_builder.append_option(v);
- }
- }
- }
- false => {
- for i in 0..array.len() {
- if array.is_null(i) {
- value_builder.append_null();
- } else {
- let v = array.value(i).mul_checked(div)?;
-
- let value =
- <T::Native as
NumCast>::from::<D::Native>(v).ok_or_else(|| {
- ArrowError::CastError(format!(
- "value of {:?} is out of range {}",
- v,
- T::DATA_TYPE
- ))
- })?;
-
- value_builder.append_value(value);
- }
- }
- }
- }
- } else {
- match cast_options.safe {
- true => {
- for i in 0..array.len() {
- if array.is_null(i) {
- value_builder.append_null();
- } else {
- let v = array
- .value(i)
- .div_checked(div)
- .ok()
- .and_then(<T::Native as
NumCast>::from::<D::Native>);
- value_builder.append_option(v);
- }
+ for i in 0..array.len() {
+ if array.is_null(i) {
+ value_builder.append_null();
+ } else {
+ match cast_options.safe {
+ true => {
+ let v = cast_single_decimal_to_integer::<D, T::Native>(
Review Comment:
The original code hoisted checks for `scale < 0` (`mul_checked` vs.
`div_checked`) and `cast_options.safe` (NULL vs. error) outside the loop,
producing four simple loop bodies. This was presumably done for performance
reasons (minimizing branching inside the loop).
The new code pushes the `cast_options.safe` check inside a single loop and
pushes `scale < 0` check all the way down inside
`cast_single_decimal_to_integer`. That triples the number of branches inside
the loop body (the null check is per-row and so is always stuck inside the
loop). Performance will almost certainly be impacted, possibly significantly.
It would be safer to just preserve the replication (even tho it duplicates
logic with the new helper), and rely on the compiler's inlining and "jump
threading" optimizations to eliminate that redundancy:
<details>
<summary>code snippet</summary>
```rust
if scale < 0 {
if cast_options.safe {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = cast_single_decimal_to_integer::<D, T::Native>(...);
value_builder.append_option(v.ok());
}
}
} else {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = cast_single_decimal_to_integer::<D, T::Native>(...);
value_builder.append_value(v?);
}
}
}
} else {
if cast_options.safe {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = cast_single_decimal_to_integer::<D, T::Native>(...);
value_builder.append_option(v.ok());
}
}
} else {
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let v = cast_single_decimal_to_integer::<D, T::Native>(...);
value_builder.append_value(v?);
}
}
}
}
```
If you wanted to simplify a bit, you could define and use a local macro
inside this function:
```rust
// Helper macro for emitting nearly the same loop every time, so we can
hoist branches out.
// The compiler will specialize the resulting code (inlining and jump
threading)
macro_rules! cast_loop {
(|$v:ident| $body:expr) => {{
for i in 0..array.len() {
if array.is_null(i) {
value_builder.append_null();
} else {
let $v = cast_single_decimal_to_integer::<D, T::Native>(...);
$body
}
}
}};
}
if scale < 0 {
if cast_options.safe {
cast_loop!(|v| value_builder.append_option(v.ok()));
} else {
cast_loop!(|v| value_builder.append_value(v?));
}
} else {
if cast_options.safe {
cast_loop!(|v| value_builder.append_option(v.ok()));
} else {
cast_loop!(|v| value_builder.append_value(v?));
}
}
```
</details>
Note that the four loop bodies are almost syntactically identical --
differing only in whether they `append_option(v.ok())` or `append_value(v?)` --
but the inlined body of `cast_single_decimal_to_integer` inside each loop will
be specialized based on the `scale < 0` check we already performed. Result:
stand-alone calls to the helper function are always safe, but we still get
maximum performance here.
##########
arrow-cast/src/cast/decimal.rs:
##########
@@ -826,84 +837,63 @@ where
let mut value_builder = PrimitiveBuilder::<T>::with_capacity(array.len());
- if scale < 0 {
- match cast_options.safe {
- true => {
- for i in 0..array.len() {
- if array.is_null(i) {
- value_builder.append_null();
- } else {
- let v = array
- .value(i)
- .mul_checked(div)
- .ok()
- .and_then(<T::Native as
NumCast>::from::<D::Native>);
- value_builder.append_option(v);
- }
- }
- }
- false => {
- for i in 0..array.len() {
- if array.is_null(i) {
- value_builder.append_null();
- } else {
- let v = array.value(i).mul_checked(div)?;
-
- let value =
- <T::Native as
NumCast>::from::<D::Native>(v).ok_or_else(|| {
- ArrowError::CastError(format!(
- "value of {:?} is out of range {}",
- v,
- T::DATA_TYPE
- ))
- })?;
-
- value_builder.append_value(value);
- }
- }
- }
- }
- } else {
- match cast_options.safe {
- true => {
- for i in 0..array.len() {
- if array.is_null(i) {
- value_builder.append_null();
- } else {
- let v = array
- .value(i)
- .div_checked(div)
- .ok()
- .and_then(<T::Native as
NumCast>::from::<D::Native>);
- value_builder.append_option(v);
- }
+ for i in 0..array.len() {
+ if array.is_null(i) {
+ value_builder.append_null();
+ } else {
+ match cast_options.safe {
+ true => {
+ let v = cast_single_decimal_to_integer::<D, T::Native>(
+ array.value(i),
+ div,
+ scale as _,
Review Comment:
Why are we casting? Isn't it a trivial i16 -> i16 cast?
(again below)
##########
arrow-cast/src/cast/decimal.rs:
##########
@@ -802,6 +802,17 @@ where
}
}
+/// Cast a single floating point value to a decimal native with the given
multiple.
+/// Returns `None` if the value cannot be represented with the requested
precision.
+#[inline]
+pub fn single_float_to_decimal<D>(input: f64, mul: f64) -> Option<D::Native>
+where
+ D: DecimalType + ArrowPrimitiveType,
+ <D as ArrowPrimitiveType>::Native: DecimalCast,
+{
+ D::Native::from_f64((mul * input).round())
Review Comment:
The original code used `mul_checked` instead of `*` -- do we have some
guarantee that overflow panic is impossible?
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -2314,10 +2331,10 @@ where
Int32 => cast_decimal_to_integer::<D, Int32Type>(array, base, *scale,
cast_options),
Int64 => cast_decimal_to_integer::<D, Int64Type>(array, base, *scale,
cast_options),
Float32 => cast_decimal_to_float::<D, Float32Type, _>(array, |x| {
- (as_float(x) / 10_f64.powi(*scale as i32)) as f32
+ single_decimal_to_float_lossy::<D, F>(&as_float, x, *scale as _)
as f32
}),
Float64 => cast_decimal_to_float::<D, Float64Type, _>(array, |x| {
- as_float(x) / 10_f64.powi(*scale as i32)
+ single_decimal_to_float_lossy::<D, F>(&as_float, x, *scale as _)
Review Comment:
If we're anyway changing the code, `i32::from(*scale)` makes clear that this
is a lossless conversion
(a bunch more similarly lossless `as _` below)
##########
parquet-variant/src/variant.rs:
##########
@@ -1106,6 +1181,25 @@ impl<'m, 'v> Variant<'m, 'v> {
self.as_num()
}
+ fn convert_string_to_decimal<VD, D>(input: &str) -> Option<VD>
Review Comment:
nit: If you swap the template order, I callers would be a tad more readable,
e.g.:
```rust
convert_string_to_decimal::<Decimal32Type, _>
```
##########
parquet-variant/src/variant.rs:
##########
@@ -1106,6 +1181,25 @@ impl<'m, 'v> Variant<'m, 'v> {
self.as_num()
}
+ fn convert_string_to_decimal<VD, D>(input: &str) -> Option<VD>
+ where
+ D: DecimalType,
+ VD: VariantDecimalType<Native = D::Native>,
+ D::Native: NumCast + DecimalCast,
+ {
+ // find the last '.'
+ let scale_usize = input
+ .rsplit_once('.')
+ .map(|(_, frac)| frac.len())
+ .unwrap_or(0);
+
+ let scale = u8::try_from(scale_usize).ok()?;
+
+ parse_string_to_decimal_native::<D>(input, scale_usize)
+ .ok()
+ .and_then(|raw| VD::try_new(raw, scale).ok())
Review Comment:
nit
```suggestion
let raw = parse_string_to_decimal_native::<D>(input,
scale_usize).ok()?;
VD::try_new(raw, scale).ok()
```
##########
parquet-variant/src/variant.rs:
##########
@@ -1106,6 +1181,25 @@ impl<'m, 'v> Variant<'m, 'v> {
self.as_num()
}
+ fn convert_string_to_decimal<VD, D>(input: &str) -> Option<VD>
+ where
+ D: DecimalType,
+ VD: VariantDecimalType<Native = D::Native>,
+ D::Native: NumCast + DecimalCast,
+ {
+ // find the last '.'
+ let scale_usize = input
+ .rsplit_once('.')
+ .map(|(_, frac)| frac.len())
+ .unwrap_or(0);
Review Comment:
nit
```suggestion
.map_or_else(0, |(_, frac)| frac.len());
```
or even
```suggestion
.map_or_default(|(_, frac)| frac.len());
```
##########
parquet-variant/src/variant.rs:
##########
@@ -1199,14 +1317,27 @@ impl<'m, 'v> Variant<'m, 'v> {
/// let v1 = Variant::from(VariantDecimal4::try_new(1234_i32, 2).unwrap());
/// assert_eq!(v1.as_decimal16(), VariantDecimal16::try_new(1234_i128,
2).ok());
///
+ /// // or from a string variant if it can be parsed as decimal
+ /// let v2 = Variant::from("123.45");
+ /// assert_eq!(v2.as_decimal16(), VariantDecimal16::try_new(12345,
2).ok());
+ ///
/// // but not if the variant is not a decimal
- /// let v2 = Variant::from("hello!");
- /// assert_eq!(v2.as_decimal16(), None);
+ /// let v3 = Variant::from("hello!");
+ /// assert_eq!(v3.as_decimal16(), None);
/// ```
pub fn as_decimal16(&self) -> Option<VariantDecimal16> {
match *self {
- Variant::Int8(_) | Variant::Int16(_) | Variant::Int32(_) |
Variant::Int64(_) => {
- self.as_num::<i128>().and_then(|x| x.try_into().ok())
+ Variant::Int8(_) | Variant::Int16(_) | Variant::Int32(_) |
Variant::Int64(_) => self
+ .as_num::<i64>()
+ .map(|x| (x as i128).try_into().ok())
Review Comment:
Out of curiosity, why not just `as_num::<i128>` directly?
But if you must keep the double cast, at least do `i128::from(x)` to make
clear it's lossless.
##########
parquet-variant/src/variant.rs:
##########
@@ -815,9 +878,21 @@ impl<'m, 'v> Variant<'m, 'v> {
Variant::Int64(i) => num_cast(i),
Variant::Float(f) => num_cast(f),
Variant::Double(d) => num_cast(d),
- Variant::Decimal4(d) if d.scale() == 0 => num_cast(d.integer()),
- Variant::Decimal8(d) if d.scale() == 0 => num_cast(d.integer()),
- Variant::Decimal16(d) if d.scale() == 0 => num_cast(d.integer()),
+ Variant::Decimal4(d) => Self::cast_decimal_to_num::<Decimal32Type,
T, _>(
+ d.integer(),
+ d.scale(),
+ |x: i32| x as f64,
+ ),
+ Variant::Decimal8(d) => Self::cast_decimal_to_num::<Decimal64Type,
T, _>(
+ d.integer(),
+ d.scale(),
+ |x: i64| x as f64,
+ ),
+ Variant::Decimal16(d) =>
Self::cast_decimal_to_num::<Decimal128Type, T, _>(
+ d.integer(),
+ d.scale(),
+ |x: i128| x as f64,
Review Comment:
I'm a bit surprised those type annotations are necessary when the first arg
takes `D::Native` and should thus constrain the third arg's `F: fn(D::Native)
-> f64?
##########
arrow-cast/src/cast/mod.rs:
##########
@@ -72,9 +72,26 @@ use arrow_schema::*;
use arrow_select::take::take;
use num_traits::{NumCast, ToPrimitive, cast::AsPrimitive};
-pub use decimal::{DecimalCast, rescale_decimal};
+pub use decimal::{
+ DecimalCast, cast_single_decimal_to_integer,
parse_string_to_decimal_native, rescale_decimal,
+ single_float_to_decimal,
+};
pub use string::cast_single_string_to_boolean_default;
+/// Lossy conversion from decimal to float.
+///
+/// Conversion is lossy and follows standard floating point semantics. Values
+/// that exceed the representable range become `INFINITY` or `-INFINITY`
without
+/// returning an error.
+#[inline]
+pub fn single_decimal_to_float_lossy<D, F>(f: &F, x: D::Native, scale: i32) ->
f64
+where
+ D: DecimalType,
+ F: Fn(D::Native) -> f64,
+{
+ f(x) / 10_f64.powi(scale)
Review Comment:
fmul is drastically cheaper than fdiv on every architecture I know of. As
in, 5-10x higher ops/second.
Any reason we shouldn't switch?
```suggestion
f(x) * 10_f64.powi(-scale)
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]