scovich commented on code in PR #9689:
URL: https://github.com/apache/arrow-rs/pull/9689#discussion_r3065514546


##########
arrow-cast/src/cast/decimal.rs:
##########
@@ -826,84 +837,63 @@ where
 
     let mut value_builder = PrimitiveBuilder::<T>::with_capacity(array.len());
 
-    if scale < 0 {
-        match cast_options.safe {
-            true => {
-                for i in 0..array.len() {
-                    if array.is_null(i) {
-                        value_builder.append_null();
-                    } else {
-                        let v = array
-                            .value(i)
-                            .mul_checked(div)
-                            .ok()
-                            .and_then(<T::Native as 
NumCast>::from::<D::Native>);
-                        value_builder.append_option(v);
-                    }
-                }
-            }
-            false => {
-                for i in 0..array.len() {
-                    if array.is_null(i) {
-                        value_builder.append_null();
-                    } else {
-                        let v = array.value(i).mul_checked(div)?;
-
-                        let value =
-                            <T::Native as 
NumCast>::from::<D::Native>(v).ok_or_else(|| {
-                                ArrowError::CastError(format!(
-                                    "value of {:?} is out of range {}",
-                                    v,
-                                    T::DATA_TYPE
-                                ))
-                            })?;
-
-                        value_builder.append_value(value);
-                    }
-                }
-            }
-        }
-    } else {
-        match cast_options.safe {
-            true => {
-                for i in 0..array.len() {
-                    if array.is_null(i) {
-                        value_builder.append_null();
-                    } else {
-                        let v = array
-                            .value(i)
-                            .div_checked(div)
-                            .ok()
-                            .and_then(<T::Native as 
NumCast>::from::<D::Native>);
-                        value_builder.append_option(v);
-                    }
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            value_builder.append_null();
+        } else {
+            match cast_options.safe {
+                true => {
+                    let v = cast_single_decimal_to_integer::<D, T::Native>(

Review Comment:
   The original code hoisted checks for `scale < 0` (`mul_checked` vs. 
`div_checked`) and `cast_options.safe` (NULL vs. error) outside the loop, 
producing four simple loop bodies. This was presumably done for performance 
reasons (minimizing branching inside the loop).
   
   The new code pushes the `cast_options.safe` check inside a single loop and 
pushes `scale < 0` check all the way down inside 
`cast_single_decimal_to_integer`. That triples the number of branches inside 
the loop body (the null check is per-row and so is always stuck inside the 
loop). Performance will almost certainly be impacted, possibly significantly.
   
   It would be safer to just preserve the replication (even tho it duplicates 
logic with the new helper), and rely on the compiler's inlining and "jump 
threading" optimizations to eliminate that redundancy:
   
   <details>
   <summary>code snippet</summary>
   
   ```rust
   if scale < 0 {
       if cast_options.safe {
           for i in 0..array.len() {
               if array.is_null(i) {
                   value_builder.append_null();
               } else {
                   let v = cast_single_decimal_to_integer::<D, T::Native>(...);
                   value_builder.append_option(v.ok());
               }
           }
       } else {
           for i in 0..array.len() {
               if array.is_null(i) {
                   value_builder.append_null();
               } else {
                   let v = cast_single_decimal_to_integer::<D, T::Native>(...);
                   value_builder.append_value(v?);
               }
           }
       }
   } else {
       if cast_options.safe {
           for i in 0..array.len() {
               if array.is_null(i) {
                   value_builder.append_null();
               } else {
                   let v = cast_single_decimal_to_integer::<D, T::Native>(...);
                   value_builder.append_option(v.ok());
               }
           }
       } else {
           for i in 0..array.len() {
               if array.is_null(i) {
                   value_builder.append_null();
               } else {
                   let v = cast_single_decimal_to_integer::<D, T::Native>(...);
                   value_builder.append_value(v?);
               }
           }
       }
   }
   ```
   
   If you wanted to simplify a bit, you could define and use a local macro 
inside this function:
   ```rust
   // Helper macro for emitting nearly the same loop every time, so we can 
hoist branches out.
   // The compiler will specialize the resulting code (inlining and jump 
threading)
   macro_rules! cast_loop {
       (|$v:ident| $body:expr) => {{
           for i in 0..array.len() {
               if array.is_null(i) {
                   value_builder.append_null();
               } else {
                   let $v = cast_single_decimal_to_integer::<D, T::Native>(...);
                   $body
               }
           }
       }};
   }
   
   if scale < 0 {
       if cast_options.safe {
           cast_loop!(|v| value_builder.append_option(v.ok()));
       } else {
           cast_loop!(|v| value_builder.append_value(v?));
       }
   } else {
       if cast_options.safe {
           cast_loop!(|v| value_builder.append_option(v.ok()));
       } else {
           cast_loop!(|v| value_builder.append_value(v?));
       }
   }
   ```
   
   </details>
   
   Note that the four loop bodies are almost syntactically identical -- 
differing only in whether they `append_option(v.ok())` or `append_value(v?)` -- 
but the inlined body of `cast_single_decimal_to_integer` inside each loop will 
be specialized based on the `scale < 0` check we already performed. Result: 
stand-alone calls to the helper function are always safe, but we still get 
maximum performance here.



##########
arrow-cast/src/cast/decimal.rs:
##########
@@ -826,84 +837,63 @@ where
 
     let mut value_builder = PrimitiveBuilder::<T>::with_capacity(array.len());
 
-    if scale < 0 {
-        match cast_options.safe {
-            true => {
-                for i in 0..array.len() {
-                    if array.is_null(i) {
-                        value_builder.append_null();
-                    } else {
-                        let v = array
-                            .value(i)
-                            .mul_checked(div)
-                            .ok()
-                            .and_then(<T::Native as 
NumCast>::from::<D::Native>);
-                        value_builder.append_option(v);
-                    }
-                }
-            }
-            false => {
-                for i in 0..array.len() {
-                    if array.is_null(i) {
-                        value_builder.append_null();
-                    } else {
-                        let v = array.value(i).mul_checked(div)?;
-
-                        let value =
-                            <T::Native as 
NumCast>::from::<D::Native>(v).ok_or_else(|| {
-                                ArrowError::CastError(format!(
-                                    "value of {:?} is out of range {}",
-                                    v,
-                                    T::DATA_TYPE
-                                ))
-                            })?;
-
-                        value_builder.append_value(value);
-                    }
-                }
-            }
-        }
-    } else {
-        match cast_options.safe {
-            true => {
-                for i in 0..array.len() {
-                    if array.is_null(i) {
-                        value_builder.append_null();
-                    } else {
-                        let v = array
-                            .value(i)
-                            .div_checked(div)
-                            .ok()
-                            .and_then(<T::Native as 
NumCast>::from::<D::Native>);
-                        value_builder.append_option(v);
-                    }
+    for i in 0..array.len() {
+        if array.is_null(i) {
+            value_builder.append_null();
+        } else {
+            match cast_options.safe {
+                true => {
+                    let v = cast_single_decimal_to_integer::<D, T::Native>(
+                        array.value(i),
+                        div,
+                        scale as _,

Review Comment:
   Why are we casting? Isn't it a trivial i16 -> i16 cast?
   (again below)



##########
arrow-cast/src/cast/decimal.rs:
##########
@@ -802,6 +802,17 @@ where
     }
 }
 
+/// Cast a single floating point value to a decimal native with the given 
multiple.
+/// Returns `None` if the value cannot be represented with the requested 
precision.
+#[inline]
+pub fn single_float_to_decimal<D>(input: f64, mul: f64) -> Option<D::Native>
+where
+    D: DecimalType + ArrowPrimitiveType,
+    <D as ArrowPrimitiveType>::Native: DecimalCast,
+{
+    D::Native::from_f64((mul * input).round())

Review Comment:
   The original code used `mul_checked` instead of `*` -- do we have some 
guarantee that overflow panic is impossible?



##########
arrow-cast/src/cast/mod.rs:
##########
@@ -2314,10 +2331,10 @@ where
         Int32 => cast_decimal_to_integer::<D, Int32Type>(array, base, *scale, 
cast_options),
         Int64 => cast_decimal_to_integer::<D, Int64Type>(array, base, *scale, 
cast_options),
         Float32 => cast_decimal_to_float::<D, Float32Type, _>(array, |x| {
-            (as_float(x) / 10_f64.powi(*scale as i32)) as f32
+            single_decimal_to_float_lossy::<D, F>(&as_float, x, *scale as _) 
as f32
         }),
         Float64 => cast_decimal_to_float::<D, Float64Type, _>(array, |x| {
-            as_float(x) / 10_f64.powi(*scale as i32)
+            single_decimal_to_float_lossy::<D, F>(&as_float, x, *scale as _)

Review Comment:
   If we're anyway changing the code, `i32::from(*scale)` makes clear that this 
is a lossless conversion
   
   (a bunch more similarly lossless `as _` below)



##########
parquet-variant/src/variant.rs:
##########
@@ -1106,6 +1181,25 @@ impl<'m, 'v> Variant<'m, 'v> {
         self.as_num()
     }
 
+    fn convert_string_to_decimal<VD, D>(input: &str) -> Option<VD>

Review Comment:
   nit: If you swap the template order, I callers would be a tad more readable, 
e.g.:
   ```rust
   convert_string_to_decimal::<Decimal32Type, _>
   ```



##########
parquet-variant/src/variant.rs:
##########
@@ -1106,6 +1181,25 @@ impl<'m, 'v> Variant<'m, 'v> {
         self.as_num()
     }
 
+    fn convert_string_to_decimal<VD, D>(input: &str) -> Option<VD>
+    where
+        D: DecimalType,
+        VD: VariantDecimalType<Native = D::Native>,
+        D::Native: NumCast + DecimalCast,
+    {
+        // find the last '.'
+        let scale_usize = input
+            .rsplit_once('.')
+            .map(|(_, frac)| frac.len())
+            .unwrap_or(0);
+
+        let scale = u8::try_from(scale_usize).ok()?;
+
+        parse_string_to_decimal_native::<D>(input, scale_usize)
+            .ok()
+            .and_then(|raw| VD::try_new(raw, scale).ok())

Review Comment:
   nit
   ```suggestion
           let raw = parse_string_to_decimal_native::<D>(input, 
scale_usize).ok()?;
           VD::try_new(raw, scale).ok()
   ```



##########
parquet-variant/src/variant.rs:
##########
@@ -1106,6 +1181,25 @@ impl<'m, 'v> Variant<'m, 'v> {
         self.as_num()
     }
 
+    fn convert_string_to_decimal<VD, D>(input: &str) -> Option<VD>
+    where
+        D: DecimalType,
+        VD: VariantDecimalType<Native = D::Native>,
+        D::Native: NumCast + DecimalCast,
+    {
+        // find the last '.'
+        let scale_usize = input
+            .rsplit_once('.')
+            .map(|(_, frac)| frac.len())
+            .unwrap_or(0);

Review Comment:
   nit
   ```suggestion
               .map_or_else(0, |(_, frac)| frac.len());
   ```
   or even
   
   ```suggestion
               .map_or_default(|(_, frac)| frac.len());
   ```



##########
parquet-variant/src/variant.rs:
##########
@@ -1199,14 +1317,27 @@ impl<'m, 'v> Variant<'m, 'v> {
     /// let v1 = Variant::from(VariantDecimal4::try_new(1234_i32, 2).unwrap());
     /// assert_eq!(v1.as_decimal16(), VariantDecimal16::try_new(1234_i128, 
2).ok());
     ///
+    /// // or from a string variant if it can be parsed as decimal
+    /// let v2 = Variant::from("123.45");
+    /// assert_eq!(v2.as_decimal16(), VariantDecimal16::try_new(12345, 
2).ok());
+    ///
     /// // but not if the variant is not a decimal
-    /// let v2 = Variant::from("hello!");
-    /// assert_eq!(v2.as_decimal16(), None);
+    /// let v3 = Variant::from("hello!");
+    /// assert_eq!(v3.as_decimal16(), None);
     /// ```
     pub fn as_decimal16(&self) -> Option<VariantDecimal16> {
         match *self {
-            Variant::Int8(_) | Variant::Int16(_) | Variant::Int32(_) | 
Variant::Int64(_) => {
-                self.as_num::<i128>().and_then(|x| x.try_into().ok())
+            Variant::Int8(_) | Variant::Int16(_) | Variant::Int32(_) | 
Variant::Int64(_) => self
+                .as_num::<i64>()
+                .map(|x| (x as i128).try_into().ok())

Review Comment:
   Out of curiosity, why not just `as_num::<i128>` directly?
   But if you must keep the double cast, at least do `i128::from(x)` to make 
clear it's lossless.



##########
parquet-variant/src/variant.rs:
##########
@@ -815,9 +878,21 @@ impl<'m, 'v> Variant<'m, 'v> {
             Variant::Int64(i) => num_cast(i),
             Variant::Float(f) => num_cast(f),
             Variant::Double(d) => num_cast(d),
-            Variant::Decimal4(d) if d.scale() == 0 => num_cast(d.integer()),
-            Variant::Decimal8(d) if d.scale() == 0 => num_cast(d.integer()),
-            Variant::Decimal16(d) if d.scale() == 0 => num_cast(d.integer()),
+            Variant::Decimal4(d) => Self::cast_decimal_to_num::<Decimal32Type, 
T, _>(
+                d.integer(),
+                d.scale(),
+                |x: i32| x as f64,
+            ),
+            Variant::Decimal8(d) => Self::cast_decimal_to_num::<Decimal64Type, 
T, _>(
+                d.integer(),
+                d.scale(),
+                |x: i64| x as f64,
+            ),
+            Variant::Decimal16(d) => 
Self::cast_decimal_to_num::<Decimal128Type, T, _>(
+                d.integer(),
+                d.scale(),
+                |x: i128| x as f64,

Review Comment:
   I'm a bit surprised those type annotations are necessary when the first arg 
takes `D::Native` and should thus constrain the third arg's `F: fn(D::Native) 
-> f64?



##########
arrow-cast/src/cast/mod.rs:
##########
@@ -72,9 +72,26 @@ use arrow_schema::*;
 use arrow_select::take::take;
 use num_traits::{NumCast, ToPrimitive, cast::AsPrimitive};
 
-pub use decimal::{DecimalCast, rescale_decimal};
+pub use decimal::{
+    DecimalCast, cast_single_decimal_to_integer, 
parse_string_to_decimal_native, rescale_decimal,
+    single_float_to_decimal,
+};
 pub use string::cast_single_string_to_boolean_default;
 
+/// Lossy conversion from decimal to float.
+///
+/// Conversion is lossy and follows standard floating point semantics. Values
+/// that exceed the representable range become `INFINITY` or `-INFINITY` 
without
+/// returning an error.
+#[inline]
+pub fn single_decimal_to_float_lossy<D, F>(f: &F, x: D::Native, scale: i32) -> 
f64
+where
+    D: DecimalType,
+    F: Fn(D::Native) -> f64,
+{
+    f(x) / 10_f64.powi(scale)

Review Comment:
   fmul is drastically cheaper than fdiv on every architecture I know of. As 
in, 5-10x higher ops/second. 
   Any reason we shouldn't switch?
   ```suggestion
       f(x) * 10_f64.powi(-scale)
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to