This is an automated email from the ASF dual-hosted git repository.

nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 0f64726  ARROW-11778: [Rust] Cast from LargeUtf8 to Numerical and 
temporal types
0f64726 is described below

commit 0f647261058892289b1722e5883f8610cc5dd3d9
Author: Ritchie Vink <[email protected]>
AuthorDate: Sat Feb 27 11:05:46 2021 +0200

    ARROW-11778: [Rust] Cast from LargeUtf8 to Numerical and temporal types
    
    Sorry that the PR's are not more clustered, but they occur to me in the 
wild.
    
    This PR allows casting from LargeUtf8 to numerical and temporal types. It 
also modifies the already existing string to temporal casts such that it uses 
the new faster `from_trusted_length` iterator API.
    
    Closes #9571 from ritchie46/large_utf8_to_numerical
    
    Lead-authored-by: Ritchie Vink <[email protected]>
    Co-authored-by: Neville Dipale <[email protected]>
    Signed-off-by: Neville Dipale <[email protected]>
---
 rust/arrow/src/compute/kernels/cast.rs | 227 ++++++++++++++++++++++-----------
 1 file changed, 153 insertions(+), 74 deletions(-)

diff --git a/rust/arrow/src/compute/kernels/cast.rs 
b/rust/arrow/src/compute/kernels/cast.rs
index 9a547bd..0d8dc82 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -88,6 +88,10 @@ pub fn can_cast_types(from_type: &DataType, to_type: 
&DataType) -> bool {
         (Utf8, Date64) => true,
         (Utf8, Timestamp(TimeUnit::Nanosecond, None)) => true,
         (Utf8, _) => DataType::is_numeric(to_type),
+        (LargeUtf8, Date32) => true,
+        (LargeUtf8, Date64) => true,
+        (LargeUtf8, Timestamp(TimeUnit::Nanosecond, None)) => true,
+        (LargeUtf8, _) => DataType::is_numeric(to_type),
         (_, Utf8) | (_, LargeUtf8) => {
             DataType::is_numeric(from_type) || from_type == &Binary
         }
@@ -366,66 +370,20 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> 
Result<ArrayRef> {
         },
         (Utf8, _) => match to_type {
             LargeUtf8 => cast_str_container::<i32, i64>(&**array),
-            UInt8 => cast_string_to_numeric::<UInt8Type>(array),
-            UInt16 => cast_string_to_numeric::<UInt16Type>(array),
-            UInt32 => cast_string_to_numeric::<UInt32Type>(array),
-            UInt64 => cast_string_to_numeric::<UInt64Type>(array),
-            Int8 => cast_string_to_numeric::<Int8Type>(array),
-            Int16 => cast_string_to_numeric::<Int16Type>(array),
-            Int32 => cast_string_to_numeric::<Int32Type>(array),
-            Int64 => cast_string_to_numeric::<Int64Type>(array),
-            Float32 => cast_string_to_numeric::<Float32Type>(array),
-            Float64 => cast_string_to_numeric::<Float64Type>(array),
-            Date32 => {
-                use chrono::Datelike;
-                let string_array = 
array.as_any().downcast_ref::<StringArray>().unwrap();
-                let mut builder = 
PrimitiveBuilder::<Date32Type>::new(string_array.len());
-                for i in 0..string_array.len() {
-                    if string_array.is_null(i) {
-                        builder.append_null()?;
-                    } else {
-                        match 
string_array.value(i).parse::<chrono::NaiveDate>() {
-                            Ok(date) => builder.append_value(
-                                date.num_days_from_ce() - EPOCH_DAYS_FROM_CE,
-                            )?,
-                            Err(_) => builder.append_null()?, // not a valid 
date
-                        };
-                    }
-                }
-                Ok(Arc::new(builder.finish()) as ArrayRef)
-            }
-            Date64 => {
-                let string_array = 
array.as_any().downcast_ref::<StringArray>().unwrap();
-                let mut builder = 
PrimitiveBuilder::<Date64Type>::new(string_array.len());
-                for i in 0..string_array.len() {
-                    if string_array.is_null(i) {
-                        builder.append_null()?;
-                    } else {
-                        match 
string_array.value(i).parse::<chrono::NaiveDateTime>() {
-                            Ok(date_time) => {
-                                
builder.append_value(date_time.timestamp_millis())?
-                            }
-                            Err(_) => builder.append_null()?, // not a valid 
date
-                        };
-                    }
-                }
-                Ok(Arc::new(builder.finish()) as ArrayRef)
-            }
+            UInt8 => cast_string_to_numeric::<UInt8Type, i32>(array),
+            UInt16 => cast_string_to_numeric::<UInt16Type, i32>(array),
+            UInt32 => cast_string_to_numeric::<UInt32Type, i32>(array),
+            UInt64 => cast_string_to_numeric::<UInt64Type, i32>(array),
+            Int8 => cast_string_to_numeric::<Int8Type, i32>(array),
+            Int16 => cast_string_to_numeric::<Int16Type, i32>(array),
+            Int32 => cast_string_to_numeric::<Int32Type, i32>(array),
+            Int64 => cast_string_to_numeric::<Int64Type, i32>(array),
+            Float32 => cast_string_to_numeric::<Float32Type, i32>(array),
+            Float64 => cast_string_to_numeric::<Float64Type, i32>(array),
+            Date32 => cast_string_to_date32::<i32>(&**array),
+            Date64 => cast_string_to_date64::<i32>(&**array),
             Timestamp(TimeUnit::Nanosecond, None) => {
-                let string_array = 
array.as_any().downcast_ref::<StringArray>().unwrap();
-                let mut builder =
-                    
PrimitiveBuilder::<TimestampNanosecondType>::new(string_array.len());
-                for i in 0..string_array.len() {
-                    if string_array.is_null(i) {
-                        builder.append_null()?;
-                    } else {
-                        match string_to_timestamp_nanos(string_array.value(i)) 
{
-                            Ok(nanos) => builder.append_value(nanos)?,
-                            Err(_) => builder.append_null()?, // not a valid 
date
-                        };
-                    }
-                }
-                Ok(Arc::new(builder.finish()) as ArrayRef)
+                cast_string_to_timestamp_ns::<i32>(&**array)
             }
             _ => Err(ArrowError::ComputeError(format!(
                 "Casting from {:?} to {:?} not supported",
@@ -487,6 +445,27 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) -> 
Result<ArrayRef> {
                 from_type, to_type,
             ))),
         },
+        (LargeUtf8, _) => match to_type {
+            UInt8 => cast_string_to_numeric::<UInt8Type, i64>(array),
+            UInt16 => cast_string_to_numeric::<UInt16Type, i64>(array),
+            UInt32 => cast_string_to_numeric::<UInt32Type, i64>(array),
+            UInt64 => cast_string_to_numeric::<UInt64Type, i64>(array),
+            Int8 => cast_string_to_numeric::<Int8Type, i64>(array),
+            Int16 => cast_string_to_numeric::<Int16Type, i64>(array),
+            Int32 => cast_string_to_numeric::<Int32Type, i64>(array),
+            Int64 => cast_string_to_numeric::<Int64Type, i64>(array),
+            Float32 => cast_string_to_numeric::<Float32Type, i64>(array),
+            Float64 => cast_string_to_numeric::<Float64Type, i64>(array),
+            Date32 => cast_string_to_date32::<i64>(&**array),
+            Date64 => cast_string_to_date64::<i64>(&**array),
+            Timestamp(TimeUnit::Nanosecond, None) => {
+                cast_string_to_timestamp_ns::<i64>(&**array)
+            }
+            _ => Err(ArrowError::ComputeError(format!(
+                "Casting from {:?} to {:?} not supported",
+                from_type, to_type,
+            ))),
+        },
 
         // start numeric casts
         (UInt8, UInt16) => cast_numeric_arrays::<UInt8Type, UInt16Type>(array),
@@ -949,17 +928,23 @@ where
 
 /// Cast numeric types to Utf8
 #[allow(clippy::unnecessary_wraps)]
-fn cast_string_to_numeric<T>(from: &ArrayRef) -> Result<ArrayRef>
+fn cast_string_to_numeric<T, Offset: StringOffsetSizeTrait>(
+    from: &ArrayRef,
+) -> Result<ArrayRef>
 where
     T: ArrowNumericType,
     <T as ArrowPrimitiveType>::Native: lexical_core::FromLexical,
 {
-    Ok(Arc::new(string_to_numeric_cast::<T>(
-        from.as_any().downcast_ref::<StringArray>().unwrap(),
+    Ok(Arc::new(string_to_numeric_cast::<T, Offset>(
+        from.as_any()
+            .downcast_ref::<GenericStringArray<Offset>>()
+            .unwrap(),
     )))
 }
 
-fn string_to_numeric_cast<T>(from: &StringArray) -> PrimitiveArray<T>
+fn string_to_numeric_cast<T, Offset: StringOffsetSizeTrait>(
+    from: &GenericStringArray<Offset>,
+) -> PrimitiveArray<T>
 where
     T: ArrowNumericType,
     <T as ArrowPrimitiveType>::Native: lexical_core::FromLexical,
@@ -978,6 +963,93 @@ where
     unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) }
 }
 
+/// Casts generic string arrays to Date32Array
+#[allow(clippy::unnecessary_wraps)]
+fn cast_string_to_date32<Offset: StringOffsetSizeTrait>(
+    array: &dyn Array,
+) -> Result<ArrayRef> {
+    use chrono::Datelike;
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+
+    let iter = (0..string_array.len()).map(|i| {
+        if string_array.is_null(i) {
+            None
+        } else {
+            string_array
+                .value(i)
+                .parse::<chrono::NaiveDate>()
+                .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
+                .ok()
+        }
+    });
+
+    // Benefit:
+    //     20% performance improvement
+    // Soundness:
+    //     The iterator is trustedLen because it comes from an `StringArray`.
+    let array = unsafe { Date32Array::from_trusted_len_iter(iter) };
+    Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to Date64Array
+#[allow(clippy::unnecessary_wraps)]
+fn cast_string_to_date64<Offset: StringOffsetSizeTrait>(
+    array: &dyn Array,
+) -> Result<ArrayRef> {
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+
+    let iter = (0..string_array.len()).map(|i| {
+        if string_array.is_null(i) {
+            None
+        } else {
+            string_array
+                .value(i)
+                .parse::<chrono::NaiveDateTime>()
+                .map(|datetime| datetime.timestamp_millis())
+                .ok()
+        }
+    });
+
+    // Benefit:
+    //     20% performance improvement
+    // Soundness:
+    //     The iterator is trustedLen because it comes from an `StringArray`.
+    let array = unsafe { Date64Array::from_trusted_len_iter(iter) };
+    Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to TimeStampNanosecondArray
+#[allow(clippy::unnecessary_wraps)]
+fn cast_string_to_timestamp_ns<Offset: StringOffsetSizeTrait>(
+    array: &dyn Array,
+) -> Result<ArrayRef> {
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+
+    let iter = (0..string_array.len()).map(|i| {
+        if string_array.is_null(i) {
+            None
+        } else {
+            string_to_timestamp_nanos(string_array.value(i)).ok()
+        }
+    });
+
+    // Benefit:
+    //     20% performance improvement
+    // Soundness:
+    //     The iterator is trustedLen because it comes from an `StringArray`.
+    let array = unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) 
};
+    Ok(Arc::new(array) as ArrayRef)
+}
+
 /// Cast numeric types to Boolean
 ///
 /// Any zero value returns `false` while non-zero returns `true`
@@ -1719,20 +1791,27 @@ mod tests {
 
     #[test]
     fn test_cast_string_to_timestamp() {
-        let a = StringArray::from(vec![
+        let a1 = Arc::new(StringArray::from(vec![
             Some("2020-09-08T12:00:00+00:00"),
             Some("Not a valid date"),
             None,
-        ]);
-        let array = Arc::new(a) as ArrayRef;
-        let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond, 
None)).unwrap();
-        let c = b
-            .as_any()
-            .downcast_ref::<TimestampNanosecondArray>()
-            .unwrap();
-        assert_eq!(1599566400000000000, c.value(0));
-        assert!(c.is_null(1));
-        assert!(c.is_null(2));
+        ])) as ArrayRef;
+        let a2 = Arc::new(LargeStringArray::from(vec![
+            Some("2020-09-08T12:00:00+00:00"),
+            Some("Not a valid date"),
+            None,
+        ])) as ArrayRef;
+        for array in &[a1, a2] {
+            let b =
+                cast(array, &DataType::Timestamp(TimeUnit::Nanosecond, 
None)).unwrap();
+            let c = b
+                .as_any()
+                .downcast_ref::<TimestampNanosecondArray>()
+                .unwrap();
+            assert_eq!(1599566400000000000, c.value(0));
+            assert!(c.is_null(1));
+            assert!(c.is_null(2));
+        }
     }
 
     #[test]

Reply via email to