This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 9a4b1c99d feat: Implement string cast operations for Time32 and Time64 
(#2251)
9a4b1c99d is described below

commit 9a4b1c99d7e5a3bd3c6e3bce3ba0ee154720827f
Author: Stuart Carnie <[email protected]>
AuthorDate: Tue Aug 2 22:09:55 2022 +1000

    feat: Implement string cast operations for Time32 and Time64 (#2251)
    
    * feat: Implement string cast operations for Time32 and Time64
    
    * chore: Remove unnecessary leap second handling
    
    Remove the unnecessary conditionals to extract the leap second, as it is
    already handled when converting to a time unit relative to midnight 🤦🏻‍♂️
    
    * chore: Inline trivial functions
---
 arrow/src/compute/kernels/cast.rs | 418 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 416 insertions(+), 2 deletions(-)

diff --git a/arrow/src/compute/kernels/cast.rs 
b/arrow/src/compute/kernels/cast.rs
index ea166f921..097b864fc 100644
--- a/arrow/src/compute/kernels/cast.rs
+++ b/arrow/src/compute/kernels/cast.rs
@@ -35,6 +35,7 @@
 //! assert_eq!(7.0, c.value(2));
 //! ```
 
+use chrono::Timelike;
 use std::str;
 use std::sync::Arc;
 
@@ -136,9 +137,25 @@ pub fn can_cast_types(from_type: &DataType, to_type: 
&DataType) -> bool {
 
         (Utf8, LargeUtf8) => true,
         (LargeUtf8, Utf8) => true,
-        (Utf8, Date32 | Date64 | Timestamp(TimeUnit::Nanosecond, None)) => 
true,
+        (Utf8,
+            Date32
+            | Date64
+            | Time32(TimeUnit::Second)
+            | Time32(TimeUnit::Millisecond)
+            | Time64(TimeUnit::Microsecond)
+            | Time64(TimeUnit::Nanosecond)
+            | Timestamp(TimeUnit::Nanosecond, None)
+        ) => true,
         (Utf8, _) => DataType::is_numeric(to_type),
-        (LargeUtf8, Date32 | Date64 | Timestamp(TimeUnit::Nanosecond, None)) 
=> true,
+        (LargeUtf8,
+            Date32
+            | Date64
+            | Time32(TimeUnit::Second)
+            | Time32(TimeUnit::Millisecond)
+            | Time64(TimeUnit::Microsecond)
+            | Time64(TimeUnit::Nanosecond)
+            | Timestamp(TimeUnit::Nanosecond, None)
+        ) => true,
         (LargeUtf8, _) => DataType::is_numeric(to_type),
         (Timestamp(_, _), Utf8) | (Timestamp(_, _), LargeUtf8) => true,
         (Date32, Utf8) | (Date32, LargeUtf8) => true,
@@ -659,6 +676,18 @@ pub fn cast_with_options(
             Float64 => cast_string_to_numeric::<Float64Type, i32>(array, 
cast_options),
             Date32 => cast_string_to_date32::<i32>(&**array, cast_options),
             Date64 => cast_string_to_date64::<i32>(&**array, cast_options),
+            Time32(TimeUnit::Second) => {
+                cast_string_to_time32second::<i32>(&**array, cast_options)
+            }
+            Time32(TimeUnit::Millisecond) => {
+                cast_string_to_time32millisecond::<i32>(&**array, cast_options)
+            }
+            Time64(TimeUnit::Microsecond) => {
+                cast_string_to_time64microsecond::<i32>(&**array, cast_options)
+            }
+            Time64(TimeUnit::Nanosecond) => {
+                cast_string_to_time64nanosecond::<i32>(&**array, cast_options)
+            }
             Timestamp(TimeUnit::Nanosecond, None) => {
                 cast_string_to_timestamp_ns::<i32>(&**array, cast_options)
             }
@@ -793,6 +822,18 @@ pub fn cast_with_options(
             Float64 => cast_string_to_numeric::<Float64Type, i64>(array, 
cast_options),
             Date32 => cast_string_to_date32::<i64>(&**array, cast_options),
             Date64 => cast_string_to_date64::<i64>(&**array, cast_options),
+            Time32(TimeUnit::Second) => {
+                cast_string_to_time32second::<i64>(&**array, cast_options)
+            }
+            Time32(TimeUnit::Millisecond) => {
+                cast_string_to_time32millisecond::<i64>(&**array, cast_options)
+            }
+            Time64(TimeUnit::Microsecond) => {
+                cast_string_to_time64microsecond::<i64>(&**array, cast_options)
+            }
+            Time64(TimeUnit::Nanosecond) => {
+                cast_string_to_time64nanosecond::<i64>(&**array, cast_options)
+            }
             Timestamp(TimeUnit::Nanosecond, None) => {
                 cast_string_to_timestamp_ns::<i64>(&**array, cast_options)
             }
@@ -1584,6 +1625,282 @@ fn cast_string_to_date64<Offset: OffsetSizeTrait>(
     Ok(Arc::new(array) as ArrayRef)
 }
 
+/// Casts generic string arrays to `Time32SecondArray`
+fn cast_string_to_time32second<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    /// The number of nanoseconds per millisecond.
+    const NANOS_PER_SEC: u32 = 1_000_000_000;
+
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+
+    let array = if cast_options.safe {
+        let iter = (0..string_array.len()).map(|i| {
+            if string_array.is_null(i) {
+                None
+            } else {
+                string_array
+                    .value(i)
+                    .parse::<chrono::NaiveTime>()
+                    .map(|time| {
+                        (time.num_seconds_from_midnight()
+                            + time.nanosecond() / NANOS_PER_SEC)
+                            as i32
+                    })
+                    .ok()
+            }
+        });
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { Time32SecondArray::from_trusted_len_iter(iter) }
+    } else {
+        let vec = (0..string_array.len())
+            .map(|i| {
+                if string_array.is_null(i) {
+                    Ok(None)
+                } else {
+                    let string = string_array
+                        .value(i);
+                    chrono::Duration::days(3);
+                    let result = string
+                        .parse::<chrono::NaiveTime>()
+                        .map(|time| (time.num_seconds_from_midnight() + 
time.nanosecond() / NANOS_PER_SEC) as i32);
+
+                    Some(result.map_err(|_| {
+                        ArrowError::CastError(
+                            format!("Cannot cast string '{}' to value of 
arrow::datatypes::types::Time32SecondType type", string),
+                        )
+                    }))
+                        .transpose()
+                }
+            })
+            .collect::<Result<Vec<Option<i32>>>>()?;
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { Time32SecondArray::from_trusted_len_iter(vec.iter()) }
+    };
+
+    Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to `Time32MillisecondArray`
+fn cast_string_to_time32millisecond<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    /// The number of nanoseconds per millisecond.
+    const NANOS_PER_MILLI: u32 = 1_000_000;
+    /// The number of milliseconds per second.
+    const MILLIS_PER_SEC: u32 = 1_000;
+
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+
+    let array = if cast_options.safe {
+        let iter = (0..string_array.len()).map(|i| {
+            if string_array.is_null(i) {
+                None
+            } else {
+                string_array
+                    .value(i)
+                    .parse::<chrono::NaiveTime>()
+                    .map(|time| {
+                        (time.num_seconds_from_midnight() * MILLIS_PER_SEC
+                            + time.nanosecond() / NANOS_PER_MILLI)
+                            as i32
+                    })
+                    .ok()
+            }
+        });
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { Time32MillisecondArray::from_trusted_len_iter(iter) }
+    } else {
+        let vec = (0..string_array.len())
+            .map(|i| {
+                if string_array.is_null(i) {
+                    Ok(None)
+                } else {
+                    let string = string_array
+                        .value(i);
+
+                    let result = string
+                        .parse::<chrono::NaiveTime>()
+                        .map(|time| (time.num_seconds_from_midnight() * 
MILLIS_PER_SEC
+                            + time.nanosecond() / NANOS_PER_MILLI) as i32);
+
+                    Some(result.map_err(|_| {
+                        ArrowError::CastError(
+                            format!("Cannot cast string '{}' to value of 
arrow::datatypes::types::Time32MillisecondType type", string),
+                        )
+                    }))
+                        .transpose()
+                }
+            })
+            .collect::<Result<Vec<Option<i32>>>>()?;
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { Time32MillisecondArray::from_trusted_len_iter(vec.iter()) }
+    };
+
+    Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to `Time64MicrosecondArray`
+fn cast_string_to_time64microsecond<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    /// The number of nanoseconds per microsecond.
+    const NANOS_PER_MICRO: i64 = 1_000;
+    /// The number of microseconds per second.
+    const MICROS_PER_SEC: i64 = 1_000_000;
+
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+
+    let array = if cast_options.safe {
+        let iter = (0..string_array.len()).map(|i| {
+            if string_array.is_null(i) {
+                None
+            } else {
+                string_array
+                    .value(i)
+                    .parse::<chrono::NaiveTime>()
+                    .map(|time| {
+                        time.num_seconds_from_midnight() as i64 * 
MICROS_PER_SEC
+                            + time.nanosecond() as i64 / NANOS_PER_MICRO
+                    })
+                    .ok()
+            }
+        });
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { Time64MicrosecondArray::from_trusted_len_iter(iter) }
+    } else {
+        let vec = (0..string_array.len())
+            .map(|i| {
+                if string_array.is_null(i) {
+                    Ok(None)
+                } else {
+                    let string = string_array
+                        .value(i);
+
+                    let result = string
+                        .parse::<chrono::NaiveTime>()
+                        .map(|time| time.num_seconds_from_midnight() as i64 * 
MICROS_PER_SEC
+                            + time.nanosecond() as i64 / NANOS_PER_MICRO);
+
+                    Some(result.map_err(|_| {
+                        ArrowError::CastError(
+                            format!("Cannot cast string '{}' to value of 
arrow::datatypes::types::Time64MicrosecondType type", string),
+                        )
+                    }))
+                        .transpose()
+                }
+            })
+            .collect::<Result<Vec<Option<i64>>>>()?;
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { Time64MicrosecondArray::from_trusted_len_iter(vec.iter()) }
+    };
+
+    Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to `Time64NanosecondArray`
+fn cast_string_to_time64nanosecond<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef> {
+    /// The number of nanoseconds per second.
+    const NANOS_PER_SEC: i64 = 1_000_000_000;
+
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+
+    let array = if cast_options.safe {
+        let iter = (0..string_array.len()).map(|i| {
+            if string_array.is_null(i) {
+                None
+            } else {
+                string_array
+                    .value(i)
+                    .parse::<chrono::NaiveTime>()
+                    .map(|time| {
+                        time.num_seconds_from_midnight() as i64 * NANOS_PER_SEC
+                            + time.nanosecond() as i64
+                    })
+                    .ok()
+            }
+        });
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { Time64NanosecondArray::from_trusted_len_iter(iter) }
+    } else {
+        let vec = (0..string_array.len())
+            .map(|i| {
+                if string_array.is_null(i) {
+                    Ok(None)
+                } else {
+                    let string = string_array
+                        .value(i);
+
+                    let result = string
+                        .parse::<chrono::NaiveTime>()
+                        .map(|time| time.num_seconds_from_midnight() as i64 * 
NANOS_PER_SEC + time.nanosecond() as i64);
+
+                    Some(result.map_err(|_| {
+                        ArrowError::CastError(
+                            format!("Cannot cast string '{}' to value of 
arrow::datatypes::types::Time64NanosecondType type", string),
+                        )
+                    }))
+                        .transpose()
+                }
+            })
+            .collect::<Result<Vec<Option<i64>>>>()?;
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { Time64NanosecondArray::from_trusted_len_iter(vec.iter()) }
+    };
+
+    Ok(Arc::new(array) as ArrayRef)
+}
+
 /// Casts generic string arrays to TimeStampNanosecondArray
 fn cast_string_to_timestamp_ns<Offset: OffsetSizeTrait>(
     array: &dyn Array,
@@ -2166,6 +2483,7 @@ where
 mod tests {
     use super::*;
     use crate::array::BasicDecimalArray;
+    use crate::datatypes::TimeUnit;
     use crate::util::decimal::Decimal128;
     use crate::{buffer::Buffer, util::display::array_value_to_string};
 
@@ -2854,6 +3172,102 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_cast_string_to_time32second() {
+        let a1 = Arc::new(StringArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("08:08:60.091323414"), // leap second
+            Some("08:08:61.091323414"), // not valid
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
+        let a2 = Arc::new(LargeStringArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("08:08:60.091323414"), // leap second
+            Some("08:08:61.091323414"), // not valid
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
+        for array in &[a1, a2] {
+            let b = cast(array, &DataType::Time32(TimeUnit::Second)).unwrap();
+            let c = b.as_any().downcast_ref::<Time32SecondArray>().unwrap();
+            assert_eq!(29315, c.value(0));
+            assert_eq!(29340, c.value(1));
+            assert!(c.is_null(2));
+            assert!(c.is_null(3));
+            assert!(c.is_null(4));
+        }
+    }
+
+    #[test]
+    fn test_cast_string_to_time32millisecond() {
+        let a1 = Arc::new(StringArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("08:08:60.091323414"), // leap second
+            Some("08:08:61.091323414"), // not valid
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
+        let a2 = Arc::new(LargeStringArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("08:08:60.091323414"), // leap second
+            Some("08:08:61.091323414"), // not valid
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
+        for array in &[a1, a2] {
+            let b = cast(array, 
&DataType::Time32(TimeUnit::Millisecond)).unwrap();
+            let c = 
b.as_any().downcast_ref::<Time32MillisecondArray>().unwrap();
+            assert_eq!(29315091, c.value(0));
+            assert_eq!(29340091, c.value(1));
+            assert!(c.is_null(2));
+            assert!(c.is_null(3));
+            assert!(c.is_null(4));
+        }
+    }
+
+    #[test]
+    fn test_cast_string_to_time64microsecond() {
+        let a1 = Arc::new(StringArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
+        let a2 = Arc::new(LargeStringArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
+        for array in &[a1, a2] {
+            let b = cast(array, 
&DataType::Time64(TimeUnit::Microsecond)).unwrap();
+            let c = 
b.as_any().downcast_ref::<Time64MicrosecondArray>().unwrap();
+            assert_eq!(29315091323, c.value(0));
+            assert!(c.is_null(1));
+            assert!(c.is_null(2));
+        }
+    }
+
+    #[test]
+    fn test_cast_string_to_time64nanosecond() {
+        let a1 = Arc::new(StringArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
+        let a2 = Arc::new(LargeStringArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
+        for array in &[a1, a2] {
+            let b = cast(array, 
&DataType::Time64(TimeUnit::Nanosecond)).unwrap();
+            let c = 
b.as_any().downcast_ref::<Time64NanosecondArray>().unwrap();
+            assert_eq!(29315091323414, c.value(0));
+            assert!(c.is_null(1));
+            assert!(c.is_null(2));
+        }
+    }
+
     #[test]
     fn test_cast_string_to_date64() {
         let a1 = Arc::new(StringArray::from(vec![

Reply via email to