This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 8a5be1330e Enable casting from Utf8View (#6077)
8a5be1330e is described below

commit 8a5be1330e30e6dd7760dba910737550d760e612
Author: Andrew Duffy <[email protected]>
AuthorDate: Fri Jul 19 20:17:13 2024 +0100

    Enable casting from Utf8View (#6077)
    
    * Enable casting from Utf8View -> string or temporal types
    
    * save
    
    * implement casting utf8view -> timestamp/interval types, with tests
    
    * fix clippy
    
    * fmt
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 arrow-cast/src/cast/mod.rs    | 150 ++++++++++++++++++++++++++++-------
 arrow-cast/src/cast/string.rs | 177 +++++++++++++++++++++++++++++++++---------
 2 files changed, 265 insertions(+), 62 deletions(-)

diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index c9de714e7d..1770157bcf 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -210,7 +210,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: 
&DataType) -> bool {
         (LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) | 
BinaryView) => true,
         (FixedSizeBinary(_), Binary | LargeBinary) => true,
         (
-            Utf8 | LargeUtf8,
+            Utf8 | LargeUtf8 | Utf8View,
             Binary
             | LargeBinary
             | Utf8
@@ -228,7 +228,6 @@ pub fn can_cast_types(from_type: &DataType, to_type: 
&DataType) -> bool {
             | Interval(_),
         ) => true,
         (Utf8 | LargeUtf8, Utf8View) => true,
-        (Utf8View, Utf8 | LargeUtf8) => true,
         (BinaryView, Binary | LargeBinary) => true,
         (Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
         (_, Utf8 | LargeUtf8) => from_type.is_primitive(),
@@ -1269,6 +1268,56 @@ pub fn cast_with_options(
                 "Casting from {from_type:?} to {to_type:?} not supported",
             ))),
         },
+        (Utf8View, _) => match to_type {
+            UInt8 => parse_string_view::<UInt8Type>(array, cast_options),
+            UInt16 => parse_string_view::<UInt16Type>(array, cast_options),
+            UInt32 => parse_string_view::<UInt32Type>(array, cast_options),
+            UInt64 => parse_string_view::<UInt64Type>(array, cast_options),
+            Int8 => parse_string_view::<Int8Type>(array, cast_options),
+            Int16 => parse_string_view::<Int16Type>(array, cast_options),
+            Int32 => parse_string_view::<Int32Type>(array, cast_options),
+            Int64 => parse_string_view::<Int64Type>(array, cast_options),
+            Float32 => parse_string_view::<Float32Type>(array, cast_options),
+            Float64 => parse_string_view::<Float64Type>(array, cast_options),
+            Date32 => parse_string_view::<Date32Type>(array, cast_options),
+            Date64 => parse_string_view::<Date64Type>(array, cast_options),
+            Binary => cast_view_to_byte::<StringViewType, 
GenericBinaryType<i32>>(array),
+            LargeBinary => cast_view_to_byte::<StringViewType, 
GenericBinaryType<i64>>(array),
+            Utf8 => cast_view_to_byte::<StringViewType, 
GenericStringType<i32>>(array),
+            LargeUtf8 => cast_view_to_byte::<StringViewType, 
GenericStringType<i64>>(array),
+            Time32(TimeUnit::Second) => 
parse_string_view::<Time32SecondType>(array, cast_options),
+            Time32(TimeUnit::Millisecond) => {
+                parse_string_view::<Time32MillisecondType>(array, cast_options)
+            }
+            Time64(TimeUnit::Microsecond) => {
+                parse_string_view::<Time64MicrosecondType>(array, cast_options)
+            }
+            Time64(TimeUnit::Nanosecond) => {
+                parse_string_view::<Time64NanosecondType>(array, cast_options)
+            }
+            Timestamp(TimeUnit::Second, to_tz) => {
+                cast_view_to_timestamp::<TimestampSecondType>(array, to_tz, 
cast_options)
+            }
+            Timestamp(TimeUnit::Millisecond, to_tz) => {
+                cast_view_to_timestamp::<TimestampMillisecondType>(array, 
to_tz, cast_options)
+            }
+            Timestamp(TimeUnit::Microsecond, to_tz) => {
+                cast_view_to_timestamp::<TimestampMicrosecondType>(array, 
to_tz, cast_options)
+            }
+            Timestamp(TimeUnit::Nanosecond, to_tz) => {
+                cast_view_to_timestamp::<TimestampNanosecondType>(array, 
to_tz, cast_options)
+            }
+            Interval(IntervalUnit::YearMonth) => {
+                cast_view_to_year_month_interval(array, cast_options)
+            }
+            Interval(IntervalUnit::DayTime) => 
cast_view_to_day_time_interval(array, cast_options),
+            Interval(IntervalUnit::MonthDayNano) => {
+                cast_view_to_month_day_nano_interval(array, cast_options)
+            }
+            _ => Err(ArrowError::CastError(format!(
+                "Casting from {from_type:?} to {to_type:?} not supported",
+            ))),
+        },
         (LargeUtf8, _) => match to_type {
             UInt8 => parse_string::<UInt8Type, i64>(array, cast_options),
             UInt16 => parse_string::<UInt16Type, i64>(array, cast_options),
@@ -1365,8 +1414,6 @@ pub fn cast_with_options(
                 "Casting from {from_type:?} to {to_type:?} not supported",
             ))),
         },
-        (Utf8View, Utf8) => cast_view_to_byte::<StringViewType, 
GenericStringType<i32>>(array),
-        (Utf8View, LargeUtf8) => cast_view_to_byte::<StringViewType, 
GenericStringType<i64>>(array),
         (BinaryView, Binary) => cast_view_to_byte::<BinaryViewType, 
GenericBinaryType<i32>>(array),
         (BinaryView, LargeBinary) => {
             cast_view_to_byte::<BinaryViewType, GenericBinaryType<i64>>(array)
@@ -3960,6 +4007,11 @@ mod tests {
 
     #[test]
     fn test_cast_string_to_timestamp() {
+        let a0 = Arc::new(StringViewArray::from(vec![
+            Some("2020-09-08T12:00:00.123456789+00:00"),
+            Some("Not a valid date"),
+            None,
+        ])) as ArrayRef;
         let a1 = Arc::new(StringArray::from(vec![
             Some("2020-09-08T12:00:00.123456789+00:00"),
             Some("Not a valid date"),
@@ -3970,7 +4022,7 @@ mod tests {
             Some("Not a valid date"),
             None,
         ])) as ArrayRef;
-        for array in &[a1, a2] {
+        for array in &[a0, a1, a2] {
             for time_unit in &[
                 TimeUnit::Second,
                 TimeUnit::Millisecond,
@@ -4039,6 +4091,11 @@ mod tests {
 
     #[test]
     fn test_cast_string_to_date32() {
+        let a0 = Arc::new(StringViewArray::from(vec![
+            Some("2018-12-25"),
+            Some("Not a valid date"),
+            None,
+        ])) as ArrayRef;
         let a1 = Arc::new(StringArray::from(vec![
             Some("2018-12-25"),
             Some("Not a valid date"),
@@ -4049,7 +4106,7 @@ mod tests {
             Some("Not a valid date"),
             None,
         ])) as ArrayRef;
-        for array in &[a1, a2] {
+        for array in &[a0, a1, a2] {
             let to_type = DataType::Date32;
             let b = cast(array, &to_type).unwrap();
             let c = b.as_primitive::<Date32Type>();
@@ -4071,30 +4128,47 @@ mod tests {
 
     #[test]
     fn test_cast_string_format_yyyymmdd_to_date32() {
-        let a = Arc::new(StringArray::from(vec![
+        let a0 = Arc::new(StringViewArray::from(vec![
+            Some("2020-12-25"),
+            Some("20201117"),
+        ])) as ArrayRef;
+        let a1 = Arc::new(StringArray::from(vec![
+            Some("2020-12-25"),
+            Some("20201117"),
+        ])) as ArrayRef;
+        let a2 = Arc::new(LargeStringArray::from(vec![
             Some("2020-12-25"),
             Some("20201117"),
         ])) as ArrayRef;
 
-        let to_type = DataType::Date32;
-        let options = CastOptions {
-            safe: false,
-            format_options: FormatOptions::default(),
-        };
-        let result = cast_with_options(&a, &to_type, &options).unwrap();
-        let c = result.as_primitive::<Date32Type>();
-        assert_eq!(
-            chrono::NaiveDate::from_ymd_opt(2020, 12, 25),
-            c.value_as_date(0)
-        );
-        assert_eq!(
-            chrono::NaiveDate::from_ymd_opt(2020, 11, 17),
-            c.value_as_date(1)
-        );
+        for array in &[a0, a1, a2] {
+            let to_type = DataType::Date32;
+            let options = CastOptions {
+                safe: false,
+                format_options: FormatOptions::default(),
+            };
+            let result = cast_with_options(&array, &to_type, 
&options).unwrap();
+            let c = result.as_primitive::<Date32Type>();
+            assert_eq!(
+                chrono::NaiveDate::from_ymd_opt(2020, 12, 25),
+                c.value_as_date(0)
+            );
+            assert_eq!(
+                chrono::NaiveDate::from_ymd_opt(2020, 11, 17),
+                c.value_as_date(1)
+            );
+        }
     }
 
     #[test]
     fn test_cast_string_to_time32second() {
+        let a0 = Arc::new(StringViewArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("08:08:60.091323414"), // leap second
+            Some("08:08:61.091323414"), // not valid
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
         let a1 = Arc::new(StringArray::from(vec![
             Some("08:08:35.091323414"),
             Some("08:08:60.091323414"), // leap second
@@ -4109,7 +4183,7 @@ mod tests {
             Some("Not a valid time"),
             None,
         ])) as ArrayRef;
-        for array in &[a1, a2] {
+        for array in &[a0, a1, a2] {
             let to_type = DataType::Time32(TimeUnit::Second);
             let b = cast(array, &to_type).unwrap();
             let c = b.as_primitive::<Time32SecondType>();
@@ -4130,6 +4204,13 @@ mod tests {
 
     #[test]
     fn test_cast_string_to_time32millisecond() {
+        let a0 = Arc::new(StringViewArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("08:08:60.091323414"), // leap second
+            Some("08:08:61.091323414"), // not valid
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
         let a1 = Arc::new(StringArray::from(vec![
             Some("08:08:35.091323414"),
             Some("08:08:60.091323414"), // leap second
@@ -4144,7 +4225,7 @@ mod tests {
             Some("Not a valid time"),
             None,
         ])) as ArrayRef;
-        for array in &[a1, a2] {
+        for array in &[a0, a1, a2] {
             let to_type = DataType::Time32(TimeUnit::Millisecond);
             let b = cast(array, &to_type).unwrap();
             let c = b.as_primitive::<Time32MillisecondType>();
@@ -4165,6 +4246,11 @@ mod tests {
 
     #[test]
     fn test_cast_string_to_time64microsecond() {
+        let a0 = Arc::new(StringViewArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
         let a1 = Arc::new(StringArray::from(vec![
             Some("08:08:35.091323414"),
             Some("Not a valid time"),
@@ -4175,7 +4261,7 @@ mod tests {
             Some("Not a valid time"),
             None,
         ])) as ArrayRef;
-        for array in &[a1, a2] {
+        for array in &[a0, a1, a2] {
             let to_type = DataType::Time64(TimeUnit::Microsecond);
             let b = cast(array, &to_type).unwrap();
             let c = b.as_primitive::<Time64MicrosecondType>();
@@ -4194,6 +4280,11 @@ mod tests {
 
     #[test]
     fn test_cast_string_to_time64nanosecond() {
+        let a0 = Arc::new(StringViewArray::from(vec![
+            Some("08:08:35.091323414"),
+            Some("Not a valid time"),
+            None,
+        ])) as ArrayRef;
         let a1 = Arc::new(StringArray::from(vec![
             Some("08:08:35.091323414"),
             Some("Not a valid time"),
@@ -4204,7 +4295,7 @@ mod tests {
             Some("Not a valid time"),
             None,
         ])) as ArrayRef;
-        for array in &[a1, a2] {
+        for array in &[a0, a1, a2] {
             let to_type = DataType::Time64(TimeUnit::Nanosecond);
             let b = cast(array, &to_type).unwrap();
             let c = b.as_primitive::<Time64NanosecondType>();
@@ -4223,6 +4314,11 @@ mod tests {
 
     #[test]
     fn test_cast_string_to_date64() {
+        let a0 = Arc::new(StringViewArray::from(vec![
+            Some("2020-09-08T12:00:00"),
+            Some("Not a valid date"),
+            None,
+        ])) as ArrayRef;
         let a1 = Arc::new(StringArray::from(vec![
             Some("2020-09-08T12:00:00"),
             Some("Not a valid date"),
@@ -4233,7 +4329,7 @@ mod tests {
             Some("Not a valid date"),
             None,
         ])) as ArrayRef;
-        for array in &[a1, a2] {
+        for array in &[a0, a1, a2] {
             let to_type = DataType::Date64;
             let b = cast(array, &to_type).unwrap();
             let c = b.as_primitive::<Date64Type>();
diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs
index 4b83a2a5e7..7d0e7e21c8 100644
--- a/arrow-cast/src/cast/string.rs
+++ b/arrow-cast/src/cast/string.rs
@@ -16,6 +16,7 @@
 // under the License.
 
 use crate::cast::*;
+use arrow_buffer::NullBuffer;
 
 pub(crate) fn value_to_string<O: OffsetSizeTrait>(
     array: &dyn Array,
@@ -43,8 +44,34 @@ pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
     cast_options: &CastOptions,
 ) -> Result<ArrayRef, ArrowError> {
     let string_array = array.as_string::<O>();
+    parse_string_iter::<P, _, _>(string_array.iter(), cast_options, || {
+        string_array.nulls().cloned()
+    })
+}
+
+/// Parse UTF-8 View
+pub(crate) fn parse_string_view<P: Parser>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let string_view_array = array.as_string_view();
+    parse_string_iter::<P, _, _>(string_view_array.iter(), cast_options, || {
+        string_view_array.nulls().cloned()
+    })
+}
+
+fn parse_string_iter<
+    'a,
+    P: Parser,
+    I: Iterator<Item = Option<&'a str>>,
+    F: FnOnce() -> Option<NullBuffer>,
+>(
+    iter: I,
+    cast_options: &CastOptions,
+    nulls: F,
+) -> Result<ArrayRef, ArrowError> {
     let array = if cast_options.safe {
-        let iter = string_array.iter().map(|x| x.and_then(P::parse));
+        let iter = iter.map(|x| x.and_then(P::parse));
 
         // Benefit:
         //     20% performance improvement
@@ -52,8 +79,7 @@ pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
         //     The iterator is trustedLen because it comes from an 
`StringArray`.
         unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
     } else {
-        let v = string_array
-            .iter()
+        let v = iter
             .map(|x| match x {
                 Some(v) => P::parse(v).ok_or_else(|| {
                     ArrowError::CastError(format!(
@@ -65,7 +91,7 @@ pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
                 None => Ok(P::Native::default()),
             })
             .collect::<Result<Vec<_>, ArrowError>>()?;
-        PrimitiveArray::new(v.into(), string_array.nulls().cloned())
+        PrimitiveArray::new(v.into(), nulls())
     };
 
     Ok(Arc::new(array) as ArrayRef)
@@ -81,20 +107,42 @@ pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait, 
T: ArrowTimestampType
     let out: PrimitiveArray<T> = match to_tz {
         Some(tz) => {
             let tz: Tz = tz.as_ref().parse()?;
-            cast_string_to_timestamp_impl(array, &tz, cast_options)?
+            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
+        }
+        None => cast_string_to_timestamp_impl(array.iter(), &Utc, 
cast_options)?,
+    };
+    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
+}
+
+/// Casts string view arrays to an ArrowTimestampType 
(TimeStampNanosecondArray, etc.)
+pub(crate) fn cast_view_to_timestamp<T: ArrowTimestampType>(
+    array: &dyn Array,
+    to_tz: &Option<Arc<str>>,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let array = array.as_string_view();
+    let out: PrimitiveArray<T> = match to_tz {
+        Some(tz) => {
+            let tz: Tz = tz.as_ref().parse()?;
+            cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
         }
-        None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
+        None => cast_string_to_timestamp_impl(array.iter(), &Utc, 
cast_options)?,
     };
     Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
 }
 
-fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType, 
Tz: TimeZone>(
-    array: &GenericStringArray<O>,
+fn cast_string_to_timestamp_impl<
+    'a,
+    I: Iterator<Item = Option<&'a str>>,
+    T: ArrowTimestampType,
+    Tz: TimeZone,
+>(
+    iter: I,
     tz: &Tz,
     cast_options: &CastOptions,
 ) -> Result<PrimitiveArray<T>, ArrowError> {
     if cast_options.safe {
-        let iter = array.iter().map(|v| {
+        let iter = iter.map(|v| {
             v.and_then(|v| {
                 let naive = string_to_datetime(tz, v).ok()?.naive_utc();
                 T::make_value(naive)
@@ -107,8 +155,7 @@ fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: 
ArrowTimestampType, Tz:
 
         Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
     } else {
-        let vec = array
-            .iter()
+        let vec = iter
             .map(|v| {
                 v.map(|v| {
                     let naive = string_to_datetime(tz, v)?.naive_utc();
@@ -122,7 +169,7 @@ fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: 
ArrowTimestampType, Tz:
                         ))
                     })
                 })
-                .transpose()
+                    .transpose()
             })
             .collect::<Result<Vec<Option<i64>>, _>>()?;
 
@@ -148,29 +195,11 @@ where
         .as_any()
         .downcast_ref::<GenericStringArray<Offset>>()
         .unwrap();
-    let interval_array = if cast_options.safe {
-        let iter = string_array
-            .iter()
-            .map(|v| v.and_then(|v| parse_function(v).ok()));
-
-        // Benefit:
-        //     20% performance improvement
-        // Soundness:
-        //     The iterator is trustedLen because it comes from an 
`StringArray`.
-        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
-    } else {
-        let vec = string_array
-            .iter()
-            .map(|v| v.map(parse_function).transpose())
-            .collect::<Result<Vec<_>, ArrowError>>()?;
-
-        // Benefit:
-        //     20% performance improvement
-        // Soundness:
-        //     The iterator is trustedLen because it comes from an 
`StringArray`.
-        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
-    };
-    Ok(Arc::new(interval_array) as ArrayRef)
+    cast_string_to_interval_impl::<_, ArrowType, F>(
+        string_array.iter(),
+        cast_options,
+        parse_function,
+    )
 }
 
 pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
@@ -206,6 +235,84 @@ pub(crate) fn 
cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
     )
 }
 
+pub(crate) fn cast_view_to_interval<F, ArrowType>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+    parse_function: F,
+) -> Result<ArrayRef, ArrowError>
+where
+    ArrowType: ArrowPrimitiveType,
+    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
+{
+    let string_view_array = 
array.as_any().downcast_ref::<StringViewArray>().unwrap();
+    cast_string_to_interval_impl::<_, ArrowType, F>(
+        string_view_array.iter(),
+        cast_options,
+        parse_function,
+    )
+}
+
+pub(crate) fn cast_view_to_year_month_interval(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    cast_view_to_interval::<_, IntervalYearMonthType>(
+        array,
+        cast_options,
+        parse_interval_year_month,
+    )
+}
+
+pub(crate) fn cast_view_to_day_time_interval(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    cast_view_to_interval::<_, IntervalDayTimeType>(array, cast_options, 
parse_interval_day_time)
+}
+
+pub(crate) fn cast_view_to_month_day_nano_interval(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    cast_view_to_interval::<_, IntervalMonthDayNanoType>(
+        array,
+        cast_options,
+        parse_interval_month_day_nano,
+    )
+}
+
+fn cast_string_to_interval_impl<'a, I, ArrowType, F>(
+    iter: I,
+    cast_options: &CastOptions,
+    parse_function: F,
+) -> Result<ArrayRef, ArrowError>
+where
+    I: Iterator<Item = Option<&'a str>>,
+    ArrowType: ArrowPrimitiveType,
+    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
+{
+    let interval_array = if cast_options.safe {
+        let iter = iter.map(|v| v.and_then(|v| parse_function(v).ok()));
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
+    } else {
+        let vec = iter
+            .map(|v| v.map(parse_function).transpose())
+            .collect::<Result<Vec<_>, ArrowError>>()?;
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
+    };
+    Ok(Arc::new(interval_array) as ArrayRef)
+}
+
 /// A specified helper to cast from `GenericBinaryArray` to 
`GenericStringArray` when they have same
 /// offset size so re-encoding offset is unnecessary.
 pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(

Reply via email to