This is an automated email from the ASF dual-hosted git repository.

tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new 40409e4038f Split arrow_cast::cast::string into it's own submodule 
(#5563)
40409e4038f is described below

commit 40409e4038f4293b60fc768639d06b3e27102b87
Author: Clide S <[email protected]>
AuthorDate: Wed Apr 3 12:56:16 2024 -0400

    Split arrow_cast::cast::string into it's own submodule (#5563)
    
    * Spit cast::string into a submodule of cast
    
    * Remove duplicate function
    
    * Apply changes
    
    * Format change
    
    ---------
    
    Co-authored-by: Clide Stefani <[email protected]>
---
 arrow-cast/src/cast/mod.rs    | 254 +--------------------------------------
 arrow-cast/src/cast/string.rs | 270 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 272 insertions(+), 252 deletions(-)

diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 52eb0d36727..3e2bf4392ff 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -40,9 +40,11 @@
 mod decimal;
 mod dictionary;
 mod list;
+mod string;
 use crate::cast::decimal::*;
 use crate::cast::dictionary::*;
 use crate::cast::list::*;
+use crate::cast::string::*;
 
 use chrono::{NaiveTime, Offset, TimeZone, Utc};
 use std::cmp::Ordering;
@@ -2001,26 +2003,6 @@ where
     from.unary_opt::<_, R>(num::cast::cast::<T::Native, R::Native>)
 }
 
-fn value_to_string<O: OffsetSizeTrait>(
-    array: &dyn Array,
-    options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
-    let mut builder = GenericStringBuilder::<O>::new();
-    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
-    let nulls = array.nulls();
-    for i in 0..array.len() {
-        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
-            true => builder.append_null(),
-            false => {
-                formatter.value(i).write(&mut builder)?;
-                // tell the builder the row is finished
-                builder.append_value("");
-            }
-        }
-    }
-    Ok(Arc::new(builder.finish()))
-}
-
 fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: OffsetSizeTrait>(
     array: &dyn Array,
 ) -> Result<ArrayRef, ArrowError> {
@@ -2034,172 +2016,6 @@ fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: 
OffsetSizeTrait>(
     )))
 }
 
-/// Parse UTF-8
-fn parse_string<P: Parser, O: OffsetSizeTrait>(
-    array: &dyn Array,
-    cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
-    let string_array = array.as_string::<O>();
-    let array = if cast_options.safe {
-        let iter = string_array.iter().map(|x| x.and_then(P::parse));
-
-        // Benefit:
-        //     20% performance improvement
-        // Soundness:
-        //     The iterator is trustedLen because it comes from an 
`StringArray`.
-        unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
-    } else {
-        let v = string_array
-            .iter()
-            .map(|x| match x {
-                Some(v) => P::parse(v).ok_or_else(|| {
-                    ArrowError::CastError(format!(
-                        "Cannot cast string '{}' to value of {:?} type",
-                        v,
-                        P::DATA_TYPE
-                    ))
-                }),
-                None => Ok(P::Native::default()),
-            })
-            .collect::<Result<Vec<_>, ArrowError>>()?;
-        PrimitiveArray::new(v.into(), string_array.nulls().cloned())
-    };
-
-    Ok(Arc::new(array) as ArrayRef)
-}
-
-/// Casts generic string arrays to an ArrowTimestampType 
(TimeStampNanosecondArray, etc.)
-fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
-    array: &dyn Array,
-    to_tz: &Option<Arc<str>>,
-    cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
-    let array = array.as_string::<O>();
-    let out: PrimitiveArray<T> = match to_tz {
-        Some(tz) => {
-            let tz: Tz = tz.as_ref().parse()?;
-            cast_string_to_timestamp_impl(array, &tz, cast_options)?
-        }
-        None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
-    };
-    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
-}
-
-fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType, 
Tz: TimeZone>(
-    array: &GenericStringArray<O>,
-    tz: &Tz,
-    cast_options: &CastOptions,
-) -> Result<PrimitiveArray<T>, ArrowError> {
-    if cast_options.safe {
-        let iter = array.iter().map(|v| {
-            v.and_then(|v| {
-                let naive = string_to_datetime(tz, v).ok()?.naive_utc();
-                T::make_value(naive)
-            })
-        });
-        // Benefit:
-        //     20% performance improvement
-        // Soundness:
-        //     The iterator is trustedLen because it comes from an 
`StringArray`.
-
-        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
-    } else {
-        let vec = array
-            .iter()
-            .map(|v| {
-                v.map(|v| {
-                    let naive = string_to_datetime(tz, v)?.naive_utc();
-                    T::make_value(naive).ok_or_else(|| {
-                        ArrowError::CastError(format!(
-                            "Overflow converting {naive} to {:?}",
-                            T::UNIT
-                        ))
-                    })
-                })
-                .transpose()
-            })
-            .collect::<Result<Vec<Option<i64>>, _>>()?;
-
-        // Benefit:
-        //     20% performance improvement
-        // Soundness:
-        //     The iterator is trustedLen because it comes from an 
`StringArray`.
-        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
-    }
-}
-
-fn cast_string_to_interval<Offset, F, ArrowType>(
-    array: &dyn Array,
-    cast_options: &CastOptions,
-    parse_function: F,
-) -> Result<ArrayRef, ArrowError>
-where
-    Offset: OffsetSizeTrait,
-    ArrowType: ArrowPrimitiveType,
-    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
-{
-    let string_array = array
-        .as_any()
-        .downcast_ref::<GenericStringArray<Offset>>()
-        .unwrap();
-    let interval_array = if cast_options.safe {
-        let iter = string_array
-            .iter()
-            .map(|v| v.and_then(|v| parse_function(v).ok()));
-
-        // Benefit:
-        //     20% performance improvement
-        // Soundness:
-        //     The iterator is trustedLen because it comes from an 
`StringArray`.
-        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
-    } else {
-        let vec = string_array
-            .iter()
-            .map(|v| v.map(parse_function).transpose())
-            .collect::<Result<Vec<_>, ArrowError>>()?;
-
-        // Benefit:
-        //     20% performance improvement
-        // Soundness:
-        //     The iterator is trustedLen because it comes from an 
`StringArray`.
-        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
-    };
-    Ok(Arc::new(interval_array) as ArrayRef)
-}
-
-fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
-    array: &dyn Array,
-    cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
-    cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
-        array,
-        cast_options,
-        parse_interval_year_month,
-    )
-}
-
-fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
-    array: &dyn Array,
-    cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
-    cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
-        array,
-        cast_options,
-        parse_interval_day_time,
-    )
-}
-
-fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
-    array: &dyn Array,
-    cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
-    cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
-        array,
-        cast_options,
-        parse_interval_month_day_nano,
-    )
-}
-
 fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
     array: PrimitiveArray<Int64Type>,
     to_tz: &Tz,
@@ -2222,41 +2038,6 @@ fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
     Ok(adjusted)
 }
 
-/// Casts Utf8 to Boolean
-fn cast_utf8_to_boolean<OffsetSize>(
-    from: &dyn Array,
-    cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError>
-where
-    OffsetSize: OffsetSizeTrait,
-{
-    let array = from
-        .as_any()
-        .downcast_ref::<GenericStringArray<OffsetSize>>()
-        .unwrap();
-
-    let output_array = array
-        .iter()
-        .map(|value| match value {
-            Some(value) => match value.to_ascii_lowercase().trim() {
-                "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" 
=> Ok(Some(true)),
-                "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | 
"off" | "0" => {
-                    Ok(Some(false))
-                }
-                invalid_value => match cast_options.safe {
-                    true => Ok(None),
-                    false => Err(ArrowError::CastError(format!(
-                        "Cannot cast value '{invalid_value}' to value of 
Boolean type",
-                    ))),
-                },
-            },
-            None => Ok(None),
-        })
-        .collect::<Result<BooleanArray, _>>()?;
-
-    Ok(Arc::new(output_array))
-}
-
 /// Cast numeric types to Boolean
 ///
 /// Any zero value returns `false` while non-zero returns `true`
@@ -2325,37 +2106,6 @@ where
     unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) }
 }
 
-/// A specified helper to cast from `GenericBinaryArray` to 
`GenericStringArray` when they have same
-/// offset size so re-encoding offset is unnecessary.
-fn cast_binary_to_string<O: OffsetSizeTrait>(
-    array: &dyn Array,
-    cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
-    let array = array
-        .as_any()
-        .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
-        .unwrap();
-
-    match GenericStringArray::<O>::try_from_binary(array.clone()) {
-        Ok(a) => Ok(Arc::new(a)),
-        Err(e) => match cast_options.safe {
-            true => {
-                // Fallback to slow method to convert invalid sequences to 
nulls
-                let mut builder =
-                    GenericStringBuilder::<O>::with_capacity(array.len(), 
array.value_data().len());
-
-                let iter = array
-                    .iter()
-                    .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
-
-                builder.extend(iter);
-                Ok(Arc::new(builder.finish()))
-            }
-            false => Err(e),
-        },
-    }
-}
-
 /// Helper function to cast from one `BinaryArray` or 'LargeBinaryArray' to 
'FixedSizeBinaryArray'.
 fn cast_binary_to_fixed_size_binary<O: OffsetSizeTrait>(
     array: &dyn Array,
diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs
new file mode 100644
index 00000000000..e9c1ff58d62
--- /dev/null
+++ b/arrow-cast/src/cast/string.rs
@@ -0,0 +1,270 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::cast::*;
+
+pub(crate) fn value_to_string<O: OffsetSizeTrait>(
+    array: &dyn Array,
+    options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let mut builder = GenericStringBuilder::<O>::new();
+    let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
+    let nulls = array.nulls();
+    for i in 0..array.len() {
+        match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
+            true => builder.append_null(),
+            false => {
+                formatter.value(i).write(&mut builder)?;
+                // tell the builder the row is finished
+                builder.append_value("");
+            }
+        }
+    }
+    Ok(Arc::new(builder.finish()))
+}
+
+/// Parse UTF-8
+pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let string_array = array.as_string::<O>();
+    let array = if cast_options.safe {
+        let iter = string_array.iter().map(|x| x.and_then(P::parse));
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
+    } else {
+        let v = string_array
+            .iter()
+            .map(|x| match x {
+                Some(v) => P::parse(v).ok_or_else(|| {
+                    ArrowError::CastError(format!(
+                        "Cannot cast string '{}' to value of {:?} type",
+                        v,
+                        P::DATA_TYPE
+                    ))
+                }),
+                None => Ok(P::Native::default()),
+            })
+            .collect::<Result<Vec<_>, ArrowError>>()?;
+        PrimitiveArray::new(v.into(), string_array.nulls().cloned())
+    };
+
+    Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to an ArrowTimestampType 
(TimeStampNanosecondArray, etc.)
+pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait, T: 
ArrowTimestampType>(
+    array: &dyn Array,
+    to_tz: &Option<Arc<str>>,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let array = array.as_string::<O>();
+    let out: PrimitiveArray<T> = match to_tz {
+        Some(tz) => {
+            let tz: Tz = tz.as_ref().parse()?;
+            cast_string_to_timestamp_impl(array, &tz, cast_options)?
+        }
+        None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
+    };
+    Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
+}
+
+fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType, 
Tz: TimeZone>(
+    array: &GenericStringArray<O>,
+    tz: &Tz,
+    cast_options: &CastOptions,
+) -> Result<PrimitiveArray<T>, ArrowError> {
+    if cast_options.safe {
+        let iter = array.iter().map(|v| {
+            v.and_then(|v| {
+                let naive = string_to_datetime(tz, v).ok()?.naive_utc();
+                T::make_value(naive)
+            })
+        });
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+
+        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
+    } else {
+        let vec = array
+            .iter()
+            .map(|v| {
+                v.map(|v| {
+                    let naive = string_to_datetime(tz, v)?.naive_utc();
+                    T::make_value(naive).ok_or_else(|| {
+                        ArrowError::CastError(format!(
+                            "Overflow converting {naive} to {:?}",
+                            T::UNIT
+                        ))
+                    })
+                })
+                .transpose()
+            })
+            .collect::<Result<Vec<Option<i64>>, _>>()?;
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
+    }
+}
+
+pub(crate) fn cast_string_to_interval<Offset, F, ArrowType>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+    parse_function: F,
+) -> Result<ArrayRef, ArrowError>
+where
+    Offset: OffsetSizeTrait,
+    ArrowType: ArrowPrimitiveType,
+    F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
+{
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+    let interval_array = if cast_options.safe {
+        let iter = string_array
+            .iter()
+            .map(|v| v.and_then(|v| parse_function(v).ok()));
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
+    } else {
+        let vec = string_array
+            .iter()
+            .map(|v| v.map(parse_function).transpose())
+            .collect::<Result<Vec<_>, ArrowError>>()?;
+
+        // Benefit:
+        //     20% performance improvement
+        // Soundness:
+        //     The iterator is trustedLen because it comes from an 
`StringArray`.
+        unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
+    };
+    Ok(Arc::new(interval_array) as ArrayRef)
+}
+
+pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
+        array,
+        cast_options,
+        parse_interval_year_month,
+    )
+}
+
+pub(crate) fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
+        array,
+        cast_options,
+        parse_interval_day_time,
+    )
+}
+
+pub(crate) fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
+        array,
+        cast_options,
+        parse_interval_month_day_nano,
+    )
+}
+
+/// A specified helper to cast from `GenericBinaryArray` to 
`GenericStringArray` when they have same
+/// offset size so re-encoding offset is unnecessary.
+pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let array = array
+        .as_any()
+        .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
+        .unwrap();
+
+    match GenericStringArray::<O>::try_from_binary(array.clone()) {
+        Ok(a) => Ok(Arc::new(a)),
+        Err(e) => match cast_options.safe {
+            true => {
+                // Fallback to slow method to convert invalid sequences to 
nulls
+                let mut builder =
+                    GenericStringBuilder::<O>::with_capacity(array.len(), 
array.value_data().len());
+
+                let iter = array
+                    .iter()
+                    .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
+
+                builder.extend(iter);
+                Ok(Arc::new(builder.finish()))
+            }
+            false => Err(e),
+        },
+    }
+}
+
+/// Casts Utf8 to Boolean
+pub(crate) fn cast_utf8_to_boolean<OffsetSize>(
+    from: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError>
+where
+    OffsetSize: OffsetSizeTrait,
+{
+    let array = from
+        .as_any()
+        .downcast_ref::<GenericStringArray<OffsetSize>>()
+        .unwrap();
+
+    let output_array = array
+        .iter()
+        .map(|value| match value {
+            Some(value) => match value.to_ascii_lowercase().trim() {
+                "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1" 
=> Ok(Some(true)),
+                "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" | 
"off" | "0" => {
+                    Ok(Some(false))
+                }
+                invalid_value => match cast_options.safe {
+                    true => Ok(None),
+                    false => Err(ArrowError::CastError(format!(
+                        "Cannot cast value '{invalid_value}' to value of 
Boolean type",
+                    ))),
+                },
+            },
+            None => Ok(None),
+        })
+        .collect::<Result<BooleanArray, _>>()?;
+
+    Ok(Arc::new(output_array))
+}

Reply via email to