This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 40409e4038f Split arrow_cast::cast::string into it's own submodule
(#5563)
40409e4038f is described below
commit 40409e4038f4293b60fc768639d06b3e27102b87
Author: Clide S <[email protected]>
AuthorDate: Wed Apr 3 12:56:16 2024 -0400
Split arrow_cast::cast::string into it's own submodule (#5563)
* Spit cast::string into a submodule of cast
* Remove duplicate function
* Apply changes
* Format change
---------
Co-authored-by: Clide Stefani <[email protected]>
---
arrow-cast/src/cast/mod.rs | 254 +--------------------------------------
arrow-cast/src/cast/string.rs | 270 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 272 insertions(+), 252 deletions(-)
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index 52eb0d36727..3e2bf4392ff 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -40,9 +40,11 @@
mod decimal;
mod dictionary;
mod list;
+mod string;
use crate::cast::decimal::*;
use crate::cast::dictionary::*;
use crate::cast::list::*;
+use crate::cast::string::*;
use chrono::{NaiveTime, Offset, TimeZone, Utc};
use std::cmp::Ordering;
@@ -2001,26 +2003,6 @@ where
from.unary_opt::<_, R>(num::cast::cast::<T::Native, R::Native>)
}
-fn value_to_string<O: OffsetSizeTrait>(
- array: &dyn Array,
- options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
- let mut builder = GenericStringBuilder::<O>::new();
- let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
- let nulls = array.nulls();
- for i in 0..array.len() {
- match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
- true => builder.append_null(),
- false => {
- formatter.value(i).write(&mut builder)?;
- // tell the builder the row is finished
- builder.append_value("");
- }
- }
- }
- Ok(Arc::new(builder.finish()))
-}
-
fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O: OffsetSizeTrait>(
array: &dyn Array,
) -> Result<ArrayRef, ArrowError> {
@@ -2034,172 +2016,6 @@ fn cast_numeric_to_binary<FROM: ArrowPrimitiveType, O:
OffsetSizeTrait>(
)))
}
-/// Parse UTF-8
-fn parse_string<P: Parser, O: OffsetSizeTrait>(
- array: &dyn Array,
- cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
- let string_array = array.as_string::<O>();
- let array = if cast_options.safe {
- let iter = string_array.iter().map(|x| x.and_then(P::parse));
-
- // Benefit:
- // 20% performance improvement
- // Soundness:
- // The iterator is trustedLen because it comes from an
`StringArray`.
- unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
- } else {
- let v = string_array
- .iter()
- .map(|x| match x {
- Some(v) => P::parse(v).ok_or_else(|| {
- ArrowError::CastError(format!(
- "Cannot cast string '{}' to value of {:?} type",
- v,
- P::DATA_TYPE
- ))
- }),
- None => Ok(P::Native::default()),
- })
- .collect::<Result<Vec<_>, ArrowError>>()?;
- PrimitiveArray::new(v.into(), string_array.nulls().cloned())
- };
-
- Ok(Arc::new(array) as ArrayRef)
-}
-
-/// Casts generic string arrays to an ArrowTimestampType
(TimeStampNanosecondArray, etc.)
-fn cast_string_to_timestamp<O: OffsetSizeTrait, T: ArrowTimestampType>(
- array: &dyn Array,
- to_tz: &Option<Arc<str>>,
- cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
- let array = array.as_string::<O>();
- let out: PrimitiveArray<T> = match to_tz {
- Some(tz) => {
- let tz: Tz = tz.as_ref().parse()?;
- cast_string_to_timestamp_impl(array, &tz, cast_options)?
- }
- None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
- };
- Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
-}
-
-fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType,
Tz: TimeZone>(
- array: &GenericStringArray<O>,
- tz: &Tz,
- cast_options: &CastOptions,
-) -> Result<PrimitiveArray<T>, ArrowError> {
- if cast_options.safe {
- let iter = array.iter().map(|v| {
- v.and_then(|v| {
- let naive = string_to_datetime(tz, v).ok()?.naive_utc();
- T::make_value(naive)
- })
- });
- // Benefit:
- // 20% performance improvement
- // Soundness:
- // The iterator is trustedLen because it comes from an
`StringArray`.
-
- Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
- } else {
- let vec = array
- .iter()
- .map(|v| {
- v.map(|v| {
- let naive = string_to_datetime(tz, v)?.naive_utc();
- T::make_value(naive).ok_or_else(|| {
- ArrowError::CastError(format!(
- "Overflow converting {naive} to {:?}",
- T::UNIT
- ))
- })
- })
- .transpose()
- })
- .collect::<Result<Vec<Option<i64>>, _>>()?;
-
- // Benefit:
- // 20% performance improvement
- // Soundness:
- // The iterator is trustedLen because it comes from an
`StringArray`.
- Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
- }
-}
-
-fn cast_string_to_interval<Offset, F, ArrowType>(
- array: &dyn Array,
- cast_options: &CastOptions,
- parse_function: F,
-) -> Result<ArrayRef, ArrowError>
-where
- Offset: OffsetSizeTrait,
- ArrowType: ArrowPrimitiveType,
- F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
-{
- let string_array = array
- .as_any()
- .downcast_ref::<GenericStringArray<Offset>>()
- .unwrap();
- let interval_array = if cast_options.safe {
- let iter = string_array
- .iter()
- .map(|v| v.and_then(|v| parse_function(v).ok()));
-
- // Benefit:
- // 20% performance improvement
- // Soundness:
- // The iterator is trustedLen because it comes from an
`StringArray`.
- unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
- } else {
- let vec = string_array
- .iter()
- .map(|v| v.map(parse_function).transpose())
- .collect::<Result<Vec<_>, ArrowError>>()?;
-
- // Benefit:
- // 20% performance improvement
- // Soundness:
- // The iterator is trustedLen because it comes from an
`StringArray`.
- unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
- };
- Ok(Arc::new(interval_array) as ArrayRef)
-}
-
-fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
- array: &dyn Array,
- cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
- cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
- array,
- cast_options,
- parse_interval_year_month,
- )
-}
-
-fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
- array: &dyn Array,
- cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
- cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
- array,
- cast_options,
- parse_interval_day_time,
- )
-}
-
-fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
- array: &dyn Array,
- cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
- cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
- array,
- cast_options,
- parse_interval_month_day_nano,
- )
-}
-
fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
array: PrimitiveArray<Int64Type>,
to_tz: &Tz,
@@ -2222,41 +2038,6 @@ fn adjust_timestamp_to_timezone<T: ArrowTimestampType>(
Ok(adjusted)
}
-/// Casts Utf8 to Boolean
-fn cast_utf8_to_boolean<OffsetSize>(
- from: &dyn Array,
- cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError>
-where
- OffsetSize: OffsetSizeTrait,
-{
- let array = from
- .as_any()
- .downcast_ref::<GenericStringArray<OffsetSize>>()
- .unwrap();
-
- let output_array = array
- .iter()
- .map(|value| match value {
- Some(value) => match value.to_ascii_lowercase().trim() {
- "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1"
=> Ok(Some(true)),
- "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" |
"off" | "0" => {
- Ok(Some(false))
- }
- invalid_value => match cast_options.safe {
- true => Ok(None),
- false => Err(ArrowError::CastError(format!(
- "Cannot cast value '{invalid_value}' to value of
Boolean type",
- ))),
- },
- },
- None => Ok(None),
- })
- .collect::<Result<BooleanArray, _>>()?;
-
- Ok(Arc::new(output_array))
-}
-
/// Cast numeric types to Boolean
///
/// Any zero value returns `false` while non-zero returns `true`
@@ -2325,37 +2106,6 @@ where
unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) }
}
-/// A specified helper to cast from `GenericBinaryArray` to
`GenericStringArray` when they have same
-/// offset size so re-encoding offset is unnecessary.
-fn cast_binary_to_string<O: OffsetSizeTrait>(
- array: &dyn Array,
- cast_options: &CastOptions,
-) -> Result<ArrayRef, ArrowError> {
- let array = array
- .as_any()
- .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
- .unwrap();
-
- match GenericStringArray::<O>::try_from_binary(array.clone()) {
- Ok(a) => Ok(Arc::new(a)),
- Err(e) => match cast_options.safe {
- true => {
- // Fallback to slow method to convert invalid sequences to
nulls
- let mut builder =
- GenericStringBuilder::<O>::with_capacity(array.len(),
array.value_data().len());
-
- let iter = array
- .iter()
- .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
-
- builder.extend(iter);
- Ok(Arc::new(builder.finish()))
- }
- false => Err(e),
- },
- }
-}
-
/// Helper function to cast from one `BinaryArray` or 'LargeBinaryArray' to
'FixedSizeBinaryArray'.
fn cast_binary_to_fixed_size_binary<O: OffsetSizeTrait>(
array: &dyn Array,
diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs
new file mode 100644
index 00000000000..e9c1ff58d62
--- /dev/null
+++ b/arrow-cast/src/cast/string.rs
@@ -0,0 +1,270 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use crate::cast::*;
+
+pub(crate) fn value_to_string<O: OffsetSizeTrait>(
+ array: &dyn Array,
+ options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let mut builder = GenericStringBuilder::<O>::new();
+ let formatter = ArrayFormatter::try_new(array, &options.format_options)?;
+ let nulls = array.nulls();
+ for i in 0..array.len() {
+ match nulls.map(|x| x.is_null(i)).unwrap_or_default() {
+ true => builder.append_null(),
+ false => {
+ formatter.value(i).write(&mut builder)?;
+ // tell the builder the row is finished
+ builder.append_value("");
+ }
+ }
+ }
+ Ok(Arc::new(builder.finish()))
+}
+
+/// Parse UTF-8
+pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let string_array = array.as_string::<O>();
+ let array = if cast_options.safe {
+ let iter = string_array.iter().map(|x| x.and_then(P::parse));
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an
`StringArray`.
+ unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
+ } else {
+ let v = string_array
+ .iter()
+ .map(|x| match x {
+ Some(v) => P::parse(v).ok_or_else(|| {
+ ArrowError::CastError(format!(
+ "Cannot cast string '{}' to value of {:?} type",
+ v,
+ P::DATA_TYPE
+ ))
+ }),
+ None => Ok(P::Native::default()),
+ })
+ .collect::<Result<Vec<_>, ArrowError>>()?;
+ PrimitiveArray::new(v.into(), string_array.nulls().cloned())
+ };
+
+ Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to an ArrowTimestampType
(TimeStampNanosecondArray, etc.)
+pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait, T:
ArrowTimestampType>(
+ array: &dyn Array,
+ to_tz: &Option<Arc<str>>,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let array = array.as_string::<O>();
+ let out: PrimitiveArray<T> = match to_tz {
+ Some(tz) => {
+ let tz: Tz = tz.as_ref().parse()?;
+ cast_string_to_timestamp_impl(array, &tz, cast_options)?
+ }
+ None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
+ };
+ Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
+}
+
+fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType,
Tz: TimeZone>(
+ array: &GenericStringArray<O>,
+ tz: &Tz,
+ cast_options: &CastOptions,
+) -> Result<PrimitiveArray<T>, ArrowError> {
+ if cast_options.safe {
+ let iter = array.iter().map(|v| {
+ v.and_then(|v| {
+ let naive = string_to_datetime(tz, v).ok()?.naive_utc();
+ T::make_value(naive)
+ })
+ });
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an
`StringArray`.
+
+ Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
+ } else {
+ let vec = array
+ .iter()
+ .map(|v| {
+ v.map(|v| {
+ let naive = string_to_datetime(tz, v)?.naive_utc();
+ T::make_value(naive).ok_or_else(|| {
+ ArrowError::CastError(format!(
+ "Overflow converting {naive} to {:?}",
+ T::UNIT
+ ))
+ })
+ })
+ .transpose()
+ })
+ .collect::<Result<Vec<Option<i64>>, _>>()?;
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an
`StringArray`.
+ Ok(unsafe { PrimitiveArray::from_trusted_len_iter(vec.iter()) })
+ }
+}
+
+pub(crate) fn cast_string_to_interval<Offset, F, ArrowType>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+ parse_function: F,
+) -> Result<ArrayRef, ArrowError>
+where
+ Offset: OffsetSizeTrait,
+ ArrowType: ArrowPrimitiveType,
+ F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
+{
+ let string_array = array
+ .as_any()
+ .downcast_ref::<GenericStringArray<Offset>>()
+ .unwrap();
+ let interval_array = if cast_options.safe {
+ let iter = string_array
+ .iter()
+ .map(|v| v.and_then(|v| parse_function(v).ok()));
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an
`StringArray`.
+ unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
+ } else {
+ let vec = string_array
+ .iter()
+ .map(|v| v.map(parse_function).transpose())
+ .collect::<Result<Vec<_>, ArrowError>>()?;
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an
`StringArray`.
+ unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
+ };
+ Ok(Arc::new(interval_array) as ArrayRef)
+}
+
+pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ cast_string_to_interval::<Offset, _, IntervalYearMonthType>(
+ array,
+ cast_options,
+ parse_interval_year_month,
+ )
+}
+
+pub(crate) fn cast_string_to_day_time_interval<Offset: OffsetSizeTrait>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ cast_string_to_interval::<Offset, _, IntervalDayTimeType>(
+ array,
+ cast_options,
+ parse_interval_day_time,
+ )
+}
+
+pub(crate) fn cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ cast_string_to_interval::<Offset, _, IntervalMonthDayNanoType>(
+ array,
+ cast_options,
+ parse_interval_month_day_nano,
+ )
+}
+
+/// A specified helper to cast from `GenericBinaryArray` to
`GenericStringArray` when they have same
+/// offset size so re-encoding offset is unnecessary.
+pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let array = array
+ .as_any()
+ .downcast_ref::<GenericByteArray<GenericBinaryType<O>>>()
+ .unwrap();
+
+ match GenericStringArray::<O>::try_from_binary(array.clone()) {
+ Ok(a) => Ok(Arc::new(a)),
+ Err(e) => match cast_options.safe {
+ true => {
+ // Fallback to slow method to convert invalid sequences to
nulls
+ let mut builder =
+ GenericStringBuilder::<O>::with_capacity(array.len(),
array.value_data().len());
+
+ let iter = array
+ .iter()
+ .map(|v| v.and_then(|v| std::str::from_utf8(v).ok()));
+
+ builder.extend(iter);
+ Ok(Arc::new(builder.finish()))
+ }
+ false => Err(e),
+ },
+ }
+}
+
+/// Casts Utf8 to Boolean
+pub(crate) fn cast_utf8_to_boolean<OffsetSize>(
+ from: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError>
+where
+ OffsetSize: OffsetSizeTrait,
+{
+ let array = from
+ .as_any()
+ .downcast_ref::<GenericStringArray<OffsetSize>>()
+ .unwrap();
+
+ let output_array = array
+ .iter()
+ .map(|value| match value {
+ Some(value) => match value.to_ascii_lowercase().trim() {
+ "t" | "tr" | "tru" | "true" | "y" | "ye" | "yes" | "on" | "1"
=> Ok(Some(true)),
+ "f" | "fa" | "fal" | "fals" | "false" | "n" | "no" | "of" |
"off" | "0" => {
+ Ok(Some(false))
+ }
+ invalid_value => match cast_options.safe {
+ true => Ok(None),
+ false => Err(ArrowError::CastError(format!(
+ "Cannot cast value '{invalid_value}' to value of
Boolean type",
+ ))),
+ },
+ },
+ None => Ok(None),
+ })
+ .collect::<Result<BooleanArray, _>>()?;
+
+ Ok(Arc::new(output_array))
+}