This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 8a5be1330e Enable casting from Utf8View (#6077)
8a5be1330e is described below
commit 8a5be1330e30e6dd7760dba910737550d760e612
Author: Andrew Duffy <[email protected]>
AuthorDate: Fri Jul 19 20:17:13 2024 +0100
Enable casting from Utf8View (#6077)
* Enable casting from Utf8View -> string or temporal types
* save
* implement casting utf8view -> timestamp/interval types, with tests
* fix clippy
* fmt
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
arrow-cast/src/cast/mod.rs | 150 ++++++++++++++++++++++++++++-------
arrow-cast/src/cast/string.rs | 177 +++++++++++++++++++++++++++++++++---------
2 files changed, 265 insertions(+), 62 deletions(-)
diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs
index c9de714e7d..1770157bcf 100644
--- a/arrow-cast/src/cast/mod.rs
+++ b/arrow-cast/src/cast/mod.rs
@@ -210,7 +210,7 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
(LargeBinary, Binary | Utf8 | LargeUtf8 | FixedSizeBinary(_) |
BinaryView) => true,
(FixedSizeBinary(_), Binary | LargeBinary) => true,
(
- Utf8 | LargeUtf8,
+ Utf8 | LargeUtf8 | Utf8View,
Binary
| LargeBinary
| Utf8
@@ -228,7 +228,6 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
| Interval(_),
) => true,
(Utf8 | LargeUtf8, Utf8View) => true,
- (Utf8View, Utf8 | LargeUtf8) => true,
(BinaryView, Binary | LargeBinary) => true,
(Utf8 | LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
(_, Utf8 | LargeUtf8) => from_type.is_primitive(),
@@ -1269,6 +1268,56 @@ pub fn cast_with_options(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
},
+ (Utf8View, _) => match to_type {
+ UInt8 => parse_string_view::<UInt8Type>(array, cast_options),
+ UInt16 => parse_string_view::<UInt16Type>(array, cast_options),
+ UInt32 => parse_string_view::<UInt32Type>(array, cast_options),
+ UInt64 => parse_string_view::<UInt64Type>(array, cast_options),
+ Int8 => parse_string_view::<Int8Type>(array, cast_options),
+ Int16 => parse_string_view::<Int16Type>(array, cast_options),
+ Int32 => parse_string_view::<Int32Type>(array, cast_options),
+ Int64 => parse_string_view::<Int64Type>(array, cast_options),
+ Float32 => parse_string_view::<Float32Type>(array, cast_options),
+ Float64 => parse_string_view::<Float64Type>(array, cast_options),
+ Date32 => parse_string_view::<Date32Type>(array, cast_options),
+ Date64 => parse_string_view::<Date64Type>(array, cast_options),
+ Binary => cast_view_to_byte::<StringViewType,
GenericBinaryType<i32>>(array),
+ LargeBinary => cast_view_to_byte::<StringViewType,
GenericBinaryType<i64>>(array),
+ Utf8 => cast_view_to_byte::<StringViewType,
GenericStringType<i32>>(array),
+ LargeUtf8 => cast_view_to_byte::<StringViewType,
GenericStringType<i64>>(array),
+ Time32(TimeUnit::Second) =>
parse_string_view::<Time32SecondType>(array, cast_options),
+ Time32(TimeUnit::Millisecond) => {
+ parse_string_view::<Time32MillisecondType>(array, cast_options)
+ }
+ Time64(TimeUnit::Microsecond) => {
+ parse_string_view::<Time64MicrosecondType>(array, cast_options)
+ }
+ Time64(TimeUnit::Nanosecond) => {
+ parse_string_view::<Time64NanosecondType>(array, cast_options)
+ }
+ Timestamp(TimeUnit::Second, to_tz) => {
+ cast_view_to_timestamp::<TimestampSecondType>(array, to_tz,
cast_options)
+ }
+ Timestamp(TimeUnit::Millisecond, to_tz) => {
+ cast_view_to_timestamp::<TimestampMillisecondType>(array,
to_tz, cast_options)
+ }
+ Timestamp(TimeUnit::Microsecond, to_tz) => {
+ cast_view_to_timestamp::<TimestampMicrosecondType>(array,
to_tz, cast_options)
+ }
+ Timestamp(TimeUnit::Nanosecond, to_tz) => {
+ cast_view_to_timestamp::<TimestampNanosecondType>(array,
to_tz, cast_options)
+ }
+ Interval(IntervalUnit::YearMonth) => {
+ cast_view_to_year_month_interval(array, cast_options)
+ }
+ Interval(IntervalUnit::DayTime) =>
cast_view_to_day_time_interval(array, cast_options),
+ Interval(IntervalUnit::MonthDayNano) => {
+ cast_view_to_month_day_nano_interval(array, cast_options)
+ }
+ _ => Err(ArrowError::CastError(format!(
+ "Casting from {from_type:?} to {to_type:?} not supported",
+ ))),
+ },
(LargeUtf8, _) => match to_type {
UInt8 => parse_string::<UInt8Type, i64>(array, cast_options),
UInt16 => parse_string::<UInt16Type, i64>(array, cast_options),
@@ -1365,8 +1414,6 @@ pub fn cast_with_options(
"Casting from {from_type:?} to {to_type:?} not supported",
))),
},
- (Utf8View, Utf8) => cast_view_to_byte::<StringViewType,
GenericStringType<i32>>(array),
- (Utf8View, LargeUtf8) => cast_view_to_byte::<StringViewType,
GenericStringType<i64>>(array),
(BinaryView, Binary) => cast_view_to_byte::<BinaryViewType,
GenericBinaryType<i32>>(array),
(BinaryView, LargeBinary) => {
cast_view_to_byte::<BinaryViewType, GenericBinaryType<i64>>(array)
@@ -3960,6 +4007,11 @@ mod tests {
#[test]
fn test_cast_string_to_timestamp() {
+ let a0 = Arc::new(StringViewArray::from(vec![
+ Some("2020-09-08T12:00:00.123456789+00:00"),
+ Some("Not a valid date"),
+ None,
+ ])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("2020-09-08T12:00:00.123456789+00:00"),
Some("Not a valid date"),
@@ -3970,7 +4022,7 @@ mod tests {
Some("Not a valid date"),
None,
])) as ArrayRef;
- for array in &[a1, a2] {
+ for array in &[a0, a1, a2] {
for time_unit in &[
TimeUnit::Second,
TimeUnit::Millisecond,
@@ -4039,6 +4091,11 @@ mod tests {
#[test]
fn test_cast_string_to_date32() {
+ let a0 = Arc::new(StringViewArray::from(vec![
+ Some("2018-12-25"),
+ Some("Not a valid date"),
+ None,
+ ])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("2018-12-25"),
Some("Not a valid date"),
@@ -4049,7 +4106,7 @@ mod tests {
Some("Not a valid date"),
None,
])) as ArrayRef;
- for array in &[a1, a2] {
+ for array in &[a0, a1, a2] {
let to_type = DataType::Date32;
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Date32Type>();
@@ -4071,30 +4128,47 @@ mod tests {
#[test]
fn test_cast_string_format_yyyymmdd_to_date32() {
- let a = Arc::new(StringArray::from(vec![
+ let a0 = Arc::new(StringViewArray::from(vec![
+ Some("2020-12-25"),
+ Some("20201117"),
+ ])) as ArrayRef;
+ let a1 = Arc::new(StringArray::from(vec![
+ Some("2020-12-25"),
+ Some("20201117"),
+ ])) as ArrayRef;
+ let a2 = Arc::new(LargeStringArray::from(vec![
Some("2020-12-25"),
Some("20201117"),
])) as ArrayRef;
- let to_type = DataType::Date32;
- let options = CastOptions {
- safe: false,
- format_options: FormatOptions::default(),
- };
- let result = cast_with_options(&a, &to_type, &options).unwrap();
- let c = result.as_primitive::<Date32Type>();
- assert_eq!(
- chrono::NaiveDate::from_ymd_opt(2020, 12, 25),
- c.value_as_date(0)
- );
- assert_eq!(
- chrono::NaiveDate::from_ymd_opt(2020, 11, 17),
- c.value_as_date(1)
- );
+ for array in &[a0, a1, a2] {
+ let to_type = DataType::Date32;
+ let options = CastOptions {
+ safe: false,
+ format_options: FormatOptions::default(),
+ };
+ let result = cast_with_options(&array, &to_type,
&options).unwrap();
+ let c = result.as_primitive::<Date32Type>();
+ assert_eq!(
+ chrono::NaiveDate::from_ymd_opt(2020, 12, 25),
+ c.value_as_date(0)
+ );
+ assert_eq!(
+ chrono::NaiveDate::from_ymd_opt(2020, 11, 17),
+ c.value_as_date(1)
+ );
+ }
}
#[test]
fn test_cast_string_to_time32second() {
+ let a0 = Arc::new(StringViewArray::from(vec![
+ Some("08:08:35.091323414"),
+ Some("08:08:60.091323414"), // leap second
+ Some("08:08:61.091323414"), // not valid
+ Some("Not a valid time"),
+ None,
+ ])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("08:08:35.091323414"),
Some("08:08:60.091323414"), // leap second
@@ -4109,7 +4183,7 @@ mod tests {
Some("Not a valid time"),
None,
])) as ArrayRef;
- for array in &[a1, a2] {
+ for array in &[a0, a1, a2] {
let to_type = DataType::Time32(TimeUnit::Second);
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Time32SecondType>();
@@ -4130,6 +4204,13 @@ mod tests {
#[test]
fn test_cast_string_to_time32millisecond() {
+ let a0 = Arc::new(StringViewArray::from(vec![
+ Some("08:08:35.091323414"),
+ Some("08:08:60.091323414"), // leap second
+ Some("08:08:61.091323414"), // not valid
+ Some("Not a valid time"),
+ None,
+ ])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("08:08:35.091323414"),
Some("08:08:60.091323414"), // leap second
@@ -4144,7 +4225,7 @@ mod tests {
Some("Not a valid time"),
None,
])) as ArrayRef;
- for array in &[a1, a2] {
+ for array in &[a0, a1, a2] {
let to_type = DataType::Time32(TimeUnit::Millisecond);
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Time32MillisecondType>();
@@ -4165,6 +4246,11 @@ mod tests {
#[test]
fn test_cast_string_to_time64microsecond() {
+ let a0 = Arc::new(StringViewArray::from(vec![
+ Some("08:08:35.091323414"),
+ Some("Not a valid time"),
+ None,
+ ])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("08:08:35.091323414"),
Some("Not a valid time"),
@@ -4175,7 +4261,7 @@ mod tests {
Some("Not a valid time"),
None,
])) as ArrayRef;
- for array in &[a1, a2] {
+ for array in &[a0, a1, a2] {
let to_type = DataType::Time64(TimeUnit::Microsecond);
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Time64MicrosecondType>();
@@ -4194,6 +4280,11 @@ mod tests {
#[test]
fn test_cast_string_to_time64nanosecond() {
+ let a0 = Arc::new(StringViewArray::from(vec![
+ Some("08:08:35.091323414"),
+ Some("Not a valid time"),
+ None,
+ ])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("08:08:35.091323414"),
Some("Not a valid time"),
@@ -4204,7 +4295,7 @@ mod tests {
Some("Not a valid time"),
None,
])) as ArrayRef;
- for array in &[a1, a2] {
+ for array in &[a0, a1, a2] {
let to_type = DataType::Time64(TimeUnit::Nanosecond);
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Time64NanosecondType>();
@@ -4223,6 +4314,11 @@ mod tests {
#[test]
fn test_cast_string_to_date64() {
+ let a0 = Arc::new(StringViewArray::from(vec![
+ Some("2020-09-08T12:00:00"),
+ Some("Not a valid date"),
+ None,
+ ])) as ArrayRef;
let a1 = Arc::new(StringArray::from(vec![
Some("2020-09-08T12:00:00"),
Some("Not a valid date"),
@@ -4233,7 +4329,7 @@ mod tests {
Some("Not a valid date"),
None,
])) as ArrayRef;
- for array in &[a1, a2] {
+ for array in &[a0, a1, a2] {
let to_type = DataType::Date64;
let b = cast(array, &to_type).unwrap();
let c = b.as_primitive::<Date64Type>();
diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs
index 4b83a2a5e7..7d0e7e21c8 100644
--- a/arrow-cast/src/cast/string.rs
+++ b/arrow-cast/src/cast/string.rs
@@ -16,6 +16,7 @@
// under the License.
use crate::cast::*;
+use arrow_buffer::NullBuffer;
pub(crate) fn value_to_string<O: OffsetSizeTrait>(
array: &dyn Array,
@@ -43,8 +44,34 @@ pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
let string_array = array.as_string::<O>();
+ parse_string_iter::<P, _, _>(string_array.iter(), cast_options, || {
+ string_array.nulls().cloned()
+ })
+}
+
+/// Parse UTF-8 View
+pub(crate) fn parse_string_view<P: Parser>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let string_view_array = array.as_string_view();
+ parse_string_iter::<P, _, _>(string_view_array.iter(), cast_options, || {
+ string_view_array.nulls().cloned()
+ })
+}
+
+fn parse_string_iter<
+ 'a,
+ P: Parser,
+ I: Iterator<Item = Option<&'a str>>,
+ F: FnOnce() -> Option<NullBuffer>,
+>(
+ iter: I,
+ cast_options: &CastOptions,
+ nulls: F,
+) -> Result<ArrayRef, ArrowError> {
let array = if cast_options.safe {
- let iter = string_array.iter().map(|x| x.and_then(P::parse));
+ let iter = iter.map(|x| x.and_then(P::parse));
// Benefit:
// 20% performance improvement
@@ -52,8 +79,7 @@ pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
// The iterator is trustedLen because it comes from an
`StringArray`.
unsafe { PrimitiveArray::<P>::from_trusted_len_iter(iter) }
} else {
- let v = string_array
- .iter()
+ let v = iter
.map(|x| match x {
Some(v) => P::parse(v).ok_or_else(|| {
ArrowError::CastError(format!(
@@ -65,7 +91,7 @@ pub(crate) fn parse_string<P: Parser, O: OffsetSizeTrait>(
None => Ok(P::Native::default()),
})
.collect::<Result<Vec<_>, ArrowError>>()?;
- PrimitiveArray::new(v.into(), string_array.nulls().cloned())
+ PrimitiveArray::new(v.into(), nulls())
};
Ok(Arc::new(array) as ArrayRef)
@@ -81,20 +107,42 @@ pub(crate) fn cast_string_to_timestamp<O: OffsetSizeTrait,
T: ArrowTimestampType
let out: PrimitiveArray<T> = match to_tz {
Some(tz) => {
let tz: Tz = tz.as_ref().parse()?;
- cast_string_to_timestamp_impl(array, &tz, cast_options)?
+ cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
+ }
+ None => cast_string_to_timestamp_impl(array.iter(), &Utc,
cast_options)?,
+ };
+ Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
+}
+
+/// Casts string view arrays to an ArrowTimestampType
(TimeStampNanosecondArray, etc.)
+pub(crate) fn cast_view_to_timestamp<T: ArrowTimestampType>(
+ array: &dyn Array,
+ to_tz: &Option<Arc<str>>,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ let array = array.as_string_view();
+ let out: PrimitiveArray<T> = match to_tz {
+ Some(tz) => {
+ let tz: Tz = tz.as_ref().parse()?;
+ cast_string_to_timestamp_impl(array.iter(), &tz, cast_options)?
}
- None => cast_string_to_timestamp_impl(array, &Utc, cast_options)?,
+ None => cast_string_to_timestamp_impl(array.iter(), &Utc,
cast_options)?,
};
Ok(Arc::new(out.with_timezone_opt(to_tz.clone())))
}
-fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T: ArrowTimestampType,
Tz: TimeZone>(
- array: &GenericStringArray<O>,
+fn cast_string_to_timestamp_impl<
+ 'a,
+ I: Iterator<Item = Option<&'a str>>,
+ T: ArrowTimestampType,
+ Tz: TimeZone,
+>(
+ iter: I,
tz: &Tz,
cast_options: &CastOptions,
) -> Result<PrimitiveArray<T>, ArrowError> {
if cast_options.safe {
- let iter = array.iter().map(|v| {
+ let iter = iter.map(|v| {
v.and_then(|v| {
let naive = string_to_datetime(tz, v).ok()?.naive_utc();
T::make_value(naive)
@@ -107,8 +155,7 @@ fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T:
ArrowTimestampType, Tz:
Ok(unsafe { PrimitiveArray::from_trusted_len_iter(iter) })
} else {
- let vec = array
- .iter()
+ let vec = iter
.map(|v| {
v.map(|v| {
let naive = string_to_datetime(tz, v)?.naive_utc();
@@ -122,7 +169,7 @@ fn cast_string_to_timestamp_impl<O: OffsetSizeTrait, T:
ArrowTimestampType, Tz:
))
})
})
- .transpose()
+ .transpose()
})
.collect::<Result<Vec<Option<i64>>, _>>()?;
@@ -148,29 +195,11 @@ where
.as_any()
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();
- let interval_array = if cast_options.safe {
- let iter = string_array
- .iter()
- .map(|v| v.and_then(|v| parse_function(v).ok()));
-
- // Benefit:
- // 20% performance improvement
- // Soundness:
- // The iterator is trustedLen because it comes from an
`StringArray`.
- unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
- } else {
- let vec = string_array
- .iter()
- .map(|v| v.map(parse_function).transpose())
- .collect::<Result<Vec<_>, ArrowError>>()?;
-
- // Benefit:
- // 20% performance improvement
- // Soundness:
- // The iterator is trustedLen because it comes from an
`StringArray`.
- unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
- };
- Ok(Arc::new(interval_array) as ArrayRef)
+ cast_string_to_interval_impl::<_, ArrowType, F>(
+ string_array.iter(),
+ cast_options,
+ parse_function,
+ )
}
pub(crate) fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
@@ -206,6 +235,84 @@ pub(crate) fn
cast_string_to_month_day_nano_interval<Offset: OffsetSizeTrait>(
)
}
+pub(crate) fn cast_view_to_interval<F, ArrowType>(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+ parse_function: F,
+) -> Result<ArrayRef, ArrowError>
+where
+ ArrowType: ArrowPrimitiveType,
+ F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
+{
+ let string_view_array =
array.as_any().downcast_ref::<StringViewArray>().unwrap();
+ cast_string_to_interval_impl::<_, ArrowType, F>(
+ string_view_array.iter(),
+ cast_options,
+ parse_function,
+ )
+}
+
+pub(crate) fn cast_view_to_year_month_interval(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ cast_view_to_interval::<_, IntervalYearMonthType>(
+ array,
+ cast_options,
+ parse_interval_year_month,
+ )
+}
+
+pub(crate) fn cast_view_to_day_time_interval(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ cast_view_to_interval::<_, IntervalDayTimeType>(array, cast_options,
parse_interval_day_time)
+}
+
+pub(crate) fn cast_view_to_month_day_nano_interval(
+ array: &dyn Array,
+ cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+ cast_view_to_interval::<_, IntervalMonthDayNanoType>(
+ array,
+ cast_options,
+ parse_interval_month_day_nano,
+ )
+}
+
+fn cast_string_to_interval_impl<'a, I, ArrowType, F>(
+ iter: I,
+ cast_options: &CastOptions,
+ parse_function: F,
+) -> Result<ArrayRef, ArrowError>
+where
+ I: Iterator<Item = Option<&'a str>>,
+ ArrowType: ArrowPrimitiveType,
+ F: Fn(&str) -> Result<ArrowType::Native, ArrowError> + Copy,
+{
+ let interval_array = if cast_options.safe {
+ let iter = iter.map(|v| v.and_then(|v| parse_function(v).ok()));
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an
`StringArray`.
+ unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(iter) }
+ } else {
+ let vec = iter
+ .map(|v| v.map(parse_function).transpose())
+ .collect::<Result<Vec<_>, ArrowError>>()?;
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an
`StringArray`.
+ unsafe { PrimitiveArray::<ArrowType>::from_trusted_len_iter(vec) }
+ };
+ Ok(Arc::new(interval_array) as ArrayRef)
+}
+
/// A specified helper to cast from `GenericBinaryArray` to
`GenericStringArray` when they have same
/// offset size so re-encoding offset is unnecessary.
pub(crate) fn cast_binary_to_string<O: OffsetSizeTrait>(