This is an automated email from the ASF dual-hosted git repository.
nevime pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 0f64726 ARROW-11778: [Rust] Cast from LargeUtf8 to Numerical and
temporal types
0f64726 is described below
commit 0f647261058892289b1722e5883f8610cc5dd3d9
Author: Ritchie Vink <[email protected]>
AuthorDate: Sat Feb 27 11:05:46 2021 +0200
ARROW-11778: [Rust] Cast from LargeUtf8 to Numerical and temporal types
Sorry that the PR's are not more clustered, but they occur to me in the
wild.
This PR allows casting from LargeUtf8 to numerical and temporal types. It
also modifies the already existing string to temporal casts such that it uses
the new faster `from_trusted_length` iterator API.
Closes #9571 from ritchie46/large_utf8_to_numerical
Lead-authored-by: Ritchie Vink <[email protected]>
Co-authored-by: Neville Dipale <[email protected]>
Signed-off-by: Neville Dipale <[email protected]>
---
rust/arrow/src/compute/kernels/cast.rs | 227 ++++++++++++++++++++++-----------
1 file changed, 153 insertions(+), 74 deletions(-)
diff --git a/rust/arrow/src/compute/kernels/cast.rs
b/rust/arrow/src/compute/kernels/cast.rs
index 9a547bd..0d8dc82 100644
--- a/rust/arrow/src/compute/kernels/cast.rs
+++ b/rust/arrow/src/compute/kernels/cast.rs
@@ -88,6 +88,10 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
(Utf8, Date64) => true,
(Utf8, Timestamp(TimeUnit::Nanosecond, None)) => true,
(Utf8, _) => DataType::is_numeric(to_type),
+ (LargeUtf8, Date32) => true,
+ (LargeUtf8, Date64) => true,
+ (LargeUtf8, Timestamp(TimeUnit::Nanosecond, None)) => true,
+ (LargeUtf8, _) => DataType::is_numeric(to_type),
(_, Utf8) | (_, LargeUtf8) => {
DataType::is_numeric(from_type) || from_type == &Binary
}
@@ -366,66 +370,20 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) ->
Result<ArrayRef> {
},
(Utf8, _) => match to_type {
LargeUtf8 => cast_str_container::<i32, i64>(&**array),
- UInt8 => cast_string_to_numeric::<UInt8Type>(array),
- UInt16 => cast_string_to_numeric::<UInt16Type>(array),
- UInt32 => cast_string_to_numeric::<UInt32Type>(array),
- UInt64 => cast_string_to_numeric::<UInt64Type>(array),
- Int8 => cast_string_to_numeric::<Int8Type>(array),
- Int16 => cast_string_to_numeric::<Int16Type>(array),
- Int32 => cast_string_to_numeric::<Int32Type>(array),
- Int64 => cast_string_to_numeric::<Int64Type>(array),
- Float32 => cast_string_to_numeric::<Float32Type>(array),
- Float64 => cast_string_to_numeric::<Float64Type>(array),
- Date32 => {
- use chrono::Datelike;
- let string_array =
array.as_any().downcast_ref::<StringArray>().unwrap();
- let mut builder =
PrimitiveBuilder::<Date32Type>::new(string_array.len());
- for i in 0..string_array.len() {
- if string_array.is_null(i) {
- builder.append_null()?;
- } else {
- match
string_array.value(i).parse::<chrono::NaiveDate>() {
- Ok(date) => builder.append_value(
- date.num_days_from_ce() - EPOCH_DAYS_FROM_CE,
- )?,
- Err(_) => builder.append_null()?, // not a valid
date
- };
- }
- }
- Ok(Arc::new(builder.finish()) as ArrayRef)
- }
- Date64 => {
- let string_array =
array.as_any().downcast_ref::<StringArray>().unwrap();
- let mut builder =
PrimitiveBuilder::<Date64Type>::new(string_array.len());
- for i in 0..string_array.len() {
- if string_array.is_null(i) {
- builder.append_null()?;
- } else {
- match
string_array.value(i).parse::<chrono::NaiveDateTime>() {
- Ok(date_time) => {
-
builder.append_value(date_time.timestamp_millis())?
- }
- Err(_) => builder.append_null()?, // not a valid
date
- };
- }
- }
- Ok(Arc::new(builder.finish()) as ArrayRef)
- }
+ UInt8 => cast_string_to_numeric::<UInt8Type, i32>(array),
+ UInt16 => cast_string_to_numeric::<UInt16Type, i32>(array),
+ UInt32 => cast_string_to_numeric::<UInt32Type, i32>(array),
+ UInt64 => cast_string_to_numeric::<UInt64Type, i32>(array),
+ Int8 => cast_string_to_numeric::<Int8Type, i32>(array),
+ Int16 => cast_string_to_numeric::<Int16Type, i32>(array),
+ Int32 => cast_string_to_numeric::<Int32Type, i32>(array),
+ Int64 => cast_string_to_numeric::<Int64Type, i32>(array),
+ Float32 => cast_string_to_numeric::<Float32Type, i32>(array),
+ Float64 => cast_string_to_numeric::<Float64Type, i32>(array),
+ Date32 => cast_string_to_date32::<i32>(&**array),
+ Date64 => cast_string_to_date64::<i32>(&**array),
Timestamp(TimeUnit::Nanosecond, None) => {
- let string_array =
array.as_any().downcast_ref::<StringArray>().unwrap();
- let mut builder =
-
PrimitiveBuilder::<TimestampNanosecondType>::new(string_array.len());
- for i in 0..string_array.len() {
- if string_array.is_null(i) {
- builder.append_null()?;
- } else {
- match string_to_timestamp_nanos(string_array.value(i))
{
- Ok(nanos) => builder.append_value(nanos)?,
- Err(_) => builder.append_null()?, // not a valid
date
- };
- }
- }
- Ok(Arc::new(builder.finish()) as ArrayRef)
+ cast_string_to_timestamp_ns::<i32>(&**array)
}
_ => Err(ArrowError::ComputeError(format!(
"Casting from {:?} to {:?} not supported",
@@ -487,6 +445,27 @@ pub fn cast(array: &ArrayRef, to_type: &DataType) ->
Result<ArrayRef> {
from_type, to_type,
))),
},
+ (LargeUtf8, _) => match to_type {
+ UInt8 => cast_string_to_numeric::<UInt8Type, i64>(array),
+ UInt16 => cast_string_to_numeric::<UInt16Type, i64>(array),
+ UInt32 => cast_string_to_numeric::<UInt32Type, i64>(array),
+ UInt64 => cast_string_to_numeric::<UInt64Type, i64>(array),
+ Int8 => cast_string_to_numeric::<Int8Type, i64>(array),
+ Int16 => cast_string_to_numeric::<Int16Type, i64>(array),
+ Int32 => cast_string_to_numeric::<Int32Type, i64>(array),
+ Int64 => cast_string_to_numeric::<Int64Type, i64>(array),
+ Float32 => cast_string_to_numeric::<Float32Type, i64>(array),
+ Float64 => cast_string_to_numeric::<Float64Type, i64>(array),
+ Date32 => cast_string_to_date32::<i64>(&**array),
+ Date64 => cast_string_to_date64::<i64>(&**array),
+ Timestamp(TimeUnit::Nanosecond, None) => {
+ cast_string_to_timestamp_ns::<i64>(&**array)
+ }
+ _ => Err(ArrowError::ComputeError(format!(
+ "Casting from {:?} to {:?} not supported",
+ from_type, to_type,
+ ))),
+ },
// start numeric casts
(UInt8, UInt16) => cast_numeric_arrays::<UInt8Type, UInt16Type>(array),
@@ -949,17 +928,23 @@ where
/// Cast numeric types to Utf8
#[allow(clippy::unnecessary_wraps)]
-fn cast_string_to_numeric<T>(from: &ArrayRef) -> Result<ArrayRef>
+fn cast_string_to_numeric<T, Offset: StringOffsetSizeTrait>(
+ from: &ArrayRef,
+) -> Result<ArrayRef>
where
T: ArrowNumericType,
<T as ArrowPrimitiveType>::Native: lexical_core::FromLexical,
{
- Ok(Arc::new(string_to_numeric_cast::<T>(
- from.as_any().downcast_ref::<StringArray>().unwrap(),
+ Ok(Arc::new(string_to_numeric_cast::<T, Offset>(
+ from.as_any()
+ .downcast_ref::<GenericStringArray<Offset>>()
+ .unwrap(),
)))
}
-fn string_to_numeric_cast<T>(from: &StringArray) -> PrimitiveArray<T>
+fn string_to_numeric_cast<T, Offset: StringOffsetSizeTrait>(
+ from: &GenericStringArray<Offset>,
+) -> PrimitiveArray<T>
where
T: ArrowNumericType,
<T as ArrowPrimitiveType>::Native: lexical_core::FromLexical,
@@ -978,6 +963,93 @@ where
unsafe { PrimitiveArray::<T>::from_trusted_len_iter(iter) }
}
+/// Casts generic string arrays to Date32Array
+#[allow(clippy::unnecessary_wraps)]
+fn cast_string_to_date32<Offset: StringOffsetSizeTrait>(
+ array: &dyn Array,
+) -> Result<ArrayRef> {
+ use chrono::Datelike;
+ let string_array = array
+ .as_any()
+ .downcast_ref::<GenericStringArray<Offset>>()
+ .unwrap();
+
+ let iter = (0..string_array.len()).map(|i| {
+ if string_array.is_null(i) {
+ None
+ } else {
+ string_array
+ .value(i)
+ .parse::<chrono::NaiveDate>()
+ .map(|date| date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
+ .ok()
+ }
+ });
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an `StringArray`.
+ let array = unsafe { Date32Array::from_trusted_len_iter(iter) };
+ Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to Date64Array
+#[allow(clippy::unnecessary_wraps)]
+fn cast_string_to_date64<Offset: StringOffsetSizeTrait>(
+ array: &dyn Array,
+) -> Result<ArrayRef> {
+ let string_array = array
+ .as_any()
+ .downcast_ref::<GenericStringArray<Offset>>()
+ .unwrap();
+
+ let iter = (0..string_array.len()).map(|i| {
+ if string_array.is_null(i) {
+ None
+ } else {
+ string_array
+ .value(i)
+ .parse::<chrono::NaiveDateTime>()
+ .map(|datetime| datetime.timestamp_millis())
+ .ok()
+ }
+ });
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an `StringArray`.
+ let array = unsafe { Date64Array::from_trusted_len_iter(iter) };
+ Ok(Arc::new(array) as ArrayRef)
+}
+
+/// Casts generic string arrays to TimeStampNanosecondArray
+#[allow(clippy::unnecessary_wraps)]
+fn cast_string_to_timestamp_ns<Offset: StringOffsetSizeTrait>(
+ array: &dyn Array,
+) -> Result<ArrayRef> {
+ let string_array = array
+ .as_any()
+ .downcast_ref::<GenericStringArray<Offset>>()
+ .unwrap();
+
+ let iter = (0..string_array.len()).map(|i| {
+ if string_array.is_null(i) {
+ None
+ } else {
+ string_to_timestamp_nanos(string_array.value(i)).ok()
+ }
+ });
+
+ // Benefit:
+ // 20% performance improvement
+ // Soundness:
+ // The iterator is trustedLen because it comes from an `StringArray`.
+ let array = unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter)
};
+ Ok(Arc::new(array) as ArrayRef)
+}
+
/// Cast numeric types to Boolean
///
/// Any zero value returns `false` while non-zero returns `true`
@@ -1719,20 +1791,27 @@ mod tests {
#[test]
fn test_cast_string_to_timestamp() {
- let a = StringArray::from(vec![
+ let a1 = Arc::new(StringArray::from(vec![
Some("2020-09-08T12:00:00+00:00"),
Some("Not a valid date"),
None,
- ]);
- let array = Arc::new(a) as ArrayRef;
- let b = cast(&array, &DataType::Timestamp(TimeUnit::Nanosecond,
None)).unwrap();
- let c = b
- .as_any()
- .downcast_ref::<TimestampNanosecondArray>()
- .unwrap();
- assert_eq!(1599566400000000000, c.value(0));
- assert!(c.is_null(1));
- assert!(c.is_null(2));
+ ])) as ArrayRef;
+ let a2 = Arc::new(LargeStringArray::from(vec![
+ Some("2020-09-08T12:00:00+00:00"),
+ Some("Not a valid date"),
+ None,
+ ])) as ArrayRef;
+ for array in &[a1, a2] {
+ let b =
+ cast(array, &DataType::Timestamp(TimeUnit::Nanosecond,
None)).unwrap();
+ let c = b
+ .as_any()
+ .downcast_ref::<TimestampNanosecondArray>()
+ .unwrap();
+ assert_eq!(1599566400000000000, c.value(0));
+ assert!(c.is_null(1));
+ assert!(c.is_null(2));
+ }
}
#[test]