This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 57f79c03a Enable casting of string to timestamp with microsecond
resolution (#3752)
57f79c03a is described below
commit 57f79c03a8dee9d8bf8601bf555aa271746913fe
Author: Marko Grujic <[email protected]>
AuthorDate: Thu Feb 23 17:02:52 2023 +0100
Enable casting of string to timestamp with microsecond resolution (#3752)
* Enable casting of string to timestamp with microsecond resolution
* Enable string conversion to timestamp with second and millisecond
resolution
---
arrow-cast/src/cast.rs | 128 ++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 101 insertions(+), 27 deletions(-)
diff --git a/arrow-cast/src/cast.rs b/arrow-cast/src/cast.rs
index 49461b14c..d49775c98 100644
--- a/arrow-cast/src/cast.rs
+++ b/arrow-cast/src/cast.rs
@@ -166,6 +166,9 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
| Time32(TimeUnit::Millisecond)
| Time64(TimeUnit::Microsecond)
| Time64(TimeUnit::Nanosecond)
+ | Timestamp(TimeUnit::Second, _)
+ | Timestamp(TimeUnit::Millisecond, _)
+ | Timestamp(TimeUnit::Microsecond, _)
| Timestamp(TimeUnit::Nanosecond, _)
) => true,
(Utf8, _) => to_type.is_numeric() && to_type != &Float16,
@@ -179,6 +182,9 @@ pub fn can_cast_types(from_type: &DataType, to_type:
&DataType) -> bool {
| Time32(TimeUnit::Millisecond)
| Time64(TimeUnit::Microsecond)
| Time64(TimeUnit::Nanosecond)
+ | Timestamp(TimeUnit::Second, _)
+ | Timestamp(TimeUnit::Millisecond, _)
+ | Timestamp(TimeUnit::Microsecond, _)
| Timestamp(TimeUnit::Nanosecond, _)
) => true,
(LargeUtf8, _) => to_type.is_numeric() && to_type != &Float16,
@@ -1141,8 +1147,17 @@ pub fn cast_with_options(
Time64(TimeUnit::Nanosecond) => {
cast_string_to_time64nanosecond::<i32>(array, cast_options)
}
+ Timestamp(TimeUnit::Second, _) => {
+ cast_string_to_timestamp::<i32, TimestampSecondType>(array,
cast_options)
+ }
+ Timestamp(TimeUnit::Millisecond, _) => {
+ cast_string_to_timestamp::<i32,
TimestampMillisecondType>(array, cast_options)
+ }
+ Timestamp(TimeUnit::Microsecond, _) => {
+ cast_string_to_timestamp::<i32,
TimestampMicrosecondType>(array, cast_options)
+ }
Timestamp(TimeUnit::Nanosecond, _) => {
- cast_string_to_timestamp_ns::<i32>(array, cast_options)
+ cast_string_to_timestamp::<i32,
TimestampNanosecondType>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
@@ -1182,8 +1197,17 @@ pub fn cast_with_options(
Time64(TimeUnit::Nanosecond) => {
cast_string_to_time64nanosecond::<i64>(array, cast_options)
}
+ Timestamp(TimeUnit::Second, _) => {
+ cast_string_to_timestamp::<i64, TimestampSecondType>(array,
cast_options)
+ }
+ Timestamp(TimeUnit::Millisecond, _) => {
+ cast_string_to_timestamp::<i64,
TimestampMillisecondType>(array, cast_options)
+ }
+ Timestamp(TimeUnit::Microsecond, _) => {
+ cast_string_to_timestamp::<i64,
TimestampMicrosecondType>(array, cast_options)
+ }
Timestamp(TimeUnit::Nanosecond, _) => {
- cast_string_to_timestamp_ns::<i64>(array, cast_options)
+ cast_string_to_timestamp::<i64,
TimestampNanosecondType>(array, cast_options)
}
_ => Err(ArrowError::CastError(format!(
"Casting from {from_type:?} to {to_type:?} not supported",
@@ -2552,8 +2576,11 @@ fn cast_string_to_time64nanosecond<Offset:
OffsetSizeTrait>(
Ok(Arc::new(array) as ArrayRef)
}
-/// Casts generic string arrays to TimeStampNanosecondArray
-fn cast_string_to_timestamp_ns<Offset: OffsetSizeTrait>(
+/// Casts generic string arrays to an ArrowTimestampType
(TimeStampNanosecondArray, etc.)
+fn cast_string_to_timestamp<
+ Offset: OffsetSizeTrait,
+ TimestampType: ArrowTimestampType<Native = i64>,
+>(
array: &dyn Array,
cast_options: &CastOptions,
) -> Result<ArrayRef, ArrowError> {
@@ -2562,26 +2589,36 @@ fn cast_string_to_timestamp_ns<Offset: OffsetSizeTrait>(
.downcast_ref::<GenericStringArray<Offset>>()
.unwrap();
+ let scale_factor = match TimestampType::get_time_unit() {
+ TimeUnit::Second => 1_000_000_000,
+ TimeUnit::Millisecond => 1_000_000,
+ TimeUnit::Microsecond => 1_000,
+ TimeUnit::Nanosecond => 1,
+ };
+
let array = if cast_options.safe {
- let iter = string_array
- .iter()
- .map(|v| v.and_then(|v| string_to_timestamp_nanos(v).ok()));
+ let iter = string_array.iter().map(|v| {
+ v.and_then(|v| string_to_timestamp_nanos(v).ok().map(|t| t /
scale_factor))
+ });
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an
`StringArray`.
- unsafe { TimestampNanosecondArray::from_trusted_len_iter(iter) }
+ unsafe { PrimitiveArray::<TimestampType>::from_trusted_len_iter(iter) }
} else {
let vec = string_array
.iter()
- .map(|v| v.map(string_to_timestamp_nanos).transpose())
+ .map(|v| {
+ v.map(|v| string_to_timestamp_nanos(v).map(|t| t /
scale_factor))
+ .transpose()
+ })
.collect::<Result<Vec<Option<i64>>, _>>()?;
// Benefit:
// 20% performance improvement
// Soundness:
// The iterator is trustedLen because it comes from an
`StringArray`.
- unsafe { TimestampNanosecondArray::from_trusted_len_iter(vec.iter()) }
+ unsafe {
PrimitiveArray::<TimestampType>::from_trusted_len_iter(vec.iter()) }
};
Ok(Arc::new(array) as ArrayRef)
@@ -4704,32 +4741,69 @@ mod tests {
#[test]
fn test_cast_string_to_timestamp() {
let a1 = Arc::new(StringArray::from(vec![
- Some("2020-09-08T12:00:00+00:00"),
+ Some("2020-09-08T12:00:00.123456789+00:00"),
Some("Not a valid date"),
None,
])) as ArrayRef;
let a2 = Arc::new(LargeStringArray::from(vec![
- Some("2020-09-08T12:00:00+00:00"),
+ Some("2020-09-08T12:00:00.123456789+00:00"),
Some("Not a valid date"),
None,
])) as ArrayRef;
for array in &[a1, a2] {
- let to_type = DataType::Timestamp(TimeUnit::Nanosecond, None);
- let b = cast(array, &to_type).unwrap();
- let c = b
- .as_any()
- .downcast_ref::<TimestampNanosecondArray>()
- .unwrap();
- assert_eq!(1599566400000000000, c.value(0));
- assert!(c.is_null(1));
- assert!(c.is_null(2));
+ for time_unit in &[
+ TimeUnit::Second,
+ TimeUnit::Millisecond,
+ TimeUnit::Microsecond,
+ TimeUnit::Nanosecond,
+ ] {
+ let to_type = DataType::Timestamp(time_unit.clone(), None);
+ let b = cast(array, &to_type).unwrap();
+
+ match time_unit {
+ TimeUnit::Second => {
+ let c =
+
b.as_any().downcast_ref::<TimestampSecondArray>().unwrap();
+ assert_eq!(1599566400, c.value(0));
+ assert!(c.is_null(1));
+ assert!(c.is_null(2));
+ }
+ TimeUnit::Millisecond => {
+ let c = b
+ .as_any()
+ .downcast_ref::<TimestampMillisecondArray>()
+ .unwrap();
+ assert_eq!(1599566400123, c.value(0));
+ assert!(c.is_null(1));
+ assert!(c.is_null(2));
+ }
+ TimeUnit::Microsecond => {
+ let c = b
+ .as_any()
+ .downcast_ref::<TimestampMicrosecondArray>()
+ .unwrap();
+ assert_eq!(1599566400123456, c.value(0));
+ assert!(c.is_null(1));
+ assert!(c.is_null(2));
+ }
+ TimeUnit::Nanosecond => {
+ let c = b
+ .as_any()
+ .downcast_ref::<TimestampNanosecondArray>()
+ .unwrap();
+ assert_eq!(1599566400123456789, c.value(0));
+ assert!(c.is_null(1));
+ assert!(c.is_null(2));
+ }
+ }
- let options = CastOptions { safe: false };
- let err = cast_with_options(array, &to_type,
&options).unwrap_err();
- assert_eq!(
- err.to_string(),
- "Cast error: Error parsing 'Not a valid date' as timestamp"
- );
+ let options = CastOptions { safe: false };
+ let err = cast_with_options(array, &to_type,
&options).unwrap_err();
+ assert_eq!(
+ err.to_string(),
+ "Cast error: Error parsing 'Not a valid date' as timestamp"
+ );
+ }
}
}