jorgecarleitao commented on a change in pull request #8142:
URL: https://github.com/apache/arrow/pull/8142#discussion_r485333061



##########
File path: rust/datafusion/src/physical_plan/datetime_expressions.rs
##########
@@ -0,0 +1,225 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! DateTime expressions
+
+use crate::error::{ExecutionError, Result};
+use arrow::array::{Array, ArrayRef, StringArray, TimestampNanosecondArray};
+use chrono::prelude::*;
+
+#[inline]
+fn string_to_timestamp_nanos(s: &str) -> Result<i64> {
+    // Fast path:  RFC3339 timestamp (with a T)
+    // Example: 2020-09-08T13:42:29.190855Z
+    if let Ok(ts) = DateTime::parse_from_rfc3339(s) {
+        return Ok(ts.timestamp_nanos());
+    }
+
+    // Implement quasi-RFC3339 support by trying to parse the
+    // timestamp with various other format specifiers to to support
+    // separating the date and time with a space ' ' rather than 'T' to be
+    // (more) compatible with Apache Spark SQL
+
+    // timezone offset, using ' ' as a separator
+    // Example: 2020-09-08 13:42:29.190855-05:00
+    if let Ok(ts) = DateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S%.f%:z") {
+        return Ok(ts.timestamp_nanos());
+    }
+
+    // with an explict Z, using ' ' as a separator
+    // Example: 2020-09-08 13:42:29Z
+    if let Ok(ts) = Utc.datetime_from_str(s, "%Y-%m-%d %H:%M:%S%.fZ") {
+        return Ok(ts.timestamp_nanos());
+    }
+
+    // Support timestamps without an explicit timezone offset, again
+    // to be compatible with what Apache Spark SQL does.
+
+    // without a timezone specifier as a local time, using T as a separator
+    // Example: 2020-09-08T13:42:29.190855
+    if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%dT%H:%M:%S.%f") {
+        return Ok(ts.timestamp_nanos());
+    }
+
+    // without a timezone specifier as a local time, using ' ' as a separator
+    // Example: 2020-09-08 13:42:29.190855
+    if let Ok(ts) = NaiveDateTime::parse_from_str(s, "%Y-%m-%d %H:%M:%S.%f") {
+        return Ok(ts.timestamp_nanos());
+    }
+
+    // Note we don't pass along the error message from the underlying
+    // chrono parsing because we tried several different format
+    // strings and we don't know which the user was trying to
+    // match. Ths any of the specific error messages is likely to be
+    // be more confusing than helpful
+    Err(ExecutionError::General("Timestamp parse error".into()))
+}
+
+/// convert an array of strings into `Timestamp(Nanosecond, None)`
+pub fn to_timestamp(args: &[ArrayRef]) -> Result<TimestampNanosecondArray> {
+    let num_rows = args[0].len();
+    let mut ts_builder = TimestampNanosecondArray::builder(num_rows);
+    let string_args = &args[0]
+        .as_any()
+        .downcast_ref::<StringArray>()
+        .expect("input cast to StringArray failed");
+
+    for i in 0..string_args.len() {

Review comment:
       I think that we get a massive performance boost if we build `ArrayData` 
here (learning from @nevi-me :P). 
   
   @nevi-me did this recently 
[here](https://github.com/apache/arrow/commit/9ea24092064205b9966c4e08da50ea344bf042e5#diff-084bc9b19a2397f6ba80602e2d136833R33),
 e.g.
   
   ```rust
   
   let num_rows = args[0].len();
   let string_args = &args[0]
       .as_any()
       .downcast_ref::<StringArray>()
       .expect("input cast to StringArray failed");
   
   let result = (0..num_rows).map(|i| 
string_to_timestamp_nanos(string_args.value(i))).collect::<Result<Vec<i64>>>()?;
   
   let data = ArrayData::new(
       DataType::Timestamp(TimeUnit::Nanosecond, None),
       num_rows,
       Some(string_args.null_count()),
       string_args.data().null_buffer().cloned(),
       0,
       vec![Buffer::from(result.to_byte_slice())],
       vec![],
   );
   Ok(make_array(Arc::new(data)))
   ```




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to