alamb commented on code in PR #3762:
URL: https://github.com/apache/arrow-rs/pull/3762#discussion_r1123436418


##########
arrow-cast/src/cast.rs:
##########
@@ -2624,6 +2647,75 @@ fn cast_string_to_timestamp<
     Ok(Arc::new(array) as ArrayRef)
 }
 
+fn cast_string_to_year_month_interval<Offset: OffsetSizeTrait>(
+    array: &dyn Array,
+    cast_options: &CastOptions,
+) -> Result<ArrayRef, ArrowError> {
+    let string_array = array
+        .as_any()
+        .downcast_ref::<GenericStringArray<Offset>>()
+        .unwrap();
+    let interval_array = if cast_options.safe {
+        let iter = string_array
+            .iter()
+            .map(|v| v.and_then(|v| parse_interval_year_month(v).ok()));
+        unsafe { IntervalYearMonthArray::from_trusted_len_iter(iter) }

Review Comment:
   I think we should add some justification for using `unsafe` in the code 
below (which is that the iterator came from a string_array so it has a known 
good length)



##########
arrow-cast/src/test_util.rs:
##########
@@ -0,0 +1,64 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+//! Utility functions to make testing DataFusion based crates easier
+
+/// A macro to assert that one string is contained within another with
+/// a nice error message if they are not.
+///
+/// Usage: `assert_contains!(actual, expected)`
+///
+/// Is a macro so test error
+/// messages are on the same line as the failure;
+///
+/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>)
+#[macro_export]
+macro_rules! assert_contains {
+    ($ACTUAL: expr, $EXPECTED: expr) => {
+        let actual_value: String = $ACTUAL.into();
+        let expected_value: String = $EXPECTED.into();
+        assert!(
+            actual_value.contains(&expected_value),
+            "Can not find expected in actual.\n\nExpected:\n{}\n\nActual:\n{}",
+            expected_value,
+            actual_value
+        );
+    };
+}
+
+/// A macro to assert that one string is NOT contained within another with
+/// a nice error message if they are are.
+///
+/// Usage: `assert_not_contains!(actual, unexpected)`
+///
+/// Is a macro so test error
+/// messages are on the same line as the failure;
+///
+/// Both arguments must be convertable into Strings ([`Into`]<[`String`]>)
+#[macro_export]
+macro_rules! assert_not_contains {

Review Comment:
   This does not appear to be used



##########
arrow-cast/src/parse.rs:
##########
@@ -445,9 +446,229 @@ impl Parser for Date64Type {
     }
 }
 
+pub(crate) fn parse_interval_year_month(
+    value: &str,
+) -> Result<<IntervalYearMonthType as ArrowPrimitiveType>::Native, ArrowError> 
{
+    let (result_months, result_days, result_nanos) = parse_interval("years", 
value)?;
+    if result_days != 0 || result_nanos != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast ${value} to IntervalYearMonth because the value isn't 
multiple of months"
+        )));
+    }
+    Ok(IntervalYearMonthType::make_value(0, result_months))
+}
+
+pub(crate) fn parse_interval_day_time(
+    value: &str,
+) -> Result<<IntervalDayTimeType as ArrowPrimitiveType>::Native, ArrowError> {
+    let (result_months, mut result_days, result_nanos) = 
parse_interval("days", value)?;
+    if result_nanos % 1_000_000 != 0 {
+        return Err(ArrowError::CastError(format!(
+            "Cannot cast ${value} to IntervalDayTime because the nanos part 
isn't multiple of milliseconds"
+        )));
+    }
+    result_days += result_months * 30;
+    Ok(IntervalDayTimeType::make_value(
+        result_days,
+        (result_nanos / 1_000_000) as i32,
+    ))
+}
+
+pub(crate) fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, 
ArrowError> {
+    let (result_months, result_days, result_nanos) = parse_interval("months", 
value)?;
+    Ok(IntervalMonthDayNanoType::make_value(
+        result_months,
+        result_days,
+        result_nanos,
+    ))
+}
+
+const SECONDS_PER_HOUR: f64 = 3_600_f64;
+const NANOS_PER_MILLIS: f64 = 1_000_000_f64;
+const NANOS_PER_SECOND: f64 = 1_000_f64 * NANOS_PER_MILLIS;
+#[cfg(test)]
+const NANOS_PER_MINUTE: f64 = 60_f64 * NANOS_PER_SECOND;
+#[cfg(test)]
+const NANOS_PER_HOUR: f64 = 60_f64 * NANOS_PER_MINUTE;
+
+#[derive(Clone, Copy)]
+#[repr(u16)]
+enum IntervalType {
+    Century = 0b_00_0000_0001,
+    Decade = 0b_00_0000_0010,
+    Year = 0b_00_0000_0100,
+    Month = 0b_00_0000_1000,
+    Week = 0b_00_0001_0000,
+    Day = 0b_00_0010_0000,
+    Hour = 0b_00_0100_0000,
+    Minute = 0b_00_1000_0000,
+    Second = 0b_01_0000_0000,
+    Millisecond = 0b_10_0000_0000,
+}
+
+impl FromStr for IntervalType {
+    type Err = ArrowError;
+
+    fn from_str(s: &str) -> Result<Self, ArrowError> {
+        match s.to_lowercase().as_str() {
+            "century" | "centuries" => Ok(Self::Century),
+            "decade" | "decades" => Ok(Self::Decade),
+            "year" | "years" => Ok(Self::Year),
+            "month" | "months" => Ok(Self::Month),
+            "week" | "weeks" => Ok(Self::Week),
+            "day" | "days" => Ok(Self::Day),
+            "hour" | "hours" => Ok(Self::Hour),
+            "minute" | "minutes" => Ok(Self::Minute),
+            "second" | "seconds" => Ok(Self::Second),
+            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
+            _ => Err(ArrowError::NotYetImplemented(format!(
+                "Unknown interval type: {s}"
+            ))),
+        }
+    }
+}
+
+pub type MonthDayNano = (i32, i32, i64);
+
+/// parse string value to a triple of aligned months, days, nanos
+pub fn parse_interval(
+    leading_field: &str,
+    value: &str,
+) -> Result<MonthDayNano, ArrowError> {
+    let mut used_interval_types = 0;
+
+    let mut calculate_from_part = |interval_period_str: &str,
+                                   interval_type: &str|
+     -> Result<(i64, i64, f64), ArrowError> {
+        // @todo It's better to use Decimal in order to protect rounding errors
+        // Wait https://github.com/apache/arrow/pull/9232
+        let interval_period = match f64::from_str(interval_period_str) {
+            Ok(n) => n,
+            Err(_) => {
+                return Err(ArrowError::NotYetImplemented(format!(
+                    "Unsupported Interval Expression with value {value:?}"
+                )));
+            }
+        };
+
+        if interval_period > (i64::MAX as f64) {
+            return Err(ArrowError::InvalidArgumentError(format!(
+                "Interval field value out of range: {value:?}"
+            )));
+        }
+
+        let it = IntervalType::from_str(interval_type).map_err(|_| {
+            ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}"
+            ))
+        })?;
+
+        // Disallow duplicate interval types
+        if used_interval_types & (it as u16) != 0 {
+            return Err(ArrowError::ParseError(format!(
+                "Invalid input syntax for type interval: {value:?}. Repeated 
type '{interval_type}'"
+            )));
+        } else {
+            used_interval_types |= it as u16;
+        }
+
+        match it {
+            IntervalType::Century => {
+                Ok(align_interval_parts(interval_period * 1200_f64, 0.0, 0.0))
+            }
+            IntervalType::Decade => {
+                Ok(align_interval_parts(interval_period * 120_f64, 0.0, 0.0))
+            }
+            IntervalType::Year => {
+                Ok(align_interval_parts(interval_period * 12_f64, 0.0, 0.0))
+            }
+            IntervalType::Month => Ok(align_interval_parts(interval_period, 
0.0, 0.0)),
+            IntervalType::Week => {
+                Ok(align_interval_parts(0.0, interval_period * 7_f64, 0.0))
+            }
+            IntervalType::Day => Ok(align_interval_parts(0.0, interval_period, 
0.0)),
+            IntervalType::Hour => {
+                Ok((0, 0, interval_period * SECONDS_PER_HOUR * 
NANOS_PER_SECOND))
+            }
+            IntervalType::Minute => {
+                Ok((0, 0, interval_period * 60_f64 * NANOS_PER_SECOND))
+            }
+            IntervalType::Second => Ok((0, 0, interval_period * 
NANOS_PER_SECOND)),
+            IntervalType::Millisecond => Ok((0, 0, interval_period * 
1_000_000f64)),
+        }
+    };
+
+    let mut result_month: i64 = 0;
+    let mut result_days: i64 = 0;
+    let mut result_nanos: i128 = 0;
+
+    let mut parts = value.split_whitespace();
+
+    loop {
+        let interval_period_str = parts.next();
+        if interval_period_str.is_none() {
+            break;
+        }
+
+        let unit = parts.next().unwrap_or(leading_field);
+
+        let (diff_month, diff_days, diff_nanos) =
+            calculate_from_part(interval_period_str.unwrap(), unit)?;
+
+        result_month += diff_month;
+
+        if result_month > (i32::MAX as i64) {
+            return Err(ArrowError::NotYetImplemented(format!(

Review Comment:
   I think `ParseError` would be more appropriate for the errors in this 
function: 
https://docs.rs/arrow/34.0.0/arrow/error/enum.ArrowError.html#variant.ParseError
 



##########
arrow-cast/src/parse.rs:
##########
@@ -871,6 +1092,117 @@ mod tests {
         );
     }
 
+    #[test]
+    fn test_parse_interval() {
+        assert_eq!(
+            (1i32, 0i32, 0i64),
+            parse_interval("months", "1 month").unwrap(),
+        );
+
+        assert_eq!(
+            (2i32, 0i32, 0i64),
+            parse_interval("months", "2 month").unwrap(),
+        );
+
+        assert_contains!(
+            parse_interval("months", "1 centurys 1 month")
+                .unwrap_err()
+                .to_string(),
+            r#"Invalid input syntax for type interval: "1 centurys 1 month""#
+        );
+

Review Comment:
   Since this is a single use of `assert_contains` what do you think about 
simply checking the error directly:
   
   
   ```suggestion
           assert_eq!(
               parse_interval("months", "1 centurys 1 month")
                   .unwrap_err()
                   .to_string(), 
               r#"(ADD DETAILS) Invalid input syntax for type interval: "1 
centurys 1 month""#
           );
   
   ```



##########
arrow-cast/src/cast.rs:
##########
@@ -4975,6 +5067,56 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_cast_string_to_interval() {
+        let interval_string_values = vec![
+            Some("1 year 1 month 1 day"),
+            None,
+            Some("1.5 years 13 month 35 days 1.4 milliseconds"),
+            Some("3 days"),
+            Some("8 seconds"),
+            None,
+            Some("1 day 29800 milliseconds"),
+            Some("3 months 1 second"),
+            Some("6 minutes 120 second"),
+            Some("2 years 39 months 9 days 19 hours 1 minute 83 seconds 399222 
milliseconds"),
+        ];
+        let string_array =
+            Arc::new(StringArray::from(interval_string_values.clone())) as 
ArrayRef;
+        let options = CastOptions { safe: false };
+        let array_ref = cast_with_options(
+            &string_array.clone(),
+            &DataType::Interval(IntervalUnit::MonthDayNano),

Review Comment:
   Can we please add coverage for:
   1.  `IntervalUnit::DayTime` and `IntervalUnit::YearMonth` 
   2. converting something that doesn't parse (`"foobar"`) for example, and 
showing it raises an error with `safe = true`, and is converted to null with 
`safe = true`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to