This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new a693f0f9c Improve interval parsing (#6211)
a693f0f9c is described below

commit a693f0f9c37567b2b121e261fc0a4587776d5ca4
Author: Samuel Colvin <[email protected]>
AuthorDate: Mon Aug 12 22:47:01 2024 +0100

    Improve interval parsing (#6211)
    
    * improve interval parsing
    
    * rename
    
    * cleanup
    
    * fix formatting
    
    * make IntervalParseConfig public
    
    * add debug to IntervalParseConfig
    
    * fmt
---
 arrow-cast/src/parse.rs | 241 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 179 insertions(+), 62 deletions(-)

diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index 65cb5f542..20fa882e9 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -994,10 +994,10 @@ pub fn parse_interval_day_time(
     Ok(IntervalDayTimeType::make_value(days, millis))
 }
 
-pub fn parse_interval_month_day_nano(
+pub fn parse_interval_month_day_nano_config(
     value: &str,
+    config: IntervalParseConfig,
 ) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, 
ArrowError> {
-    let config = IntervalParseConfig::new(IntervalUnit::Month);
     let interval = Interval::parse(value, &config)?;
 
     let (months, days, nanos) = interval.to_month_day_nanos();
@@ -1005,6 +1005,12 @@ pub fn parse_interval_month_day_nano(
     Ok(IntervalMonthDayNanoType::make_value(months, days, nanos))
 }
 
+pub fn parse_interval_month_day_nano(
+    value: &str,
+) -> Result<<IntervalMonthDayNanoType as ArrowPrimitiveType>::Native, 
ArrowError> {
+    parse_interval_month_day_nano_config(value, 
IntervalParseConfig::new(IntervalUnit::Month))
+}
+
 const NANOS_PER_MILLIS: i64 = 1_000_000;
 const NANOS_PER_SECOND: i64 = 1_000 * NANOS_PER_MILLIS;
 const NANOS_PER_MINUTE: i64 = 60 * NANOS_PER_SECOND;
@@ -1012,10 +1018,23 @@ const NANOS_PER_HOUR: i64 = 60 * NANOS_PER_MINUTE;
 #[cfg(test)]
 const NANOS_PER_DAY: i64 = 24 * NANOS_PER_HOUR;
 
+#[derive(Debug, Clone)]
+pub struct IntervalParseConfig {
+    /// The default unit to use if none is specified
+    /// e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when default_unit = 
IntervalType::Second
+    default_unit: IntervalUnit,
+}
+
+impl IntervalParseConfig {
+    pub fn new(default_unit: IntervalUnit) -> Self {
+        Self { default_unit }
+    }
+}
+
 #[rustfmt::skip]
-#[derive(Clone, Copy)]
+#[derive(Debug, Clone, Copy)]
 #[repr(u16)]
-enum IntervalUnit {
+pub enum IntervalUnit {
     Century     = 0b_0000_0000_0001,
     Decade      = 0b_0000_0000_0010,
     Year        = 0b_0000_0000_0100,
@@ -1030,30 +1049,50 @@ enum IntervalUnit {
     Nanosecond  = 0b_1000_0000_0000,
 }
 
+/// Logic for parsing interval unit strings
+///
+/// See 
<https://github.com/postgres/postgres/blob/2caa85f4aae689e6f6721d7363b4c66a2a6417d6/src/backend/utils/adt/datetime.c#L189>
+/// for a list of unit names supported by PostgreSQL which we try to match 
here.
 impl FromStr for IntervalUnit {
     type Err = ArrowError;
 
     fn from_str(s: &str) -> Result<Self, ArrowError> {
         match s.to_lowercase().as_str() {
-            "century" | "centuries" => Ok(Self::Century),
-            "decade" | "decades" => Ok(Self::Decade),
-            "year" | "years" => Ok(Self::Year),
-            "month" | "months" => Ok(Self::Month),
-            "week" | "weeks" => Ok(Self::Week),
-            "day" | "days" => Ok(Self::Day),
-            "hour" | "hours" => Ok(Self::Hour),
-            "minute" | "minutes" => Ok(Self::Minute),
-            "second" | "seconds" => Ok(Self::Second),
-            "millisecond" | "milliseconds" => Ok(Self::Millisecond),
-            "microsecond" | "microseconds" => Ok(Self::Microsecond),
+            "c" | "cent" | "cents" | "century" | "centuries" => 
Ok(Self::Century),
+            "dec" | "decs" | "decade" | "decades" => Ok(Self::Decade),
+            "y" | "yr" | "yrs" | "year" | "years" => Ok(Self::Year),
+            "mon" | "mons" | "month" | "months" => Ok(Self::Month),
+            "w" | "week" | "weeks" => Ok(Self::Week),
+            "d" | "day" | "days" => Ok(Self::Day),
+            "h" | "hr" | "hrs" | "hour" | "hours" => Ok(Self::Hour),
+            "m" | "min" | "mins" | "minute" | "minutes" => Ok(Self::Minute),
+            "s" | "sec" | "secs" | "second" | "seconds" => Ok(Self::Second),
+            "ms" | "msec" | "msecs" | "msecond" | "mseconds" | "millisecond" | 
"milliseconds" => {
+                Ok(Self::Millisecond)
+            }
+            "us" | "usec" | "usecs" | "usecond" | "useconds" | "microsecond" | 
"microseconds" => {
+                Ok(Self::Microsecond)
+            }
             "nanosecond" | "nanoseconds" => Ok(Self::Nanosecond),
-            _ => Err(ArrowError::NotYetImplemented(format!(
+            _ => Err(ArrowError::InvalidArgumentError(format!(
                 "Unknown interval type: {s}"
             ))),
         }
     }
 }
 
+impl IntervalUnit {
+    fn from_str_or_config(
+        s: Option<&str>,
+        config: &IntervalParseConfig,
+    ) -> Result<Self, ArrowError> {
+        match s {
+            Some(s) => s.parse(),
+            None => Ok(config.default_unit),
+        }
+    }
+}
+
 pub type MonthDayNano = (i32, i32, i64);
 
 /// Chosen based on the number of decimal digits in 1 week in nanoseconds
@@ -1352,68 +1391,35 @@ impl Interval {
     }
 }
 
-struct IntervalParseConfig {
-    /// The default unit to use if none is specified
-    /// e.g. `INTERVAL 1` represents `INTERVAL 1 SECOND` when default_unit = 
IntervalType::Second
-    default_unit: IntervalUnit,
-}
-
-impl IntervalParseConfig {
-    fn new(default_unit: IntervalUnit) -> Self {
-        Self { default_unit }
-    }
-}
-
 /// parse the string into a vector of interval components i.e. (amount, unit) 
tuples
 fn parse_interval_components(
     value: &str,
     config: &IntervalParseConfig,
 ) -> Result<Vec<(IntervalAmount, IntervalUnit)>, ArrowError> {
-    let parts = value.split_whitespace();
-
-    let raw_amounts = parts.clone().step_by(2);
-    let raw_units = parts.skip(1).step_by(2);
-
-    // parse amounts
-    let (amounts, invalid_amounts) = raw_amounts
-        .map(IntervalAmount::from_str)
-        .partition::<Vec<_>, _>(Result::is_ok);
-
-    // invalid amounts?
-    if !invalid_amounts.is_empty() {
-        return Err(ArrowError::ParseError(format!(
-            "Invalid input syntax for type interval: {value:?}"
-        )));
-    }
+    let raw_pairs = split_interval_components(value);
 
-    // parse units
-    let (units, invalid_units): (Vec<_>, Vec<_>) = raw_units
-        .clone()
-        .map(IntervalUnit::from_str)
-        .partition(Result::is_ok);
-
-    // invalid units?
-    if !invalid_units.is_empty() {
+    // parse amounts and units
+    let Ok(pairs): Result<Vec<(IntervalAmount, IntervalUnit)>, ArrowError> = 
raw_pairs
+        .iter()
+        .map(|(a, u)| Ok((a.parse()?, IntervalUnit::from_str_or_config(*u, 
config)?)))
+        .collect()
+    else {
         return Err(ArrowError::ParseError(format!(
             "Invalid input syntax for type interval: {value:?}"
         )));
-    }
+    };
 
     // collect parsed results
-    let amounts = amounts.into_iter().map(Result::unwrap).collect::<Vec<_>>();
-    let units = units.into_iter().map(Result::unwrap).collect::<Vec<_>>();
-
-    // if only an amount is specified, use the default unit
-    if amounts.len() == 1 && units.is_empty() {
-        return Ok(vec![(amounts[0], config.default_unit)]);
-    };
+    let (amounts, units): (Vec<_>, Vec<_>) = pairs.into_iter().unzip();
 
     // duplicate units?
     let mut observed_interval_types = 0;
-    for (unit, raw_unit) in units.iter().zip(raw_units) {
+    for (unit, (_, raw_unit)) in units.iter().zip(raw_pairs) {
         if observed_interval_types & (*unit as u16) != 0 {
             return Err(ArrowError::ParseError(format!(
-                "Invalid input syntax for type interval: {value:?}. Repeated 
type '{raw_unit}'",
+                "Invalid input syntax for type interval: {:?}. Repeated type 
'{}'",
+                value,
+                raw_unit.unwrap_or_default(),
             )));
         }
 
@@ -1425,6 +1431,33 @@ fn parse_interval_components(
     Ok(result.collect::<Vec<_>>())
 }
 
+/// Split an interval into a vec of amounts and units.
+///
+/// Pairs are separated by spaces, but within a pair the amount and unit may 
or may not be separated by a space.
+///
+/// This should match the behavior of PostgreSQL's interval parser.
+fn split_interval_components(value: &str) -> Vec<(&str, Option<&str>)> {
+    let mut result = vec![];
+    let mut words = value.split(char::is_whitespace);
+    while let Some(word) = words.next() {
+        if let Some(split_word_at) = word.find(not_interval_amount) {
+            let (amount, unit) = word.split_at(split_word_at);
+            result.push((amount, Some(unit)));
+        } else if let Some(unit) = words.next() {
+            result.push((word, Some(unit)));
+        } else {
+            result.push((word, None));
+            break;
+        }
+    }
+    result
+}
+
+/// test if a character is NOT part of an interval numeric amount
+fn not_interval_amount(c: char) -> bool {
+    !c.is_ascii_digit() && c != '.' && c != '-'
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -2202,6 +2235,78 @@ mod tests {
             )
             .unwrap(),
         );
+
+        // no units
+        assert_eq!(
+            Interval::new(1, 0, 0),
+            Interval::parse("1", &config).unwrap()
+        );
+        assert_eq!(
+            Interval::new(42, 0, 0),
+            Interval::parse("42", &config).unwrap()
+        );
+        assert_eq!(
+            Interval::new(0, 0, 42_000_000_000),
+            Interval::parse("42", 
&IntervalParseConfig::new(IntervalUnit::Second)).unwrap()
+        );
+
+        // shorter units
+        assert_eq!(
+            Interval::new(1, 0, 0),
+            Interval::parse("1 mon", &config).unwrap()
+        );
+        assert_eq!(
+            Interval::new(1, 0, 0),
+            Interval::parse("1 mons", &config).unwrap()
+        );
+        assert_eq!(
+            Interval::new(0, 0, 1_000_000),
+            Interval::parse("1 ms", &config).unwrap()
+        );
+        assert_eq!(
+            Interval::new(0, 0, 1_000),
+            Interval::parse("1 us", &config).unwrap()
+        );
+
+        // no space
+        assert_eq!(
+            Interval::new(0, 0, 1_000),
+            Interval::parse("1us", &config).unwrap()
+        );
+        assert_eq!(
+            Interval::new(0, 0, NANOS_PER_SECOND),
+            Interval::parse("1s", &config).unwrap()
+        );
+        assert_eq!(
+            Interval::new(1, 2, 10_864_000_000_000),
+            Interval::parse("1mon 2days 3hr 1min 4sec", &config).unwrap()
+        );
+
+        assert_eq!(
+            Interval::new(
+                -13i32,
+                -8i32,
+                -NANOS_PER_HOUR
+                    - NANOS_PER_MINUTE
+                    - NANOS_PER_SECOND
+                    - (1.11_f64 * NANOS_PER_MILLIS as f64) as i64
+            ),
+            Interval::parse(
+                "-1year -1month -1week -1day -1 hour -1 minute -1 second 
-1.11millisecond",
+                &config
+            )
+            .unwrap(),
+        );
+
+        assert_eq!(
+            Interval::parse("1h s", &config).unwrap_err().to_string(),
+            r#"Parser error: Invalid input syntax for type interval: "1h s""#
+        );
+
+        assert_eq!(
+            Interval::parse("1XX", &config).unwrap_err().to_string(),
+            r#"Parser error: Invalid input syntax for type interval: "1XX""#
+        );
     }
 
     #[test]
@@ -2625,4 +2730,16 @@ mod tests {
         assert_eq!(TimestampNanosecondType::parse(""), None);
         assert_eq!(Date32Type::parse(""), None);
     }
+
+    #[test]
+    fn test_parse_interval_month_day_nano_config() {
+        let interval = parse_interval_month_day_nano_config(
+            "1",
+            IntervalParseConfig::new(IntervalUnit::Second),
+        )
+        .unwrap();
+        assert_eq!(interval.months, 0);
+        assert_eq!(interval.days, 0);
+        assert_eq!(interval.nanoseconds, NANOS_PER_SECOND);
+    }
 }

Reply via email to