alamb commented on a change in pull request #9373: URL: https://github.com/apache/arrow/pull/9373#discussion_r570403114
########## File path: rust/datafusion/src/sql/planner.rs ########## @@ -982,6 +996,171 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))), } } + + fn sql_interval_to_literal( + &self, + value: &str, + leading_field: &Option<DateTimeField>, + leading_precision: &Option<u64>, + last_field: &Option<DateTimeField>, + fractional_seconds_precision: &Option<u64>, + ) -> Result<Expr> { + if leading_field.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with leading_field {:?}", + leading_field + ))); + } + + if leading_precision.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with leading_precision {:?}", + leading_precision + ))); + } + + if last_field.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with last_field {:?}", + last_field + ))); + } + + if fractional_seconds_precision.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with fractional_seconds_precision {:?}", + fractional_seconds_precision + ))); + } + + const SECONDS_PER_HOUR: f32 = 3_600_f32; + const MILLIS_PER_SECOND: f32 = 1_000_f32; + + // We are storing parts as integers, it's why we need to align parts fractional + // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days Review comment: I found it strange that the example of`1.5 months` is not supported whereas `0.5` and `1` are supported ``` > select (interval '0.5 month'); +-------------------------------------------------+ | IntervalDayTime("64424509440") | +-------------------------------------------------+ | 0 years 0 mons 15 days 0 hours 0 mins 0.00 secs | +-------------------------------------------------+ 1 rows in set. Query took 0 seconds. > select (interval '1 month'); +------------------------------------------------+ | IntervalYearMonth("1") | +------------------------------------------------+ | 0 years 1 mons 0 days 0 hours 0 mins 0.00 secs | +------------------------------------------------+ 1 rows in set. Query took 0 seconds. > select (interval '1.5 month'); NotImplemented("DF doesnt support complex intervals: \"1.1 month\"") ``` ########## File path: rust/arrow/src/util/display.rs ########## @@ -44,6 +44,66 @@ macro_rules! make_string { }}; } +macro_rules! make_string_interval_year_month { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::<array::IntervalYearMonthArray>() + .unwrap(); + + let s = if array.is_null($row) { + "0 years 0 mons 0 days 0 hours 0 mins 0.00 secs".to_string() + } else { + let interval = array.value($row) as f64; + let years = (interval / 12_f64).floor(); + let month = interval - (years * 12_f64); + + format!( + "{} years {} mons 0 days 0 hours 0 mins 0.00 secs", + years, month, + ) + }; + + Ok(s) + }}; +} + +macro_rules! make_string_interval_day_time { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::<array::IntervalDayTimeArray>() + .unwrap(); + + let s = if array.is_null($row) { + "0 years 0 mons 0 days 0 hours 0 mins 0.00 secs".to_string() Review comment: I wonder if `""` would be a more appropriate representation for `NULL`, to conform with the handling of null strings: https://github.com/apache/arrow/pull/9373/files#diff-821be3a8a9239cdd12ab2b3c979e174f7b936aa6bfad4b9bc76489b9923cf747R164 I also think `"0 years 0 mons 0 days 0 hours 0 mins 0.00 secs"` might be the same representation if `value = 0` (so you couldn't distinguish between `0` and `NULL`( ########## File path: rust/arrow/src/util/display.rs ########## @@ -44,6 +44,66 @@ macro_rules! make_string { }}; } +macro_rules! make_string_interval_year_month { + ($column: ident, $row: ident) => {{ + let array = $column + .as_any() + .downcast_ref::<array::IntervalYearMonthArray>() + .unwrap(); + + let s = if array.is_null($row) { + "0 years 0 mons 0 days 0 hours 0 mins 0.00 secs".to_string() Review comment: same comment as below about NULL vs 0 handling ########## File path: rust/datafusion/src/sql/planner.rs ########## @@ -982,6 +996,171 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))), } } + + fn sql_interval_to_literal( + &self, + value: &str, + leading_field: &Option<DateTimeField>, + leading_precision: &Option<u64>, + last_field: &Option<DateTimeField>, + fractional_seconds_precision: &Option<u64>, + ) -> Result<Expr> { + if leading_field.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with leading_field {:?}", + leading_field + ))); + } + + if leading_precision.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with leading_precision {:?}", + leading_precision + ))); + } + + if last_field.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with last_field {:?}", + last_field + ))); + } + + if fractional_seconds_precision.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with fractional_seconds_precision {:?}", + fractional_seconds_precision + ))); + } + + const SECONDS_PER_HOUR: f32 = 3_600_f32; + const MILLIS_PER_SECOND: f32 = 1_000_f32; + + // We are storing parts as integers, it's why we need to align parts fractional + // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days Review comment: I can understand why we might not support fractional months (as depending on the month the length of a month can not necessarily be expressed as an integral number of days or months). But it seems inconsitent to me to say `0.5 month` is `15 days` but `1.5 month` is not `45 days` I think the easiest thing for me to reason about is if they both error. ########## File path: rust/datafusion/src/sql/planner.rs ########## @@ -982,6 +996,171 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))), } } + + fn sql_interval_to_literal( + &self, + value: &str, + leading_field: &Option<DateTimeField>, + leading_precision: &Option<u64>, + last_field: &Option<DateTimeField>, + fractional_seconds_precision: &Option<u64>, + ) -> Result<Expr> { + if leading_field.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with leading_field {:?}", + leading_field + ))); + } + + if leading_precision.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with leading_precision {:?}", + leading_precision + ))); + } + + if last_field.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with last_field {:?}", + last_field + ))); + } + + if fractional_seconds_precision.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with fractional_seconds_precision {:?}", + fractional_seconds_precision + ))); + } + + const SECONDS_PER_HOUR: f32 = 3_600_f32; + const MILLIS_PER_SECOND: f32 = 1_000_f32; + + // We are storing parts as integers, it's why we need to align parts fractional + // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days + // INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours + let align_interval_parts = |month_part: f32, + mut day_part: f32, + mut milles_part: f32| + -> (i32, i32, f32) { + // Convert fractional month to days, It's not supported by Arrow types, but anyway + day_part += (month_part - (month_part as i32) as f32) * 30_f32; + + // Convert fractional days to hours + milles_part += (day_part - ((day_part as i32) as f32)) + * 24_f32 + * SECONDS_PER_HOUR + * MILLIS_PER_SECOND; + + (month_part as i32, day_part as i32, milles_part) + }; + + let calculate_from_part = |interval_period_str: &str, + interval_type: &str| + -> Result<(i32, i32, f32)> { + let interval_period = match f32::from_str(interval_period_str) { + Ok(n) => n, + Err(_) => { + return Err(DataFusionError::SQL(ParserError(format!( + "Unsupported Interval Expression with value {:?}", + value + )))) + } + }; + + if interval_period > (i32::MAX as f32) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + match interval_type.to_lowercase().as_str() { + "year" => Ok(align_interval_parts(interval_period * 12_f32, 0.0, 0.0)), + "month" => Ok(align_interval_parts(interval_period, 0.0, 0.0)), + "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)), + "hour" | "hours" => { + Ok((0, 0, interval_period * SECONDS_PER_HOUR * MILLIS_PER_SECOND)) + } + "minutes" | "minute" => { + Ok((0, 0, interval_period * 60_f32 * MILLIS_PER_SECOND)) + } + "seconds" | "second" => Ok((0, 0, interval_period * MILLIS_PER_SECOND)), + "milliseconds" | "millisecond" => Ok((0, 0, interval_period)), + _ => Err(DataFusionError::NotImplemented(format!( + "Invalid input syntax for type interval: {:?}", + value + ))), + } + }; + + let mut result_month: i64 = 0; + let mut result_days: i64 = 0; + let mut result_millis: i64 = 0; + + let mut parts = value.split_whitespace(); + + loop { + let interval_period_str = parts.next(); + if interval_period_str.is_none() { + break; + } + + let (diff_month, diff_days, diff_millis) = calculate_from_part( + interval_period_str.unwrap(), + parts.next().unwrap_or("second"), + )?; + + result_month += diff_month as i64; + + if result_month > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + result_days += diff_days as i64; + + if result_days > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + result_millis += diff_millis as i64; + + if result_millis > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + } + + // Interval is tricky thing + // 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month + // The true way to store and calculate intervals is to store it as it defined + // Due the fact that Arrow supports only two types YearMonth (month) and DayTime (day, time) + // It's not possible to store complex intervals + // It's possible to do select (NOW() + INTERVAL '1 year') + INTERVAL '1 day'; as workaround + if result_month != 0 && (result_days != 0 || result_millis != 0) { + return Err(DataFusionError::NotImplemented(format!( + "DF doesnt support complex intervals: {:?}", Review comment: ```suggestion "DF does not support intervals that have both a Year/Month part as well as Days/Hours/Mins/Seconds: {:?}. Hint: try breaking the interval into two parts, one with Year/Month and the other with Days/Hours/Mins/Seconds - e.g. (NOW() + INTERVAL '1 year') + INTERVAL '1 day'", ``` Maybe too much in an error message but as I user of DataFusion I would appreciate having this context :) ########## File path: rust/datafusion/src/sql/planner.rs ########## @@ -982,6 +996,171 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ))), } } + + fn sql_interval_to_literal( + &self, + value: &str, + leading_field: &Option<DateTimeField>, + leading_precision: &Option<u64>, + last_field: &Option<DateTimeField>, + fractional_seconds_precision: &Option<u64>, + ) -> Result<Expr> { + if leading_field.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with leading_field {:?}", + leading_field + ))); + } + + if leading_precision.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with leading_precision {:?}", + leading_precision + ))); + } + + if last_field.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with last_field {:?}", + last_field + ))); + } + + if fractional_seconds_precision.is_some() { + return Err(DataFusionError::NotImplemented(format!( + "Unsupported Interval Expression with fractional_seconds_precision {:?}", + fractional_seconds_precision + ))); + } + + const SECONDS_PER_HOUR: f32 = 3_600_f32; + const MILLIS_PER_SECOND: f32 = 1_000_f32; + + // We are storing parts as integers, it's why we need to align parts fractional + // INTERVAL '0.5 MONTH' = 15 days, INTERVAL '1.5 MONTH' = 1 month 15 days + // INTERVAL '0.5 DAY' = 12 hours, INTERVAL '1.5 DAY' = 1 day 12 hours + let align_interval_parts = |month_part: f32, + mut day_part: f32, + mut milles_part: f32| + -> (i32, i32, f32) { + // Convert fractional month to days, It's not supported by Arrow types, but anyway + day_part += (month_part - (month_part as i32) as f32) * 30_f32; + + // Convert fractional days to hours + milles_part += (day_part - ((day_part as i32) as f32)) + * 24_f32 + * SECONDS_PER_HOUR + * MILLIS_PER_SECOND; + + (month_part as i32, day_part as i32, milles_part) + }; + + let calculate_from_part = |interval_period_str: &str, + interval_type: &str| + -> Result<(i32, i32, f32)> { + let interval_period = match f32::from_str(interval_period_str) { + Ok(n) => n, + Err(_) => { + return Err(DataFusionError::SQL(ParserError(format!( + "Unsupported Interval Expression with value {:?}", + value + )))) + } + }; + + if interval_period > (i32::MAX as f32) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + match interval_type.to_lowercase().as_str() { + "year" => Ok(align_interval_parts(interval_period * 12_f32, 0.0, 0.0)), + "month" => Ok(align_interval_parts(interval_period, 0.0, 0.0)), + "day" | "days" => Ok(align_interval_parts(0.0, interval_period, 0.0)), + "hour" | "hours" => { + Ok((0, 0, interval_period * SECONDS_PER_HOUR * MILLIS_PER_SECOND)) + } + "minutes" | "minute" => { + Ok((0, 0, interval_period * 60_f32 * MILLIS_PER_SECOND)) + } + "seconds" | "second" => Ok((0, 0, interval_period * MILLIS_PER_SECOND)), + "milliseconds" | "millisecond" => Ok((0, 0, interval_period)), + _ => Err(DataFusionError::NotImplemented(format!( + "Invalid input syntax for type interval: {:?}", + value + ))), + } + }; + + let mut result_month: i64 = 0; + let mut result_days: i64 = 0; + let mut result_millis: i64 = 0; + + let mut parts = value.split_whitespace(); + + loop { + let interval_period_str = parts.next(); + if interval_period_str.is_none() { + break; + } + + let (diff_month, diff_days, diff_millis) = calculate_from_part( + interval_period_str.unwrap(), + parts.next().unwrap_or("second"), + )?; + + result_month += diff_month as i64; + + if result_month > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + result_days += diff_days as i64; + + if result_days > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + + result_millis += diff_millis as i64; + + if result_millis > (i32::MAX as i64) { + return Err(DataFusionError::NotImplemented(format!( + "Interval field value out of range: {:?}", + value + ))); + } + } + + // Interval is tricky thing + // 1 day is not 24 hours because timezones, 1 year != 365/364! 30 days != 1 month + // The true way to store and calculate intervals is to store it as it defined Review comment: 💯 agree with this comment. I think an additional restriction we should perhaps consider is 'only support integral numbers of months + years' as the `YearMonthInterval` interval type can not capture precisely fractional parts https://github.com/apache/arrow/blob/master/format/Schema.fbs#L253-L254 ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org