This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 0df21883c Faster time parsing (~93% faster) (#3860)
0df21883c is described below
commit 0df21883cb7a5f414894a833ae301fcd8d8e464c
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Mar 16 19:40:50 2023 +0000
Faster time parsing (~93% faster) (#3860)
* Faster time parsing
* Clippy
* Clippy
* WIP
* Tests and fixes
* Review feedback
---
arrow-cast/Cargo.toml | 4 +
arrow-cast/benches/parse_time.rs | 42 ++++++
arrow-cast/src/parse.rs | 277 ++++++++++++++++++++++++++++-----------
3 files changed, 243 insertions(+), 80 deletions(-)
diff --git a/arrow-cast/Cargo.toml b/arrow-cast/Cargo.toml
index 235dca135..859254c3a 100644
--- a/arrow-cast/Cargo.toml
+++ b/arrow-cast/Cargo.toml
@@ -63,3 +63,7 @@ half = { version = "2.1", default-features = false }
[[bench]]
name = "parse_timestamp"
harness = false
+
+[[bench]]
+name = "parse_time"
+harness = false
diff --git a/arrow-cast/benches/parse_time.rs b/arrow-cast/benches/parse_time.rs
new file mode 100644
index 000000000..d28b9c7c6
--- /dev/null
+++ b/arrow-cast/benches/parse_time.rs
@@ -0,0 +1,42 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_cast::parse::string_to_time_nanoseconds;
+use criterion::*;
+
+fn criterion_benchmark(c: &mut Criterion) {
+ let timestamps = [
+ "9:50",
+ "09:50",
+ "09:50 PM",
+ "9:50:12 AM",
+ "09:50:12 PM",
+ "09:50:12.123456789",
+ "9:50:12.123456789",
+ "09:50:12.123456789 PM",
+ ];
+
+ for timestamp in timestamps {
+ let t = black_box(timestamp);
+ c.bench_function(t, |b| {
+ b.iter(|| string_to_time_nanoseconds(t).unwrap());
+ });
+ }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index 23c2642e7..30cebb4bf 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -23,11 +23,12 @@ use arrow_schema::ArrowError;
use chrono::prelude::*;
use std::str::FromStr;
+/// Parse nanoseconds from the first `N` values in digits, subtracting the
offset `O`
#[inline]
-fn parse_nanos<const N: usize>(digits: &[u8]) -> u32 {
+fn parse_nanos<const N: usize, const O: u8>(digits: &[u8]) -> u32 {
digits[..N]
.iter()
- .fold(0_u32, |acc, v| acc * 10 + *v as u32)
+ .fold(0_u32, |acc, v| acc * 10 + v.wrapping_sub(O) as u32)
* 10_u32.pow((9 - N) as _)
}
@@ -110,15 +111,15 @@ impl TimestampParser {
let digits = (self.mask >> 20).trailing_ones();
let nanos = match digits {
0 => return None,
- 1 => parse_nanos::<1>(&self.digits[20..21]),
- 2 => parse_nanos::<2>(&self.digits[20..22]),
- 3 => parse_nanos::<3>(&self.digits[20..23]),
- 4 => parse_nanos::<4>(&self.digits[20..24]),
- 5 => parse_nanos::<5>(&self.digits[20..25]),
- 6 => parse_nanos::<6>(&self.digits[20..26]),
- 7 => parse_nanos::<7>(&self.digits[20..27]),
- 8 => parse_nanos::<8>(&self.digits[20..28]),
- _ => parse_nanos::<9>(&self.digits[20..29]),
+ 1 => parse_nanos::<1, 0>(&self.digits[20..21]),
+ 2 => parse_nanos::<2, 0>(&self.digits[20..22]),
+ 3 => parse_nanos::<3, 0>(&self.digits[20..23]),
+ 4 => parse_nanos::<4, 0>(&self.digits[20..24]),
+ 5 => parse_nanos::<5, 0>(&self.digits[20..25]),
+ 6 => parse_nanos::<6, 0>(&self.digits[20..26]),
+ 7 => parse_nanos::<7, 0>(&self.digits[20..27]),
+ 8 => parse_nanos::<8, 0>(&self.digits[20..28]),
+ _ => parse_nanos::<9, 0>(&self.digits[20..29]),
};
Some((time(hour, minute, second, nanos)?, 20 + digits
as usize))
}
@@ -299,79 +300,120 @@ fn to_timestamp_nanos(dt: NaiveDateTime) -> Result<i64,
ArrowError> {
/// This function does not support parsing strings with a timezone
/// or offset specified, as it considers only time since midnight.
pub fn string_to_time_nanoseconds(s: &str) -> Result<i64, ArrowError> {
- // colon count, presence of decimal, presence of whitespace
- fn preprocess_time_string(string: &str) -> (usize, bool, bool) {
- string
- .as_bytes()
- .iter()
- .fold((0, false, false), |tup, char| match char {
- b':' => (tup.0 + 1, tup.1, tup.2),
- b'.' => (tup.0, true, tup.2),
- b' ' => (tup.0, tup.1, true),
- _ => tup,
- })
+ let nt = string_to_time(s).ok_or_else(|| {
+ ArrowError::ParseError(format!("Failed to parse \'{s}\' as time"))
+ })?;
+ Ok(nt.num_seconds_from_midnight() as i64 * 1_000_000_000 + nt.nanosecond()
as i64)
+}
+
+fn string_to_time(s: &str) -> Option<NaiveTime> {
+ let bytes = s.as_bytes();
+ if bytes.len() < 4 {
+ return None;
}
- // Do a preprocess pass of the string to prune which formats to attempt
parsing for
- let formats: &[&str] = match preprocess_time_string(s.trim()) {
- // 24-hour clock, with hour, minutes, seconds and fractions of a
second specified
- // Examples:
- // * 09:50:12.123456789
- // * 9:50:12.123456789
- (2, true, false) => &["%H:%M:%S%.f", "%k:%M:%S%.f"],
-
- // 12-hour clock, with hour, minutes, seconds and fractions of a
second specified
- // Examples:
- // * 09:50:12.123456789 PM
- // * 09:50:12.123456789 pm
- // * 9:50:12.123456789 AM
- // * 9:50:12.123456789 am
- (2, true, true) => &[
- "%I:%M:%S%.f %P",
- "%I:%M:%S%.f %p",
- "%l:%M:%S%.f %P",
- "%l:%M:%S%.f %p",
- ],
-
- // 24-hour clock, with hour, minutes and seconds specified
- // Examples:
- // * 09:50:12
- // * 9:50:12
- (2, false, false) => &["%H:%M:%S", "%k:%M:%S"],
-
- // 12-hour clock, with hour, minutes and seconds specified
- // Examples:
- // * 09:50:12 PM
- // * 09:50:12 pm
- // * 9:50:12 AM
- // * 9:50:12 am
- (2, false, true) => &["%I:%M:%S %P", "%I:%M:%S %p", "%l:%M:%S %P",
"%l:%M:%S %p"],
-
- // 24-hour clock, with hour and minutes specified
- // Examples:
- // * 09:50
- // * 9:50
- (1, false, false) => &["%H:%M", "%k:%M"],
-
- // 12-hour clock, with hour and minutes specified
- // Examples:
- // * 09:50 PM
- // * 09:50 pm
- // * 9:50 AM
- // * 9:50 am
- (1, false, true) => &["%I:%M %P", "%I:%M %p", "%l:%M %P", "%l:%M %p"],
-
- _ => &[],
+ let (am, bytes) = match bytes.get(bytes.len() - 3..) {
+ Some(b" AM" | b" am" | b" Am" | b" aM") => {
+ (Some(true), &bytes[..bytes.len() - 3])
+ }
+ Some(b" PM" | b" pm" | b" pM" | b" Pm") => {
+ (Some(false), &bytes[..bytes.len() - 3])
+ }
+ _ => (None, bytes),
};
- formats
- .iter()
- .find_map(|f| NaiveTime::parse_from_str(s, f).ok())
- .map(|nt| {
- nt.num_seconds_from_midnight() as i64 * 1_000_000_000 +
nt.nanosecond() as i64
- })
- // Return generic error if failed to parse as unknown which format
user intended for the string
- .ok_or_else(|| ArrowError::CastError(format!("Error parsing '{s}' as
time")))
+ if bytes.len() < 4 {
+ return None;
+ }
+
+ let mut digits = [b'0'; 6];
+
+ // Extract hour
+ let bytes = match (bytes[1], bytes[2]) {
+ (b':', _) => {
+ digits[1] = bytes[0];
+ &bytes[2..]
+ }
+ (_, b':') => {
+ digits[0] = bytes[0];
+ digits[1] = bytes[1];
+ &bytes[3..]
+ }
+ _ => return None,
+ };
+
+ if bytes.len() < 2 {
+ return None; // Minutes required
+ }
+
+ // Extract minutes
+ digits[2] = bytes[0];
+ digits[3] = bytes[1];
+
+ let nanoseconds = match bytes.get(2) {
+ Some(b':') => {
+ if bytes.len() < 5 {
+ return None;
+ }
+
+ // Extract seconds
+ digits[4] = bytes[3];
+ digits[5] = bytes[4];
+
+ // Extract sub-seconds if any
+ match bytes.get(5) {
+ Some(b'.') => {
+ let decimal = &bytes[6..];
+ if decimal.iter().any(|x| !x.is_ascii_digit()) {
+ return None;
+ }
+ match decimal.len() {
+ 0 => return None,
+ 1 => parse_nanos::<1, b'0'>(decimal),
+ 2 => parse_nanos::<2, b'0'>(decimal),
+ 3 => parse_nanos::<3, b'0'>(decimal),
+ 4 => parse_nanos::<4, b'0'>(decimal),
+ 5 => parse_nanos::<5, b'0'>(decimal),
+ 6 => parse_nanos::<6, b'0'>(decimal),
+ 7 => parse_nanos::<7, b'0'>(decimal),
+ 8 => parse_nanos::<8, b'0'>(decimal),
+ _ => parse_nanos::<9, b'0'>(decimal),
+ }
+ }
+ Some(_) => return None,
+ None => 0,
+ }
+ }
+ Some(_) => return None,
+ None => 0,
+ };
+
+ digits.iter_mut().for_each(|x| *x = x.wrapping_sub(b'0'));
+ if digits.iter().any(|x| *x > 9) {
+ return None;
+ }
+
+ let hour = match (digits[0] * 10 + digits[1], am) {
+ (12, Some(true)) => 0, // 12:00 AM -> 00:00
+ (h @ 1..=11, Some(true)) => h, // 1:00 AM -> 01:00
+ (12, Some(false)) => 12, // 12:00 PM -> 12:00
+ (h @ 1..=11, Some(false)) => h + 12, // 1:00 PM -> 13:00
+ (_, Some(_)) => return None,
+ (h, None) => h,
+ };
+
+ // Handle leap second
+ let (second, nanoseconds) = match digits[4] * 10 + digits[5] {
+ 60 => (59, nanoseconds + 1_000_000_000),
+ s => (s, nanoseconds),
+ };
+
+ NaiveTime::from_hms_nano_opt(
+ hour as _,
+ (digits[2] * 10 + digits[3]) as _,
+ second as _,
+ nanoseconds,
+ )
}
/// Specialized parsing implementations
@@ -900,6 +942,13 @@ mod tests {
use arrow_array::timezone::Tz;
use arrow_buffer::i256;
+ #[test]
+ fn test_parse_nanos() {
+ assert_eq!(parse_nanos::<3, 0>(&[1, 2, 3]), 123_000_000);
+ assert_eq!(parse_nanos::<5, 0>(&[1, 2, 3, 4, 5]), 123_450_000);
+ assert_eq!(parse_nanos::<6, b'0'>(b"123456"), 123_456_000);
+ }
+
#[test]
fn string_to_timestamp_timezone() {
// Explicit timezone
@@ -1398,6 +1447,74 @@ mod tests {
);
}
+ #[test]
+ fn test_string_to_time_invalid() {
+ let cases = [
+ "25:00",
+ "9:00:",
+ "009:00",
+ "09:0:00",
+ "25:00:00",
+ "13:00 AM",
+ "13:00 PM",
+ "12:00. AM",
+ "09:0:00",
+ "09:01:0",
+ "09:01:1",
+ "9:1:0",
+ "09:01:0",
+ "1:00.123",
+ "1:00:00.123f",
+ " 9:00:00",
+ ":09:00",
+ "T9:00:00",
+ "AM",
+ ];
+ for case in cases {
+ assert!(string_to_time(case).is_none(), "{case}");
+ }
+ }
+
+ #[test]
+ fn test_string_to_time_chrono() {
+ let cases = [
+ ("1:00", "%H:%M"),
+ ("12:00", "%H:%M"),
+ ("13:00", "%H:%M"),
+ ("24:00", "%H:%M"),
+ ("1:00:00", "%H:%M:%S"),
+ ("12:00:30", "%H:%M:%S"),
+ ("13:00:59", "%H:%M:%S"),
+ ("24:00:60", "%H:%M:%S"),
+ ("09:00:00", "%H:%M:%S%.f"),
+ ("0:00:30.123456", "%H:%M:%S%.f"),
+ ("0:00 AM", "%I:%M %P"),
+ ("1:00 AM", "%I:%M %P"),
+ ("12:00 AM", "%I:%M %P"),
+ ("13:00 AM", "%I:%M %P"),
+ ("0:00 PM", "%I:%M %P"),
+ ("1:00 PM", "%I:%M %P"),
+ ("12:00 PM", "%I:%M %P"),
+ ("13:00 PM", "%I:%M %P"),
+ ("1:00 pM", "%I:%M %P"),
+ ("1:00 Pm", "%I:%M %P"),
+ ("1:00 aM", "%I:%M %P"),
+ ("1:00 Am", "%I:%M %P"),
+ ("1:00:30.123456 PM", "%I:%M:%S%.f %P"),
+ ("1:00:30.123456789 PM", "%I:%M:%S%.f %P"),
+ ("1:00:30.123456789123 PM", "%I:%M:%S%.f %P"),
+ ("1:00:30.1234 PM", "%I:%M:%S%.f %P"),
+ ("1:00:30.123456 PM", "%I:%M:%S%.f %P"),
+ ("1:00:30.123456789123456789 PM", "%I:%M:%S%.f %P"),
+ ("1:00:30.12F456 PM", "%I:%M:%S%.f %P"),
+ ];
+ for (s, format) in cases {
+ let chrono = NaiveTime::parse_from_str(s, format).ok();
+ let custom = string_to_time(s);
+ assert_eq!(chrono, custom, "{s}");
+ }
+ }
+
#[test]
fn test_parse_interval() {
assert_eq!(