This is an automated email from the ASF dual-hosted git repository.
tustvold pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/master by this push:
new 37758df25 Faster i256 parsing (#3950)
37758df25 is described below
commit 37758df25147ce36d81824650d24679e6c581b19
Author: Raphael Taylor-Davies <[email protected]>
AuthorDate: Thu Mar 30 22:06:57 2023 +0100
Faster i256 parsing (#3950)
* Faster i256 parsing
* Fix overflow
---
arrow-buffer/Cargo.toml | 5 ++
arrow-buffer/benches/i256.rs | 44 ++++++++++++++
arrow-buffer/src/bigint.rs | 142 ++++++++++++++++++++++++++++++++++++++++---
3 files changed, 181 insertions(+), 10 deletions(-)
diff --git a/arrow-buffer/Cargo.toml b/arrow-buffer/Cargo.toml
index 3d2fd71c9..c5b0c6c26 100644
--- a/arrow-buffer/Cargo.toml
+++ b/arrow-buffer/Cargo.toml
@@ -38,6 +38,11 @@ num = { version = "0.4", default-features = false, features
= ["std"] }
half = { version = "2.1", default-features = false }
[dev-dependencies]
+criterion = { version = "0.4", default-features = false }
rand = { version = "0.8", default-features = false, features = ["std",
"std_rng"] }
[build-dependencies]
+
+[[bench]]
+name = "i256"
+harness = false
\ No newline at end of file
diff --git a/arrow-buffer/benches/i256.rs b/arrow-buffer/benches/i256.rs
new file mode 100644
index 000000000..a04e4cb6c
--- /dev/null
+++ b/arrow-buffer/benches/i256.rs
@@ -0,0 +1,44 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+use arrow_buffer::i256;
+use criterion::*;
+use std::str::FromStr;
+
+fn criterion_benchmark(c: &mut Criterion) {
+ let numbers = vec![
+ i256::ZERO,
+ i256::ONE,
+ i256::MINUS_ONE,
+ i256::from_i128(1233456789),
+ i256::from_i128(-1233456789),
+ i256::from_i128(i128::MAX),
+ i256::from_i128(i128::MIN),
+ i256::MIN,
+ i256::MAX,
+ ];
+
+ for number in numbers {
+ let t = black_box(number.to_string());
+ c.bench_function(&format!("i256_parse({t})"), |b| {
+ b.iter(|| i256::from_str(&t).unwrap());
+ });
+ }
+}
+
+criterion_group!(benches, criterion_benchmark);
+criterion_main!(benches);
diff --git a/arrow-buffer/src/bigint.rs b/arrow-buffer/src/bigint.rs
index 5abfb7c85..3a9c4aac8 100644
--- a/arrow-buffer/src/bigint.rs
+++ b/arrow-buffer/src/bigint.rs
@@ -16,9 +16,28 @@
// under the License.
use num::cast::AsPrimitive;
-use num::{BigInt, FromPrimitive, Num, ToPrimitive};
+use num::{BigInt, FromPrimitive, ToPrimitive};
use std::cmp::Ordering;
-use std::ops::{BitAnd, BitOr, BitXor, Shl, Shr};
+use std::num::ParseIntError;
+use std::ops::{BitAnd, BitOr, BitXor, Neg, Shl, Shr};
+use std::str::FromStr;
+
+/// An opaque error similar to [`std::num::ParseIntError`]
+#[derive(Debug)]
+pub struct ParseI256Error {}
+
+impl From<ParseIntError> for ParseI256Error {
+ fn from(_: ParseIntError) -> Self {
+ Self {}
+ }
+}
+
+impl std::fmt::Display for ParseI256Error {
+ fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+ write!(f, "Failed to parse as i256")
+ }
+}
+impl std::error::Error for ParseI256Error {}
/// A signed 256-bit integer
#[allow(non_camel_case_types)]
@@ -40,6 +59,67 @@ impl std::fmt::Display for i256 {
}
}
+impl FromStr for i256 {
+ type Err = ParseI256Error;
+
+ fn from_str(s: &str) -> Result<Self, Self::Err> {
+ // i128 can store up to 38 decimal digits
+ if s.len() <= 38 {
+ return Ok(Self::from_i128(i128::from_str(s)?));
+ }
+
+ let (negative, s) = match s.as_bytes()[0] {
+ b'-' => (true, &s[1..]),
+ b'+' => (false, &s[1..]),
+ _ => (false, s),
+ };
+
+ // Trim leading 0s
+ let s = s.trim_start_matches('0');
+ if s.is_empty() {
+ return Ok(i256::ZERO);
+ }
+
+ if !s.as_bytes()[0].is_ascii_digit() {
+ // Ensures no duplicate sign
+ return Err(ParseI256Error {});
+ }
+
+ parse_impl(s, negative)
+ }
+}
+
+/// Parse `s` with any sign and leading 0s removed
+fn parse_impl(s: &str, negative: bool) -> Result<i256, ParseI256Error> {
+ if s.len() <= 38 {
+ let low = i128::from_str(s)?;
+ return Ok(match negative {
+ true => i256::from_parts(low.neg() as _, -1),
+ false => i256::from_parts(low as _, 0),
+ });
+ }
+
+ let split = s.len() - 38;
+ if !s.as_bytes()[split].is_ascii_digit() {
+ // Ensures not splitting codepoint and no sign
+ return Err(ParseI256Error {});
+ }
+ let (hs, ls) = s.split_at(split);
+
+ let mut low = i128::from_str(ls)?;
+ let high = parse_impl(hs, negative)?;
+
+ if negative {
+ low = -low;
+ }
+
+ let low = i256::from_i128(low);
+
+ high.checked_mul(i256::from_i128(10_i128.pow(38)))
+ .and_then(|high| high.checked_add(low))
+ .ok_or(ParseI256Error {})
+}
+
impl PartialOrd for i256 {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
@@ -106,14 +186,7 @@ impl i256 {
/// Create an integer value from its representation as string.
#[inline]
pub fn from_string(value_str: &str) -> Option<Self> {
- let numbers = BigInt::from_str_radix(value_str, 10).ok()?;
- let (integer, overflow) = Self::from_bigint_with_overflow(numbers);
-
- if overflow {
- None
- } else {
- Some(integer)
- }
+ value_str.parse().ok()
}
/// Create an optional i256 from the provided `f64`. Returning `None`
@@ -915,4 +988,53 @@ mod tests {
let i128 = a.as_i128();
assert_eq!(i128, i128::MAX - 1);
}
+
+ #[test]
+ fn test_string_roundtrip() {
+ let roundtrip_cases = [
+ i256::ZERO,
+ i256::ONE,
+ i256::MINUS_ONE,
+ i256::from_i128(123456789),
+ i256::from_i128(-123456789),
+ i256::from_i128(i128::MIN),
+ i256::from_i128(i128::MAX),
+ i256::MIN,
+ i256::MAX,
+ ];
+ for case in roundtrip_cases {
+ let formatted = case.to_string();
+ let back: i256 = formatted.parse().unwrap();
+ assert_eq!(case, back);
+ }
+ }
+
+ #[test]
+ fn test_from_string() {
+ let cases = [
+ (
+ "000000000000000000000000000000000000000011",
+ Some(i256::from_i128(11)),
+ ),
+ (
+ "-000000000000000000000000000000000000000011",
+ Some(i256::from_i128(-11)),
+ ),
+ (
+ "-0000000000000000000000000000000000000000123456789",
+ Some(i256::from_i128(-123456789)),
+ ),
+ ("-", None),
+ ("+", None),
+ ("--1", None),
+ ("-+1", None),
+ ("000000000000000000000000000000000000000", Some(i256::ZERO)),
+ ("0000000000000000000000000000000000000000-11", None),
+ ("11-1111111111111111111111111111111111111", None),
+
("115792089237316195423570985008687907853269984665640564039457584007913129639936",
None)
+ ];
+ for (case, expected) in cases {
+ assert_eq!(i256::from_string(case), expected)
+ }
+ }
}