tustvold commented on code in PR #3805:
URL: https://github.com/apache/arrow-rs/pull/3805#discussion_r1126099715
##########
arrow-array/src/types.rs:
##########
@@ -699,6 +701,132 @@ fn format_decimal_str(value_str: &str, precision: usize,
scale: i8) -> String {
}
}
+lazy_static! {
+ static ref PARSE_DECIMAL_RE: Regex =
+ Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap();
+}
+
+/// Parse the string format decimal value to i128/i256 format and checking the
precision and scale.
+/// The result value can't be out of bounds.
+pub fn parse_decimal_with_parameter<T: DecimalType>(
Review Comment:
Could we put this in arrow-cast/parse instead
##########
arrow-array/src/types.rs:
##########
@@ -699,6 +701,132 @@ fn format_decimal_str(value_str: &str, precision: usize,
scale: i8) -> String {
}
}
+lazy_static! {
+ static ref PARSE_DECIMAL_RE: Regex =
Review Comment:
This seems like quite a heavy way to achieve this, I wonder if we could
avoid bringing this dependency in with a simple for loop over the string bytes?
##########
arrow-array/src/types.rs:
##########
@@ -699,6 +701,132 @@ fn format_decimal_str(value_str: &str, precision: usize,
scale: i8) -> String {
}
}
+lazy_static! {
+ static ref PARSE_DECIMAL_RE: Regex =
+ Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap();
+}
+
+/// Parse the string format decimal value to i128/i256 format and checking the
precision and scale.
+/// The result value can't be out of bounds.
+pub fn parse_decimal_with_parameter<T: DecimalType>(
+ s: &str,
+ precision: u8,
+ scale: i8,
+) -> Result<T::Native, ArrowError> {
+ if PARSE_DECIMAL_RE.is_match(s) {
+ let mut offset = s.len();
+ let len = s.len();
+ let mut base = T::Native::usize_as(1);
+ let scale_usize = usize::from(scale as u8);
+
+ // handle the value after the '.' and meet the scale
+ let delimiter_position = s.find('.');
+ match delimiter_position {
+ None => {
+ // there is no '.'
+ base = T::Native::usize_as(10).pow_checked(scale as u32)?;
+ }
+ Some(mid) => {
+ // there is the '.'
+ if len - mid >= scale_usize + 1 {
+ // If the string value is "123.12345" and the scale is 2,
we should just remain '.12' and drop the '345' value.
+ offset -= len - mid - 1 - scale_usize;
+ } else {
+ // If the string value is "123.12" and the scale is 4, we
should append '00' to the tail.
+ base = T::Native::usize_as(10)
+ .pow_checked((scale_usize + 1 + mid - len) as u32)?;
+ }
+ }
+ };
+
+ // each byte is digit、'-' or '.'
+ let bytes = s.as_bytes();
+ let mut negative = false;
+ let mut result = T::Native::usize_as(0);
+
+ bytes[0..offset]
+ .iter()
+ .rev()
+ .try_for_each::<_, Result<(), ArrowError>>(|&byte| {
+ match byte {
+ b'-' => {
+ negative = true;
+ }
+ b'0'..=b'9' => {
+ let add = T::Native::usize_as((byte - b'0') as usize)
+ .mul_checked(base)?;
+ result = result.add_checked(add)?;
+ base = base.mul_checked(T::Native::usize_as(10))?;
+ }
+ // because of the PARSE_DECIMAL_RE, bytes just contains
digit、'-' and '.'.
+ _ => (),
+ }
+ Ok(())
+ })?;
+
+ if negative {
+ result = result.neg_checked()?;
+ }
+
+ match T::validate_decimal_precision(result, precision) {
+ Ok(_) => Ok(result),
+ Err(e) => Err(ArrowError::ParseError(format!(
+ "parse decimal overflow: {e}"
+ ))),
+ }
+ } else {
+ Err(ArrowError::ParseError(format!(
+ "can't parse the string value {s} to decimal"
+ )))
+ }
+}
+
+// Parse the string format decimal value to i128 format without checking the
precision and scale.
+// Like "125.12" to 12512_i128.
+#[cfg(test)]
Review Comment:
Could we drop this and rename the above method to parse_decimal?
##########
arrow-array/src/types.rs:
##########
@@ -699,6 +701,132 @@ fn format_decimal_str(value_str: &str, precision: usize,
scale: i8) -> String {
}
}
+lazy_static! {
+ static ref PARSE_DECIMAL_RE: Regex =
+ Regex::new(r"^-?(\d+\.?\d*|\d*\.?\d+)$").unwrap();
+}
+
+/// Parse the string format decimal value to i128/i256 format and checking the
precision and scale.
+/// The result value can't be out of bounds.
+pub fn parse_decimal_with_parameter<T: DecimalType>(
+ s: &str,
+ precision: u8,
+ scale: i8,
+) -> Result<T::Native, ArrowError> {
+ if PARSE_DECIMAL_RE.is_match(s) {
+ let mut offset = s.len();
+ let len = s.len();
+ let mut base = T::Native::usize_as(1);
+ let scale_usize = usize::from(scale as u8);
+
+ // handle the value after the '.' and meet the scale
+ let delimiter_position = s.find('.');
+ match delimiter_position {
+ None => {
+ // there is no '.'
+ base = T::Native::usize_as(10).pow_checked(scale as u32)?;
+ }
+ Some(mid) => {
+ // there is the '.'
+ if len - mid >= scale_usize + 1 {
+ // If the string value is "123.12345" and the scale is 2,
we should just remain '.12' and drop the '345' value.
+ offset -= len - mid - 1 - scale_usize;
+ } else {
+ // If the string value is "123.12" and the scale is 4, we
should append '00' to the tail.
+ base = T::Native::usize_as(10)
+ .pow_checked((scale_usize + 1 + mid - len) as u32)?;
+ }
+ }
+ };
+
+ // each byte is digit、'-' or '.'
+ let bytes = s.as_bytes();
+ let mut negative = false;
+ let mut result = T::Native::usize_as(0);
+
+ bytes[0..offset]
+ .iter()
+ .rev()
+ .try_for_each::<_, Result<(), ArrowError>>(|&byte| {
+ match byte {
+ b'-' => {
+ negative = true;
+ }
+ b'0'..=b'9' => {
+ let add = T::Native::usize_as((byte - b'0') as usize)
+ .mul_checked(base)?;
+ result = result.add_checked(add)?;
+ base = base.mul_checked(T::Native::usize_as(10))?;
+ }
+ // because of the PARSE_DECIMAL_RE, bytes just contains
digit、'-' and '.'.
+ _ => (),
+ }
+ Ok(())
+ })?;
+
+ if negative {
+ result = result.neg_checked()?;
+ }
+
+ match T::validate_decimal_precision(result, precision) {
+ Ok(_) => Ok(result),
+ Err(e) => Err(ArrowError::ParseError(format!(
+ "parse decimal overflow: {e}"
+ ))),
+ }
+ } else {
+ Err(ArrowError::ParseError(format!(
+ "can't parse the string value {s} to decimal"
+ )))
+ }
+}
+
+// Parse the string format decimal value to i128 format without checking the
precision and scale.
+// Like "125.12" to 12512_i128.
+#[cfg(test)]
+fn parse_decimal(s: &str) -> Result<i128, ArrowError> {
+ use std::ops::Neg;
+
+ if PARSE_DECIMAL_RE.is_match(s) {
+ let mut offset = s.len();
+ // each byte is digit、'-' or '.'
+ let bytes = s.as_bytes();
+ let mut negative = false;
+ let mut result: i128 = 0;
+ let mut base = 1;
+ while offset > 0 {
+ match bytes[offset - 1] {
+ b'-' => {
+ negative = true;
+ }
+ b'.' => {
+ // do nothing
Review Comment:
It occurs to me if this checked that the decimal only appears once, we could
drop the regex
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]