This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new a1949e46f3 fix: liberal parsing of zero scale decimals (#8700)
a1949e46f3 is described below
commit a1949e46f31f2d33d7b2163052492a4c47ee6306
Author: Marko Grujic <[email protected]>
AuthorDate: Mon Jan 19 16:08:13 2026 +0100
fix: liberal parsing of zero scale decimals (#8700)
# Which issue does this PR close?
- Closes #8699.
Alternative to https://github.com/apache/arrow-rs/pull/8702 (see #8699
for a discussion).
# Rationale for this change
Make parsing of zero-scale decimals more correct/consistent.
# What changes are included in this PR?
When a decimal with scale 0 is being parsed, but it has some digits pass
the decimal point those will effectively be discarded.
# Are these changes tested?
Yes.
# Are there any user-facing changes?
Aligned parsing of zero-scale decimals.
---------
Co-authored-by: Martin Grigorov <[email protected]>
---
arrow-cast/src/parse.rs | 153 +++++++++++++++++++++++++++++-------------------
1 file changed, 92 insertions(+), 61 deletions(-)
diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs
index b266cc4aa3..a23f421c34 100644
--- a/arrow-cast/src/parse.rs
+++ b/arrow-cast/src/parse.rs
@@ -741,32 +741,27 @@ fn parse_e_notation<T: DecimalType>(
let mut exp: i16 = 0;
let base = T::Native::usize_as(10);
- let mut exp_start: bool = false;
// e has a plus sign
let mut pos_shift_direction: bool = true;
- // skip to point or exponent index
- let mut bs;
- if fractionals > 0 {
- // it's a fraction, so the point index needs to be skipped, so +1
- bs = s.as_bytes().iter().skip(index + fractionals as usize + 1);
- } else {
- // it's actually an integer that is already written into the result,
so let's skip on to e
- bs = s.as_bytes().iter().skip(index);
- }
+ // skip to the exponent index directly or just after any processed
fractionals
+ let mut bs = s.as_bytes().iter().skip(index + fractionals as usize);
- while let Some(b) = bs.next() {
+ // This function is only called from `parse_decimal`, in which we skip
parsing any fractionals
+ // after we reach `scale` digits, not knowing ahead of time whether the
decimal contains an
+ // e-notation or not.
+ // So once we do hit into an e-notation, and drop down into this function,
we need to parse the
+ // remaining unprocessed fractionals too, since otherwise we might lose
precision.
+ for b in bs.by_ref() {
match b {
b'0'..=b'9' => {
result = result.mul_wrapping(base);
result = result.add_wrapping(T::Native::usize_as((b - b'0') as
usize));
- if fractionals > 0 {
- fractionals += 1;
- }
+ fractionals += 1;
digits += 1;
}
- &b'e' | &b'E' => {
- exp_start = true;
+ b'e' | b'E' => {
+ break;
}
_ => {
return Err(ArrowError::ParseError(format!(
@@ -774,39 +769,29 @@ fn parse_e_notation<T: DecimalType>(
)));
}
};
+ }
- if exp_start {
- pos_shift_direction = match bs.next() {
- Some(&b'-') => false,
- Some(&b'+') => true,
- Some(b) => {
- if !b.is_ascii_digit() {
- return Err(ArrowError::ParseError(format!(
- "can't parse the string value {s} to decimal"
- )));
- }
-
- exp *= 10;
- exp += (b - b'0') as i16;
-
- true
- }
- None => {
- return Err(ArrowError::ParseError(format!(
- "can't parse the string value {s} to decimal"
- )));
- }
- };
-
- for b in bs.by_ref() {
- if !b.is_ascii_digit() {
- return Err(ArrowError::ParseError(format!(
- "can't parse the string value {s} to decimal"
- )));
- }
+ // parse the exponent itself
+ let mut signed = false;
+ for b in bs {
+ match b {
+ b'-' if !signed => {
+ pos_shift_direction = false;
+ signed = true;
+ }
+ b'+' if !signed => {
+ pos_shift_direction = true;
+ signed = true;
+ }
+ b if b.is_ascii_digit() => {
exp *= 10;
exp += (b - b'0') as i16;
}
+ _ => {
+ return Err(ArrowError::ParseError(format!(
+ "can't parse the string value {s} to decimal"
+ )));
+ }
}
}
@@ -850,7 +835,11 @@ fn parse_e_notation<T: DecimalType>(
}
/// Parse the string format decimal value to i128/i256 format and checking the
precision and scale.
-/// The result value can't be out of bounds.
+/// Expected behavior:
+/// - The result value can't be out of bounds.
+/// - When parsing a decimal with scale 0, all fractional digits will be
discarded. The final
+/// fractional digits may be a subset or a superset of the digits after the
decimal point when
+/// e-notation is used.
pub fn parse_decimal<T: DecimalType>(
s: &str,
precision: u8,
@@ -862,18 +851,24 @@ pub fn parse_decimal<T: DecimalType>(
let base = T::Native::usize_as(10);
let bs = s.as_bytes();
- let (signed, negative) = match bs.first() {
- Some(b'-') => (true, true),
- Some(b'+') => (true, false),
- _ => (false, false),
- };
- if bs.is_empty() || signed && bs.len() == 1 {
+ if !bs
+ .last()
+ .is_some_and(|b| b.is_ascii_digit() || (b == &b'.' && s.len() > 1))
+ {
+ // If the last character is not a digit (or a decimal point prefixed
with some digits), then
+ // it's not a valid decimal.
return Err(ArrowError::ParseError(format!(
"can't parse the string value {s} to decimal"
)));
}
+ let (signed, negative) = match bs.first() {
+ Some(b'-') => (true, true),
+ Some(b'+') => (true, false),
+ _ => (false, false),
+ };
+
// Iterate over the raw input bytes, skipping the sign if any
let mut bs = bs.iter().enumerate().skip(signed as usize);
@@ -903,7 +898,7 @@ pub fn parse_decimal<T: DecimalType>(
digits as u16,
fractionals as i16,
result,
- point_index,
+ point_index + 1,
precision as u16,
scale as i16,
)?;
@@ -916,7 +911,7 @@ pub fn parse_decimal<T: DecimalType>(
"can't parse the string value {s} to decimal"
)));
}
- if fractionals == scale && scale != 0 {
+ if fractionals == scale {
// We have processed all the digits that we need. All
that
// is left is to validate that the rest of the string
contains
// valid digits.
@@ -931,13 +926,6 @@ pub fn parse_decimal<T: DecimalType>(
if is_e_notation {
break;
}
-
- // Fail on "."
- if digits == 0 {
- return Err(ArrowError::ParseError(format!(
- "can't parse the string value {s} to decimal"
- )));
- }
}
b'e' | b'E' => {
result = parse_e_notation::<T>(
@@ -2613,6 +2601,9 @@ mod tests {
"1.1e.12",
"1.23e+3.",
"1.23e+3.1",
+ "1e",
+ "1e+",
+ "1e-",
];
for s in can_not_parse_tests {
let result_128 = parse_decimal::<Decimal128Type>(s, 20, 3);
@@ -2636,6 +2627,7 @@ mod tests {
("12345678908765.123456", 3),
("123456789087651234.56e-4", 3),
("1234560000000", 0),
+ ("12345678900.0", 0),
("1.23456e12", 0),
];
for (s, scale) in overflow_parse_tests {
@@ -2752,6 +2744,45 @@ mod tests {
let result = parse_decimal::<Decimal256Type>(s, 76, scale);
assert_eq!(i, result.unwrap());
}
+
+ let zero_scale_tests = [
+ (".123", 0, 3),
+ ("0.123", 0, 3),
+ ("1.0", 1, 3),
+ ("1.2", 1, 3),
+ ("1.00", 1, 3),
+ ("1.23", 1, 3),
+ ("1.000", 1, 3),
+ ("1.123", 1, 3),
+ ("123.0", 123, 3),
+ ("123.4", 123, 3),
+ ("123.00", 123, 3),
+ ("123.45", 123, 3),
+ ("123.000000000000000000004", 123, 3),
+ ("0.123e2", 12, 3),
+ ("0.123e4", 1230, 10),
+ ("1.23e4", 12300, 10),
+ ("12.3e4", 123000, 10),
+ ("123e4", 1230000, 10),
+ (
+ "20000000000000000000000000000000000002.0",
+ 20000000000000000000000000000000000002,
+ 38,
+ ),
+ ];
+ for (s, i, precision) in zero_scale_tests {
+ let result_128 = parse_decimal::<Decimal128Type>(s, precision,
0).unwrap();
+ assert_eq!(i, result_128);
+ }
+
+ let can_not_parse_zero_scale = [".", "blag", "", "+", "-", "e"];
+ for s in can_not_parse_zero_scale {
+ let result_128 = parse_decimal::<Decimal128Type>(s, 5, 0);
+ assert_eq!(
+ format!("Parser error: can't parse the string value {s} to
decimal"),
+ result_128.unwrap_err().to_string(),
+ );
+ }
}
#[test]