This is an automated email from the ASF dual-hosted git repository.
jonah pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new b7fb32a7c0 feat: `parse_float_as_decimal` supports scientific notation
and Decimal256 (#13806)
b7fb32a7c0 is described below
commit b7fb32a7c037000cb1f80b491964d655ae01763d
Author: Jonah Gao <[email protected]>
AuthorDate: Thu Dec 19 09:34:31 2024 +0800
feat: `parse_float_as_decimal` supports scientific notation and Decimal256
(#13806)
* feat: `parse_float_as_decimal` supports scientific notation and Decimal256
* Fix test
* Add test
* Add test
* Refine negative scales
* Update comment
* Refine bigint_to_i256
* UT for bigint_to_i256
* Add ut for parse_decimal
---
datafusion-cli/Cargo.lock | 1 +
datafusion/sql/Cargo.toml | 1 +
datafusion/sql/src/expr/value.rs | 194 ++++++++++++++++++++-----
datafusion/sqllogictest/test_files/options.slt | 96 ++++++++++--
4 files changed, 245 insertions(+), 47 deletions(-)
diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock
index bac4192b10..d33cbf3964 100644
--- a/datafusion-cli/Cargo.lock
+++ b/datafusion-cli/Cargo.lock
@@ -1625,6 +1625,7 @@ dependencies = [
"arrow",
"arrow-array",
"arrow-schema",
+ "bigdecimal",
"datafusion-common",
"datafusion-expr",
"indexmap",
diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml
index 01bf920438..e1e4d8df3d 100644
--- a/datafusion/sql/Cargo.toml
+++ b/datafusion/sql/Cargo.toml
@@ -44,6 +44,7 @@ unparser = []
arrow = { workspace = true }
arrow-array = { workspace = true }
arrow-schema = { workspace = true }
+bigdecimal = { workspace = true }
datafusion-common = { workspace = true, default-features = true }
datafusion-expr = { workspace = true }
indexmap = { workspace = true }
diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs
index a651567abe..a70934b5cd 100644
--- a/datafusion/sql/src/expr/value.rs
+++ b/datafusion/sql/src/expr/value.rs
@@ -19,10 +19,13 @@ use crate::planner::{ContextProvider, PlannerContext,
SqlToRel};
use arrow::compute::kernels::cast_utils::{
parse_interval_month_day_nano_config, IntervalParseConfig, IntervalUnit,
};
-use arrow::datatypes::DECIMAL128_MAX_PRECISION;
-use arrow_schema::DataType;
+use arrow::datatypes::{i256, DECIMAL128_MAX_PRECISION};
+use arrow_schema::{DataType, DECIMAL256_MAX_PRECISION};
+use bigdecimal::num_bigint::BigInt;
+use bigdecimal::{BigDecimal, Signed, ToPrimitive};
use datafusion_common::{
- internal_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result,
ScalarValue,
+ internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema,
+ DataFusionError, Result, ScalarValue,
};
use datafusion_expr::expr::{BinaryExpr, Placeholder};
use datafusion_expr::planner::PlannerResult;
@@ -31,6 +34,9 @@ use log::debug;
use sqlparser::ast::{BinaryOperator, Expr as SQLExpr, Interval, UnaryOperator,
Value};
use sqlparser::parser::ParserError::ParserError;
use std::borrow::Cow;
+use std::cmp::Ordering;
+use std::ops::Neg;
+use std::str::FromStr;
impl<S: ContextProvider> SqlToRel<'_, S> {
pub(crate) fn parse_value(
@@ -84,7 +90,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
}
if self.options.parse_float_as_decimal {
- parse_decimal_128(unsigned_number, negative)
+ parse_decimal(unsigned_number, negative)
} else {
signed_number.parse::<f64>().map(lit).map_err(|_| {
DataFusionError::from(ParserError(format!(
@@ -315,45 +321,84 @@ const fn try_decode_hex_char(c: u8) -> Option<u8> {
}
}
-/// Parse Decimal128 from a string
-///
-/// TODO: support parsing from scientific notation
-fn parse_decimal_128(unsigned_number: &str, negative: bool) -> Result<Expr> {
- // remove leading zeroes
- let trimmed = unsigned_number.trim_start_matches('0');
- // Parse precision and scale, remove decimal point if exists
- let (precision, scale, replaced_str) = if trimmed == "." {
- // Special cases for numbers such as “0.”, “000.”, and so on.
- (1, 0, Cow::Borrowed("0"))
- } else if let Some(i) = trimmed.find('.') {
- (
- trimmed.len() - 1,
- trimmed.len() - i - 1,
- Cow::Owned(trimmed.replace('.', "")),
- )
- } else {
- // No decimal point, keep as is
- (trimmed.len(), 0, Cow::Borrowed(trimmed))
- };
+/// Returns None if the value can't be converted to i256.
+/// Modified from
<https://github.com/apache/arrow-rs/blob/c4dbf0d8af6ca5a19b8b2ea777da3c276807fc5e/arrow-buffer/src/bigint/mod.rs#L303>
+fn bigint_to_i256(v: &BigInt) -> Option<i256> {
+ let v_bytes = v.to_signed_bytes_le();
+ match v_bytes.len().cmp(&32) {
+ Ordering::Less => {
+ let mut bytes = if v.is_negative() {
+ [255_u8; 32]
+ } else {
+ [0; 32]
+ };
+ bytes[0..v_bytes.len()].copy_from_slice(&v_bytes[..v_bytes.len()]);
+ Some(i256::from_le_bytes(bytes))
+ }
+ Ordering::Equal =>
Some(i256::from_le_bytes(v_bytes.try_into().unwrap())),
+ Ordering::Greater => None,
+ }
+}
- let number = replaced_str.parse::<i128>().map_err(|e| {
+fn parse_decimal(unsigned_number: &str, negative: bool) -> Result<Expr> {
+ let mut dec = BigDecimal::from_str(unsigned_number).map_err(|e| {
DataFusionError::from(ParserError(format!(
- "Cannot parse {replaced_str} as i128 when building decimal: {e}"
+ "Cannot parse {unsigned_number} as BigDecimal: {e}"
)))
})?;
-
- // Check precision overflow
- if precision as u8 > DECIMAL128_MAX_PRECISION {
- return Err(DataFusionError::from(ParserError(format!(
- "Cannot parse {replaced_str} as i128 when building decimal:
precision overflow"
- ))));
+ if negative {
+ dec = dec.neg();
}
- Ok(Expr::Literal(ScalarValue::Decimal128(
- Some(if negative { -number } else { number }),
- precision as u8,
- scale as i8,
- )))
+ let digits = dec.digits();
+ let (int_val, scale) = dec.into_bigint_and_exponent();
+ if scale < i8::MIN as i64 {
+ return not_impl_err!(
+ "Decimal scale {} exceeds the minimum supported scale: {}",
+ scale,
+ i8::MIN
+ );
+ }
+ let precision = if scale > 0 {
+ // arrow-rs requires the precision to include the positive scale.
+ // See
<https://github.com/apache/arrow-rs/blob/123045cc766d42d1eb06ee8bb3f09e39ea995ddc/arrow-array/src/types.rs#L1230>
+ std::cmp::max(digits, scale.unsigned_abs())
+ } else {
+ digits
+ };
+ if precision <= DECIMAL128_MAX_PRECISION as u64 {
+ let val = int_val.to_i128().ok_or_else(|| {
+ // Failures are unexpected here as we have already checked the
precision
+ internal_datafusion_err!(
+ "Unexpected overflow when converting {} to i128",
+ int_val
+ )
+ })?;
+ Ok(Expr::Literal(ScalarValue::Decimal128(
+ Some(val),
+ precision as u8,
+ scale as i8,
+ )))
+ } else if precision <= DECIMAL256_MAX_PRECISION as u64 {
+ let val = bigint_to_i256(&int_val).ok_or_else(|| {
+ // Failures are unexpected here as we have already checked the
precision
+ internal_datafusion_err!(
+ "Unexpected overflow when converting {} to i256",
+ int_val
+ )
+ })?;
+ Ok(Expr::Literal(ScalarValue::Decimal256(
+ Some(val),
+ precision as u8,
+ scale as i8,
+ )))
+ } else {
+ not_impl_err!(
+ "Decimal precision {} exceeds the maximum supported precision: {}",
+ precision,
+ DECIMAL256_MAX_PRECISION
+ )
+ }
}
#[cfg(test)]
@@ -379,4 +424,79 @@ mod tests {
assert_eq!(output, expect);
}
}
+
+ #[test]
+ fn test_bigint_to_i256() {
+ let cases = [
+ (BigInt::from(0), Some(i256::from(0))),
+ (BigInt::from(1), Some(i256::from(1))),
+ (BigInt::from(-1), Some(i256::from(-1))),
+ (
+ BigInt::from_str(i256::MAX.to_string().as_str()).unwrap(),
+ Some(i256::MAX),
+ ),
+ (
+ BigInt::from_str(i256::MIN.to_string().as_str()).unwrap(),
+ Some(i256::MIN),
+ ),
+ (
+ // Can't fit into i256
+ BigInt::from_str((i256::MAX.to_string() +
"1").as_str()).unwrap(),
+ None,
+ ),
+ ];
+
+ for (input, expect) in cases {
+ let output = bigint_to_i256(&input);
+ assert_eq!(output, expect);
+ }
+ }
+
+ #[test]
+ fn test_parse_decimal() {
+ // Supported cases
+ let cases = [
+ ("0", ScalarValue::Decimal128(Some(0), 1, 0)),
+ ("1", ScalarValue::Decimal128(Some(1), 1, 0)),
+ ("123.45", ScalarValue::Decimal128(Some(12345), 5, 2)),
+ // Digit count is less than scale
+ ("0.001", ScalarValue::Decimal128(Some(1), 3, 3)),
+ // Scientific notation
+ ("123.456e-2", ScalarValue::Decimal128(Some(123456), 6, 5)),
+ // Negative scale
+ ("123456e128", ScalarValue::Decimal128(Some(123456), 6, -128)),
+ // Decimal256
+ (
+ &("9".repeat(39) + "." + "99999"),
+ ScalarValue::Decimal256(
+ Some(i256::from_string(&"9".repeat(44)).unwrap()),
+ 44,
+ 5,
+ ),
+ ),
+ ];
+ for (input, expect) in cases {
+ let output = parse_decimal(input, true).unwrap();
+ assert_eq!(output,
Expr::Literal(expect.arithmetic_negate().unwrap()));
+
+ let output = parse_decimal(input, false).unwrap();
+ assert_eq!(output, Expr::Literal(expect));
+ }
+
+ // scale < i8::MIN
+ assert_eq!(
+ parse_decimal("1e129", false)
+ .unwrap_err()
+ .strip_backtrace(),
+ "This feature is not implemented: Decimal scale -129 exceeds the
minimum supported scale: -128"
+ );
+
+ // Unsupported precision
+ assert_eq!(
+ parse_decimal(&"1".repeat(77), false)
+ .unwrap_err()
+ .strip_backtrace(),
+ "This feature is not implemented: Decimal precision 77 exceeds the
maximum supported precision: 76"
+ );
+ }
}
diff --git a/datafusion/sqllogictest/test_files/options.slt
b/datafusion/sqllogictest/test_files/options.slt
index aafaa05496..14b0e2d43c 100644
--- a/datafusion/sqllogictest/test_files/options.slt
+++ b/datafusion/sqllogictest/test_files/options.slt
@@ -192,19 +192,95 @@ select
arrow_typeof(00009999999999999999999999999999999999.9999),
----
Decimal128(38, 4) Decimal128(38, 4) Decimal128(20, 0)
-# precision overflow
-statement error DataFusion error: SQL error: ParserError\("Cannot parse
123456789012345678901234567890123456789 as i128 when building decimal:
precision overflow"\)
-select 123456789.012345678901234567890123456789
+# scientific notation
+query RTRTRT
+select 1.23e3, arrow_typeof(1.23e3),
+ +1.23e1, arrow_typeof(+1.23e1),
+ -1234.56e-3, arrow_typeof(-1234.56e-3)
+----
+1230 Decimal128(3, -1) 12.3 Decimal128(3, 1) -1.23456 Decimal128(6, 5)
+
+query RTRTRT
+select 1.23e-2, arrow_typeof(1.23e-2),
+ 1.23456e0, arrow_typeof(1.23456e0),
+ -.0123e2, arrow_typeof(-.0123e2)
+----
+0.0123 Decimal128(4, 4) 1.23456 Decimal128(6, 5) -1.23 Decimal128(3, 2)
+
+# Decimal256 cases
+query RT
+select 123456789.0123456789012345678901234567890,
+ arrow_typeof(123456789.0123456789012345678901234567890)
+----
+123456789.012345678901 Decimal256(40, 31)
+
+query RT
+select -123456789.0123456789012345678901234567890,
+ arrow_typeof(-123456789.0123456789012345678901234567890)
+----
+-123456789.012345678901 Decimal256(40, 31)
+
+# max precision and scale of Decimal256
+query RTRT
+select -1e-76, arrow_typeof(-1e-76),
+ -1.234567e-70, arrow_typeof(-1.234567e-70)
+----
+0 Decimal256(76, 76) 0 Decimal256(76, 76)
-statement error SQL error: ParserError\("Cannot parse
123456789012345678901234567890123456789 as i128 when building decimal:
precision overflow"\)
-select -123456789.012345678901234567890123456789
+# Decimal256::MAX for nonnegative scale
+query RT
+select
9999999999999999999999999999999999999999999999999999999999999999999999999999,
+
arrow_typeof(9999999999999999999999999999999999999999999999999999999999999999999999999999);
+----
+9999999999999999999999999999999999999999999999999999999999999999999999999999
Decimal256(76, 0)
-# can not fit in i128
-statement error SQL error: ParserError\("Cannot parse
1234567890123456789012345678901234567890 as i128 when building decimal: number
too large to fit in target type"\)
-select 123456789.0123456789012345678901234567890
+# Decimal256::MIN
+query RT
+select
-9999999999999999999999999999999999999999999999999999999999999999999999999999,
+
arrow_typeof(-9999999999999999999999999999999999999999999999999999999999999999999999999999);
+----
+-9999999999999999999999999999999999999999999999999999999999999999999999999999
Decimal256(76, 0)
-statement error SQL error: ParserError\("Cannot parse
1234567890123456789012345678901234567890 as i128 when building decimal: number
too large to fit in target type"\)
-select -123456789.0123456789012345678901234567890
+# boundaries between decimal128 and decimal256
+query RTRT
+select 1e-38, arrow_typeof(1e-38),
+ 1e-39, arrow_typeof(1e-39);
+----
+0 Decimal128(38, 38) 0 Decimal256(39, 39)
+
+query RTRT
+select -1e-38, arrow_typeof(-1e-38),
+ -1e-39, arrow_typeof(-1e-39);
+----
+0 Decimal128(38, 38) 0 Decimal256(39, 39)
+
+# unsupported precision
+query error Decimal precision 77 exceeds the maximum supported precision: 76
+select -1e-77;
+
+query error Decimal precision 79 exceeds the maximum supported precision: 76
+select
1.000000000000000000000000000000000000000000000000000000000000000000000000000001;
+
+# negative scales
+query TR
+select arrow_typeof(1e77), 1e77
+----
+Decimal128(1, -77)
100000000000000000000000000000000000000000000000000000000000000000000000000000
+
+query T
+select arrow_typeof(1e128)
+----
+Decimal128(1, -128)
+
+query error Decimal scale \-129 exceeds the minimum supported scale: \-128
+select 1e129
+
+# simple arithmetic
+query RTRT
+select 1e40 + 1e40, arrow_typeof(1e40 + 1e40),
+ 1e-40 + -1e-40, arrow_typeof(1e-40 + -1e-40)
+----
+20000000000000000000000000000000000000000 Decimal128(2, -40) 0 Decimal256(41,
40)
# Restore option to default value
statement ok
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]