This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch branch-4.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-4.1 by this push:
     new f708be5c7d8 branch-4.1: [fix](decimal) Fix incorrect decimal cast 
results for scientific-notation strings #63119 (#63184)
f708be5c7d8 is described below

commit f708be5c7d848c27fcb4e2fe3131a2c30575e99d
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed May 13 11:45:19 2026 +0800

    branch-4.1: [fix](decimal) Fix incorrect decimal cast results for 
scientific-notation strings #63119 (#63184)
    
    Cherry-picked from #63119
    
    Co-authored-by: TengJianPing <[email protected]>
---
 be/src/util/string_parser.cpp                      | 49 +++++++++++++++++++---
 .../column_array_update_crc32c_batch_37.out        |  4 +-
 ...n_array_update_crc32c_batch_37.out_with_nullmap |  4 +-
 .../column_array_update_crc32c_single_37.out       |  2 +-
 ..._array_update_crc32c_single_37.out_with_nullmap |  2 +-
 be/test/exprs/function/cast/cast_to_decimal.cpp    | 33 ++++++++++++++-
 6 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/be/src/util/string_parser.cpp b/be/src/util/string_parser.cpp
index 5dcb65ae072..0b057467e12 100644
--- a/be/src/util/string_parser.cpp
+++ b/be/src/util/string_parser.cpp
@@ -41,6 +41,20 @@ namespace doris {
 // <exponent> ::= <e_marker> <sign>? <digits>
 //
 // <e_marker> ::= "e" | "E"
+//
+// Parsing algorithm:
+// 1. Trim spaces and the sign, then normalize the significand by skipping 
leading zeros and an
+//    optional leading dot. During this scan, count digits that belong to the 
original integral
+//    part (`int_part_count`) and remember where the significand ends 
(`end_digit_index`).
+// 2. Parse the optional exponent. Scientific notation is handled by moving 
the decimal point:
+//    `result_int_part_digit_count = int_part_count + exponent`. For example, 
"12.34e-1" has
+//    int_part_count=2 and exponent=-1, so the result has one integral digit: 
"1.234".
+// 3. Build the result in scaled-integer form: first collect the integral 
digits up to the shifted
+//    decimal point, then collect up to `type_scale` fractional digits, 
padding with zeros when the
+//    input has fewer fractional digits than the target scale.
+// 4. If there are extra fractional digits, round half up using the first 
discarded digit. Finally,
+//    check the integral digit count against `type_precision - type_scale` and 
return the signed
+//    scaled integer value.
 template <PrimitiveType P>
 typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_decimal(
         const char* __restrict s, size_t len, int type_precision, int 
type_scale,
@@ -50,6 +64,16 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
                           std::is_same_v<T, __int128> || std::is_same_v<T, 
wide::Int256>,
                   "Cast string to decimal only support target type int32_t, 
int64_t, __int128 or "
                   "wide::Int256.");
+
+    // Parse in two logical coordinate systems:
+    // 1. `s[0, end_digit_index)` is the normalized significand after trimming 
spaces, sign and
+    //    leading zeros. If the original value starts with '.', the dot is 
also skipped so
+    //    ".14E+3" is parsed as significand "14" with exponent 3.
+    // 2. `result_int_part_digit_count = int_part_count + exponent` is the 
decimal point position
+    //    after applying scientific notation. For example, "1.4E+2" has 
int_part_count=1,
+    //    exponent=2, result_int_part_digit_count=3, so "14" becomes integer 
140.
+    // `digit_index` always indexes the normalized significand string, which 
may still contain a
+    // dot for inputs like "1.4E+2"; loops that build numbers skip that dot 
explicitly.
     // Ignore leading and trailing spaces.
     s = skip_ascii_whitespaces(s, len);
 
@@ -102,7 +126,9 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
         *result = StringParser::PARSE_FAILURE;
         return 0;
     }
-    // parse exponent if any
+    // Parse exponent if any. Keep `end_digit_index` before consuming 'e/E' so 
later digit counts
+    // ignore exponent syntax. For "1.4E+2", end_digit_index points just after 
"1.4", not after
+    // "E+2".
     int64_t exponent = 0;
     auto end_digit_index = i;
     if (i != len) {
@@ -149,8 +175,6 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
             return 0;
         }
     }
-    T int_part_number = 0;
-    T frac_part_number = 0;
     // TODO: check limit values of exponent and add UT
     // max string len is config::string_type_length_soft_limit_bytes,
     // whose max value is std::numeric_limits<int32_t>::max() - 4,
@@ -163,9 +187,15 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
         return 0;
     }
     int result_int_part_digit_count = tmp_result_int_part_digit_count;
+    T int_part_number = 0;
+    T frac_part_number = 0;
     int actual_frac_part_count = 0;
     int digit_index = 0;
     if (result_int_part_digit_count >= 0) {
+        // `max_index` is the raw significand index where integer-part digits 
stop. Add one extra
+        // raw character only when crossing an in-buffer dot, e.g. "1.4E+2" 
must scan "1.4" to
+        // collect three integer digits after the exponent shift. It is capped 
by end_digit_index
+        // because missing digits are appended later by multiplying with 
powers of 10.
         int max_index = std::min(found_dot ? (result_int_part_digit_count +
                                               ((int_part_count > 0 && exponent 
> 0) ? 1 : 0))
                                            : result_int_part_digit_count,
@@ -188,7 +218,11 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
             }
             int_part_number = int_part_number * 10 + (s[digit_index] - '0');
         }
-        auto total_significant_digit_count = i - ((found_dot && int_part_count 
> 0) ? 1 : 0);
+        // Count only significand digits, not exponent syntax. If the exponent 
moves the decimal
+        // point past all available significant digits, append zeros by 
scaling the integer part:
+        // "1.4E+2" scans integer 14, total_significant_digit_count=2, then 
multiplies by 10.
+        auto total_significant_digit_count =
+                end_digit_index - ((found_dot && int_part_count > 0) ? 1 : 0);
         if (result_int_part_digit_count > total_significant_digit_count) {
             int_part_number *= 
get_scale_multiplier<T>(result_int_part_digit_count -
                                                        
total_significant_digit_count);
@@ -206,8 +240,11 @@ typename PrimitiveTypeTraits<P>::CppType::NativeType 
StringParser::string_to_dec
         ++actual_frac_part_count;
     }
     auto type_scale_multiplier = get_scale_multiplier<T>(type_scale);
-    // there are still extra fraction digits left, check rounding
-    if (digit_index != end_digit_index) {
+    // Round only when the next parsed significand digit is exactly the first 
discarded fractional
+    // digit. If `actual_frac_part_count` is already greater than type_scale, 
the missing positions
+    // are implicit zeros from a negative exponent, so "5e-17" to scale 15 
must stay 0 instead of
+    // rounding up.
+    if (actual_frac_part_count == type_scale && digit_index != 
end_digit_index) {
         if (UNLIKELY(s[digit_index] == '.')) {
             ++digit_index;
         }
diff --git 
a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out 
b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out
index d699fb6177a..c419670b58f 100644
--- 
a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out
+++ 
b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out
@@ -2,5 +2,5 @@
 0
 0
 4230634956
-166888020
-1932016285
\ No newline at end of file
+572890395
+2601481115
\ No newline at end of file
diff --git 
a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap
 
b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap
index d699fb6177a..c419670b58f 100644
--- 
a/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap
+++ 
b/be/test/expected_result/vec/columns/column_array_update_crc32c_batch_37.out_with_nullmap
@@ -2,5 +2,5 @@
 0
 0
 4230634956
-166888020
-1932016285
\ No newline at end of file
+572890395
+2601481115
\ No newline at end of file
diff --git 
a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out 
b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out
index 45a1f82c6f1..faaab6bc634 100644
--- 
a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out
+++ 
b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out
@@ -1 +1 @@
-106414486;4062799302;0
\ No newline at end of file
+880726687;3657333385;0
\ No newline at end of file
diff --git 
a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap
 
b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap
index 45a1f82c6f1..faaab6bc634 100644
--- 
a/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap
+++ 
b/be/test/expected_result/vec/columns/column_array_update_crc32c_single_37.out_with_nullmap
@@ -1 +1 @@
-106414486;4062799302;0
\ No newline at end of file
+880726687;3657333385;0
\ No newline at end of file
diff --git a/be/test/exprs/function/cast/cast_to_decimal.cpp 
b/be/test/exprs/function/cast/cast_to_decimal.cpp
index 677c7a3fa16..b8e1bef4825 100644
--- a/be/test/exprs/function/cast/cast_to_decimal.cpp
+++ b/be/test/exprs/function/cast/cast_to_decimal.cpp
@@ -88,6 +88,37 @@ TEST_F(FunctionCastToDecimalTest, 
test_from_string_invalid_input) {
     int table_index = 0;
     from_string_invalid_input_test_func<Decimal32>(9, 3, table_index++);
 }
+
+TEST_F(FunctionCastToDecimalTest, test_from_string_scientific_notation) {
+    InputTypeSet input_types = {PrimitiveType::TYPE_VARCHAR};
+    DataSet data_set = {
+            {{std::string("1.4E+2")}, DECIMAL128V3(140, 0, 15)},
+            {{std::string(".14E+3")}, DECIMAL128V3(140, 0, 15)},
+            {{std::string("0.001E+5")}, DECIMAL128V3(100, 0, 15)},
+            {{std::string("1.E+2")}, DECIMAL128V3(100, 0, 15)},
+            {{std::string("1.4E+0")}, DECIMAL128V3(1, 400000000000000, 15)},
+            {{std::string("1.4E-2")}, DECIMAL128V3(0, 14000000000000, 15)},
+    };
+    check_function_for_cast<DataTypeDecimal<Decimal128V3::PType>>(input_types, 
data_set, 15, 38);
+}
+
+TEST_F(FunctionCastToDecimalTest, string_parser_scientific_rounding) {
+    auto parse_decimal128 = [](std::string_view value) {
+        StringParser::ParseResult result = StringParser::PARSE_SUCCESS;
+        auto parsed = 
StringParser::string_to_decimal<TYPE_DECIMAL128I>(value.data(), value.size(),
+                                                                        38, 
15, &result);
+        EXPECT_EQ(result, StringParser::PARSE_SUCCESS);
+        return parsed;
+    };
+
+    EXPECT_EQ(parse_decimal128("5e-16"), 1);
+    EXPECT_EQ(parse_decimal128("5e-17"), 0);
+    EXPECT_EQ(parse_decimal128("9e-17"), 0);
+    EXPECT_EQ(parse_decimal128("-5e-17"), 0);
+    EXPECT_EQ(parse_decimal128("0.0000000000000005"), 1);
+    EXPECT_EQ(parse_decimal128("0.00000000000000005"), 0);
+}
+
 TEST_F(FunctionCastToDecimalTest, test_from_bool) {
     from_bool_test_func<Decimal32>(9, 0);
     from_bool_test_func<Decimal32>(9, 1);
@@ -122,4 +153,4 @@ TEST_F(FunctionCastToDecimalTest, test_from_bool_overflow) {
     from_bool_overflow_test_func<Decimal128V3>();
     from_bool_overflow_test_func<Decimal256>();
 }
-} // namespace doris
\ No newline at end of file
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to