This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 7c145faa80 [Enhance] use fast_float::from_chars to do str cast to
float/double to avoid lose precision (#16190)
7c145faa80 is described below
commit 7c145faa809f6ff70d895c91097ea7cc0edf8fc1
Author: HappenLee <[email protected]>
AuthorDate: Wed Feb 1 23:53:34 2023 +0800
[Enhance] use fast_float::from_chars to do str cast to float/double to
avoid lose precision (#16190)
---
be/src/util/string_parser.hpp | 146 ++++++++++--------------------------
be/test/util/string_parser_test.cpp | 5 +-
2 files changed, 41 insertions(+), 110 deletions(-)
diff --git a/be/src/util/string_parser.hpp b/be/src/util/string_parser.hpp
index 653f0dac14..02006b7c7d 100644
--- a/be/src/util/string_parser.hpp
+++ b/be/src/util/string_parser.hpp
@@ -20,6 +20,8 @@
#pragma once
+#include <fast_float/fast_float.h>
+
#include <cmath>
#include <cstdint>
#include <cstring>
@@ -111,13 +113,7 @@ public:
template <typename T>
static inline T string_to_float(const char* s, int len, ParseResult*
result) {
- T ans = string_to_float_internal<T>(s, len, result);
- if (LIKELY(*result == PARSE_SUCCESS)) {
- return ans;
- }
-
- int i = skip_leading_whitespace(s, len);
- return string_to_float_internal<T>(s + i, len - i, result);
+ return string_to_float_internal<T>(s, len, result);
}
// Parses a string for 'true' or 'false', case insensitive.
@@ -425,118 +421,54 @@ inline T StringParser::string_to_int_no_overflow(const
char* s, int len, ParseRe
template <typename T>
inline T StringParser::string_to_float_internal(const char* s, int len,
ParseResult* result) {
- if (UNLIKELY(len <= 0)) {
+ int i = 0;
+ // skip leading spaces
+ for (; i < len; ++i) {
+ if (!is_whitespace(s[i])) {
+ break;
+ }
+ }
+
+ // skip back spaces
+ int j = len - 1;
+ for (; j >= i; j--) {
+ if (!is_whitespace(s[j])) {
+ break;
+ }
+ }
+
+ // skip leading '+', from_chars can handle '-'
+ if (i < len && s[i] == '+') {
+ i++;
+ }
+ if (UNLIKELY(i > j)) {
*result = PARSE_FAILURE;
return 0;
}
// Use double here to not lose precision while accumulating the result
double val = 0;
- bool negative = false;
- int i = 0;
- double divide = 1;
- bool decimal = false;
- int64_t remainder = 0;
- // The number of 'significant figures' we've encountered so far (i.e.,
digits excluding
- // leading 0s). This technically shouldn't count trailing 0s either, but
for us it
- // doesn't matter if we count them based on the implementation below.
- int sig_figs = 0;
-
- switch (*s) {
- case '-':
- negative = true;
- case '+':
- i = 1;
- }
-
- int first = i;
- for (; i < len; ++i) {
- if (LIKELY(s[i] >= '0' && s[i] <= '9')) {
- if (s[i] != '0' || sig_figs > 0) {
- ++sig_figs;
- }
- if (decimal) {
- // According to the IEEE floating-point spec, a double has up
to 15-17
- // significant decimal digits (see
- //
http://en.wikipedia.org/wiki/Double-precision_floating-point_format). We stop
- // processing digits after we've already seen at least 18 sig
figs to avoid
- // overflowing 'remainder' (we stop after 18 instead of 17 to
get the rounding
- // right).
- if (sig_figs <= 18) {
- remainder = remainder * 10 + s[i] - '0';
- divide *= 10;
+ auto res = fast_float::from_chars(s + i, s + j + 1, val);
+
+ if (res.ec == std::errc() && res.ptr == s + j + 1) {
+ if (abs(val) == std::numeric_limits<T>::infinity()) {
+ auto contain_inf = false;
+ for (int k = i; k < j + 1; k++) {
+ if (s[k] == 'i' || s[k] == 'I') {
+ contain_inf = true;
+ break;
}
- } else {
- val = val * 10 + s[i] - '0';
- }
- } else if (s[i] == '.') {
- decimal = true;
- } else if (s[i] == 'e' || s[i] == 'E') {
- break;
- } else if (s[i] == 'i' || s[i] == 'I') {
- if (len > i + 2 && (s[i + 1] == 'n' || s[i + 1] == 'N') &&
- (s[i + 2] == 'f' || s[i + 2] == 'F')) {
- // Note: Hive writes inf as Infinity, at least for text. We'll
be a little loose
- // here and interpret any column with inf as a prefix as
infinity rather than
- // checking every remaining byte.
- *result = PARSE_SUCCESS;
- return negative ? -INFINITY : INFINITY;
- } else {
- // Starts with 'i', but isn't inf...
- *result = PARSE_FAILURE;
- return 0;
- }
- } else if (s[i] == 'n' || s[i] == 'N') {
- if (len > i + 2 && (s[i + 1] == 'a' || s[i + 1] == 'A') &&
- (s[i + 2] == 'n' || s[i + 2] == 'N')) {
- *result = PARSE_SUCCESS;
- return negative ? -NAN : NAN;
- } else {
- // Starts with 'n', but isn't NaN...
- *result = PARSE_FAILURE;
- return 0;
- }
- } else {
- if ((UNLIKELY(i == first || !is_all_whitespace(s + i, len - i)))) {
- // Reject the string because either the first char was not a
digit, "," or "e",
- // or the remaining chars are not all whitespace
- *result = PARSE_FAILURE;
- return 0;
}
- // skip trailing whitespace.
- break;
- }
- }
- val += remainder / divide;
-
- if (i < len && (s[i] == 'e' || s[i] == 'E')) {
- // Create a C-string from s starting after the optional '-' sign and
fall back to
- // strtod to avoid conversion inaccuracy for scientific notation.
- // Do not use boost::lexical_cast because it causes codegen to crash
for an
- // unknown reason (exception handling?).
- char c_str[len - negative + 1];
- memcpy(c_str, s + negative, len - negative);
- c_str[len - negative] = '\0';
- char* s_end;
- val = strtod(c_str, &s_end);
- if (s_end != c_str + len - negative) {
- // skip trailing whitespace
- int trailing_len = len - negative - (int)(s_end - c_str);
- if (UNLIKELY(!is_all_whitespace(s_end, trailing_len))) {
- *result = PARSE_FAILURE;
- return val;
- }
+ *result = contain_inf ? PARSE_SUCCESS : PARSE_OVERFLOW;
+ } else {
+ *result = PARSE_SUCCESS;
}
- }
-
- // Determine if it is an overflow case and update the result
- if (UNLIKELY(val == std::numeric_limits<T>::infinity())) {
- *result = PARSE_OVERFLOW;
+ return val;
} else {
- *result = PARSE_SUCCESS;
+ *result = PARSE_FAILURE;
}
- return (T)(negative ? -val : val);
+ return 0;
}
inline bool StringParser::string_to_bool_internal(const char* s, int len,
ParseResult* result) {
diff --git a/be/test/util/string_parser_test.cpp
b/be/test/util/string_parser_test.cpp
index a85e5b97ed..f9e9c37050 100644
--- a/be/test/util/string_parser_test.cpp
+++ b/be/test/util/string_parser_test.cpp
@@ -441,7 +441,6 @@ TEST(StringToFloat, Basic) {
std::string double_max =
boost::lexical_cast<std::string>(std::numeric_limits<double>::max());
test_float_value<double>(double_min, StringParser::PARSE_SUCCESS);
test_float_value<double>(double_max, StringParser::PARSE_SUCCESS);
-
// Non-finite values
test_all_float_variants("INFinity", StringParser::PARSE_SUCCESS);
test_all_float_variants("infinity", StringParser::PARSE_SUCCESS);
@@ -451,8 +450,8 @@ TEST(StringToFloat, Basic) {
test_float_value_is_nan<double>("nan", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<float>("NaN", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<double>("NaN", StringParser::PARSE_SUCCESS);
- test_float_value_is_nan<float>("nana", StringParser::PARSE_SUCCESS);
- test_float_value_is_nan<double>("nana", StringParser::PARSE_SUCCESS);
+ test_float_value_is_nan<float>("nana", StringParser::PARSE_FAILURE);
+ test_float_value_is_nan<double>("nana", StringParser::PARSE_FAILURE);
test_float_value_is_nan<float>("naN", StringParser::PARSE_SUCCESS);
test_float_value_is_nan<double>("naN", StringParser::PARSE_SUCCESS);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]