Repository: incubator-impala Updated Branches: refs/heads/master 310edd5d0 -> 211f60d83
IMPALA-1731,IMPALA-3868: Float values are not parsed correctly Fixed StringToFloatInternal() not to parse strings like "1.23inf" and "infinite" with leading/trailing garbage as Infinity. These strings are now rejected with PARSE_FAILURE. Only "inf" and "infinity" are accepted, parsing is case-insensitive. "NaN" values are handled similarly: strings with leading/trailing garbage like "nana" are rejected, parsing is case-insensitive. Other changes: - StringToFloatInternal() was cleaned up a bit. Parsing inf and NaN strings was moved out of the main loop. - Use std::numeric_limits<T>::infinity() instead of INFINITY macro and std::numeric_limits<T>::quiet_NaN() instead of NAN macro. - Fixed another minor bug: multiple dots are allowed when parsing float values (e.g. "2.1..6" is interpreted as 2.16). - New BE and E2E tests were added. Change-Id: I9e17d0f051b300a22a520ce34e276c2d4460d35e Reviewed-on: http://gerrit.cloudera.org:8080/3791 Reviewed-by: Michael Ho <[email protected]> Tested-by: Internal Jenkins Project: http://git-wip-us.apache.org/repos/asf/incubator-impala/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-impala/commit/211f60d8 Tree: http://git-wip-us.apache.org/repos/asf/incubator-impala/tree/211f60d8 Diff: http://git-wip-us.apache.org/repos/asf/incubator-impala/diff/211f60d8 Branch: refs/heads/master Commit: 211f60d8318859155b3f8f8043c7cae039d76bbc Parents: 310edd5 Author: Attila Jeges <[email protected]> Authored: Wed Jul 27 16:15:05 2016 +0200 Committer: Internal Jenkins <[email protected]> Committed: Wed Aug 24 03:34:01 2016 +0000 ---------------------------------------------------------------------- be/src/exprs/expr-test.cc | 89 +++++++++++++++++++- be/src/util/string-parser-test.cc | 50 +++++++++-- be/src/util/string-parser.h | 77 ++++++++++------- .../queries/QueryTest/exprs.test | 30 ++++++- 4 files changed, 205 insertions(+), 41 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/211f60d8/be/src/exprs/expr-test.cc ---------------------------------------------------------------------- diff --git a/be/src/exprs/expr-test.cc b/be/src/exprs/expr-test.cc index fadb73e..744038b 100644 --- a/be/src/exprs/expr-test.cc +++ b/be/src/exprs/expr-test.cc @@ -2788,23 +2788,108 @@ TEST_F(ExprTest, NonFiniteFloats) { TestValue("CAST(1/0 AS FLOAT)", TYPE_FLOAT, numeric_limits<float>::infinity()); TestValue("CAST(1/0 AS DOUBLE)", TYPE_DOUBLE, numeric_limits<double>::infinity()); TestValue("CAST(CAST(1/0 as FLOAT) as DOUBLE)", TYPE_DOUBLE, - numeric_limits<double>::infinity()); + numeric_limits<double>::infinity()); + TestValue("CAST(CAST(1/0 as DOUBLE) as FLOAT)", TYPE_FLOAT, + numeric_limits<float>::infinity()); TestStringValue("CAST(1/0 AS STRING)", "inf"); TestStringValue("CAST(CAST(1/0 AS FLOAT) AS STRING)", "inf"); TestValue("CAST('inf' AS FLOAT)", TYPE_FLOAT, numeric_limits<float>::infinity()); TestValue("CAST('inf' AS DOUBLE)", TYPE_DOUBLE, numeric_limits<double>::infinity()); - TestValue("CAST('Infinity' AS FLOAT)", TYPE_FLOAT, numeric_limits<float>::infinity()); + TestValue("CAST(' inf ' AS FLOAT)", TYPE_FLOAT, + numeric_limits<float>::infinity()); + TestValue("CAST(' inf ' AS DOUBLE)", TYPE_DOUBLE, + numeric_limits<double>::infinity()); + TestValue("CAST('+inf' AS FLOAT)", TYPE_FLOAT, + numeric_limits<float>::infinity()); + TestValue("CAST('+inf' AS DOUBLE)", TYPE_DOUBLE, + numeric_limits<double>::infinity()); + TestValue("CAST('-inf' AS FLOAT)", TYPE_FLOAT, + -numeric_limits<float>::infinity()); + TestValue("CAST('-inf' AS DOUBLE)", TYPE_DOUBLE, + -numeric_limits<double>::infinity()); + TestValue("CAST('Infinity' AS FLOAT)", TYPE_FLOAT, + numeric_limits<float>::infinity()); TestValue("CAST('-Infinity' AS DOUBLE)", TYPE_DOUBLE, -numeric_limits<double>::infinity()); + // Parsing inf values is case-insensitive. + TestValue("CAST('iNf' AS FLOAT)", TYPE_FLOAT, + numeric_limits<float>::infinity()); + TestValue("CAST('iNf' AS DOUBLE)", TYPE_DOUBLE, + numeric_limits<double>::infinity()); + TestValue("CAST('INf' AS FLOAT)", TYPE_FLOAT, + numeric_limits<float>::infinity()); + TestValue("CAST('INf' AS DOUBLE)", TYPE_DOUBLE, + numeric_limits<double>::infinity()); + TestValue("CAST('inF' AS FLOAT)", TYPE_FLOAT, + numeric_limits<float>::infinity()); + TestValue("CAST('inF' AS DOUBLE)", TYPE_DOUBLE, + numeric_limits<double>::infinity()); // NaN != NaN, so we have to wrap the value in a string TestStringValue("CAST(CAST('nan' AS FLOAT) AS STRING)", string("nan")); TestStringValue("CAST(CAST('nan' AS DOUBLE) AS STRING)", string("nan")); + TestStringValue("CAST(CAST(' nan ' AS FLOAT) AS STRING)", string("nan")); + TestStringValue("CAST(CAST(' nan ' AS DOUBLE) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('-nan' AS FLOAT) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('-nan' AS DOUBLE) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('+nan' AS FLOAT) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('+nan' AS DOUBLE) AS STRING)", string("nan")); + // Parsing NaN values is case-insensitive + TestStringValue("CAST(CAST('nAn' AS FLOAT) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('nAn' AS DOUBLE) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('NAn' AS FLOAT) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('NAn' AS DOUBLE) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('naN' AS FLOAT) AS STRING)", string("nan")); + TestStringValue("CAST(CAST('naN' AS DOUBLE) AS STRING)", string("nan")); // 0/0 evalutes to -nan, test that we return "nan" TestStringValue("CAST(0/0 AS STRING)", string("nan")); } +TEST_F(ExprTest, InvalidFloats) { + // IMPALA-1731: Test that leading/trailing garbage is not allowed when parsing inf. + TestIsNull("CAST('1.23inf' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('1.23inf' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('1.23inf456' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('1.23inf456' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('inf123' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('inf123' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('infinity2' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('infinity2' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('infinite' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('infinite' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('inf123nan' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('inf123nan' AS DOUBLE)", TYPE_DOUBLE); + + // IMPALA-1731: Test that leading/trailing garbage is not allowed when parsing NaN. + TestIsNull("CAST('1.23nan' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('1.23nan' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('1.23nan456' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('1.23nan456' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('nana' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('nana' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('nan123' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('nan123' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('nan123inf' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('nan123inf' AS DOUBLE)", TYPE_DOUBLE); + + // IMPALA-3868: Test that multiple dots are not allowed in float values + TestIsNull("CAST('1.2.3.4.5' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('1.2.3.4.5' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('1..e' AS FLOAT)", TYPE_FLOAT); + TestIsNull("CAST('1..e' AS DOUBLE)", TYPE_DOUBLE); + + // Broken string with null-character in the middle + string s1("CAST('in\0f' AS DOUBLE)", 22); + TestIsNull(s1, TYPE_DOUBLE); + string s2("CAST('N\0aN' AS FLOAT)", 21); + TestIsNull(s2, TYPE_FLOAT); + + // Empty string + TestIsNull("CAST('' AS DOUBLE)", TYPE_DOUBLE); + TestIsNull("CAST('' AS FLOAT)", TYPE_FLOAT); +} + TEST_F(ExprTest, MathTrigonometricFunctions) { // It is important to calculate the expected values // using math functions, and not simply use constants. http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/211f60d8/be/src/util/string-parser-test.cc ---------------------------------------------------------------------- diff --git a/be/src/util/string-parser-test.cc b/be/src/util/string-parser-test.cc index a49782f..6d4007b 100644 --- a/be/src/util/string-parser-test.cc +++ b/be/src/util/string-parser-test.cc @@ -367,21 +367,19 @@ TEST(StringToFloat, Basic) { // Non-finite values TestAllFloatVariants("INFinity", StringParser::PARSE_SUCCESS); TestAllFloatVariants("infinity", StringParser::PARSE_SUCCESS); + TestAllFloatVariants("inFIniTy", StringParser::PARSE_SUCCESS); TestAllFloatVariants("inf", StringParser::PARSE_SUCCESS); + TestAllFloatVariants("InF", StringParser::PARSE_SUCCESS); + TestAllFloatVariants("INF", StringParser::PARSE_SUCCESS); + TestAllFloatVariants("iNf", StringParser::PARSE_SUCCESS); TestFloatValueIsNan<float>("nan", StringParser::PARSE_SUCCESS); TestFloatValueIsNan<double>("nan", StringParser::PARSE_SUCCESS); TestFloatValueIsNan<float>("NaN", StringParser::PARSE_SUCCESS); TestFloatValueIsNan<double>("NaN", StringParser::PARSE_SUCCESS); - TestFloatValueIsNan<float>("nana", StringParser::PARSE_SUCCESS); - TestFloatValueIsNan<double>("nana", StringParser::PARSE_SUCCESS); TestFloatValueIsNan<float>("naN", StringParser::PARSE_SUCCESS); TestFloatValueIsNan<double>("naN", StringParser::PARSE_SUCCESS); - TestFloatValueIsNan<float>("n aN", StringParser::PARSE_FAILURE); - TestFloatValueIsNan<float>("nnaN", StringParser::PARSE_FAILURE); - - // Overflow. TestFloatValue<float>(float_max + "11111", StringParser::PARSE_OVERFLOW); TestFloatValue<double>(double_max + "11111", StringParser::PARSE_OVERFLOW); @@ -428,6 +426,26 @@ TEST(StringToFloat, Basic) { TestAllFloatVariants("in finity", StringParser::PARSE_FAILURE); TestAllFloatVariants("na", StringParser::PARSE_FAILURE); TestAllFloatVariants("ThisIsANaN", StringParser::PARSE_FAILURE); + TestAllFloatVariants("n aN", StringParser::PARSE_FAILURE); + TestAllFloatVariants("nnaN", StringParser::PARSE_FAILURE); + + // IMPALA-3868: Test that multiple dots are not allowed. + TestAllFloatVariants(".1.2", StringParser::PARSE_FAILURE); + TestAllFloatVariants("..12", StringParser::PARSE_FAILURE); + TestAllFloatVariants("12..", StringParser::PARSE_FAILURE); + TestAllFloatVariants("1.23.", StringParser::PARSE_FAILURE); + TestAllFloatVariants("1.23.4", StringParser::PARSE_FAILURE); + TestAllFloatVariants("1234.5678.90", StringParser::PARSE_FAILURE); + TestAllFloatVariants("12.34.5.6", StringParser::PARSE_FAILURE); + TestAllFloatVariants("0..e", StringParser::PARSE_FAILURE); + + // Test broken strings with null-character in the middle. + string s1("in\0f", 4); + TestAllFloatVariants(s1, StringParser::PARSE_FAILURE); + string s2("n\0an", 4); + TestAllFloatVariants(s2, StringParser::PARSE_FAILURE); + string s3("1\0.2", 4); + TestAllFloatVariants(s3, StringParser::PARSE_FAILURE); } TEST(StringToFloat, InvalidLeadingTrailing) { @@ -448,6 +466,26 @@ TEST(StringToFloat, InvalidLeadingTrailing) { // Test empty string and string with only whitespaces. TestFloatValue<double>("", StringParser::PARSE_FAILURE); TestFloatValue<double>(" ", StringParser::PARSE_FAILURE); + + // IMPALA-1731: Test that leading/trailing garbage is not allowed when parsing inf. + TestAllFloatVariants("1.23inf", StringParser::PARSE_FAILURE); + TestAllFloatVariants("1inf", StringParser::PARSE_FAILURE); + TestAllFloatVariants("iinf", StringParser::PARSE_FAILURE); + TestAllFloatVariants("1.23inf456", StringParser::PARSE_FAILURE); + TestAllFloatVariants("info", StringParser::PARSE_FAILURE); + TestAllFloatVariants("inf123", StringParser::PARSE_FAILURE); + TestAllFloatVariants("infinity2", StringParser::PARSE_FAILURE); + TestAllFloatVariants("infinite", StringParser::PARSE_FAILURE); + TestAllFloatVariants("inf123nan", StringParser::PARSE_FAILURE); + + // IMPALA-1731: Test that leading/trailing garbage is not allowed when parsing NaN. + TestAllFloatVariants("1.23nan", StringParser::PARSE_FAILURE); + TestAllFloatVariants("1nan", StringParser::PARSE_FAILURE); + TestAllFloatVariants("anan", StringParser::PARSE_FAILURE); + TestAllFloatVariants("1.23nan456", StringParser::PARSE_FAILURE); + TestAllFloatVariants("nana", StringParser::PARSE_FAILURE); + TestAllFloatVariants("nan2", StringParser::PARSE_FAILURE); + TestAllFloatVariants("nan123inf", StringParser::PARSE_FAILURE); } TEST(StringToFloat, BruteForce) { http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/211f60d8/be/src/util/string-parser.h ---------------------------------------------------------------------- diff --git a/be/src/util/string-parser.h b/be/src/util/string-parser.h index be8eb8d..a123924 100644 --- a/be/src/util/string-parser.h +++ b/be/src/util/string-parser.h @@ -354,6 +354,29 @@ class StringParser { return static_cast<T>(negative ? -val : val); } + /// Checks if "inf" or "infinity" matches 's' in a case-insensitive manner. The match + /// has to start at the beginning of 's', leading whitespace is considered invalid. + /// Trailing whitespace characters are allowed. + /// Returns true if a match was found and false otherwise. + static inline bool IsInfinity(const char *s, int len) { + if (len >= 3 && strncasecmp(s, "inf", 3) == 0) { + int i = 3; + if (len >= 8 && strncasecmp(s + 3, "inity", 5) == 0) { + i = 8; + } + return IsAllWhitespace(s + i, len - i); + } + return false; + } + + /// Checks if "nan" matches 's' in a case-insensitive manner. The match has to start at + /// the beginning of 's', leading whitespace is considered invalid. Trailing whitespace + /// characters are allowed. + /// Returns true if a match was found and false otherwise. + static inline bool IsNaN(const char *s, int len) { + return len >= 3 && strncasecmp(s, "nan", 3) == 0 && IsAllWhitespace(s + 3, len - 3); + } + /// This is considerably faster than glibc's implementation (>100x why???) /// No special case handling needs to be done for overflows, the floating point spec /// already does it and will cap the values to -inf/inf @@ -369,10 +392,27 @@ class StringParser { return 0; } - // Use double here to not lose precision while accumulating the result - double val = 0; bool negative = false; int i = 0; + switch (*s) { + case '-': negative = true; // Fallthrough is intended. + case '+': i = 1; + } + + // Check if we have inf or NaN. + if (IsInfinity(s + i, len - i)) { + *result = PARSE_SUCCESS; + return negative ? -std::numeric_limits<T>::infinity() + : std::numeric_limits<T>::infinity(); + } + if (IsNaN(s + i, len - i)) { + *result = PARSE_SUCCESS; + return negative ? -std::numeric_limits<T>::quiet_NaN() + : std::numeric_limits<T>::quiet_NaN(); + } + + // Use double here to not lose precision while accumulating the result + double val = 0; double divide = 1; bool decimal = false; int64_t remainder = 0; @@ -380,11 +420,7 @@ class StringParser { // leading 0s). This technically shouldn't count trailing 0s either, but for us it // doesn't matter if we count them based on the implementation below. int sig_figs = 0; - switch (*s) { - case '-': negative = true; - case '+': i = 1; - } - int first = i; + const int first = i; for (; i < len; ++i) { if (LIKELY(s[i] >= '0' && s[i] <= '9')) { if (s[i] != '0' || sig_figs > 0) ++sig_figs; @@ -402,36 +438,13 @@ class StringParser { } else { val = val * 10 + s[i] - '0'; } - } else if (s[i] == '.') { + } else if (!decimal && s[i] == '.') { decimal = true; } else if (s[i] == 'e' || s[i] == 'E') { break; - } else if (s[i] == 'i' || s[i] == 'I') { - if (len > i + 2 && (s[i+1] == 'n' || s[i+1] == 'N') && - (s[i+2] == 'f' || s[i+2] == 'F')) { - // Note: Hive writes inf as Infinity, at least for text. We'll be a little loose - // here and interpret any column with inf as a prefix as infinity rather than - // checking every remaining byte. - *result = PARSE_SUCCESS; - return negative ? -INFINITY : INFINITY; - } else { - // Starts with 'i', but isn't inf... - *result = PARSE_FAILURE; - return 0; - } - } else if (s[i] == 'n' || s[i] == 'N') { - if (len > i + 2 && (s[i+1] == 'a' || s[i+1] == 'A') && - (s[i+2] == 'n' || s[i+2] == 'N')) { - *result = PARSE_SUCCESS; - return negative ? -NAN : NAN; - } else { - // Starts with 'n', but isn't NaN... - *result = PARSE_FAILURE; - return 0; - } } else { if ((UNLIKELY(i == first || !IsAllWhitespace(s + i, len - i)))) { - // Reject the string because either the first char was not a digit, "," or "e", + // Reject the string because either the first char was not a digit, "." or "e", // or the remaining chars are not all whitespace *result = PARSE_FAILURE; return 0; http://git-wip-us.apache.org/repos/asf/incubator-impala/blob/211f60d8/testdata/workloads/functional-query/queries/QueryTest/exprs.test ---------------------------------------------------------------------- diff --git a/testdata/workloads/functional-query/queries/QueryTest/exprs.test b/testdata/workloads/functional-query/queries/QueryTest/exprs.test index 7234d4d..1061247 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/exprs.test +++ b/testdata/workloads/functional-query/queries/QueryTest/exprs.test @@ -2453,4 +2453,32 @@ select base64decode('abc%') STRING ---- ERRORS UDF WARNING: Could not base64 decode input in space 4; actual output length 0 -==== \ No newline at end of file +==== +---- QUERY +# IMPALA-1731: test parsing infinity values +select cast('inf' as double), cast('InFinity' as float), + cast('inf ' as float), cast(' infinity ' as double), + cast('infinite' as double), cast('1.23inf' as double), cast('1inf' as float) +---- RESULTS +Infinity,Infinity,Infinity,Infinity,NULL,NULL,NULL +---- TYPES +double,float,float,double,double,double,float +==== +---- QUERY +# IMPALA-1731: test parsing NaN values +select cast('nan' as double), cast('NaN' as float), cast(' nan ' as double), + cast('nana' as double), cast('1.23nan' as double), cast('1nan' as float) +---- RESULTS +NaN,NaN,NaN,NULL,NULL,NULL +---- TYPES +double,float,double,double,double,float +==== +---- QUERY +# IMPALA-3868: test parsing float with multiple dots +select cast('1.23' as double), cast('.1.23' as float), cast('123.456.' as double), + cast('1.23.456' as double), cast('1.23.4.5' as float), cast('0..e' as double) +---- RESULTS +1.23,NULL,NULL,NULL,NULL,NULL +---- TYPES +double,float,double,double,float,double +====
