This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-doris.git
The following commit(s) were added to refs/heads/master by this push:
new 5029ef4 [fix] fix ltrim result may incorrect in some case (#7963)
5029ef4 is described below
commit 5029ef46c92fdba10a35136fa62eb7ac2d425d54
Author: Zhengguo Yang <[email protected]>
AuthorDate: Wed Feb 9 13:06:37 2022 +0800
[fix] fix ltrim result may incorrect in some case (#7963)
fix ltrim result may incorrect in some case
according to https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
Built-in Function: int __builtin_cl/tz (unsigned int x)
If x is 0, the result is undefined.
So we handle the case of 0 separately
this function return different between gcc and clang when x is 0
---
be/CMakeLists.txt | 3 +-
be/src/exprs/v_string_functions.h | 219 -------------------------------
be/src/vec/functions/function_string.cpp | 17 ++-
be/test/exprs/string_functions_test.cpp | 56 ++++----
4 files changed, 39 insertions(+), 256 deletions(-)
diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt
index 6bf9813..2d50c67 100644
--- a/be/CMakeLists.txt
+++ b/be/CMakeLists.txt
@@ -674,7 +674,6 @@ else()
${DORIS_DEPENDENCIES}
${WL_START_GROUP}
${X86_DEPENDENCIES}
- ${WL_END_GROUP}
)
endif()
@@ -690,6 +689,8 @@ if (WITH_MYSQL)
)
endif()
+set(DORIS_DEPENDENCIES ${DORIS_DEPENDENCIES} ${WL_END_GROUP})
+
message(STATUS "DORIS_DEPENDENCIES is ${DORIS_DEPENDENCIES}")
# Add all external dependencies. They should come after the palo libs.
diff --git a/be/src/exprs/v_string_functions.h
b/be/src/exprs/v_string_functions.h
deleted file mode 100644
index 3fd9845..0000000
--- a/be/src/exprs/v_string_functions.h
+++ /dev/null
@@ -1,219 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements. See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership. The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License. You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#pragma once
-
-#ifndef BE_V_STRING_FUNCTIONS_H
-#define BE_V_STRING_FUNCTIONS_H
-
-#include <stdint.h>
-#include <unistd.h>
-#include "runtime/string_value.hpp"
-
-#ifdef __SSE2__
-#include <emmintrin.h>
-#endif
-
-namespace doris {
-class VStringFunctions {
-public:
-#ifdef __SSE2__
- /// n equals to 16 chars length
- static constexpr auto REGISTER_SIZE = sizeof(__m128i);
-#endif
-public:
- static StringVal rtrim(const StringVal& str) {
- if (str.is_null || str.len == 0) {
- return str;
- }
- auto begin = 0;
- auto end = str.len - 1;
-#ifdef __SSE2__
- char blank = ' ';
- const auto pattern = _mm_set1_epi8(blank);
- while (end - begin + 1 >= REGISTER_SIZE) {
- const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const
__m128i *>(str.ptr + end + 1 - REGISTER_SIZE));
- const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern);
- const auto mask = _mm_movemask_epi8(v_against_pattern);
- int offset = __builtin_clz(~(mask << REGISTER_SIZE));
- /// means not found
- if (offset == 0)
- {
- return StringVal(str.ptr + begin, end - begin + 1);
- } else {
- end -= offset;
- }
- }
-#endif
- while (end >= begin && str.ptr[end] == ' ') {
- --end;
- }
- if (end < 0) {
- return StringVal("");
- }
- return StringVal(str.ptr + begin, end - begin + 1);
- }
-
- static StringVal ltrim(const StringVal& str) {
- if (str.is_null || str.len == 0) {
- return str;
- }
- auto begin = 0;
- auto end = str.len - 1;
-#ifdef __SSE2__
- char blank = ' ';
- const auto pattern = _mm_set1_epi8(blank);
- while (end - begin + 1 >= REGISTER_SIZE) {
- const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const
__m128i *>(str.ptr + begin));
- const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack, pattern);
- const auto mask = _mm_movemask_epi8(v_against_pattern);
- const auto offset = __builtin_ctz(mask ^ 0xffff);
- /// means not found
- if (offset == 0)
- {
- return StringVal(str.ptr + begin, end - begin + 1);
- } else if (offset > REGISTER_SIZE) {
- begin += REGISTER_SIZE;
- } else {
- begin += offset;
- return StringVal(str.ptr + begin, end - begin + 1);
- }
- }
-#endif
- while (begin <= end && str.ptr[begin] == ' ') {
- ++begin;
- }
- return StringVal(str.ptr + begin, end - begin + 1);
- }
-
- static StringVal trim(const StringVal& str) {
- if (str.is_null || str.len == 0) {
- return str;
- }
- return rtrim(ltrim(str));
- }
-
- static bool is_ascii(StringVal str) {
- #ifdef __SSE2__
- size_t i = 0;
- __m128i binary_code = _mm_setzero_si128();
- if (str.len >= REGISTER_SIZE) {
- for (; i <= str.len - REGISTER_SIZE; i += REGISTER_SIZE) {
- __m128i chars = _mm_loadu_si128((const __m128i*)(str.ptr + i));
- binary_code = _mm_or_si128(binary_code, chars);
- }
- }
- int mask = _mm_movemask_epi8(binary_code);
-
- char or_code = 0;
- for (; i < str.len; i++) {
- or_code |= str.ptr[i];
- }
- mask |= (or_code & 0x80);
-
- return !mask;
- #else
- char or_code = 0;
- for (size_t i = 0; i < str.len; i++) {
- or_code |= str.ptr[i];
- }
- return !(or_code & 0x80);
- #endif
- }
-
- static void reverse(const StringVal& str, StringVal dst) {
- if (str.is_null) {
- dst.ptr = NULL;
- return;
- }
- const bool is_ascii = VStringFunctions::is_ascii(str);
- if (is_ascii) {
- int64_t begin = 0;
- int64_t end = str.len;
- int64_t result_end = dst.len;
- #if defined(__SSE2__)
- const auto shuffle_array =
_mm_set_epi64((__m64)0x00'01'02'03'04'05'06'07ull,
(__m64)0x08'09'0a'0b'0c'0d'0e'0full);
- for (; (begin + REGISTER_SIZE) < end; begin += REGISTER_SIZE) {
- result_end -= REGISTER_SIZE;
- _mm_storeu_si128((__m128i*)(dst.ptr + result_end),
-
_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(str.ptr + begin)), shuffle_array));
- }
- #endif
- for (; begin < end; ++begin) {
- --result_end;
- dst.ptr[result_end] = str.ptr[begin];
- }
- } else {
- for (size_t i = 0, char_size = 0; i < str.len; i += char_size) {
- char_size = get_utf8_byte_length((unsigned)(str.ptr)[i]);
- std::copy(str.ptr + i, str.ptr + i + char_size, dst.ptr +
str.len - i - char_size);
- }
- }
- }
-
- static size_t get_utf8_byte_length(unsigned char byte) {
- size_t char_size = 0;
- if (byte >= 0xFC) {
- char_size = 6;
- } else if (byte >= 0xF8) {
- char_size = 5;
- } else if (byte >= 0xF0) {
- char_size = 4;
- } else if (byte >= 0xE0) {
- char_size = 3;
- } else if (byte >= 0xC0) {
- char_size = 2;
- } else {
- char_size = 1;
- }
- return char_size;
- }
-
- static void hex_encode(const unsigned char* src_str, size_t length, char*
dst_str) {
- static constexpr auto hex_table = "0123456789ABCDEF";
- auto src_str_end = src_str + length;
-
-#if defined(__SSE2__)
- constexpr auto step = sizeof(uint64);
- if (src_str + step < src_str_end) {
- const auto hex_map = _mm_loadu_si128(reinterpret_cast<const
__m128i *>(hex_table));
- const auto mask_map = _mm_set1_epi8(0x0F);
-
- do {
- auto data = _mm_loadu_si64(src_str);
- auto hex_loc =
_mm_and_si128(_mm_unpacklo_epi8(_mm_srli_epi64(data, 4), data), mask_map);
- _mm_storeu_si128(reinterpret_cast<__m128i *>(dst_str),
_mm_shuffle_epi8(hex_map, hex_loc));
-
- src_str += step;
- dst_str += step * 2;
- } while (src_str + step < src_str_end);
- }
-#endif
- char res[2];
- // hex(str) str length is n, result must be 2 * n length
- for (; src_str < src_str_end; src_str += 1, dst_str += 2) {
- // low 4 bits
- *(res + 1) = hex_table[src_str[0] & 0x0F];
- // high 4 bits
- *res = hex_table[(src_str[0] >> 4)];
- std::copy(res, res + 2, dst_str);
- }
- }
-};
-}
-
-#endif //BE_V_STRING_FUNCTIONS_H
\ No newline at end of file
diff --git a/be/src/vec/functions/function_string.cpp
b/be/src/vec/functions/function_string.cpp
index bdeb7e5..a89edf5 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -23,9 +23,9 @@
#include <cstdlib>
#include <string_view>
-#include "exprs/v_string_functions.h"
#include "runtime/string_search.hpp"
#include "util/encryption_util.h"
+#include "util/simd/vstring_function.h"
#include "util/url_coding.h"
#include "vec/common/pod_array_fwd.h"
#include "vec/functions/function_string_to_string.h"
@@ -258,8 +258,8 @@ struct ReverseImpl {
auto src_str = reinterpret_cast<const char*>(&data[offsets[i -
1]]);
int64_t src_len = offsets[i] - offsets[i - 1] - 1;
char dst[src_len];
- VStringFunctions::reverse(StringVal((uint8_t*)src_str, src_len),
- StringVal((uint8_t*)dst, src_len));
+ simd::VStringFunctions::reverse(StringVal((uint8_t*)src_str,
src_len),
+ StringVal((uint8_t*)dst, src_len));
StringOP::push_value_string(std::string_view(dst, src_len), i,
res_data, res_offsets);
}
return Status::OK();
@@ -271,9 +271,7 @@ struct HexStringName {
};
struct HexStringImpl {
- static DataTypes get_variadic_argument_types() {
- return {std::make_shared<DataTypeString>()};
- }
+ static DataTypes get_variadic_argument_types() { return
{std::make_shared<DataTypeString>()}; }
static Status vector(const ColumnString::Chars& data, const
ColumnString::Offsets& offsets,
ColumnString::Chars& dst_data, ColumnString::Offsets&
dst_offsets) {
@@ -293,7 +291,8 @@ struct HexStringImpl {
dst_data_ptr++;
offset++;
} else {
- VStringFunctions::hex_encode(source, srclen,
reinterpret_cast<char*>(dst_data_ptr));
+ simd::VStringFunctions::hex_encode(source, srclen,
+
reinterpret_cast<char*>(dst_data_ptr));
dst_data_ptr[srclen * 2] = '\0';
dst_data_ptr += (srclen * 2 + 1);
offset += (srclen * 2 + 1);
@@ -355,10 +354,10 @@ struct TrimImpl {
const char* raw_str = reinterpret_cast<const
char*>(&data[offsets[i - 1]]);
StringVal str(raw_str);
if constexpr (is_ltrim) {
- str = VStringFunctions::ltrim(str);
+ str = simd::VStringFunctions::ltrim(str);
}
if constexpr (is_rtrim) {
- str = VStringFunctions::rtrim(str);
+ str = simd::VStringFunctions::rtrim(str);
}
StringOP::push_value_string(std::string_view((char*)str.ptr,
str.len), i, res_data,
res_offsets);
diff --git a/be/test/exprs/string_functions_test.cpp
b/be/test/exprs/string_functions_test.cpp
index 035e6a1..acae0e8 100644
--- a/be/test/exprs/string_functions_test.cpp
+++ b/be/test/exprs/string_functions_test.cpp
@@ -16,17 +16,18 @@
// under the License.
#include "exprs/string_functions.h"
-#include "exprs/v_string_functions.h"
+#include <fmt/os.h>
#include <gtest/gtest.h>
+
#include <iostream>
#include <string>
-#include <fmt/os.h>
#include "exprs/anyval_util.h"
#include "test_util/test_util.h"
#include "testutil/function_utils.h"
#include "util/logging.h"
+#include "util/simd/vstring_function.h"
namespace doris {
@@ -47,8 +48,7 @@ private:
TEST_F(StringFunctionsTest, do_money_format_for_bigint_bench) {
doris_udf::FunctionContext* context = new doris_udf::FunctionContext();
- StringVal expected =
- AnyValUtil::from_string(ctx,
std::string("9,223,372,036,854,775,807.00"));
+ StringVal expected = AnyValUtil::from_string(ctx,
std::string("9,223,372,036,854,775,807.00"));
BigIntVal bigIntVal(9223372036854775807);
for (int i = 0; i < LOOP_LESS_OR_MORE(10, 10000000); i++) {
StringVal result = StringFunctions::money_format(context, bigIntVal);
@@ -98,7 +98,7 @@ TEST_F(StringFunctionsTest, money_format_large_int) {
value = MIN_INT128;
result = StringFunctions::money_format(context,
doris_udf::LargeIntVal(value));
expected = AnyValUtil::from_string_temp(
- context,
std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00"));
+ context,
std::string("-170,141,183,460,469,231,731,687,303,715,884,105,728.00"));
ASSERT_EQ(expected, result);
delete context;
}
@@ -682,80 +682,82 @@ TEST_F(StringFunctionsTest, upper) {
TEST_F(StringFunctionsTest, ltrim) {
// no blank
StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
- StringVal res = VStringFunctions::ltrim(src);
+ StringVal res = simd::VStringFunctions::ltrim(src);
ASSERT_EQ(src, res);
// empty string
StringVal src1("");
- res = VStringFunctions::ltrim(src1);
+ res = simd::VStringFunctions::ltrim(src1);
ASSERT_EQ(src1, res);
// null string
StringVal src2(StringVal::null());
- res = VStringFunctions::ltrim(src2);
+ res = simd::VStringFunctions::ltrim(src2);
ASSERT_EQ(src2, res);
// less than 16 blanks
StringVal src3(" hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
- res = VStringFunctions::ltrim(src3);
+ res = simd::VStringFunctions::ltrim(src3);
ASSERT_EQ(src, res);
// more than 16 blanks
StringVal src4(" hello
worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
- res = VStringFunctions::ltrim(src4);
+ res = simd::VStringFunctions::ltrim(src4);
ASSERT_EQ(src, res);
// all are blanks, less than 16 blanks
StringVal src5(" ");
- res = VStringFunctions::ltrim(src5);
+ res = simd::VStringFunctions::ltrim(src5);
ASSERT_EQ(StringVal(""), res);
// all are blanks, more than 16 blanks
StringVal src6(" ");
- res = VStringFunctions::ltrim(src6);
+ res = simd::VStringFunctions::ltrim(src6);
ASSERT_EQ(StringVal(""), res);
// src less than 16 length
StringVal src7(" 12345678910");
- res = VStringFunctions::ltrim(src7);
+ res = simd::VStringFunctions::ltrim(src7);
ASSERT_EQ(StringVal("12345678910"), res);
}
TEST_F(StringFunctionsTest, rtrim) {
// no blank
StringVal src("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa");
- StringVal res = VStringFunctions::rtrim(src);
+ StringVal res = simd::VStringFunctions::rtrim(src);
ASSERT_EQ(src, res);
// empty string
StringVal src1("");
- res = VStringFunctions::rtrim(src1);
+ res = simd::VStringFunctions::rtrim(src1);
ASSERT_EQ(src1, res);
// null string
StringVal src2(StringVal::null());
- res = VStringFunctions::rtrim(src2);
+ res = simd::VStringFunctions::rtrim(src2);
ASSERT_EQ(src2, res);
// less than 16 blanks
StringVal src3("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa ");
- res = VStringFunctions::rtrim(src3);
+ res = simd::VStringFunctions::rtrim(src3);
ASSERT_EQ(src, res);
// more than 16 blanks
StringVal src4("hello worldaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
");
- res = VStringFunctions::rtrim(src4);
+ res = simd::VStringFunctions::rtrim(src4);
ASSERT_EQ(src, res);
// all are blanks, less than 16 blanks
StringVal src5(" ");
- res = VStringFunctions::rtrim(src5);
+ res = simd::VStringFunctions::rtrim(src5);
ASSERT_EQ(StringVal(""), res);
// all are blanks, more than 16 blanks
StringVal src6(" ");
- res = VStringFunctions::rtrim(src6);
+ res = simd::VStringFunctions::rtrim(src6);
ASSERT_EQ(StringVal(""), res);
// src less than 16 length
StringVal src7("12345678910 ");
- res = VStringFunctions::rtrim(src7);
+ res = simd::VStringFunctions::rtrim(src7);
ASSERT_EQ(StringVal("12345678910"), res);
}
TEST_F(StringFunctionsTest, is_ascii) {
- ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal("hello123")));
- ASSERT_EQ(true,
VStringFunctions::is_ascii(StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe")));
- ASSERT_EQ(false, VStringFunctions::is_ascii(StringVal("运维组123")));
- ASSERT_EQ(false,
VStringFunctions::is_ascii(StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe")));
- ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal::null()));
- ASSERT_EQ(true, VStringFunctions::is_ascii(StringVal("")));
+ ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal("hello123")));
+ ASSERT_EQ(true, simd::VStringFunctions::is_ascii(
+
StringVal("hello123fwrewerwerwerwrsfqrwerwefwfwrwfsfwe")));
+ ASSERT_EQ(false, simd::VStringFunctions::is_ascii(StringVal("运维组123")));
+ ASSERT_EQ(false, simd::VStringFunctions::is_ascii(
+
StringVal("hello123运维组fwrewerwerwerwrsfqrwerwefwfwrwfsfwe")));
+ ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal::null()));
+ ASSERT_EQ(true, simd::VStringFunctions::is_ascii(StringVal("")));
}
} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]