This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 090a3423401 [Improvement](function) optimization for substr with ascii 
string (#29799)
090a3423401 is described below

commit 090a34234012eeb751cf84d71818e6139f94e356
Author: Pxl <[email protected]>
AuthorDate: Thu Jan 11 11:31:35 2024 +0800

    [Improvement](function) optimization for substr with ascii string (#29799)
---
 be/src/util/simd/vstring_function.h    |  57 ++++++++-
 be/src/vec/common/pod_array.h          |   8 ++
 be/src/vec/functions/function_string.h | 226 ++++++++++++++++-----------------
 3 files changed, 173 insertions(+), 118 deletions(-)

diff --git a/be/src/util/simd/vstring_function.h 
b/be/src/util/simd/vstring_function.h
index 306a18a6d66..dac964b1b94 100644
--- a/be/src/util/simd/vstring_function.h
+++ b/be/src/util/simd/vstring_function.h
@@ -44,6 +44,54 @@ inline uint8_t get_utf8_byte_length(uint8_t character) {
     return UTF8_BYTE_LENGTH[character];
 }
 
+// copy from 
https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h
+// The function returns true (1) if all chars passed in src are
+// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
+inline bool validate_ascii_fast(const char* src, size_t len) {
+    size_t i = 0;
+    __m128i has_error = _mm_setzero_si128();
+    if (len >= 16) {
+        for (; i <= len - 16; i += 16) {
+            __m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i));
+            has_error = _mm_or_si128(has_error, current_bytes);
+        }
+    }
+    int error_mask = _mm_movemask_epi8(has_error);
+
+    char tail_has_error = 0;
+    for (; i < len; i++) {
+        tail_has_error |= src[i];
+    }
+    error_mask |= (tail_has_error & 0x80);
+
+    return !error_mask;
+}
+
+#ifdef __AVX2__
+#include <x86intrin.h>
+// The function returns true (1) if all chars passed in src are
+// 7-bit values (0x00..0x7F). Otherwise, it returns false (0).
+inline bool validate_ascii_fast_avx(const char* src, size_t len) {
+    size_t i = 0;
+    __m256i has_error = _mm256_setzero_si256();
+    if (len >= 32) {
+        for (; i <= len - 32; i += 32) {
+            __m256i current_bytes = _mm256_loadu_si256((const __m256i*)(src + 
i));
+            has_error = _mm256_or_si256(has_error, current_bytes);
+        }
+    }
+    int error_mask = _mm256_movemask_epi8(has_error);
+
+    char tail_has_error = 0;
+    for (; i < len; i++) {
+        tail_has_error |= src[i];
+    }
+    error_mask |= (tail_has_error & 0x80);
+
+    return !error_mask;
+}
+#endif
+
 namespace simd {
 
 class VStringFunctions {
@@ -219,11 +267,10 @@ public:
 
     // Gcc will do auto simd in this function
     static bool is_ascii(const StringRef& str) {
-        char or_code = 0;
-        for (size_t i = 0; i < str.size; i++) {
-            or_code |= str.data[i];
-        }
-        return !(or_code & 0x80);
+#ifdef __AVX2__
+        return validate_ascii_fast_avx(str.data, str.size);
+#endif
+        return validate_ascii_fast(str.data, str.size);
     }
 
     static void reverse(const StringRef& str, StringRef dst) {
diff --git a/be/src/vec/common/pod_array.h b/be/src/vec/common/pod_array.h
index 91642aebd47..0d7cde6503d 100644
--- a/be/src/vec/common/pod_array.h
+++ b/be/src/vec/common/pod_array.h
@@ -480,6 +480,14 @@ public:
         this->c_end += bytes_to_copy;
     }
 
+    template <typename It1, typename It2>
+    void insert_assume_reserved_and_allow_overflow(It1 from_begin, It2 
from_end) {
+        size_t bytes_to_copy = this->byte_size(from_end - from_begin);
+        memcpy_small_allow_read_write_overflow15(
+                this->c_end, reinterpret_cast<const void*>(&*from_begin), 
bytes_to_copy);
+        this->c_end += bytes_to_copy;
+    }
+
     void swap(PODArray& rhs) {
 #ifndef NDEBUG
         this->unprotect();
diff --git a/be/src/vec/functions/function_string.h 
b/be/src/vec/functions/function_string.h
index 2b1d5891a66..4363d040cd5 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -17,7 +17,6 @@
 
 #pragma once
 
-#include <glog/logging.h>
 #include <limits.h>
 #include <stdlib.h>
 #include <string.h>
@@ -52,6 +51,7 @@
 #include "vec/common/memcpy_small.h"
 #include "vec/common/pod_array.h"
 #include "vec/common/pod_array_fwd.h"
+#include "vec/common/string_utils/string_utils.h"
 #include "vec/common/typeid_cast.h"
 #include "vec/core/block.h"
 #include "vec/core/column_numbers.h"
@@ -120,6 +120,14 @@ struct StringOP {
         chars.insert(string_value.data(), string_value.data() + 
string_value.size());
         offsets[index] = chars.size();
     }
+
+    static void push_value_string_reserved_and_allow_overflow(const 
std::string_view& string_value,
+                                                              int index, 
ColumnString::Chars& chars,
+                                                              
ColumnString::Offsets& offsets) {
+        chars.insert_assume_reserved_and_allow_overflow(string_value.data(),
+                                                        string_value.data() + 
string_value.size());
+        offsets[index] = chars.size();
+    }
 };
 
 struct SubstringUtil {
@@ -147,30 +155,37 @@ struct SubstringUtil {
             check_set_nullable(argument_columns[i], null_map, col_const[i]);
         }
 
-        auto specific_str_column = assert_cast<const 
ColumnString*>(argument_columns[0].get());
-        auto specific_start_column =
+        const auto* specific_str_column =
+                assert_cast<const ColumnString*>(argument_columns[0].get());
+        const auto* specific_start_column =
                 assert_cast<const 
ColumnVector<Int32>*>(argument_columns[1].get());
-        auto specific_len_column =
+        const auto* specific_len_column =
                 assert_cast<const 
ColumnVector<Int32>*>(argument_columns[2].get());
-        if (col_const[1] && col_const[2]) {
-            vectors<true>(specific_str_column->get_chars(), 
specific_str_column->get_offsets(),
-                          specific_start_column->get_data(), 
specific_len_column->get_data(),
-                          null_map->get_data(), res->get_chars(), 
res->get_offsets());
-        } else {
-            vectors<false>(specific_str_column->get_chars(), 
specific_str_column->get_offsets(),
-                           specific_start_column->get_data(), 
specific_len_column->get_data(),
-                           null_map->get_data(), res->get_chars(), 
res->get_offsets());
+
+        auto vectors = vectors_utf8<false>;
+        bool is_ascii = simd::VStringFunctions::is_ascii(
+                {specific_str_column->get_chars().data(), 
specific_str_column->get_chars().size()});
+        if (col_const[1] && col_const[2] && is_ascii) {
+            vectors = vectors_ascii<true>;
+        } else if (col_const[1] && col_const[2]) {
+            vectors = vectors_utf8<true>;
+        } else if (is_ascii) {
+            vectors = vectors_ascii<false>;
         }
+        vectors(specific_str_column->get_chars(), 
specific_str_column->get_offsets(),
+                specific_start_column->get_data(), 
specific_len_column->get_data(),
+                null_map->get_data(), res->get_chars(), res->get_offsets());
+
         block.get_by_position(result).column =
                 ColumnNullable::create(std::move(res), std::move(null_map));
     }
 
 private:
-    template <bool Const>
-    static void vectors(const ColumnString::Chars& chars, const 
ColumnString::Offsets& offsets,
-                        const PaddedPODArray<Int32>& start, const 
PaddedPODArray<Int32>& len,
-                        NullMap& null_map, ColumnString::Chars& res_chars,
-                        ColumnString::Offsets& res_offsets) {
+    template <bool is_const>
+    static void vectors_utf8(const ColumnString::Chars& chars, const 
ColumnString::Offsets& offsets,
+                             const PaddedPODArray<Int32>& start, const 
PaddedPODArray<Int32>& len,
+                             NullMap& null_map, ColumnString::Chars& res_chars,
+                             ColumnString::Offsets& res_offsets) {
         size_t size = offsets.size();
         res_offsets.resize(size);
         res_chars.reserve(chars.size());
@@ -179,119 +194,104 @@ private:
         PMR::monotonic_buffer_resource pool {buf.data(), buf.size()};
         PMR::vector<size_t> index {&pool};
 
-        auto* __restrict data_ptr = chars.data();
-        auto* __restrict offset_ptr = offsets.data();
-
-        if constexpr (Const) {
-            const auto start_value = start[0];
-            const auto len_value = len[0];
-            if (start_value == 0 || len_value <= 0) {
+        if constexpr (is_const) {
+            if (start[0] == 0 || len[0] <= 0) {
                 for (size_t i = 0; i < size; ++i) {
                     StringOP::push_empty_string(i, res_chars, res_offsets);
                 }
-            } else {
-                for (size_t i = 0; i < size; ++i) {
-                    const int str_size = offset_ptr[i] - offset_ptr[i - 1];
-                    const uint8_t* raw_str = data_ptr + offset_ptr[i - 1];
-                    // return empty string if start > src.length
-                    if (start_value > str_size || start_value < -str_size || 
str_size == 0) {
-                        StringOP::push_empty_string(i, res_chars, res_offsets);
-                        continue;
-                    }
-                    // reference to string_function.cpp: substring
-                    size_t byte_pos = 0;
-                    index.clear();
-                    for (size_t j = 0, char_size = 0;
-                         j < str_size &&
-                         (start_value <= 0 || index.size() <= start_value + 
len_value);
-                         j += char_size) {
-                        char_size = UTF8_BYTE_LENGTH[(unsigned 
char)(raw_str)[j]];
-                        index.push_back(j);
-                    }
+                return;
+            }
+        }
 
-                    int fixed_pos = start_value;
-                    if (fixed_pos < 0) {
-                        fixed_pos = str_size + fixed_pos + 1;
-                    } else if (fixed_pos > index.size()) {
-                        StringOP::push_null_string(i, res_chars, res_offsets, 
null_map);
-                        continue;
-                    }
+        for (size_t i = 0; i < size; ++i) {
+            int str_size = offsets[i] - offsets[i - 1];
+            const char* str_data = (char*)chars.data() + offsets[i - 1];
+            int start_value = is_const ? start[0] : start[i];
+            int len_value = is_const ? len[0] : len[i];
 
-                    byte_pos = index[fixed_pos - 1];
-                    int fixed_len = str_size - byte_pos;
-                    if (fixed_pos + len_value <= index.size()) {
-                        fixed_len = index[fixed_pos + len_value - 1] - 
byte_pos;
-                    }
+            // return empty string if start > src.length
+            if (start_value > str_size || str_size == 0 || start_value == 0 || 
len_value <= 0) {
+                StringOP::push_empty_string(i, res_chars, res_offsets);
+                continue;
+            }
 
-                    if (byte_pos <= str_size && fixed_len > 0) {
-                        // return StringRef(str.data + byte_pos, fixed_len);
-                        StringOP::push_value_string(
-                                std::string_view {reinterpret_cast<const 
char*>(raw_str + byte_pos),
-                                                  (size_t)fixed_len},
-                                i, res_chars, res_offsets);
-                    } else {
-                        StringOP::push_empty_string(i, res_chars, res_offsets);
-                    }
+            size_t byte_pos = 0;
+            index.clear();
+            for (size_t j = 0, char_size = 0; j < str_size; j += char_size) {
+                char_size = get_utf8_byte_length(str_data[j]);
+                index.push_back(j);
+                if (start_value > 0 && index.size() > start_value + len_value) 
{
+                    break;
                 }
             }
-        } else {
-            PMR::vector<std::pair<const unsigned char*, int>> strs(&pool);
-            strs.resize(size);
-            for (int i = 0; i < size; ++i) {
-                strs[i].first = data_ptr + offset_ptr[i - 1];
-                strs[i].second = offset_ptr[i] - offset_ptr[i - 1];
+
+            int fixed_pos = start_value;
+            if (fixed_pos < -(int)index.size()) {
+                StringOP::push_empty_string(i, res_chars, res_offsets);
+                continue;
+            }
+            if (fixed_pos < 0) {
+                fixed_pos = index.size() + fixed_pos + 1;
+            }
+            if (fixed_pos > index.size()) {
+                StringOP::push_null_string(i, res_chars, res_offsets, 
null_map);
+                continue;
             }
 
-            for (size_t i = 0; i < size; ++i) {
-                auto [raw_str, str_size] = strs[i];
-                const auto& start_value = start[i];
-                const auto& len_value = len[i];
+            byte_pos = index[fixed_pos - 1];
+            size_t fixed_len = str_size - byte_pos;
+            if (fixed_pos + len_value <= index.size()) {
+                fixed_len = index[fixed_pos + len_value - 1] - byte_pos;
+            }
 
-                // return empty string if start > src.length
-                if (start_value > str_size || str_size == 0 || start_value == 
0 || len_value <= 0) {
-                    StringOP::push_empty_string(i, res_chars, res_offsets);
-                    continue;
-                }
-                // reference to string_function.cpp: substring
-                size_t byte_pos = 0;
-                index.clear();
-                for (size_t j = 0, char_size = 0; j < str_size; j += 
char_size) {
-                    char_size = UTF8_BYTE_LENGTH[(unsigned char)(raw_str)[j]];
-                    index.push_back(j);
-                    if (start_value > 0 && index.size() > start_value + 
len_value) {
-                        break;
-                    }
-                }
+            if (byte_pos <= str_size && fixed_len > 0) {
+                StringOP::push_value_string_reserved_and_allow_overflow(
+                        {str_data + byte_pos, fixed_len}, i, res_chars, 
res_offsets);
+            } else {
+                StringOP::push_empty_string(i, res_chars, res_offsets);
+            }
+        }
+    }
 
-                int fixed_pos = start_value;
-                if (fixed_pos < -(int)index.size()) {
+    template <bool is_const>
+    static void vectors_ascii(const ColumnString::Chars& chars,
+                              const ColumnString::Offsets& offsets,
+                              const PaddedPODArray<Int32>& start, const 
PaddedPODArray<Int32>& len,
+                              NullMap& null_map, ColumnString::Chars& 
res_chars,
+                              ColumnString::Offsets& res_offsets) {
+        size_t size = offsets.size();
+        res_offsets.resize(size);
+
+        if constexpr (is_const) {
+            if (start[0] == 0 || len[0] <= 0) {
+                for (size_t i = 0; i < size; ++i) {
                     StringOP::push_empty_string(i, res_chars, res_offsets);
-                    continue;
-                }
-                if (fixed_pos < 0) {
-                    fixed_pos = index.size() + fixed_pos + 1;
-                }
-                if (fixed_pos > index.size()) {
-                    StringOP::push_null_string(i, res_chars, res_offsets, 
null_map);
-                    continue;
                 }
+                return;
+            }
+            res_chars.reserve(std::min(chars.size(), len[0] * size));
+        } else {
+            res_chars.reserve(chars.size());
+        }
 
-                byte_pos = index[fixed_pos - 1];
-                int fixed_len = str_size - byte_pos;
-                if (fixed_pos + len_value <= index.size()) {
-                    fixed_len = index[fixed_pos + len_value - 1] - byte_pos;
-                }
+        for (size_t i = 0; i < size; ++i) {
+            int str_size = offsets[i] - offsets[i - 1];
+            const char* str_data = (char*)chars.data() + offsets[i - 1];
 
-                if (byte_pos <= str_size && fixed_len > 0) {
-                    // return StringRef(str.data + byte_pos, fixed_len);
-                    StringOP::push_value_string(
-                            std::string_view {reinterpret_cast<const 
char*>(raw_str + byte_pos),
-                                              (size_t)fixed_len},
-                            i, res_chars, res_offsets);
-                } else {
-                    StringOP::push_empty_string(i, res_chars, res_offsets);
-                }
+            int start_value = is_const ? start[0] : start[i];
+            int len_value = is_const ? len[0] : len[i];
+
+            if (start_value > str_size || start_value < -str_size || str_size 
== 0) {
+                StringOP::push_empty_string(i, res_chars, res_offsets);
+                continue;
+            }
+            int fixed_pos = start_value - 1;
+            if (fixed_pos < 0) {
+                fixed_pos = str_size + fixed_pos + 1;
             }
+            size_t fixed_len = std::min(str_size - fixed_pos, len_value);
+            StringOP::push_value_string_reserved_and_allow_overflow(
+                    {str_data + fixed_pos, fixed_len}, i, res_chars, 
res_offsets);
         }
     }
 };


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to