This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 4e9d5a7f7a optimize substr performance and fix ASAN global buffer
overflow (#10442)
4e9d5a7f7a is described below
commit 4e9d5a7f7a955973262a5c59f11f9119d989acce
Author: Kang <[email protected]>
AuthorDate: Tue Jul 12 08:36:21 2022 +0800
optimize substr performance and fix ASAN global buffer overflow (#10442)
* add volnitsky substr algorithm
* replace std::search with volnitsky search algorithm in StringSearch
* optimize substring for constant_substring_fn case
use long run length search for performance
---
be/src/exprs/like_predicate.h | 2 +-
be/src/runtime/string_search.hpp | 29 ++-
be/src/vec/common/string_searcher.h | 357 +++++++++++++++++++++++++++
be/src/vec/common/volnitsky.h | 475 ++++++++++++++++++++++++++++++++++++
be/src/vec/functions/like.cpp | 39 ++-
be/src/vec/functions/like.h | 2 +-
6 files changed, 894 insertions(+), 10 deletions(-)
diff --git a/be/src/exprs/like_predicate.h b/be/src/exprs/like_predicate.h
index c530e32f6f..7f0822ef07 100644
--- a/be/src/exprs/like_predicate.h
+++ b/be/src/exprs/like_predicate.h
@@ -73,7 +73,7 @@ private:
void set_search_string(const std::string& search_string_arg) {
search_string = search_string_arg;
search_string_sv = StringValue(search_string);
- substring_pattern = StringSearch(&search_string_sv);
+ substring_pattern.set_pattern(&search_string_sv);
}
};
diff --git a/be/src/runtime/string_search.hpp b/be/src/runtime/string_search.hpp
index 3f657b6fa4..463719f279 100644
--- a/be/src/runtime/string_search.hpp
+++ b/be/src/runtime/string_search.hpp
@@ -23,6 +23,7 @@
#include "common/logging.h"
#include "runtime/string_value.h"
+#include "vec/common/volnitsky.h"
namespace doris {
@@ -31,18 +32,18 @@ public:
virtual ~StringSearch() {}
StringSearch() : _pattern(nullptr) {}
- StringSearch(const StringValue* pattern) : _pattern(pattern) {}
+ StringSearch(const StringValue* pattern) { set_pattern(pattern); }
+
+ void set_pattern(const StringValue* pattern) {
+ _pattern = pattern;
+ _vol_searcher.reset(new Volnitsky(pattern->ptr, pattern->len));
+ }
// search for this pattern in str.
// Returns the offset into str if the pattern exists
// Returns -1 if the pattern is not found
int search(const StringValue* str) const {
- if (!str || !_pattern || _pattern->len == 0) {
- return -1;
- }
-
- auto it = std::search(str->ptr, str->ptr + str->len,
- std::default_searcher(_pattern->ptr,
_pattern->ptr + _pattern->len));
+ auto it = search(str->ptr, str->len);
if (it == str->ptr + str->len) {
return -1;
} else {
@@ -50,8 +51,22 @@ public:
}
}
+ // search for this pattern in str.
+ // Returns the offset into str if the pattern exists
+ // Returns str+len if the pattern is not found
+ const char* search(char* str, size_t len) const {
+ if (!str || !_pattern || _pattern->len == 0) {
+ return str + len;
+ }
+
+ return _vol_searcher->search(str, len);
+ }
+
+ inline size_t get_pattern_length() { return _pattern ? _pattern->len : 0; }
+
private:
const StringValue* _pattern;
+ std::unique_ptr<Volnitsky> _vol_searcher;
};
} // namespace doris
diff --git a/be/src/vec/common/string_searcher.h
b/be/src/vec/common/string_searcher.h
new file mode 100644
index 0000000000..895ba538a9
--- /dev/null
+++ b/be/src/vec/common/string_searcher.h
@@ -0,0 +1,357 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+//
https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/StringSearcher.h
+// and modified by Doris
+
+#pragma once
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "vec/common/string_utils/string_utils.h"
+
+#ifdef __SSE2__
+#include <emmintrin.h>
+#endif
+
+#ifdef __SSE4_1__
+#include <smmintrin.h>
+#endif
+
+namespace doris {
+
+// namespace ErrorCodes
+// {
+// extern const int BAD_ARGUMENTS;
+// }
+
+/** Variants for searching a substring in a string.
+ * In most cases, performance is less than Volnitsky (see Volnitsky.h).
+ */
+
+class StringSearcherBase {
+public:
+ bool force_fallback = false;
+#ifdef __SSE2__
+protected:
+ static constexpr auto n = sizeof(__m128i);
+ const int page_size = sysconf(_SC_PAGESIZE); //::getPageSize();
+
+ bool page_safe(const void* const ptr) const {
+ return ((page_size - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
page_size - n;
+ }
+#endif
+};
+
+/// Performs case-sensitive and case-insensitive search of UTF-8 strings
+template <bool CaseSensitive, bool ASCII>
+class StringSearcher;
+
+/// Case-sensitive searcher (both ASCII and UTF-8)
+template <bool ASCII>
+class StringSearcher<true, ASCII> : public StringSearcherBase {
+private:
+ /// string to be searched for
+ const uint8_t* const needle;
+ const uint8_t* const needle_end;
+ /// first character in `needle`
+ uint8_t first {};
+
+#ifdef __SSE4_1__
+ /// vector filled `first` for determining leftmost position of the first
symbol
+ __m128i pattern;
+ /// vector of first 16 characters of `needle`
+ __m128i cache = _mm_setzero_si128();
+ int cachemask {};
+#endif
+
+public:
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ StringSearcher(const CharT* needle_, const size_t needle_size)
+ : needle {reinterpret_cast<const uint8_t*>(needle_)},
+ needle_end {needle + needle_size} {
+ if (0 == needle_size) return;
+
+ first = *needle;
+
+#ifdef __SSE4_1__
+ pattern = _mm_set1_epi8(first);
+
+ const auto* needle_pos = needle;
+
+ //for (const auto i : collections::range(0, n))
+ for (size_t i = 0; i < n; i++) {
+ cache = _mm_srli_si128(cache, 1);
+
+ if (needle_pos != needle_end) {
+ cache = _mm_insert_epi8(cache, *needle_pos, n - 1);
+ cachemask |= 1 << i;
+ ++needle_pos;
+ }
+ }
+#endif
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ ALWAYS_INLINE bool compare(const CharT* /*haystack*/, const CharT*
/*haystack_end*/,
+ const CharT* pos) const {
+#ifdef __SSE4_1__
+ if (needle_end - needle > n && page_safe(pos)) {
+ const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const
__m128i*>(pos));
+ const auto v_against_cache = _mm_cmpeq_epi8(v_haystack, cache);
+ const auto mask = _mm_movemask_epi8(v_against_cache);
+
+ if (0xffff == cachemask) {
+ if (mask == cachemask) {
+ pos += n;
+ const auto* needle_pos = needle + n;
+
+ while (needle_pos < needle_end && *pos == *needle_pos)
++pos, ++needle_pos;
+
+ if (needle_pos == needle_end) return true;
+ }
+ } else if ((mask & cachemask) == cachemask)
+ return true;
+
+ return false;
+ }
+#endif
+
+ if (*pos == first) {
+ ++pos;
+ const auto* needle_pos = needle + 1;
+
+ while (needle_pos < needle_end && *pos == *needle_pos) ++pos,
++needle_pos;
+
+ if (needle_pos == needle_end) return true;
+ }
+
+ return false;
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ const CharT* search(const CharT* haystack, const CharT* const
haystack_end) const {
+ if (needle == needle_end) return haystack;
+
+ while (haystack < haystack_end) {
+#ifdef __SSE4_1__
+ if (haystack + n <= haystack_end && page_safe(haystack)) {
+ /// find first character
+ const auto v_haystack = _mm_loadu_si128(reinterpret_cast<const
__m128i*>(haystack));
+ const auto v_against_pattern = _mm_cmpeq_epi8(v_haystack,
pattern);
+
+ const auto mask = _mm_movemask_epi8(v_against_pattern);
+
+ /// first character not present in 16 octets starting at
`haystack`
+ if (mask == 0) {
+ haystack += n;
+ continue;
+ }
+
+ const auto offset = __builtin_ctz(mask);
+ haystack += offset;
+
+ if (haystack + n <= haystack_end && page_safe(haystack)) {
+ /// check for first 16 octets
+ const auto v_haystack_offset =
+ _mm_loadu_si128(reinterpret_cast<const
__m128i*>(haystack));
+ const auto v_against_cache =
_mm_cmpeq_epi8(v_haystack_offset, cache);
+ const auto mask_offset =
_mm_movemask_epi8(v_against_cache);
+
+ if (0xffff == cachemask) {
+ if (mask_offset == cachemask) {
+ const auto* haystack_pos = haystack + n;
+ const auto* needle_pos = needle + n;
+
+ while (haystack_pos < haystack_end && needle_pos <
needle_end &&
+ *haystack_pos == *needle_pos)
+ ++haystack_pos, ++needle_pos;
+
+ if (needle_pos == needle_end) return haystack;
+ }
+ } else if ((mask_offset & cachemask) == cachemask)
+ return haystack;
+
+ ++haystack;
+ continue;
+ }
+ }
+#endif
+
+ if (haystack == haystack_end) return haystack_end;
+
+ if (*haystack == first) {
+ const auto* haystack_pos = haystack + 1;
+ const auto* needle_pos = needle + 1;
+
+ while (haystack_pos < haystack_end && needle_pos < needle_end
&&
+ *haystack_pos == *needle_pos)
+ ++haystack_pos, ++needle_pos;
+
+ if (needle_pos == needle_end) return haystack;
+ }
+
+ ++haystack;
+ }
+
+ return haystack_end;
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ const CharT* search(const CharT* haystack, const size_t haystack_size)
const {
+ return search(haystack, haystack + haystack_size);
+ }
+};
+
+// Searches for needle surrounded by token-separators.
+// Separators are anything inside ASCII (0-128) and not alphanum.
+// Any value outside of basic ASCII (>=128) is considered a non-separator
symbol, hence UTF-8 strings
+// should work just fine. But any Unicode whitespace is not considered a token
separtor.
+template <typename StringSearcher>
+class TokenSearcher : public StringSearcherBase {
+ StringSearcher searcher;
+ size_t needle_size;
+
+public:
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ TokenSearcher(const CharT* needle_, const size_t needle_size_)
+ : searcher {needle_, needle_size_}, needle_size(needle_size_) {
+ if (std::any_of(needle_, needle_ + needle_size_, isTokenSeparator)) {
+ //throw Exception{"Needle must not contain whitespace or separator
characters", ErrorCodes::BAD_ARGUMENTS};
+ }
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ ALWAYS_INLINE bool compare(const CharT* haystack, const CharT*
haystack_end,
+ const CharT* pos) const {
+ // use searcher only if pos is in the beginning of token and pos +
searcher.needle_size is end of token.
+ if (isToken(haystack, haystack_end, pos))
+ return searcher.compare(haystack, haystack_end, pos);
+
+ return false;
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ const CharT* search(const CharT* haystack, const CharT* const
haystack_end) const {
+ // use searcher.search(), then verify that returned value is a token
+ // if it is not, skip it and re-run
+
+ const auto* pos = haystack;
+ while (pos < haystack_end) {
+ pos = searcher.search(pos, haystack_end);
+ if (pos == haystack_end || isToken(haystack, haystack_end, pos))
return pos;
+
+ // assuming that heendle does not contain any token separators.
+ pos += needle_size;
+ }
+ return haystack_end;
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ const CharT* search(const CharT* haystack, const size_t haystack_size)
const {
+ return search(haystack, haystack + haystack_size);
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ ALWAYS_INLINE bool isToken(const CharT* haystack, const CharT* const
haystack_end,
+ const CharT* p) const {
+ return (p == haystack || isTokenSeparator(*(p - 1))) &&
+ (p + needle_size >= haystack_end || isTokenSeparator(*(p +
needle_size)));
+ }
+
+ ALWAYS_INLINE static bool isTokenSeparator(const uint8_t c) {
+ return !(is_alpha_numeric_ascii(c) || !is_ascii(c));
+ }
+};
+
+using ASCIICaseSensitiveStringSearcher = StringSearcher<true, true>;
+// using ASCIICaseInsensitiveStringSearcher = StringSearcher<false, true>;
+using UTF8CaseSensitiveStringSearcher = StringSearcher<true, false>;
+// using UTF8CaseInsensitiveStringSearcher = StringSearcher<false, false>;
+using ASCIICaseSensitiveTokenSearcher =
TokenSearcher<ASCIICaseSensitiveStringSearcher>;
+// using ASCIICaseInsensitiveTokenSearcher =
TokenSearcher<ASCIICaseInsensitiveStringSearcher>;
+
+/** Uses functions from libc.
+ * It makes sense to use only with short haystacks when cheap initialization
is required.
+ * There is no option for case-insensitive search for UTF-8 strings.
+ * It is required that strings are zero-terminated.
+ */
+
+struct LibCASCIICaseSensitiveStringSearcher : public StringSearcherBase {
+ const char* const needle;
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ LibCASCIICaseSensitiveStringSearcher(const CharT* const needle_, const
size_t /* needle_size */)
+ : needle(reinterpret_cast<const char*>(needle_)) {}
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ const CharT* search(const CharT* haystack, const CharT* const
haystack_end) const {
+ const auto* res = strstr(reinterpret_cast<const char*>(haystack),
+ reinterpret_cast<const char*>(needle));
+ if (!res) return haystack_end;
+ return reinterpret_cast<const CharT*>(res);
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ const CharT* search(const CharT* haystack, const size_t haystack_size)
const {
+ return search(haystack, haystack + haystack_size);
+ }
+};
+
+struct LibCASCIICaseInsensitiveStringSearcher : public StringSearcherBase {
+ const char* const needle;
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ LibCASCIICaseInsensitiveStringSearcher(const CharT* const needle_,
+ const size_t /* needle_size */)
+ : needle(reinterpret_cast<const char*>(needle_)) {}
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ const CharT* search(const CharT* haystack, const CharT* const
haystack_end) const {
+ const auto* res = strcasestr(reinterpret_cast<const char*>(haystack),
+ reinterpret_cast<const char*>(needle));
+ if (!res) return haystack_end;
+ return reinterpret_cast<const CharT*>(res);
+ }
+
+ template <typename CharT>
+ // requires (sizeof(CharT) == 1)
+ const CharT* search(const CharT* haystack, const size_t haystack_size)
const {
+ return search(haystack, haystack + haystack_size);
+ }
+};
+
+} // namespace doris
diff --git a/be/src/vec/common/volnitsky.h b/be/src/vec/common/volnitsky.h
new file mode 100644
index 0000000000..08413a45f9
--- /dev/null
+++ b/be/src/vec/common/volnitsky.h
@@ -0,0 +1,475 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+// This file is copied from
+// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/Volnitsky.h
+// and modified by Doris
+
+#pragma once
+
+#include <stdint.h>
+#include <string.h>
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "vec/common/string_searcher.h"
+#include "vec/common/unaligned.h"
+
+/** Search for a substring in a string by Volnitsky's algorithm
+ * http://volnitsky.com/project/str_search/
+ *
+ * `haystack` and `needle` can contain zero bytes.
+ *
+ * Algorithm:
+ * - if the `needle` is too small or too large, or too small `haystack`, use
std::search or memchr;
+ * - when initializing, fill in an open-addressing linear probing hash table
of the form
+ * hash from the bigram of needle -> the position of this bigram in needle
+ 1.
+ * (one is added only to distinguish zero offset from an empty cell)
+ * - the keys are not stored in the hash table, only the values are stored;
+ * - bigrams can be inserted several times if they occur in the needle
several times;
+ * - when searching, take from haystack bigram, which should correspond to
the last bigram of needle (comparing from the end);
+ * - look for it in the hash table, if found - get the offset from the hash
table and compare the string bytewise;
+ * - if it did not match, we check the next cell of the hash table from the
collision resolution chain;
+ * - if not found, skip to haystack almost the size of the needle bytes;
+ *
+ * MultiVolnitsky - search for multiple substrings in a string:
+ * - Add bigrams to hash table with string index. Then the usual Volnitsky
search is used.
+ * - We are adding while searching, limiting the number of fallback searchers
and the total number of added bigrams
+ */
+
+namespace doris {
+using UInt8 = uint8_t;
+using UInt16 = uint16_t;
+using UInt64 = uint64_t;
+
+namespace VolnitskyTraits {
+using Offset =
+ UInt8; /// Offset in the needle. For the basic algorithm, the length
of the needle must not be greater than 255.
+using Id =
+ UInt8; /// Index of the string (within the array of multiple needles),
must not be greater than 255.
+using Ngram = UInt16; /// n-gram (2 bytes).
+
+/** Fits into the L2 cache (of common Intel CPUs).
+ * This number is extremely good for compilers as it is
numeric_limits<Uint16>::max() and there are optimizations with movzwl and other
instructions with 2 bytes
+ */
+static constexpr size_t hash_size = 64 * 1024;
+
+/// min haystack size to use main algorithm instead of fallback
+static constexpr size_t min_haystack_size_for_algorithm = 20000;
+
+static inline bool isFallbackNeedle(const size_t needle_size, size_t
haystack_size_hint = 0) {
+ return needle_size < 2 * sizeof(Ngram) || needle_size >=
std::numeric_limits<Offset>::max() ||
+ (haystack_size_hint && haystack_size_hint <
min_haystack_size_for_algorithm);
+}
+
+static inline Ngram toNGram(const UInt8* const pos) {
+ return unaligned_load<Ngram>(pos);
+}
+
+template <typename Callback>
+static inline bool putNGramASCIICaseInsensitive(const UInt8* pos, int offset,
+ Callback&& putNGramBase) {
+ struct Chars {
+ UInt8 c0;
+ UInt8 c1;
+ };
+
+ union {
+ Ngram n;
+ Chars chars;
+ };
+
+ n = toNGram(pos);
+
+ const auto c0_al = isAlphaASCII(chars.c0);
+ const auto c1_al = isAlphaASCII(chars.c1);
+
+ if (c0_al && c1_al) {
+ /// 4 combinations: AB, aB, Ab, ab
+ putNGramBase(n, offset);
+ chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
+ putNGramBase(n, offset);
+ chars.c1 = alternateCaseIfAlphaASCII(chars.c1);
+ putNGramBase(n, offset);
+ chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
+ putNGramBase(n, offset);
+ } else if (c0_al) {
+ /// 2 combinations: A1, a1
+ putNGramBase(n, offset);
+ chars.c0 = alternateCaseIfAlphaASCII(chars.c0);
+ putNGramBase(n, offset);
+ } else if (c1_al) {
+ /// 2 combinations: 0B, 0b
+ putNGramBase(n, offset);
+ chars.c1 = alternateCaseIfAlphaASCII(chars.c1);
+ putNGramBase(n, offset);
+ } else
+ /// 1 combination: 01
+ putNGramBase(n, offset);
+
+ return true;
+}
+
+template <bool CaseSensitive, bool ASCII, typename Callback>
+static inline bool putNGram(const UInt8* pos, int offset, [[maybe_unused]]
const UInt8* begin,
+ size_t size, Callback&& putNGramBase) {
+ if constexpr (CaseSensitive) {
+ putNGramBase(toNGram(pos), offset);
+ return true;
+ } else if constexpr (ASCII) {
+ return putNGramASCIICaseInsensitive(pos, offset,
std::forward<Callback>(putNGramBase));
+ } else {
+ // return putNGramUTF8CaseInsensitive(pos, offset, begin, size,
std::forward<Callback>(putNGramBase));
+ return false;
+ }
+}
+} // namespace VolnitskyTraits
+
+/// @todo store lowercase needle to speed up in case there are numerous
occurrences of bigrams from needle in haystack
+template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
+class VolnitskyBase {
+protected:
+ const UInt8* needle;
+ size_t needle_size;
+ const UInt8* needle_end = needle + needle_size;
+ /// For how long we move, if the n-gram from haystack is not found in the
hash table.
+ size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
+
+ /** max needle length is 255, max distinct ngrams for case-sensitive is
(255 - 1), case-insensitive is 4 * (255 - 1)
+ * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both
cases */
+ std::unique_ptr<VolnitskyTraits::Offset[]> hash; /// Hash table.
+
+ bool fallback; /// Do we need to use the fallback algorithm.
+
+ FallbackSearcher fallback_searcher;
+
+public:
+ using Searcher = FallbackSearcher;
+
+ /** haystack_size_hint - the expected total size of the haystack for
`search` calls. Optional (zero means unspecified).
+ * If you specify it small enough, the fallback algorithm will be used,
+ * since it is considered that it's useless to waste time initializing
the hash table.
+ */
+ VolnitskyBase(const char* const needle_, const size_t needle_size_,
+ size_t haystack_size_hint = 0)
+ : needle {reinterpret_cast<const UInt8*>(needle_)},
+ needle_size {needle_size_},
+ fallback {VolnitskyTraits::isFallbackNeedle(needle_size,
haystack_size_hint)},
+ fallback_searcher {needle_, needle_size} {
+ if (fallback || fallback_searcher.force_fallback) return;
+
+ hash = std::unique_ptr<VolnitskyTraits::Offset[]>(
+ new VolnitskyTraits::Offset[VolnitskyTraits::hash_size] {});
+
+ auto callback = [this](const VolnitskyTraits::Ngram ngram, const int
offset) {
+ return this->putNGramBase(ngram, offset);
+ };
+ /// ssize_t is used here because unsigned can't be used with condition
like `i >= 0`, unsigned always >= 0
+ /// And also adding from the end guarantees that we will find first
occurrence because we will lookup bigger offsets first.
+ for (auto i = static_cast<ssize_t>(needle_size -
sizeof(VolnitskyTraits::Ngram)); i >= 0;
+ --i) {
+ bool ok = VolnitskyTraits::putNGram<CaseSensitive, ASCII>(needle +
i, i + 1, needle,
+
needle_size, callback);
+
+ /** `putNGramUTF8CaseInsensitive` does not work if characters with
lower and upper cases
+ * are represented by different number of bytes or code points.
+ * So, use fallback if error occurred.
+ */
+ if (!ok) {
+ fallback_searcher.force_fallback = true;
+ hash = nullptr;
+ return;
+ }
+ }
+ }
+
+ /// If not found, the end of the haystack is returned.
+ const UInt8* search(const UInt8* const haystack, const size_t
haystack_size) const {
+ if (needle_size == 0) return haystack;
+
+ const auto* haystack_end = haystack + haystack_size;
+
+ if (fallback || haystack_size <= needle_size ||
fallback_searcher.force_fallback)
+ return fallback_searcher.search(haystack, haystack_end);
+
+ /// Let's "apply" the needle to the haystack and compare the n-gram
from the end of the needle.
+ const auto* pos = haystack + needle_size -
sizeof(VolnitskyTraits::Ngram);
+ for (; pos <= haystack_end - needle_size; pos += step) {
+ /// We look at all the cells of the hash table that can correspond
to the n-gram from haystack.
+ for (size_t cell_num = VolnitskyTraits::toNGram(pos) %
VolnitskyTraits::hash_size;
+ hash[cell_num]; cell_num = (cell_num + 1) %
VolnitskyTraits::hash_size) {
+ /// When found - compare bytewise, using the offset from the
hash table.
+ const auto* res = pos - (hash[cell_num] - 1);
+
+ /// pointer in the code is always padded array so we can use
pagesafe semantics
+ if (fallback_searcher.compare(haystack, haystack_end, res))
return res;
+ }
+ }
+
+ return fallback_searcher.search(pos - step + 1, haystack_end);
+ }
+
+ const char* search(const char* haystack, size_t haystack_size) const {
+ return reinterpret_cast<const char*>(
+ search(reinterpret_cast<const UInt8*>(haystack),
haystack_size));
+ }
+
+protected:
+ void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset) {
+ /// Put the offset for the n-gram in the corresponding cell or the
nearest free cell.
+ size_t cell_num = ngram % VolnitskyTraits::hash_size;
+
+ while (hash[cell_num])
+ cell_num =
+ (cell_num + 1) % VolnitskyTraits::hash_size; /// Search
for the next free cell.
+
+ hash[cell_num] = offset;
+ }
+};
+
+template <bool CaseSensitive, bool ASCII, typename FallbackSearcher>
+class MultiVolnitskyBase {
+private:
+ /// needles and their offsets
+ const std::vector<StringRef>& needles;
+
+ /// fallback searchers
+ std::vector<size_t> fallback_needles;
+ std::vector<FallbackSearcher> fallback_searchers;
+
+ /// because std::pair<> is not POD
+ struct OffsetId {
+ VolnitskyTraits::Id id;
+ VolnitskyTraits::Offset off;
+ };
+
+ std::unique_ptr<OffsetId[]> hash;
+
+ /// step for each bunch of strings
+ size_t step;
+
+ /// last index of offsets that was not processed
+ size_t last;
+
+ /// limit for adding to hashtable. In worst case with case insentive
search, the table will be filled at most as half
+ static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8;
+
+public:
+ explicit MultiVolnitskyBase(const std::vector<StringRef>& needles_)
+ : needles {needles_}, step {0}, last {0} {
+ fallback_searchers.reserve(needles.size());
+ hash = std::unique_ptr<OffsetId[]>(
+ new OffsetId[VolnitskyTraits::
+ hash_size]); /// No zero initialization,
it will be done later.
+ }
+
+ /**
+ * This function is needed to initialize hash table
+ * Returns `true` if there is nothing to initialize
+ * and `false` if we have something to initialize and initializes it.
+ * This function is a kind of fallback if there are many needles.
+ * We actually destroy the hash table and initialize it with uninitialized
needles
+ * and search through the haystack again.
+ * The actual usage of this function is like this:
+ * while (hasMoreToSearch())
+ * {
+ * search inside the haystack with the known needles
+ * }
+ */
+ bool hasMoreToSearch() {
+ if (last == needles.size()) return false;
+
+ memset(hash.get(), 0, VolnitskyTraits::hash_size * sizeof(OffsetId));
+ fallback_needles.clear();
+ step = std::numeric_limits<size_t>::max();
+
+ size_t buf = 0;
+ size_t size = needles.size();
+
+ for (; last < size; ++last) {
+ const char* cur_needle_data = needles[last].data;
+ const size_t cur_needle_size = needles[last].size;
+
+ /// save the indices of fallback searchers
+ if (VolnitskyTraits::isFallbackNeedle(cur_needle_size)) {
+ fallback_needles.push_back(last);
+ } else {
+ /// put all bigrams
+ auto callback = [this](const VolnitskyTraits::Ngram ngram,
const int offset) {
+ return this->putNGramBase(ngram, offset, this->last);
+ };
+
+ buf += cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
+
+ /// this is the condition when we actually need to stop and
start searching with known needles
+ if (buf > small_limit) break;
+
+ step = std::min(step, cur_needle_size -
sizeof(VolnitskyTraits::Ngram) + 1);
+ for (auto i = static_cast<int>(cur_needle_size -
sizeof(VolnitskyTraits::Ngram));
+ i >= 0; --i) {
+ VolnitskyTraits::putNGram<CaseSensitive, ASCII>(
+ reinterpret_cast<const UInt8*>(cur_needle_data) +
i, i + 1,
+ reinterpret_cast<const UInt8*>(cur_needle_data),
cur_needle_size,
+ callback);
+ }
+ }
+ fallback_searchers.emplace_back(cur_needle_data, cur_needle_size);
+ }
+ return true;
+ }
+
+ inline bool searchOne(const UInt8* haystack, const UInt8* haystack_end)
const {
+ const size_t fallback_size = fallback_needles.size();
+ for (size_t i = 0; i < fallback_size; ++i)
+ if (fallback_searchers[fallback_needles[i]].search(haystack,
haystack_end) !=
+ haystack_end)
+ return true;
+
+ /// check if we have one non empty volnitsky searcher
+ if (step != std::numeric_limits<size_t>::max()) {
+ const auto* pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
+ for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos
+= step) {
+ for (size_t cell_num = VolnitskyTraits::toNGram(pos) %
VolnitskyTraits::hash_size;
+ hash[cell_num].off; cell_num = (cell_num + 1) %
VolnitskyTraits::hash_size) {
+ if (pos >= haystack + hash[cell_num].off - 1) {
+ const auto res = pos - (hash[cell_num].off - 1);
+ const size_t ind = hash[cell_num].id;
+ if (res + needles[ind].size <= haystack_end &&
+ fallback_searchers[ind].compare(haystack,
haystack_end, res))
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+ }
+
+ inline size_t searchOneFirstIndex(const UInt8* haystack, const UInt8*
haystack_end) const {
+ const size_t fallback_size = fallback_needles.size();
+
+ size_t answer = std::numeric_limits<size_t>::max();
+
+ for (size_t i = 0; i < fallback_size; ++i)
+ if (fallback_searchers[fallback_needles[i]].search(haystack,
haystack_end) !=
+ haystack_end)
+ answer = std::min(answer, fallback_needles[i]);
+
+ /// check if we have one non empty volnitsky searcher
+ if (step != std::numeric_limits<size_t>::max()) {
+ const auto* pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
+ for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos
+= step) {
+ for (size_t cell_num = VolnitskyTraits::toNGram(pos) %
VolnitskyTraits::hash_size;
+ hash[cell_num].off; cell_num = (cell_num + 1) %
VolnitskyTraits::hash_size) {
+ if (pos >= haystack + hash[cell_num].off - 1) {
+ const auto res = pos - (hash[cell_num].off - 1);
+ const size_t ind = hash[cell_num].id;
+ if (res + needles[ind].size <= haystack_end &&
+ fallback_searchers[ind].compare(haystack,
haystack_end, res))
+ answer = std::min(answer, ind);
+ }
+ }
+ }
+ }
+
+ /*
+ * if nothing was found, answer + 1 will be equal to zero and we can
+ * assign it into the result because we need to return the position
starting with one
+ */
+ return answer + 1;
+ }
+
+ template <typename CountCharsCallback>
+ inline UInt64 searchOneFirstPosition(const UInt8* haystack, const UInt8*
haystack_end,
+ const CountCharsCallback&
count_chars) const {
+ const size_t fallback_size = fallback_needles.size();
+
+ UInt64 answer = std::numeric_limits<UInt64>::max();
+
+ for (size_t i = 0; i < fallback_size; ++i)
+ if (auto pos =
fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
+ pos != haystack_end)
+ answer = std::min<UInt64>(answer, pos - haystack);
+
+ /// check if we have one non empty volnitsky searcher
+ if (step != std::numeric_limits<size_t>::max()) {
+ const auto* pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
+ for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos
+= step) {
+ for (size_t cell_num = VolnitskyTraits::toNGram(pos) %
VolnitskyTraits::hash_size;
+ hash[cell_num].off; cell_num = (cell_num + 1) %
VolnitskyTraits::hash_size) {
+ if (pos >= haystack + hash[cell_num].off - 1) {
+ const auto res = pos - (hash[cell_num].off - 1);
+ const size_t ind = hash[cell_num].id;
+ if (res + needles[ind].size <= haystack_end &&
+ fallback_searchers[ind].compare(haystack,
haystack_end, res))
+ answer = std::min<UInt64>(answer, res - haystack);
+ }
+ }
+ }
+ }
+ if (answer == std::numeric_limits<UInt64>::max()) return 0;
+ return count_chars(haystack, haystack + answer);
+ }
+
+ template <typename CountCharsCallback, typename AnsType>
+ inline void searchOneAll(const UInt8* haystack, const UInt8* haystack_end,
AnsType* answer,
+ const CountCharsCallback& count_chars) const {
+ const size_t fallback_size = fallback_needles.size();
+ for (size_t i = 0; i < fallback_size; ++i) {
+ const UInt8* ptr =
+ fallback_searchers[fallback_needles[i]].search(haystack,
haystack_end);
+ if (ptr != haystack_end) answer[fallback_needles[i]] =
count_chars(haystack, ptr);
+ }
+
+ /// check if we have one non empty volnitsky searcher
+ if (step != std::numeric_limits<size_t>::max()) {
+ const auto* pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
+ for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos
+= step) {
+ for (size_t cell_num = VolnitskyTraits::toNGram(pos) %
VolnitskyTraits::hash_size;
+ hash[cell_num].off; cell_num = (cell_num + 1) %
VolnitskyTraits::hash_size) {
+ if (pos >= haystack + hash[cell_num].off - 1) {
+ const auto* res = pos - (hash[cell_num].off - 1);
+ const size_t ind = hash[cell_num].id;
+ if (answer[ind] == 0 && res + needles[ind].size <=
haystack_end &&
+ fallback_searchers[ind].compare(haystack,
haystack_end, res))
+ answer[ind] = count_chars(haystack, res);
+ }
+ }
+ }
+ }
+ }
+
+ void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset,
const size_t num) {
+ size_t cell_num = ngram % VolnitskyTraits::hash_size;
+
+ while (hash[cell_num].off) cell_num = (cell_num + 1) %
VolnitskyTraits::hash_size;
+
+ hash[cell_num] = {static_cast<VolnitskyTraits::Id>(num),
+ static_cast<VolnitskyTraits::Offset>(offset)};
+ }
+};
+
+using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
+using VolnitskyUTF8 =
+ VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; ///
exactly same as Volnitsky
+
+using VolnitskyCaseSensitiveToken = VolnitskyBase<true, true,
ASCIICaseSensitiveTokenSearcher>;
+
+using MultiVolnitsky = MultiVolnitskyBase<true, true,
ASCIICaseSensitiveStringSearcher>;
+using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false,
ASCIICaseSensitiveStringSearcher>;
+
+} // namespace doris
diff --git a/be/src/vec/functions/like.cpp b/be/src/vec/functions/like.cpp
index 6a04e654ab..522fb0b621 100644
--- a/be/src/vec/functions/like.cpp
+++ b/be/src/vec/functions/like.cpp
@@ -101,7 +101,8 @@ Status FunctionLikeBase::execute_impl(FunctionContext*
context, Block& block,
// result column
auto res = ColumnUInt8::create();
ColumnUInt8::Container& vec_res = res->get_data();
- vec_res.resize(values->size());
+ // set default value to 0, and match functions only need to set 1/true
+ vec_res.resize_fill(values->size());
auto* state = reinterpret_cast<LikeState*>(
context->get_function_state(FunctionContext::THREAD_LOCAL));
@@ -129,6 +130,42 @@ Status FunctionLikeBase::vector_vector(const
ColumnString::Chars& values,
const ColumnString::Offsets&
pattern_offsets,
ColumnUInt8::Container& result, const
LikeFn& function,
LikeSearchState* search_state) {
+ // for constant_substring_fn, use long run length search for performance
+ if (constant_substring_fn ==
+ *(function.target<doris::Status (*)(LikeSearchState * state, const
StringValue&,
+ const StringValue&, unsigned
char*)>())) {
+ // treat continous multi string data as a long string data
+ const UInt8* begin = values.data();
+ const UInt8* end = begin + values.size();
+ const UInt8* pos = begin;
+
+ /// Current index in the array of strings.
+ size_t i = 0;
+ size_t needle_size =
search_state->substring_pattern.get_pattern_length();
+
+ /// We will search for the next occurrence in all strings at once.
+ while (pos < end) {
+ // search return matched substring start offset
+ pos = (UInt8*)search_state->substring_pattern.search((char*)pos,
end - pos);
+ if (pos >= end) break;
+
+ /// Determine which index it refers to.
+ /// begin + value_offsets[i] is the start offset of string at i+1
+ while (begin + value_offsets[i] <= pos) ++i;
+
+ /// We check that the entry does not pass through the boundaries
of strings.
+ if (pos + needle_size < begin + value_offsets[i]) {
+ result[i] = 1;
+ }
+
+ // move to next string offset
+ pos = begin + value_offsets[i];
+ ++i;
+ }
+
+ return Status::OK();
+ }
+
const auto size = value_offsets.size();
for (int i = 0; i < size; ++i) {
diff --git a/be/src/vec/functions/like.h b/be/src/vec/functions/like.h
index e66692a0fa..e19b92d0c2 100644
--- a/be/src/vec/functions/like.h
+++ b/be/src/vec/functions/like.h
@@ -61,7 +61,7 @@ struct LikeSearchState {
void set_search_string(const std::string& search_string_arg) {
search_string = search_string_arg;
search_string_sv = StringValue(search_string);
- substring_pattern = StringSearch(&search_string_sv);
+ substring_pattern.set_pattern(&search_string_sv);
}
};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]