github-actions[bot] commented on code in PR #64012: URL: https://github.com/apache/doris/pull/64012#discussion_r3339728747
########## be/src/exec/common/multi_string_searcher.h: ########## @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <limits> +#include <memory> +#include <vector> + +#include "core/string_ref.h" +#include "core/types.h" +#include "exec/common/string_searcher.h" + +namespace doris { + +// Searches many literal needles in one haystack. The table stores 2-byte ngrams from a +// batch of needles, so scanning the haystack can find candidates for many needles at once. +class MultiStringSearcher { + using Offset = uint8_t; + using Id = uint8_t; + using Ngram = uint16_t; + + // One table entry is one candidate ngram occurrence in a needle. It stores the global + // needle id and the 1-based offset of this ngram inside that needle. off == 0 means + // the slot is empty, so needle offsets are stored as 1-based values. + struct OffsetId { + Id id = 0; + Offset off = 0; + }; + +public: + explicit MultiStringSearcher(const std::vector<StringRef>& needles) + : _needles(needles), _hash(new OffsetId[HASH_SIZE]) { + _searchers.reserve(_needles.size()); + for (const auto& needle : _needles) { + _searchers.emplace_back(needle.data, needle.size); + } + } + + bool has_more_to_search() { + if (_last == _needles.size()) { + return false; + } + + std::memset(_hash.get(), 0, HASH_SIZE * sizeof(OffsetId)); + _fallback_needles.clear(); + _step = std::numeric_limits<size_t>::max(); + + // Not all needles are necessarily inserted in one table. Keeping the load factor + // low makes the linear-probing chains short; if the current batch reaches + // SMALL_LIMIT, the caller will scan all rows with this batch and call this method + // again for the next batch. _last is the global needle index of the next batch. + size_t ngrams = 0; + for (; _last < _needles.size(); ++_last) { + const auto& needle = _needles[_last]; + if (is_fallback_needle(needle.size)) { + _fallback_needles.push_back(_last); + } else { + const size_t needle_ngrams = needle.size - sizeof(Ngram) + 1; + if (ngrams + needle_ngrams > SMALL_LIMIT && ngrams != 0) { + break; + } + + ngrams += needle_ngrams; + _step = std::min(_step, needle_ngrams); + // Insert ngrams from the end to the beginning, matching the way the scan + // position is later converted back to the candidate needle start. + for (auto i = static_cast<int>(needle.size - sizeof(Ngram)); i >= 0; --i) { + put_ngram(to_ngram(reinterpret_cast<const uint8_t*>(needle.data + i)), i + 1, + _last); + } + } + } + + return true; + } + + void search_one_all(const uint8_t* haystack, const uint8_t* haystack_end, Int32* answer) const { + // answer points to the beginning of one result array row. Entries are indexed by + // global needle id, so different batches write into the same full-size row result. + for (const size_t needle_index : _fallback_needles) { + const auto* match = _searchers[needle_index].search(haystack, haystack_end); + if (match != haystack_end) { + answer[needle_index] = static_cast<Int32>(match - haystack + 1); + } + } + + if (_step == std::numeric_limits<size_t>::max() || + haystack_end - haystack < static_cast<ptrdiff_t>(sizeof(Ngram))) { + // The batch may contain only fallback needles, or the haystack may be too short + // to read a 2-byte ngram. Fallback needles have already been handled above. + return; + } + + for (const uint8_t* pos = haystack + _step - sizeof(Ngram); Review Comment: This can create an out-of-range pointer before the loop condition has a chance to fail. For example, with a 254-byte non-fallback needle `_step` becomes 253; a 10-byte haystack passes the 2-byte check above, then `haystack + _step - sizeof(Ngram)` forms `haystack + 251`, which is undefined behavior even if the loop never runs. Please either guard `haystack_end - haystack < _step` before this expression or rewrite the scan using integer offsets. The same issue exists a few lines below at `pos < haystack + _hash[cell].off - 1`; compare offsets like `(pos - haystack) < _hash[cell].off - 1` instead of forming a potentially out-of-range pointer. ########## be/src/exec/common/multi_string_searcher.h: ########## @@ -0,0 +1,180 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <algorithm> +#include <cstddef> +#include <cstdint> +#include <cstring> +#include <limits> +#include <memory> +#include <vector> + +#include "core/string_ref.h" +#include "core/types.h" +#include "exec/common/string_searcher.h" + +namespace doris { + +// Searches many literal needles in one haystack. The table stores 2-byte ngrams from a +// batch of needles, so scanning the haystack can find candidates for many needles at once. +class MultiStringSearcher { + using Offset = uint8_t; + using Id = uint8_t; + using Ngram = uint16_t; + + // One table entry is one candidate ngram occurrence in a needle. It stores the global + // needle id and the 1-based offset of this ngram inside that needle. off == 0 means + // the slot is empty, so needle offsets are stored as 1-based values. + struct OffsetId { + Id id = 0; + Offset off = 0; + }; + +public: + explicit MultiStringSearcher(const std::vector<StringRef>& needles) + : _needles(needles), _hash(new OffsetId[HASH_SIZE]) { + _searchers.reserve(_needles.size()); + for (const auto& needle : _needles) { + _searchers.emplace_back(needle.data, needle.size); + } + } + + bool has_more_to_search() { + if (_last == _needles.size()) { + return false; + } + + std::memset(_hash.get(), 0, HASH_SIZE * sizeof(OffsetId)); + _fallback_needles.clear(); + _step = std::numeric_limits<size_t>::max(); + + // Not all needles are necessarily inserted in one table. Keeping the load factor + // low makes the linear-probing chains short; if the current batch reaches + // SMALL_LIMIT, the caller will scan all rows with this batch and call this method + // again for the next batch. _last is the global needle index of the next batch. + size_t ngrams = 0; + for (; _last < _needles.size(); ++_last) { + const auto& needle = _needles[_last]; + if (is_fallback_needle(needle.size)) { + _fallback_needles.push_back(_last); + } else { + const size_t needle_ngrams = needle.size - sizeof(Ngram) + 1; + if (ngrams + needle_ngrams > SMALL_LIMIT && ngrams != 0) { + break; + } + + ngrams += needle_ngrams; + _step = std::min(_step, needle_ngrams); + // Insert ngrams from the end to the beginning, matching the way the scan + // position is later converted back to the candidate needle start. + for (auto i = static_cast<int>(needle.size - sizeof(Ngram)); i >= 0; --i) { + put_ngram(to_ngram(reinterpret_cast<const uint8_t*>(needle.data + i)), i + 1, + _last); + } + } + } + + return true; + } + + void search_one_all(const uint8_t* haystack, const uint8_t* haystack_end, Int32* answer) const { + // answer points to the beginning of one result array row. Entries are indexed by + // global needle id, so different batches write into the same full-size row result. + for (const size_t needle_index : _fallback_needles) { + const auto* match = _searchers[needle_index].search(haystack, haystack_end); + if (match != haystack_end) { + answer[needle_index] = static_cast<Int32>(match - haystack + 1); + } + } + + if (_step == std::numeric_limits<size_t>::max() || + haystack_end - haystack < static_cast<ptrdiff_t>(sizeof(Ngram))) { + // The batch may contain only fallback needles, or the haystack may be too short + // to read a 2-byte ngram. Fallback needles have already been handled above. + return; + } + + for (const uint8_t* pos = haystack + _step - sizeof(Ngram); + pos <= haystack_end - sizeof(Ngram); pos += _step) { + // A 2-byte ngram has exactly 65536 possible values. HASH_SIZE is also 65536, + // so ngram % HASH_SIZE is effectively direct addressing. Collisions and + // duplicate ngrams are kept by linear probing and checked until an empty slot. + for (size_t cell = to_ngram(pos) % HASH_SIZE; _hash[cell].off; + cell = (cell + 1) % HASH_SIZE) { + if (pos < haystack + _hash[cell].off - 1) { + continue; + } + + const size_t needle_index = _hash[cell].id; + // Convert the matched ngram position in the haystack back to the candidate + // start position of the whole needle, then verify the full needle below. + const uint8_t* match = pos - (_hash[cell].off - 1); + const auto candidate_position = static_cast<Int32>(match - haystack + 1); + if (answer[needle_index] != 0 && candidate_position >= answer[needle_index]) { + continue; + } + + const auto& needle = _needles[needle_index]; + // The hash table only proves that one 2-byte ngram matched. Full memcmp is + // required to discard false positives from duplicate ngrams and collisions. + if (match + needle.size <= haystack_end && Review Comment: This bounds check still performs `match + needle.size` before verifying that the result is within the haystack object. For a long needle and a candidate near the end of a short row, that pointer arithmetic can go far past one-past-end and is undefined behavior. Please use a subtraction-style check such as `haystack_end - match >= needle.size` before `memcmp`, and add coverage for long constant needles against shorter haystacks / near-end candidates. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
