This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new cb2e915869d [refactor](opt) improve BE code readability of
multi_match_any function (#39354)
cb2e915869d is described below
commit cb2e915869d1590640ba44b25c40207b3603ae99
Author: Chester <[email protected]>
AuthorDate: Mon Sep 23 09:34:30 2024 +0800
[refactor](opt) improve BE code readability of multi_match_any function
(#39354)
To improve BE code readability of **multi_match_any** function, this PR
refactored codes by:
1. optimize the head files from deprecated C++ header 'stddef.h' to
'cstddef'
2. use readability-qualified-auto
3. use readability-braces-around-statements
4. extract common codes of `vector_constant()` and `vector_vector()`into
two functions:
`prepare_regexps_and_scratch()` and `on_match()`
5. simplify codes of `execute_impl()` dealing with null input by:
removing two rarely used variables `haystack_nullable` and
`needles_nullable`;
adding the function `handle_nullable_column()`
---
.../functions/functions_multi_string_search.cpp | 200 +++++++++++----------
1 file changed, 101 insertions(+), 99 deletions(-)
diff --git a/be/src/vec/functions/functions_multi_string_search.cpp
b/be/src/vec/functions/functions_multi_string_search.cpp
index f7a1b8d7a90..7736a1a039b 100644
--- a/be/src/vec/functions/functions_multi_string_search.cpp
+++ b/be/src/vec/functions/functions_multi_string_search.cpp
@@ -20,10 +20,10 @@
#include <hs/hs_common.h>
#include <hs/hs_runtime.h>
-#include <stddef.h>
#include <algorithm>
#include <boost/iterator/iterator_facade.hpp>
+#include <cstddef>
#include <limits>
#include <memory>
#include <optional>
@@ -80,42 +80,30 @@ public:
auto haystack_column = block.get_by_position(arguments[0]).column;
auto needles_column = block.get_by_position(arguments[1]).column;
- bool haystack_nullable = false;
- bool needles_nullable = false;
-
- if (haystack_column->is_nullable()) {
- haystack_nullable = true;
- }
-
- if (needles_column->is_nullable()) {
- needles_nullable = true;
- }
-
auto haystack_ptr = remove_nullable(haystack_column);
auto needles_ptr = remove_nullable(needles_column);
- const ColumnString* col_haystack_vector =
- check_and_get_column<ColumnString>(&*haystack_ptr);
+ const auto* col_haystack_vector =
check_and_get_column<ColumnString>(&*haystack_ptr);
const ColumnConst* col_haystack_const =
check_and_get_column_const<ColumnString>(&*haystack_ptr);
- const ColumnArray* col_needles_vector =
- check_and_get_column<ColumnArray>(needles_ptr.get());
+ const auto* col_needles_vector =
check_and_get_column<ColumnArray>(needles_ptr.get());
const ColumnConst* col_needles_const =
check_and_get_column_const<ColumnArray>(needles_ptr.get());
- if (!col_needles_const && !col_needles_vector)
+ if (!col_needles_const && !col_needles_vector) {
return Status::InvalidArgument(
"function '{}' encountered unsupported needles column,
found {}", name,
needles_column->get_name());
+ }
- if (col_haystack_const && col_needles_vector)
+ if (col_haystack_const && col_needles_vector) {
return Status::InvalidArgument(
"function '{}' doesn't support search with non-constant
needles "
"in constant haystack",
name);
+ }
- using ResultType = typename Impl::ResultType;
auto col_res = ColumnVector<ResultType>::create();
auto col_offsets = ColumnArray::ColumnOffsets::create();
@@ -140,25 +128,8 @@ public:
return status;
}
- if (haystack_nullable) {
- auto column_nullable =
check_and_get_column<ColumnNullable>(haystack_column.get());
- auto& null_map = column_nullable->get_null_map_data();
- for (size_t i = 0; i != input_rows_count; ++i) {
- if (null_map[i] == 1) {
- vec_res[i] = 0;
- }
- }
- }
-
- if (needles_nullable) {
- auto column_nullable =
check_and_get_column<ColumnNullable>(needles_column.get());
- auto& null_map = column_nullable->get_null_map_data();
- for (size_t i = 0; i != input_rows_count; ++i) {
- if (null_map[i] == 1) {
- vec_res[i] = 0;
- }
- }
- }
+ handle_nullable_column(haystack_column, vec_res, input_rows_count);
+ handle_nullable_column(needles_column, vec_res, input_rows_count);
block.replace_by_position(result, std::move(col_res));
@@ -166,9 +137,25 @@ public:
}
private:
+ using ResultType = typename Impl::ResultType;
+
const bool allow_hyperscan_ = true;
const size_t max_hyperscan_regexp_length_ = 0; // not limited
const size_t max_hyperscan_regexp_total_length_ = 0; // not limited
+
+ /// Handles nullable column by setting result to 0 if the input is null
+ void handle_nullable_column(const ColumnPtr& column,
PaddedPODArray<ResultType>& vec_res,
+ size_t input_rows_count) const {
+ if (column->is_nullable()) {
+ const auto* column_nullable = assert_cast<const
ColumnNullable*>(column.get());
+ const auto& null_map = column_nullable->get_null_map_data();
+ for (size_t i = 0; i != input_rows_count; ++i) {
+ if (null_map[i] == 1) {
+ vec_res[i] = 0;
+ }
+ }
+ }
+ }
};
/// For more readable instantiations of MultiMatchAnyImpl<>
@@ -187,17 +174,67 @@ struct FunctionMultiMatchAnyImpl {
static auto get_return_type() { return
std::make_shared<DataTypeNumber<ResultType>>(); }
+ /**
+ * Prepares the regular expressions and scratch space for Hyperscan.
+ *
+ * This function takes a vector of needles (substrings to search for) and
initializes
+ * the regular expressions and scratch space required for Hyperscan, a
high-performance
+ * regular expression matching library.
+ *
+ */
+ static Status prepare_regexps_and_scratch(const std::vector<StringRef>&
needles,
+ multiregexps::Regexps*& regexps,
+ multiregexps::ScratchPtr&
smart_scratch) {
+ multiregexps::DeferredConstructedRegexpsPtr
deferred_constructed_regexps =
+ multiregexps::getOrSet</*SaveIndices*/
+ FindAnyIndex,
WithEditDistance>(needles, std::nullopt);
+ regexps = deferred_constructed_regexps->get();
+
+ hs_scratch_t* scratch = nullptr;
+ hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch);
+
+ if (err != HS_SUCCESS) {
+ return Status::InternalError("could not clone scratch space for
vectorscan");
+ }
+
+ smart_scratch.reset(scratch);
+ return Status::OK();
+ }
+
+ /**
+ * Static callback function to handle the match results of the hs_scan
function.
+ *
+ * This function is called when a matching substring is found while
scanning with
+ * Hyperscan. It updates the result based on the match information.
+ *
+ */
+ static int on_match([[maybe_unused]] unsigned int id, unsigned long long
/* from */, // NOLINT
+ unsigned long long /* to */,
// NOLINT
+ unsigned int /* flags */, void* context) {
+ if constexpr (FindAnyIndex) {
+ *reinterpret_cast<ResultType*>(context) = id;
+ } else if constexpr (FindAny) {
+ *reinterpret_cast<ResultType*>(context) = 1;
+ }
+ /// Once we hit the callback, there is no need to search for others.
+ return 1;
+ }
+
static Status vector_constant(const ColumnString::Chars& haystack_data,
const ColumnString::Offsets&
haystack_offsets,
const Array& needles_arr,
PaddedPODArray<ResultType>& res,
PaddedPODArray<UInt64>& offsets, bool
allow_hyperscan,
size_t max_hyperscan_regexp_length,
size_t max_hyperscan_regexp_total_length) {
- if (!allow_hyperscan) return Status::InvalidArgument("Hyperscan
functions are disabled");
+ if (!allow_hyperscan) {
+ return Status::InvalidArgument("Hyperscan functions are disabled");
+ }
std::vector<StringRef> needles;
needles.reserve(needles_arr.size());
- for (const auto& needle : needles_arr)
needles.emplace_back(needle.get<StringRef>());
+ for (const auto& needle : needles_arr) {
+ needles.emplace_back(needle.get<StringRef>());
+ }
res.resize(haystack_offsets.size());
@@ -206,44 +243,26 @@ struct FunctionMultiMatchAnyImpl {
return Status::OK();
}
- multiregexps::DeferredConstructedRegexpsPtr
deferred_constructed_regexps =
- multiregexps::getOrSet</*SaveIndices*/ FindAnyIndex,
WithEditDistance>(
- needles, std::nullopt);
- multiregexps::Regexps* regexps = deferred_constructed_regexps->get();
-
- hs_scratch_t* scratch = nullptr;
- hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch);
+ multiregexps::Regexps* regexps = nullptr;
+ multiregexps::ScratchPtr smart_scratch;
+ RETURN_IF_ERROR(prepare_regexps_and_scratch(needles, regexps,
smart_scratch));
- if (err != HS_SUCCESS)
- return Status::InternalError("could not clone scratch space for
vectorscan");
-
- multiregexps::ScratchPtr smart_scratch(scratch);
-
- auto on_match = []([[maybe_unused]] unsigned int id,
- unsigned long long /* from */, // NOLINT
- unsigned long long /* to */, // NOLINT
- unsigned int /* flags */, void* context) -> int {
- if constexpr (FindAnyIndex)
- *reinterpret_cast<ResultType*>(context) = id;
- else if constexpr (FindAny)
- *reinterpret_cast<ResultType*>(context) = 1;
- /// Once we hit the callback, there is no need to search for
others.
- return 1;
- };
const size_t haystack_offsets_size = haystack_offsets.size();
UInt64 offset = 0;
for (size_t i = 0; i < haystack_offsets_size; ++i) {
UInt64 length = haystack_offsets[i] - offset;
/// vectorscan restriction.
- if (length > std::numeric_limits<UInt32>::max())
+ if (length > std::numeric_limits<UInt32>::max()) {
return Status::InternalError("too long string to search");
+ }
/// zero the result, scan, check, update the offset.
res[i] = 0;
- err = hs_scan(regexps->getDB(),
- reinterpret_cast<const char*>(haystack_data.data())
+ offset,
- static_cast<unsigned>(length), 0,
smart_scratch.get(), on_match, &res[i]);
- if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED)
+ hs_error_t err = hs_scan(
+ regexps->getDB(), reinterpret_cast<const
char*>(haystack_data.data()) + offset,
+ static_cast<unsigned>(length), 0, smart_scratch.get(),
on_match, &res[i]);
+ if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) {
return Status::InternalError("failed to scan with vectorscan");
+ }
offset = haystack_offsets[i];
}
@@ -257,20 +276,22 @@ struct FunctionMultiMatchAnyImpl {
PaddedPODArray<ResultType>& res,
PaddedPODArray<UInt64>& offsets,
bool allow_hyperscan, size_t
max_hyperscan_regexp_length,
size_t max_hyperscan_regexp_total_length) {
- if (!allow_hyperscan) return Status::InvalidArgument("Hyperscan
functions are disabled");
+ if (!allow_hyperscan) {
+ return Status::InvalidArgument("Hyperscan functions are disabled");
+ }
res.resize(haystack_offsets.size());
size_t prev_haystack_offset = 0;
size_t prev_needles_offset = 0;
- auto& nested_column =
+ const auto& nested_column =
vectorized::check_and_get_column<vectorized::ColumnNullable>(needles_data)
->get_nested_column();
- const ColumnString* needles_data_string =
check_and_get_column<ColumnString>(nested_column);
+ const auto* needles_data_string =
check_and_get_column<ColumnString>(nested_column);
if (!needles_data_string) {
- return Status::InvalidArgument("needles should be string");
+ return Status::InvalidArgument("needles should be string column");
}
std::vector<StringRef> needles;
@@ -287,46 +308,27 @@ struct FunctionMultiMatchAnyImpl {
continue;
}
- multiregexps::DeferredConstructedRegexpsPtr
deferred_constructed_regexps =
- multiregexps::getOrSet</*SaveIndices*/ FindAnyIndex,
WithEditDistance>(
- needles, std::nullopt);
- multiregexps::Regexps* regexps =
deferred_constructed_regexps->get();
-
- hs_scratch_t* scratch = nullptr;
- hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch);
-
- if (err != HS_SUCCESS)
- return Status::InternalError("could not clone scratch space
for vectorscan");
-
- multiregexps::ScratchPtr smart_scratch(scratch);
-
- auto on_match = []([[maybe_unused]] unsigned int id,
- unsigned long long /* from */, // NOLINT
- unsigned long long /* to */, // NOLINT
- unsigned int /* flags */, void* context) -> int
{
- if constexpr (FindAnyIndex)
- *reinterpret_cast<ResultType*>(context) = id;
- else if constexpr (FindAny)
- *reinterpret_cast<ResultType*>(context) = 1;
- /// Once we hit the callback, there is no need to search for
others.
- return 1;
- };
+ multiregexps::Regexps* regexps = nullptr;
+ multiregexps::ScratchPtr smart_scratch;
+ RETURN_IF_ERROR(prepare_regexps_and_scratch(needles, regexps,
smart_scratch));
const size_t cur_haystack_length = haystack_offsets[i] -
prev_haystack_offset;
/// vectorscan restriction.
- if (cur_haystack_length > std::numeric_limits<UInt32>::max())
+ if (cur_haystack_length > std::numeric_limits<UInt32>::max()) {
return Status::InternalError("too long string to search");
+ }
/// zero the result, scan, check, update the offset.
res[i] = 0;
- err = hs_scan(
+ hs_error_t err = hs_scan(
regexps->getDB(),
reinterpret_cast<const char*>(haystack_data.data()) +
prev_haystack_offset,
static_cast<unsigned>(cur_haystack_length), 0,
smart_scratch.get(), on_match,
&res[i]);
- if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED)
+ if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) {
return Status::InternalError("failed to scan with vectorscan");
+ }
prev_haystack_offset = haystack_offsets[i];
prev_needles_offset = needles_offsets[i];
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]