github-actions[bot] commented on code in PR #60412: URL: https://github.com/apache/doris/pull/60412#discussion_r2935418956
########## be/src/exprs/function/function_hamming_distance.cpp: ########## @@ -0,0 +1,271 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> +#include <cstring> +#include <vector> + +#include "common/status.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/data_type/data_type_number.h" +#include "core/string_ref.h" +#include "exprs/function/function_totype.h" +#include "exprs/function/simple_function_factory.h" +#include "util/simd/vstring_function.h" + +namespace doris { +#include "common/compile_check_begin.h" + +struct NameHammingDistance { + static constexpr auto name = "hamming_distance"; +}; + +template <typename LeftDataType, typename RightDataType> +struct HammingDistanceImpl { + using ResultDataType = DataTypeInt64; + using ResultPaddedPODArray = PaddedPODArray<Int64>; + + static Status vector_vector(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, + const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { + DCHECK_EQ(loffsets.size(), roffsets.size()); + + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i), + string_ref_at(rdata, roffsets, i), res[i], i)); + } + return Status::OK(); + } + + static Status vector_scalar(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, const StringRef& rdata, + ResultPaddedPODArray& res) { + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i), rdata, res[i], i)); + } + return Status::OK(); + } + + static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { + const size_t size = roffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(ldata, string_ref_at(rdata, roffsets, i), res[i], i)); + } + return Status::OK(); + } + +private: + static StringRef string_ref_at(const ColumnString::Chars& data, + const ColumnString::Offsets& offsets, size_t i) { + DCHECK_LT(i, offsets.size()); + const size_t begin = (i == 0) ? 0 : offsets[i - 1]; + const size_t end = offsets[i]; + if (end <= begin || end > data.size()) { Review Comment: **Coding standards violation (AGENTS.md):** This is defensive programming. In a well-formed `ColumnString`, `end > begin` always holds (every string has at least a `\0` terminator) and `end <= data.size()` is a structural invariant. Per AGENTS.md: "Assert correctness only—never use defensive programming with `if` or similar constructs." Also, the `(i == 0) ? 0 : offsets[i - 1]` ternary on line 83 is unnecessary — `PaddedPODArray` guarantees `offsets[-1] == 0` (see `column_string.h` comment: "-1th index is Ok"). All standard code in the codebase simply uses `offsets[i - 1]`. Recommendation: Replace this entire `string_ref_at` helper with the standard pattern: ```cpp const char* raw_str = reinterpret_cast<const char*>(&data[offsets[i - 1]]); size_t str_size = offsets[i] - offsets[i - 1]; ``` Or use `column->get_data_at(i)` when the column object is available (as done in the nullable path at line 252). ########## be/src/exprs/function/function_levenshtein.cpp: ########## @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> +#include <cstring> +#include <vector> + +#include "common/status.h" +#include "core/data_type/data_type_number.h" +#include "core/string_ref.h" +#include "exprs/function/function_totype.h" +#include "exprs/function/simple_function_factory.h" +#include "util/simd/vstring_function.h" + +namespace doris { +#include "common/compile_check_begin.h" + +struct NameLevenshtein { + static constexpr auto name = "levenshtein"; +}; + +template <typename LeftDataType, typename RightDataType> +struct LevenshteinImpl { + using ResultDataType = DataTypeInt32; + using ResultPaddedPODArray = PaddedPODArray<Int32>; + + static Status vector_vector(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, + const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { + DCHECK_EQ(loffsets.size(), roffsets.size()); + + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i), + string_ref_at(rdata, roffsets, i)); + } + return Status::OK(); + } + + static Status vector_scalar(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, const StringRef& rdata, + ResultPaddedPODArray& res) { + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i), rdata); + } + return Status::OK(); + } + + static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { + const size_t size = roffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(ldata, string_ref_at(rdata, roffsets, i)); + } + return Status::OK(); + } + +private: + static StringRef string_ref_at(const ColumnString::Chars& data, + const ColumnString::Offsets& offsets, size_t i) { + DCHECK_LT(i, offsets.size()); + const size_t begin = (i == 0) ? 0 : offsets[i - 1]; + const size_t end = offsets[i]; + if (end <= begin || end > data.size()) { + return StringRef("", 0); + } + + size_t str_size = end - begin; + if (str_size > 0 && data[end - 1] == '\0') { + --str_size; + } + return StringRef(reinterpret_cast<const char*>(data.data() + begin), str_size); + } + + static void utf8_char_offsets(const StringRef& ref, std::vector<size_t>& offsets) { + offsets.clear(); + offsets.reserve(ref.size); + simd::VStringFunctions::get_char_len(ref.data, ref.size, offsets); + } + + static bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, + const StringRef& right, size_t right_off, size_t right_next) { + const size_t left_len = left_next - left_off; + const size_t right_len = right_next - right_off; + return left_len == right_len && + std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; + } + + static Int32 levenshtein_distance_ascii(const StringRef& left, const StringRef& right) { Review Comment: **Performance concern — no input size limit:** `levenshtein_distance` is O(m*n) in both time and performs per-row `std::vector` allocations. Two 1MB strings would require ~10^12 operations. There is no guard against extremely long inputs. Other functions in the codebase (e.g., `functions_multi_string_search.cpp`) have explicit size limits for expensive operations. Consider adding a reasonable character-count limit (e.g., 10,000 or 65,535 characters) and returning `Status::InvalidArgument` when exceeded. This would require changing the return type to `Status` (similar to `hamming_distance`). This is a moderate concern — practical usage rarely involves very long strings, but it's a potential DoS vector via crafted SQL. ########## be/src/exprs/function/function_hamming_distance.cpp: ########## @@ -0,0 +1,271 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> +#include <cstring> +#include <vector> + +#include "common/status.h" +#include "core/column/column_nullable.h" +#include "core/column/column_string.h" +#include "core/data_type/data_type_number.h" +#include "core/string_ref.h" +#include "exprs/function/function_totype.h" +#include "exprs/function/simple_function_factory.h" +#include "util/simd/vstring_function.h" + +namespace doris { +#include "common/compile_check_begin.h" + +struct NameHammingDistance { + static constexpr auto name = "hamming_distance"; +}; + +template <typename LeftDataType, typename RightDataType> +struct HammingDistanceImpl { + using ResultDataType = DataTypeInt64; + using ResultPaddedPODArray = PaddedPODArray<Int64>; + + static Status vector_vector(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, + const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { + DCHECK_EQ(loffsets.size(), roffsets.size()); + + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i), + string_ref_at(rdata, roffsets, i), res[i], i)); + } + return Status::OK(); + } + + static Status vector_scalar(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, const StringRef& rdata, + ResultPaddedPODArray& res) { + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i), rdata, res[i], i)); + } + return Status::OK(); + } + + static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { + const size_t size = roffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(ldata, string_ref_at(rdata, roffsets, i), res[i], i)); + } + return Status::OK(); + } + +private: + static StringRef string_ref_at(const ColumnString::Chars& data, + const ColumnString::Offsets& offsets, size_t i) { + DCHECK_LT(i, offsets.size()); + const size_t begin = (i == 0) ? 0 : offsets[i - 1]; + const size_t end = offsets[i]; + if (end <= begin || end > data.size()) { + return StringRef("", 0); + } + + size_t str_size = end - begin; + if (str_size > 0 && data[end - 1] == '\0') { Review Comment: **Non-standard pattern:** Stripping the trailing `\0` is not part of the standard `ColumnString` access pattern. `ColumnString::get_data_at()` returns the size **including** the terminating zero (see `size_at()` comment: "Size of i-th element, including terminating zero"). Other string functions in the codebase do not strip `\0` from raw offset-based access either — they rely on the `StringRef.size` being computed as `offsets[i] - offsets[i-1]`, which already accounts for the terminator. Note that on line 252, `get_data_at()` is used (which includes the `\0` in the size), creating an inconsistency within this same file between how string data is accessed in the non-nullable vs nullable paths. This inconsistency could lead to subtle off-by-one differences: the non-nullable path (using `string_ref_at`) strips `\0` and compares character content only, while the nullable path (using `get_data_at`) includes `\0` in the comparison. For `hamming_distance`, this would mean different results depending on whether the input column is nullable or not. ########## be/src/exprs/function/function_levenshtein.cpp: ########## @@ -0,0 +1,209 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <algorithm> +#include <cstring> +#include <vector> + +#include "common/status.h" +#include "core/data_type/data_type_number.h" +#include "core/string_ref.h" +#include "exprs/function/function_totype.h" +#include "exprs/function/simple_function_factory.h" +#include "util/simd/vstring_function.h" + +namespace doris { +#include "common/compile_check_begin.h" + +struct NameLevenshtein { + static constexpr auto name = "levenshtein"; +}; + +template <typename LeftDataType, typename RightDataType> +struct LevenshteinImpl { + using ResultDataType = DataTypeInt32; + using ResultPaddedPODArray = PaddedPODArray<Int32>; + + static Status vector_vector(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, + const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { + DCHECK_EQ(loffsets.size(), roffsets.size()); + + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i), + string_ref_at(rdata, roffsets, i)); + } + return Status::OK(); + } + + static Status vector_scalar(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, const StringRef& rdata, + ResultPaddedPODArray& res) { + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i), rdata); + } + return Status::OK(); + } + + static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { + const size_t size = roffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(ldata, string_ref_at(rdata, roffsets, i)); + } + return Status::OK(); + } + +private: + static StringRef string_ref_at(const ColumnString::Chars& data, + const ColumnString::Offsets& offsets, size_t i) { + DCHECK_LT(i, offsets.size()); + const size_t begin = (i == 0) ? 0 : offsets[i - 1]; + const size_t end = offsets[i]; + if (end <= begin || end > data.size()) { Review Comment: Same issue as in `function_hamming_distance.cpp`: defensive `if` check and non-standard `(i == 0) ? 0 : offsets[i - 1]` pattern. See the comment on the hamming_distance file for details. Additionally, `FunctionBinaryToType` (which `FunctionLevenshtein` uses) employs `use_default_implementation_for_nulls() = true`, so the framework strips nullable wrappers and calls `vector_vector`/`vector_scalar`/`scalar_vector` with non-nullable columns. These methods use `string_ref_at`, but the framework also uses `get_data_at()` in some paths — maintaining consistency with the standard access pattern would be safer. ########## regression-test/data/nereids_p0/sql_functions/string_functions/test_string_function.out: ########## @@ -569,6 +569,93 @@ W520 -- !soundex -- \N +-- !levenshtein -- +0 + +-- !levenshtein -- +3 + +-- !levenshtein -- +2 + +-- !levenshtein -- +0 + +-- !levenshtein -- +3 + +-- !levenshtein -- +3 + +-- !levenshtein -- +\N + +-- !levenshtein -- +\N + +-- !levenshtein -- +1 + +-- !levenshtein -- +1 + +-- !levenshtein -- +2 + +-- !levenshtein -- +1 + +-- !levenshtein -- +1 + +-- !levenshtein_tbl -- +1 3 +2 0 +3 1 +4 \N +5 1 +6 2 +7 3 + +-- !hamming_distance -- +0 + +-- !hamming_distance -- +0 + +-- !hamming_distance -- +1 + +-- !hamming_distance -- +1 + +-- !hamming_distance -- +\N + +-- !hamming_distance -- +\N + +-- !hamming_distance -- +4 + +-- !hamming_distance -- +1 + +-- !hamming_distance -- +2 + +-- !hamming_distance_tbl -- +1 0 +2 1 +3 1 +4 \N +5 4 +6 1 +7 2 + +-- !hamming_distance -- Review Comment: **Orphaned output entry:** This `-- !hamming_distance --` block with `\N` has no corresponding `qt_hamming_distance` query in the `.groovy` test file. After the `qt_hamming_distance_tbl` query (the table test), the groovy file proceeds to `qt_soundex` for non-ASCII testing — there is no additional `qt_hamming_distance` call. This orphaned entry will cause the regression test to fail because the `.out` file has more output blocks than the `.groovy` file produces. The `.out` file should be auto-generated by running the test, not hand-edited. Please regenerate this file. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
