This is an automated email from the ASF dual-hosted git repository.
zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 113fd2da342 Add levenshtein and hamming_distance functions (#60412)
113fd2da342 is described below
commit 113fd2da3424f81c41e3ec2ed427074a56d2cf3f
Author: 收集群风 <[email protected]>
AuthorDate: Fri May 29 11:07:46 2026 +0800
Add levenshtein and hamming_distance functions (#60412)
Related Issue: #48203
Related PR: #57144 (reference)
Problem Summary: support levenshtein (Hive) and hamming_distance
(Trino/Presto).
---
.../exprs/function/function_hamming_distance.cpp | 320 +++++++++++++++++++++
be/src/exprs/function/function_levenshtein.cpp | 263 +++++++++++++++++
be/src/exprs/function/simple_function_factory.h | 4 +
be/src/util/simd/vstring_function.h | 16 ++
.../doris/catalog/BuiltinScalarFunctions.java | 4 +
.../functions/executable/StringArithmetic.java | 68 +++++
.../functions/scalar/HammingDistance.java | 78 +++++
.../expressions/functions/scalar/Levenshtein.java | 76 +++++
.../expressions/visitor/ScalarFunctionVisitor.java | 10 +
.../string_functions/test_string_all.out | 197 ++++++++++++-
.../string_functions/test_string_all.groovy | 191 +++++++++++-
11 files changed, 1225 insertions(+), 2 deletions(-)
diff --git a/be/src/exprs/function/function_hamming_distance.cpp
b/be/src/exprs/function/function_hamming_distance.cpp
new file mode 100644
index 00000000000..230b02c32d6
--- /dev/null
+++ b/be/src/exprs/function/function_hamming_distance.cpp
@@ -0,0 +1,320 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <vector>
+
+#include "common/status.h"
+#include "core/column/column_nullable.h"
+#include "core/column/column_string.h"
+#include "core/data_type/data_type_number.h"
+#include "core/string_ref.h"
+#include "exprs/function/simple_function_factory.h"
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+
+class FunctionHammingDistance : public IFunction {
+public:
+ using ResultDataType = DataTypeInt64;
+ using ResultPaddedPODArray = PaddedPODArray<Int64>;
+ using ResultColumnType = ColumnVector<ResultDataType::PType>;
+
+ static constexpr auto name = "hamming_distance";
+
+ static FunctionPtr create() { return
std::make_shared<FunctionHammingDistance>(); }
+
+ String get_name() const override { return name; }
+ size_t get_number_of_arguments() const override { return 2; }
+
+ DataTypePtr get_return_type_impl(const DataTypes& arguments) const
override {
+ const bool has_nullable = std::ranges::any_of(
+ arguments, [](const DataTypePtr& type) { return
type->is_nullable(); });
+ if (has_nullable) {
+ return make_nullable(std::make_shared<ResultDataType>());
+ }
+ return std::make_shared<ResultDataType>();
+ }
+
+ bool use_default_implementation_for_nulls() const override { return false;
}
+
+ Status execute_impl(FunctionContext* /*context*/, Block& block, const
ColumnNumbers& arguments,
+ uint32_t result, size_t input_rows_count) const
override {
+ const auto& [left_col, left_const] =
+ unpack_if_const(block.get_by_position(arguments[0]).column);
+ const auto& [right_col, right_const] =
+ unpack_if_const(block.get_by_position(arguments[1]).column);
+
+ const auto* left_nullable =
check_and_get_column<ColumnNullable>(left_col.get());
+ const auto* right_nullable =
check_and_get_column<ColumnNullable>(right_col.get());
+
+ const IColumn* left_nested =
+ left_nullable ? &left_nullable->get_nested_column() :
left_col.get();
+ const IColumn* right_nested =
+ right_nullable ? &right_nullable->get_nested_column() :
right_col.get();
+
+ const auto* left_str_col = assert_cast<const
ColumnString*>(left_nested);
+ const auto* right_str_col = assert_cast<const
ColumnString*>(right_nested);
+
+ auto res_col = ResultColumnType::create(input_rows_count);
+ auto& res_data = res_col->get_data();
+
+ const NullMap* left_null_map =
+ left_nullable ? &left_nullable->get_null_map_data() : nullptr;
+ const NullMap* right_null_map =
+ right_nullable ? &right_nullable->get_null_map_data() :
nullptr;
+ const bool has_nullable = left_null_map != nullptr || right_null_map
!= nullptr;
+
+ if (!has_nullable) {
+ if (left_const) {
+
RETURN_IF_ERROR(scalar_vector(left_str_col->get_data_at(0).trim_tail_padding_zero(),
+ *right_str_col, res_data));
+ } else if (right_const) {
+ RETURN_IF_ERROR(vector_scalar(
+ *left_str_col,
right_str_col->get_data_at(0).trim_tail_padding_zero(),
+ res_data));
+ } else {
+ RETURN_IF_ERROR(vector_vector(*left_str_col, *right_str_col,
res_data));
+ }
+ block.replace_by_position(result, std::move(res_col));
+ return Status::OK();
+ }
+
+ auto null_col = ColumnUInt8::create(input_rows_count, 0);
+ auto& null_map = null_col->get_data();
+ if (left_const) {
+ if (left_null_map && (*left_null_map)[0]) {
+ std::fill(null_map.begin(), null_map.end(), 1);
+ block.replace_by_position(
+ result, ColumnNullable::create(std::move(res_col),
std::move(null_col)));
+ return Status::OK();
+ }
+
+ const auto left =
left_str_col->get_data_at(0).trim_tail_padding_zero();
+ RETURN_IF_ERROR(scalar_vector_nullable(left, *right_str_col,
right_null_map, res_data,
+ null_map));
+ } else if (right_const) {
+ if (right_null_map && (*right_null_map)[0]) {
+ std::fill(null_map.begin(), null_map.end(), 1);
+ block.replace_by_position(
+ result, ColumnNullable::create(std::move(res_col),
std::move(null_col)));
+ return Status::OK();
+ }
+
+ RETURN_IF_ERROR(vector_scalar_nullable(
+ *left_str_col,
right_str_col->get_data_at(0).trim_tail_padding_zero(),
+ left_null_map, res_data, null_map));
+ } else {
+ for (size_t i = 0; i < input_rows_count; ++i) {
+ const bool left_is_null = left_null_map && (*left_null_map)[i];
+ const bool right_is_null = right_null_map &&
(*right_null_map)[i];
+ if (left_is_null || right_is_null) {
+ null_map[i] = 1;
+ res_data[i] = 0;
+ continue;
+ }
+
+ RETURN_IF_ERROR(hamming_distance(
+ left_str_col->get_data_at(i).trim_tail_padding_zero(),
+
right_str_col->get_data_at(i).trim_tail_padding_zero(), res_data[i], i));
+ }
+ }
+
+ block.replace_by_position(result,
+ ColumnNullable::create(std::move(res_col),
std::move(null_col)));
+ return Status::OK();
+ }
+
+private:
+ static Status vector_vector(const ColumnString& lcol, const ColumnString&
rcol,
+ ResultPaddedPODArray& res) {
+ DCHECK_EQ(lcol.size(), rcol.size());
+
+ const size_t size = lcol.size();
+ res.resize(size);
+ std::vector<size_t> left_offsets;
+ std::vector<size_t> right_offsets;
+ for (size_t i = 0; i < size; ++i) {
+ const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
+ const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
+ RETURN_IF_ERROR(hamming_distance_with_offsets(
+ left, left_offsets, false,
simd::VStringFunctions::is_ascii(left), right,
+ right_offsets, false,
simd::VStringFunctions::is_ascii(right), res[i], i));
+ }
+ return Status::OK();
+ }
+
+ static Status vector_scalar(const ColumnString& lcol, const StringRef&
rdata,
+ ResultPaddedPODArray& res) {
+ const size_t size = lcol.size();
+ res.resize(size);
+ const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
+ std::vector<size_t> right_offsets;
+ simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets);
+ std::vector<size_t> left_offsets;
+ for (size_t i = 0; i < size; ++i) {
+ const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
+ RETURN_IF_ERROR(hamming_distance_with_offsets(
+ left, left_offsets, false,
simd::VStringFunctions::is_ascii(left), rdata,
+ right_offsets, true, right_ascii, res[i], i));
+ }
+ return Status::OK();
+ }
+
+ static Status scalar_vector(const StringRef& ldata, const ColumnString&
rcol,
+ ResultPaddedPODArray& res) {
+ const size_t size = rcol.size();
+ res.resize(size);
+ const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
+ std::vector<size_t> left_offsets;
+ simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets);
+ std::vector<size_t> right_offsets;
+ for (size_t i = 0; i < size; ++i) {
+ const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
+ RETURN_IF_ERROR(hamming_distance_with_offsets(
+ ldata, left_offsets, true, left_ascii, right,
right_offsets, false,
+ simd::VStringFunctions::is_ascii(right), res[i], i));
+ }
+ return Status::OK();
+ }
+
+ static Status vector_scalar_nullable(const ColumnString& lcol, const
StringRef& rdata,
+ const NullMap* left_null_map,
ResultPaddedPODArray& res,
+ NullMap& null_map) {
+ const size_t size = lcol.size();
+ res.resize(size);
+ const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
+ std::vector<size_t> right_offsets;
+ simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets);
+ std::vector<size_t> left_offsets;
+ for (size_t i = 0; i < size; ++i) {
+ if (left_null_map && (*left_null_map)[i]) {
+ null_map[i] = 1;
+ res[i] = 0;
+ continue;
+ }
+
+ const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
+ RETURN_IF_ERROR(hamming_distance_with_offsets(
+ left, left_offsets, false,
simd::VStringFunctions::is_ascii(left), rdata,
+ right_offsets, true, right_ascii, res[i], i));
+ }
+ return Status::OK();
+ }
+
+ static Status scalar_vector_nullable(const StringRef& ldata, const
ColumnString& rcol,
+ const NullMap* right_null_map,
ResultPaddedPODArray& res,
+ NullMap& null_map) {
+ const size_t size = rcol.size();
+ res.resize(size);
+ const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
+ std::vector<size_t> left_offsets;
+ simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets);
+ std::vector<size_t> right_offsets;
+ for (size_t i = 0; i < size; ++i) {
+ if (right_null_map && (*right_null_map)[i]) {
+ null_map[i] = 1;
+ res[i] = 0;
+ continue;
+ }
+
+ const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
+ RETURN_IF_ERROR(hamming_distance_with_offsets(
+ ldata, left_offsets, true, left_ascii, right,
right_offsets, false,
+ simd::VStringFunctions::is_ascii(right), res[i], i));
+ }
+ return Status::OK();
+ }
+
+ static Status hamming_distance_ascii(const StringRef& left, const
StringRef& right,
+ Int64& result, size_t row) {
+ if (left.size != right.size) {
+ return Status::InvalidArgument(
+ "hamming_distance requires strings of the same length at
row {}", row);
+ }
+
+ Int64 distance = 0;
+ for (size_t i = 0; i < left.size; ++i) {
+ distance += static_cast<Int64>(left.data[i] != right.data[i]);
+ }
+ result = distance;
+ return Status::OK();
+ }
+
+ static Status hamming_distance_utf8(const StringRef& left,
+ const std::vector<size_t>&
left_offsets,
+ const StringRef& right,
+ const std::vector<size_t>&
right_offsets, Int64& result,
+ size_t row) {
+ if (left_offsets.size() != right_offsets.size()) {
+ return Status::InvalidArgument(
+ "hamming_distance requires strings of the same length at
row {}", row);
+ }
+
+ Int64 distance = 0;
+ const size_t len = left_offsets.size();
+ for (size_t i = 0; i + 1 < len; ++i) {
+ const size_t left_off = left_offsets[i];
+ const size_t left_next = left_offsets[i + 1];
+ const size_t right_off = right_offsets[i];
+ const size_t right_next = right_offsets[i + 1];
+ distance +=
static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal(
+ left, left_off, left_next, right, right_off, right_next));
+ }
+ if (len > 0) {
+ const size_t left_off = left_offsets[len - 1];
+ const size_t right_off = right_offsets[len - 1];
+ distance +=
static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal(
+ left, left_off, left.size, right, right_off, right.size));
+ }
+
+ result = distance;
+ return Status::OK();
+ }
+
+ static Status hamming_distance_with_offsets(
+ const StringRef& left, std::vector<size_t>& left_offsets, bool
left_offsets_ready,
+ bool left_ascii, const StringRef& right, std::vector<size_t>&
right_offsets,
+ bool right_offsets_ready, bool right_ascii, Int64& result, size_t
row) {
+ if (left_ascii && right_ascii) {
+ return hamming_distance_ascii(left, right, result, row);
+ }
+
+ if (!left_offsets_ready) {
+ simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
+ }
+ if (!right_offsets_ready) {
+ simd::VStringFunctions::get_utf8_char_offsets(right,
right_offsets);
+ }
+ return hamming_distance_utf8(left, left_offsets, right, right_offsets,
result, row);
+ }
+
+ static Status hamming_distance(const StringRef& left, const StringRef&
right, Int64& result,
+ size_t row) {
+ std::vector<size_t> left_offsets;
+ std::vector<size_t> right_offsets;
+ return hamming_distance_with_offsets(
+ left, left_offsets, false,
simd::VStringFunctions::is_ascii(left), right,
+ right_offsets, false, simd::VStringFunctions::is_ascii(right),
result, row);
+ }
+};
+
+void register_function_hamming_distance(SimpleFunctionFactory& factory) {
+ factory.register_function<FunctionHammingDistance>();
+}
+
+} // namespace doris
diff --git a/be/src/exprs/function/function_levenshtein.cpp
b/be/src/exprs/function/function_levenshtein.cpp
new file mode 100644
index 00000000000..27d24be3ce6
--- /dev/null
+++ b/be/src/exprs/function/function_levenshtein.cpp
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <vector>
+
+#include "common/status.h"
+#include "core/data_type/data_type_number.h"
+#include "core/string_ref.h"
+#include "exprs/function/function_totype.h"
+#include "exprs/function/simple_function_factory.h"
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+
+struct NameLevenshtein {
+ static constexpr auto name = "levenshtein";
+};
+
+template <typename LeftDataType, typename RightDataType>
+struct LevenshteinImpl {
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+ static Status vector_vector(const ColumnString::Chars& ldata,
+ const ColumnString::Offsets& loffsets,
+ const ColumnString::Chars& rdata,
+ const ColumnString::Offsets& roffsets,
ResultPaddedPODArray& res) {
+ DCHECK_EQ(loffsets.size(), roffsets.size());
+
+ const size_t size = loffsets.size();
+ res.resize(size);
+ std::vector<size_t> left_offsets;
+ std::vector<size_t> right_offsets;
+ for (size_t i = 0; i < size; ++i) {
+ res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i),
+ string_ref_at(rdata, roffsets, i),
left_offsets,
+ right_offsets);
+ }
+ return Status::OK();
+ }
+
+ static Status vector_scalar(const ColumnString::Chars& ldata,
+ const ColumnString::Offsets& loffsets, const
StringRef& rdata,
+ ResultPaddedPODArray& res) {
+ const size_t size = loffsets.size();
+ res.resize(size);
+ const auto right = rdata.trim_tail_padding_zero();
+ const bool right_ascii = simd::VStringFunctions::is_ascii(right);
+ std::vector<size_t> right_offsets;
+ simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
+ std::vector<size_t> left_offsets;
+ for (size_t i = 0; i < size; ++i) {
+ res[i] =
levenshtein_distance_with_right_offsets(string_ref_at(ldata, loffsets, i),
+ left_offsets,
right, right_offsets,
+ right_ascii);
+ }
+ return Status::OK();
+ }
+
+ static Status scalar_vector(const StringRef& ldata, const
ColumnString::Chars& rdata,
+ const ColumnString::Offsets& roffsets,
ResultPaddedPODArray& res) {
+ const size_t size = roffsets.size();
+ res.resize(size);
+ const auto left = ldata.trim_tail_padding_zero();
+ const bool left_ascii = simd::VStringFunctions::is_ascii(left);
+ std::vector<size_t> left_offsets;
+ simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
+ std::vector<size_t> right_offsets;
+ for (size_t i = 0; i < size; ++i) {
+ res[i] = levenshtein_distance_with_left_offsets(left,
left_offsets, left_ascii,
+
string_ref_at(rdata, roffsets, i),
+ right_offsets);
+ }
+ return Status::OK();
+ }
+
+private:
+ static StringRef string_ref_at(const ColumnString::Chars& data,
+ const ColumnString::Offsets& offsets,
size_t i) {
+ DCHECK_LT(i, offsets.size());
+ const auto idx = static_cast<ssize_t>(i);
+ return StringRef(data.data() + offsets[idx - 1], offsets[idx] -
offsets[idx - 1])
+ .trim_tail_padding_zero();
+ }
+
+ static Int32 levenshtein_distance_utf8(const StringRef& left,
+ const std::vector<size_t>&
left_offsets,
+ const StringRef& right,
+ const std::vector<size_t>&
right_offsets) {
+ const StringRef* left_ref = &left;
+ const StringRef* right_ref = &right;
+ const std::vector<size_t>* left_offsets_ref = &left_offsets;
+ const std::vector<size_t>* right_offsets_ref = &right_offsets;
+ if (right_offsets_ref->size() > left_offsets_ref->size()) {
+ std::swap(left_offsets_ref, right_offsets_ref);
+ std::swap(left_ref, right_ref);
+ }
+
+ const size_t m = left_offsets_ref->size();
+ const size_t n = right_offsets_ref->size();
+
+ std::vector<Int32> prev(n + 1);
+ std::vector<Int32> curr(n + 1);
+ for (size_t j = 0; j <= n; ++j) {
+ prev[j] = static_cast<Int32>(j);
+ }
+
+ for (size_t i = 1; i <= m; ++i) {
+ curr[0] = static_cast<Int32>(i);
+ const size_t left_off = (*left_offsets_ref)[i - 1];
+ const size_t left_next = i < m ? (*left_offsets_ref)[i] :
left_ref->size;
+
+ for (size_t j = 1; j <= n; ++j) {
+ const size_t right_off = (*right_offsets_ref)[j - 1];
+ const size_t right_next = j < n ? (*right_offsets_ref)[j] :
right_ref->size;
+
+ const Int32 cost =
+ simd::VStringFunctions::utf8_char_equal(*left_ref,
left_off, left_next,
+ *right_ref,
right_off, right_next)
+ ? 0
+ : 1;
+
+ const Int32 insert_cost = curr[j - 1] + 1;
+ const Int32 delete_cost = prev[j] + 1;
+ const Int32 replace_cost = prev[j - 1] + cost;
+ curr[j] = std::min(std::min(insert_cost, delete_cost),
replace_cost);
+ }
+ std::swap(prev, curr);
+ }
+
+ return prev[n];
+ }
+
+ static Int32 levenshtein_distance_ascii(const StringRef& left, const
StringRef& right) {
+ const StringRef* left_ref = &left;
+ const StringRef* right_ref = &right;
+ size_t m = left.size;
+ size_t n = right.size;
+
+ if (n > m) {
+ std::swap(left_ref, right_ref);
+ std::swap(m, n);
+ }
+
+ std::vector<Int32> prev(n + 1);
+ std::vector<Int32> curr(n + 1);
+ for (size_t j = 0; j <= n; ++j) {
+ prev[j] = static_cast<Int32>(j);
+ }
+
+ for (size_t i = 1; i <= m; ++i) {
+ curr[0] = static_cast<Int32>(i);
+ const char left_char = left_ref->data[i - 1];
+
+ for (size_t j = 1; j <= n; ++j) {
+ const Int32 cost = left_char == right_ref->data[j - 1] ? 0 : 1;
+ const Int32 insert_cost = curr[j - 1] + 1;
+ const Int32 delete_cost = prev[j] + 1;
+ const Int32 replace_cost = prev[j - 1] + cost;
+ curr[j] = std::min(std::min(insert_cost, delete_cost),
replace_cost);
+ }
+ std::swap(prev, curr);
+ }
+
+ return prev[n];
+ }
+
+ static Int32 levenshtein_distance(const StringRef& left, const StringRef&
right,
+ std::vector<size_t>& left_offsets,
+ std::vector<size_t>& right_offsets) {
+ const bool left_ascii = simd::VStringFunctions::is_ascii(left);
+ const bool right_ascii = simd::VStringFunctions::is_ascii(right);
+ if (left_ascii && right_ascii) {
+ return levenshtein_distance_ascii(left, right);
+ }
+
+ if (left.size == 0) {
+ return
static_cast<Int32>(simd::VStringFunctions::get_char_len(right.data,
right.size));
+ }
+ if (right.size == 0) {
+ return
static_cast<Int32>(simd::VStringFunctions::get_char_len(left.data, left.size));
+ }
+
+ simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
+ simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
+ return levenshtein_distance_utf8(left, left_offsets, right,
right_offsets);
+ }
+
+ static Int32 levenshtein_distance_with_right_offsets(const StringRef& left,
+ std::vector<size_t>&
left_offsets,
+ const StringRef&
right,
+ const
std::vector<size_t>& right_offsets,
+ bool right_ascii) {
+ const bool left_ascii = simd::VStringFunctions::is_ascii(left);
+ if (left_ascii && right_ascii) {
+ return levenshtein_distance_ascii(left, right);
+ }
+
+ if (left.size == 0) {
+ return static_cast<Int32>(right_offsets.size());
+ }
+ if (right.size == 0) {
+ return left_ascii ? static_cast<Int32>(left.size)
+ : static_cast<Int32>(
+
simd::VStringFunctions::get_char_len(left.data, left.size));
+ }
+
+ simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
+ return levenshtein_distance_utf8(left, left_offsets, right,
right_offsets);
+ }
+
+ static Int32 levenshtein_distance_with_left_offsets(const StringRef& left,
+ const
std::vector<size_t>& left_offsets,
+ bool left_ascii, const
StringRef& right,
+ std::vector<size_t>&
right_offsets) {
+ const bool right_ascii = simd::VStringFunctions::is_ascii(right);
+ if (left_ascii && right_ascii) {
+ return levenshtein_distance_ascii(left, right);
+ }
+
+ if (left.size == 0) {
+ return static_cast<Int32>(
+ right_ascii ? right.size
+ :
simd::VStringFunctions::get_char_len(right.data, right.size));
+ }
+ if (right.size == 0) {
+ return static_cast<Int32>(left_offsets.size());
+ }
+
+ simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
+ return levenshtein_distance_utf8(left, left_offsets, right,
right_offsets);
+ }
+
+ static Int32 levenshtein_distance(const StringRef& left, const StringRef&
right) {
+ std::vector<size_t> left_offsets;
+ std::vector<size_t> right_offsets;
+ return levenshtein_distance(left, right, left_offsets, right_offsets);
+ }
+};
+
+using FunctionLevenshtein =
+ FunctionBinaryToType<DataTypeString, DataTypeString, LevenshteinImpl,
NameLevenshtein>;
+
+void register_function_levenshtein(SimpleFunctionFactory& factory) {
+ factory.register_function<FunctionLevenshtein>();
+}
+
+} // namespace doris
diff --git a/be/src/exprs/function/simple_function_factory.h
b/be/src/exprs/function/simple_function_factory.h
index c1ebcc34535..1d7e26fe559 100644
--- a/be/src/exprs/function/simple_function_factory.h
+++ b/be/src/exprs/function/simple_function_factory.h
@@ -120,6 +120,8 @@ void register_function_ai(SimpleFunctionFactory& factory);
void register_function_score(SimpleFunctionFactory& factory);
void register_function_variant_type(SimpleFunctionFactory& factory);
void register_function_binary(SimpleFunctionFactory& factory);
+void register_function_levenshtein(SimpleFunctionFactory& factory);
+void register_function_hamming_distance(SimpleFunctionFactory& factory);
void register_function_soundex(SimpleFunctionFactory& factory);
#if defined(BE_TEST) && !defined(BE_BENCHMARK)
@@ -356,6 +358,8 @@ public:
register_function_ai(instance);
register_function_score(instance);
register_function_binary(instance);
+ register_function_levenshtein(instance);
+ register_function_hamming_distance(instance);
register_function_soundex(instance);
register_function_json_transform(instance);
register_function_json_hash(instance);
diff --git a/be/src/util/simd/vstring_function.h
b/be/src/util/simd/vstring_function.h
index b583dd67fc4..27375a0ea76 100644
--- a/be/src/util/simd/vstring_function.h
+++ b/be/src/util/simd/vstring_function.h
@@ -27,6 +27,7 @@
#include <array>
#include <cstddef>
#include <cstdint>
+#include <cstring>
#include "core/string_ref.h"
#include "util/simd/lower_upper_impl.h"
@@ -345,6 +346,21 @@ public:
return char_len;
}
+ static inline void get_utf8_char_offsets(const StringRef& ref,
std::vector<size_t>& offsets) {
+ offsets.clear();
+ offsets.reserve(ref.size);
+ get_char_len(ref.data, ref.size, offsets);
+ }
+
+ static inline bool utf8_char_equal(const StringRef& left, size_t left_off,
size_t left_next,
+ const StringRef& right, size_t
right_off,
+ size_t right_next) {
+ const size_t left_len = left_next - left_off;
+ const size_t right_len = right_next - right_off;
+ return left_len == right_len &&
+ std::memcmp(left.data + left_off, right.data + right_off,
left_len) == 0;
+ }
+
// utf8-encoding:
// - 1-byte: 0xxx_xxxx;
// - 2-byte: 110x_xxxx 10xx_xxxx;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index eb09cb7ab27..0883b93e5a8 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -235,6 +235,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.GetVariantTyp
import org.apache.doris.nereids.trees.expressions.functions.scalar.Greatest;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Grouping;
import org.apache.doris.nereids.trees.expressions.functions.scalar.GroupingId;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Hex;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.HllCardinality;
import org.apache.doris.nereids.trees.expressions.functions.scalar.HllEmpty;
@@ -319,6 +320,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Lcm;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Least;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Left;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Length;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.Levenshtein;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Ln;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Locate;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Log;
@@ -804,6 +806,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(Greatest.class, "greatest"),
scalar(Grouping.class, "grouping"),
scalar(GroupingId.class, "grouping_id"),
+ scalar(HammingDistance.class, "hamming_distance"),
scalar(Hex.class, "hex"),
scalar(HllCardinality.class, "hll_cardinality"),
scalar(HllEmpty.class, "hll_empty"),
@@ -891,6 +894,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(LastQueryId.class, "last_query_id"),
scalar(Lcm.class, "lcm"),
scalar(Least.class, "least"),
+ scalar(Levenshtein.class, "levenshtein"),
scalar(Left.class, "left", "strleft"),
scalar(Length.class, "length", "octet_length"),
scalar(Crc32.class, "crc32"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java
index 0172c3b4339..570d0bb4b98 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java
@@ -1123,6 +1123,74 @@ public class StringArithmetic {
return castStringLikeLiteral(first, result);
}
+ /**
+ * Executable arithmetic functions levenshtein
+ */
+ @ExecFunction(name = "levenshtein")
+ public static Expression levenshtein(StringLikeLiteral first,
StringLikeLiteral second) {
+ int[] left = first.getValue().codePoints().toArray();
+ int[] right = second.getValue().codePoints().toArray();
+
+ if (right.length > left.length) {
+ int[] tmp = left;
+ left = right;
+ right = tmp;
+ }
+
+ int m = left.length;
+ int n = right.length;
+ if (n == 0) {
+ return new IntegerLiteral(m);
+ }
+ if (m == 0) {
+ return new IntegerLiteral(n);
+ }
+
+ int[] prev = new int[n + 1];
+ int[] curr = new int[n + 1];
+ for (int j = 0; j <= n; j++) {
+ prev[j] = j;
+ }
+
+ for (int i = 1; i <= m; i++) {
+ curr[0] = i;
+ int leftChar = left[i - 1];
+ for (int j = 1; j <= n; j++) {
+ int cost = leftChar == right[j - 1] ? 0 : 1;
+ int insertCost = curr[j - 1] + 1;
+ int deleteCost = prev[j] + 1;
+ int replaceCost = prev[j - 1] + cost;
+ curr[j] = Math.min(insertCost, Math.min(deleteCost,
replaceCost));
+ }
+ int[] tmp = prev;
+ prev = curr;
+ curr = tmp;
+ }
+
+ return new IntegerLiteral(prev[n]);
+ }
+
+ /**
+ * Executable arithmetic functions hamming_distance
+ */
+ @ExecFunction(name = "hamming_distance")
+ public static Expression hammingDistance(StringLikeLiteral first,
StringLikeLiteral second) {
+ int[] left = first.getValue().codePoints().toArray();
+ int[] right = second.getValue().codePoints().toArray();
+
+ if (left.length != right.length) {
+ throw new AnalysisException("hamming_distance requires strings of
the same length");
+ }
+
+ long distance = 0;
+ for (int i = 0; i < left.length; i++) {
+ if (left[i] != right[i]) {
+ distance++;
+ }
+ }
+ return new BigIntLiteral(distance);
+ }
+
/**
* Executable arithmetic functions make_set
*/
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java
new file mode 100644
index 00000000000..a874ed7a912
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.BigIntType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'hamming_distance'.
+ */
+public class HammingDistance extends ScalarFunction
+ implements BinaryExpression, ExplicitlyCastableSignature,
PropagateNullable {
+
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+ FunctionSignature.ret(BigIntType.INSTANCE)
+ .args(VarcharType.SYSTEM_DEFAULT,
VarcharType.SYSTEM_DEFAULT),
+ FunctionSignature.ret(BigIntType.INSTANCE)
+ .args(StringType.INSTANCE, StringType.INSTANCE)
+ );
+
+ /**
+ * constructor with 2 arguments.
+ */
+ public HammingDistance(Expression arg0, Expression arg1) {
+ super("hamming_distance", arg0, arg1);
+ }
+
+ /** constructor for withChildren and reuse signature */
+ private HammingDistance(ScalarFunctionParams functionParams) {
+ super(functionParams);
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public HammingDistance withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 2);
+ return new HammingDistance(getFunctionParams(children));
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitHammingDistance(this, context);
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java
new file mode 100644
index 00000000000..c1095b27a26
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.IntegerType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'levenshtein'.
+ */
+public class Levenshtein extends ScalarFunction
+ implements BinaryExpression, ExplicitlyCastableSignature,
PropagateNullable {
+
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+
FunctionSignature.ret(IntegerType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT,
VarcharType.SYSTEM_DEFAULT),
+
FunctionSignature.ret(IntegerType.INSTANCE).args(StringType.INSTANCE,
StringType.INSTANCE)
+ );
+
+ /**
+ * constructor with 2 arguments.
+ */
+ public Levenshtein(Expression arg0, Expression arg1) {
+ super("levenshtein", arg0, arg1);
+ }
+
+ /** constructor for withChildren and reuse signature */
+ private Levenshtein(ScalarFunctionParams functionParams) {
+ super(functionParams);
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public Levenshtein withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 2);
+ return new Levenshtein(getFunctionParams(children));
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitLevenshtein(this, context);
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index 52194f5bc5b..ce9deb2776d 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -248,6 +248,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Gcd;
import org.apache.doris.nereids.trees.expressions.functions.scalar.GetFormat;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.GetVariantType;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Greatest;
+import
org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Hex;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.HllCardinality;
import org.apache.doris.nereids.trees.expressions.functions.scalar.HllEmpty;
@@ -338,6 +339,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Lcm;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Least;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Left;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Length;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.Levenshtein;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Ln;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Locate;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Log;
@@ -1898,6 +1900,14 @@ public interface ScalarFunctionVisitor<R, C> {
return visitScalarFunction(locate, context);
}
+ default R visitHammingDistance(HammingDistance hammingDistance, C context)
{
+ return visitScalarFunction(hammingDistance, context);
+ }
+
+ default R visitLevenshtein(Levenshtein levenshtein, C context) {
+ return visitScalarFunction(levenshtein, context);
+ }
+
default R visitLog(Log log, C context) {
return visitScalarFunction(log, context);
}
diff --git
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out
index d126d2cd8ea..c770570ff89 100644
---
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out
+++
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out
@@ -965,6 +965,202 @@ S530 S530
-- !soundex_330 --
R163 R163
+-- !levenshtein_331 --
+0 3 2 1 1
+
+-- !levenshtein_332 --
+0 3 3 \N \N
+
+-- !levenshtein_333 --
+2 1 1
+
+-- !levenshtein_334 --
+\N 2 1
+
+-- !levenshtein_tbl --
+1 3
+2 0
+3 1
+4 \N
+5 1
+6 2
+7 3
+8 2
+9 1
+
+-- !levenshtein_nn_vector_vector --
+1 1
+2 4
+3 1
+4 2
+
+-- !levenshtein_nn_vector_scalar_ascii --
+1 0
+
+-- !levenshtein_nn_scalar_vector_ascii --
+1 0
+
+-- !levenshtein_nn_vector_scalar_utf8 --
+3 1
+
+-- !levenshtein_nn_scalar_vector_utf8 --
+3 1
+
+-- !levenshtein_vector_scalar_nullable --
+2 0
+4 \N
+
+-- !levenshtein_scalar_vector_nullable --
+2 0
+4 \N
+
+-- !levenshtein_vector_scalar_utf8 --
+5 1
+7 2
+
+-- !levenshtein_scalar_vector_utf8 --
+5 1
+7 2
+
+-- !levenshtein_vector_scalar_empty_utf8 --
+5 3
+8 2
+
+-- !levenshtein_scalar_vector_empty_utf8 --
+5 3
+8 2
+
+-- !levenshtein_lv_nn_vector_vector --
+1 3
+2 3
+3 3
+4 3
+5 1
+6 1
+7 3
+8 3
+
+-- !levenshtein_lv_nn_vector_scalar_empty --
+1 3
+2 6
+3 0
+4 3
+5 2
+6 3
+7 0
+8 3
+
+-- !levenshtein_lv_nn_scalar_vector_empty --
+1 3
+2 6
+3 0
+4 3
+5 2
+6 3
+7 0
+8 3
+
+-- !levenshtein_lv_nn_vector_scalar_ascii --
+1 3
+2 0
+3 6
+4 3
+
+-- !levenshtein_lv_nn_scalar_vector_ascii --
+1 3
+2 0
+3 6
+4 3
+
+-- !levenshtein_lv_nn_vector_scalar_utf8 --
+5 1
+6 0
+7 3
+8 0
+
+-- !levenshtein_lv_nn_scalar_vector_utf8 --
+5 1
+6 0
+7 3
+8 0
+
+-- !hamming_distance_333 --
+0 0 1 1
+
+-- !hamming_distance_334 --
+0 \N \N
+
+-- !hamming_distance_335 --
+4 1 2
+
+-- !hamming_distance_336 --
+\N \N \N
+
+-- !hamming_distance_tbl --
+1 0
+2 1
+3 1
+4 \N
+5 4
+6 1
+7 2
+
+-- !hamming_distance_nn_vector_vector --
+1 1
+2 4
+3 1
+4 2
+
+-- !hamming_distance_nn_vector_scalar_ascii --
+1 0
+
+-- !hamming_distance_nn_scalar_vector_ascii --
+1 0
+
+-- !hamming_distance_nn_vector_scalar_utf8 --
+3 1
+
+-- !hamming_distance_nn_scalar_vector_utf8 --
+3 1
+
+-- !hamming_distance_vector_scalar_nullable --
+1 0
+2 0
+4 \N
+
+-- !hamming_distance_scalar_vector_nullable --
+1 0
+2 0
+4 \N
+
+-- !hamming_distance_vector_scalar_nullable_utf8 --
+3 1
+
+-- !hamming_distance_scalar_vector_nullable_utf8 --
+3 1
+
+-- !hamming_distance_left_const_null_nullable --
+1 \N
+4 \N
+
+-- !hamming_distance_right_const_null_nullable --
+1 \N
+4 \N
+
+-- !hamming_distance_left_cast_null_nullable --
+1 \N
+4 \N
+
+-- !hamming_distance_right_cast_null_nullable --
+1 \N
+4 \N
+
+-- !nereids_levenshtein_337 --
+3 \N 1
+
+-- !nereids_hamming_distance_338 --
+1 \N 1
+
-- !space_333 --
@@ -1411,4 +1607,3 @@ Hello Test123
-- !xpath_string_486 --
123
-
diff --git
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy
index 9d7123b03a7..094014504bd 100644
---
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy
+++
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy
@@ -753,6 +753,195 @@ suite("string_functions_all") {
testFoldConst("SELECT soundex('R@b-e123rt'), soundex('Robert');")
// SOUNDEX tests with non-ASCII characters - Skipped (not supported)
+ // LEVENSHTEIN tests
+ qt_levenshtein_331 "SELECT levenshtein('', ''), levenshtein('kitten',
'sitting'), levenshtein('flaw', 'lawn'), levenshtein('你好', '你们'),
levenshtein('数据库', '数据');"
+ testFoldConst("SELECT levenshtein('', ''), levenshtein('kitten',
'sitting'), levenshtein('flaw', 'lawn'), levenshtein('你好', '你们'),
levenshtein('数据库', '数据');")
+ qt_levenshtein_332 "SELECT levenshtein('abc', 'abc'), levenshtein('abc',
''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc',
NULL);"
+ testFoldConst("SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''),
levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);")
+ qt_levenshtein_333 "SELECT levenshtein('abcd', 'abdc'), levenshtein('你好呀',
'你好'), levenshtein('a你b', 'a们b');"
+ testFoldConst("SELECT levenshtein('abcd', 'abdc'), levenshtein('你好呀',
'你好'), levenshtein('a你b', 'a们b');")
+ qt_levenshtein_334 "SELECT levenshtein(NULL, NULL), levenshtein('', '你好'),
levenshtein('你好世界', '你好世间');"
+ testFoldConst("SELECT levenshtein(NULL, NULL), levenshtein('', '你好'),
levenshtein('你好世界', '你好世间');")
+ sql """DROP TABLE IF EXISTS string_distance_lv_test"""
+ sql """
+ CREATE TABLE IF NOT EXISTS string_distance_lv_test (
+ id int,
+ s1 VARCHAR,
+ s2 VARCHAR
+ )
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num"="1")
+ """
+ sql """
+ insert into string_distance_lv_test values
+ (1, 'kitten', 'sitting'),
+ (2, 'abc', 'abc'),
+ (3, '数据库', '数据'),
+ (4, null, 'abc'),
+ (5, '你好呀', '你好'),
+ (6, 'abcd', 'abdc'),
+ (7, '', '数据库'),
+ (8, '你好', ''),
+ (9, '数据', '数据库')
+ """
+ qt_levenshtein_tbl "SELECT id, levenshtein(s1, s2) FROM
string_distance_lv_test ORDER BY id"
+
+ sql """DROP TABLE IF EXISTS string_distance_nn_test"""
+ sql """
+ CREATE TABLE IF NOT EXISTS string_distance_nn_test (
+ id int,
+ s1 VARCHAR(20) NOT NULL,
+ s2 VARCHAR(20) NOT NULL
+ )
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num"="1")
+ """
+ sql """
+ insert into string_distance_nn_test values
+ (1, 'abc', 'abd'),
+ (2, 'abcd', 'wxyz'),
+ (3, '你好', '你们'),
+ (4, '数据库', '数库据')
+ """
+ qt_levenshtein_nn_vector_vector "SELECT id, levenshtein(s1, s2) FROM
string_distance_nn_test ORDER BY id"
+ qt_levenshtein_nn_vector_scalar_ascii "SELECT id, levenshtein(s1, 'abc')
FROM string_distance_nn_test WHERE id = 1 ORDER BY id"
+ qt_levenshtein_nn_scalar_vector_ascii "SELECT id, levenshtein('abc', s1)
FROM string_distance_nn_test WHERE id = 1 ORDER BY id"
+ qt_levenshtein_nn_vector_scalar_utf8 "SELECT id, levenshtein(s1, '你们')
FROM string_distance_nn_test WHERE id = 3 ORDER BY id"
+ qt_levenshtein_nn_scalar_vector_utf8 "SELECT id, levenshtein('你们', s1)
FROM string_distance_nn_test WHERE id = 3 ORDER BY id"
+ qt_levenshtein_vector_scalar_nullable "SELECT id, levenshtein(s1, 'abc')
FROM string_distance_lv_test WHERE id IN (2, 4) ORDER BY id"
+ qt_levenshtein_scalar_vector_nullable "SELECT id, levenshtein('abc', s1)
FROM string_distance_lv_test WHERE id IN (2, 4) ORDER BY id"
+ qt_levenshtein_vector_scalar_utf8 "SELECT id, levenshtein(s1, '你好') FROM
string_distance_lv_test WHERE id IN (5, 7) ORDER BY id"
+ qt_levenshtein_scalar_vector_utf8 "SELECT id, levenshtein('你好', s1) FROM
string_distance_lv_test WHERE id IN (5, 7) ORDER BY id"
+ qt_levenshtein_vector_scalar_empty_utf8 "SELECT id, levenshtein(s1, '')
FROM string_distance_lv_test WHERE id IN (5, 8) ORDER BY id"
+ qt_levenshtein_scalar_vector_empty_utf8 "SELECT id, levenshtein('', s1)
FROM string_distance_lv_test WHERE id IN (5, 8) ORDER BY id"
+
+ sql """DROP TABLE IF EXISTS string_distance_lv_nn_test"""
+ sql """
+ CREATE TABLE IF NOT EXISTS string_distance_lv_nn_test (
+ id int,
+ s1 VARCHAR(20) NOT NULL,
+ s2 VARCHAR(20) NOT NULL
+ )
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num"="1")
+ """
+ sql """
+ insert into string_distance_lv_nn_test values
+ (1, 'abc', 'abcdef'),
+ (2, 'abcdef', 'abc'),
+ (3, '', 'abc'),
+ (4, 'abc', ''),
+ (5, '数据', '数据库'),
+ (6, '数据库', '数据'),
+ (7, '', '数据库'),
+ (8, '数据库', '')
+ """
+ qt_levenshtein_lv_nn_vector_vector "SELECT id, levenshtein(s1, s2) FROM
string_distance_lv_nn_test ORDER BY id"
+ qt_levenshtein_lv_nn_vector_scalar_empty "SELECT id, levenshtein(s1, '')
FROM string_distance_lv_nn_test ORDER BY id"
+ qt_levenshtein_lv_nn_scalar_vector_empty "SELECT id, levenshtein('', s1)
FROM string_distance_lv_nn_test ORDER BY id"
+ qt_levenshtein_lv_nn_vector_scalar_ascii "SELECT id, levenshtein(s1,
'abcdef') FROM string_distance_lv_nn_test WHERE id IN (1, 2, 3, 4) ORDER BY id"
+ qt_levenshtein_lv_nn_scalar_vector_ascii "SELECT id, levenshtein('abcdef',
s1) FROM string_distance_lv_nn_test WHERE id IN (1, 2, 3, 4) ORDER BY id"
+ qt_levenshtein_lv_nn_vector_scalar_utf8 "SELECT id, levenshtein(s1, '数据库')
FROM string_distance_lv_nn_test WHERE id IN (5, 6, 7, 8) ORDER BY id"
+ qt_levenshtein_lv_nn_scalar_vector_utf8 "SELECT id, levenshtein('数据库', s1)
FROM string_distance_lv_nn_test WHERE id IN (5, 6, 7, 8) ORDER BY id"
+
+ // HAMMING_DISTANCE tests
+ qt_hamming_distance_333 "SELECT hamming_distance('', ''),
hamming_distance('abc', 'abc'), hamming_distance('abc', 'abd'),
hamming_distance('你好', '你们');"
+ testFoldConst("SELECT hamming_distance('', ''), hamming_distance('abc',
'abc'), hamming_distance('abc', 'abd'), hamming_distance('你好', '你们');")
+ qt_hamming_distance_334 "SELECT hamming_distance('abc', 'abc'),
hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);"
+ testFoldConst("SELECT hamming_distance('abc', 'abc'),
hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);")
+ qt_hamming_distance_335 "SELECT hamming_distance('abcd', 'wxyz'),
hamming_distance('你好吗', '你们吗'), hamming_distance('数据库', '数库据');"
+ testFoldConst("SELECT hamming_distance('abcd', 'wxyz'),
hamming_distance('你好吗', '你们吗'), hamming_distance('数据库', '数库据');")
+ qt_hamming_distance_336 "SELECT hamming_distance(NULL, NULL),
hamming_distance(NULL, 'addd'), hamming_distance('addd', NULL);"
+ testFoldConst("SELECT hamming_distance(NULL, NULL), hamming_distance(NULL,
'addd'), hamming_distance('addd', NULL);")
+ sql """ set debug_skip_fold_constant = false; """
+ test {
+ sql "SELECT hamming_distance('abc', 'ab');"
+ exception "hamming_distance requires strings of the same length"
+ }
+ test {
+ sql "SELECT hamming_distance('你好', '你');"
+ exception "hamming_distance requires strings of the same length"
+ }
+ sql """ set debug_skip_fold_constant = true; """
+ test {
+ sql "SELECT hamming_distance('abc', 'ab');"
+ exception "hamming_distance requires strings of the same length"
+ }
+ test {
+ sql "SELECT hamming_distance('你好', '你');"
+ exception "hamming_distance requires strings of the same length"
+ }
+ sql """ set debug_skip_fold_constant = false; """
+ sql """DROP TABLE IF EXISTS string_distance_hd_test"""
+ sql """
+ CREATE TABLE IF NOT EXISTS string_distance_hd_test (
+ id int,
+ s1 VARCHAR,
+ s2 VARCHAR
+ )
+ DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES ("replication_num"="1")
+ """
+ sql """
+ insert into string_distance_hd_test values
+ (1, 'abc', 'abc'),
+ (2, 'abc', 'abd'),
+ (3, '你好', '你们'),
+ (4, null, 'abc'),
+ (5, 'abcd', 'wxyz'),
+ (6, '你好吗', '你们吗'),
+ (7, '数据库', '数库据')
+ """
+ qt_hamming_distance_tbl "SELECT id, hamming_distance(s1, s2) FROM
string_distance_hd_test ORDER BY id"
+ qt_hamming_distance_nn_vector_vector "SELECT id, hamming_distance(s1, s2)
FROM string_distance_nn_test ORDER BY id"
+ qt_hamming_distance_nn_vector_scalar_ascii "SELECT id,
hamming_distance(s1, 'abc') FROM string_distance_nn_test WHERE id = 1 ORDER BY
id"
+ qt_hamming_distance_nn_scalar_vector_ascii "SELECT id,
hamming_distance('abc', s1) FROM string_distance_nn_test WHERE id = 1 ORDER BY
id"
+ qt_hamming_distance_nn_vector_scalar_utf8 "SELECT id, hamming_distance(s1,
'你们') FROM string_distance_nn_test WHERE id = 3 ORDER BY id"
+ qt_hamming_distance_nn_scalar_vector_utf8 "SELECT id,
hamming_distance('你们', s1) FROM string_distance_nn_test WHERE id = 3 ORDER BY
id"
+ qt_hamming_distance_vector_scalar_nullable "SELECT id,
hamming_distance(s1, 'abc') FROM string_distance_hd_test WHERE id IN (1, 2, 4)
ORDER BY id"
+ qt_hamming_distance_scalar_vector_nullable "SELECT id,
hamming_distance('abc', s1) FROM string_distance_hd_test WHERE id IN (1, 2, 4)
ORDER BY id"
+ qt_hamming_distance_vector_scalar_nullable_utf8 "SELECT id,
hamming_distance(s1, '你们') FROM string_distance_hd_test WHERE id = 3 ORDER BY
id"
+ qt_hamming_distance_scalar_vector_nullable_utf8 "SELECT id,
hamming_distance('你们', s1) FROM string_distance_hd_test WHERE id = 3 ORDER BY
id"
+ qt_hamming_distance_left_const_null_nullable "SELECT id,
hamming_distance(NULL, s1) FROM string_distance_hd_test WHERE id IN (1, 4)
ORDER BY id"
+ qt_hamming_distance_right_const_null_nullable "SELECT id,
hamming_distance(s1, NULL) FROM string_distance_hd_test WHERE id IN (1, 4)
ORDER BY id"
+ qt_hamming_distance_left_cast_null_nullable "SELECT id,
hamming_distance(CAST(NULL AS STRING), s1) FROM string_distance_hd_test WHERE
id IN (1, 4) ORDER BY id"
+ qt_hamming_distance_right_cast_null_nullable "SELECT id,
hamming_distance(s1, CAST(NULL AS STRING)) FROM string_distance_hd_test WHERE
id IN (1, 4) ORDER BY id"
+ test {
+ sql "SELECT hamming_distance(s1, 'ab') FROM string_distance_hd_test
WHERE id = 1"
+ exception "hamming_distance requires strings of the same length"
+ }
+ test {
+ sql "SELECT hamming_distance('ab', s1) FROM string_distance_hd_test
WHERE id = 1"
+ exception "hamming_distance requires strings of the same length"
+ }
+ test {
+ sql "SELECT hamming_distance(s1, '你') FROM string_distance_hd_test
WHERE id = 3"
+ exception "hamming_distance requires strings of the same length"
+ }
+ test {
+ sql "SELECT hamming_distance(s1, 'ab') FROM string_distance_nn_test
WHERE id = 1"
+ exception "hamming_distance requires strings of the same length"
+ }
+ test {
+ sql "SELECT hamming_distance('ab', s1) FROM string_distance_nn_test
WHERE id = 1"
+ exception "hamming_distance requires strings of the same length"
+ }
+ test {
+ sql "SELECT hamming_distance(s1, '你') FROM string_distance_nn_test
WHERE id = 3"
+ exception "hamming_distance requires strings of the same length"
+ }
+
+ sql """ set enable_nereids_planner=true,
enable_fallback_to_original_planner=false; """
+ qt_nereids_levenshtein_337 "SELECT levenshtein('kitten', 'sitting'),
levenshtein(NULL, 'abc'), levenshtein('你好世界', '你好世间');"
+ testFoldConst("SELECT levenshtein('kitten', 'sitting'), levenshtein(NULL,
'abc'), levenshtein('你好世界', '你好世间');")
+ qt_nereids_hamming_distance_338 "SELECT hamming_distance('abcd', 'abcf'),
hamming_distance(NULL, 'addd'), hamming_distance('你好', '你们');"
+ testFoldConst("SELECT hamming_distance('abcd', 'abcf'),
hamming_distance(NULL, 'addd'), hamming_distance('你好', '你们');")
+ test {
+ sql "SELECT hamming_distance('abc', 'ab');"
+ exception "hamming_distance requires strings of the same length"
+ }
+ sql """ set enable_nereids_planner=false,
enable_fallback_to_original_planner=true; """
+
// SPACE tests
qt_space_333 "SELECT space(5);"
testFoldConst("SELECT space(5);")
@@ -1092,4 +1281,4 @@ suite("string_functions_all") {
testFoldConst("SELECT xpath_string(NULL, '/a');")
qt_xpath_string_486 "SELECT xpath_string('<a><!-- comment -->123</a>',
'/a');"
testFoldConst("SELECT xpath_string('<a><!-- comment -->123</a>', '/a');")
-}
\ No newline at end of file
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]