This is an automated email from the ASF dual-hosted git repository.

zclllyybb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 113fd2da342 Add levenshtein and hamming_distance functions (#60412)
113fd2da342 is described below

commit 113fd2da3424f81c41e3ec2ed427074a56d2cf3f
Author: 收集群风 <[email protected]>
AuthorDate: Fri May 29 11:07:46 2026 +0800

    Add levenshtein and hamming_distance functions (#60412)
    
    Related Issue: #48203
    Related PR: #57144 (reference)
    Problem Summary: support levenshtein (Hive) and hamming_distance
    (Trino/Presto).
---
 .../exprs/function/function_hamming_distance.cpp   | 320 +++++++++++++++++++++
 be/src/exprs/function/function_levenshtein.cpp     | 263 +++++++++++++++++
 be/src/exprs/function/simple_function_factory.h    |   4 +
 be/src/util/simd/vstring_function.h                |  16 ++
 .../doris/catalog/BuiltinScalarFunctions.java      |   4 +
 .../functions/executable/StringArithmetic.java     |  68 +++++
 .../functions/scalar/HammingDistance.java          |  78 +++++
 .../expressions/functions/scalar/Levenshtein.java  |  76 +++++
 .../expressions/visitor/ScalarFunctionVisitor.java |  10 +
 .../string_functions/test_string_all.out           | 197 ++++++++++++-
 .../string_functions/test_string_all.groovy        | 191 +++++++++++-
 11 files changed, 1225 insertions(+), 2 deletions(-)

diff --git a/be/src/exprs/function/function_hamming_distance.cpp 
b/be/src/exprs/function/function_hamming_distance.cpp
new file mode 100644
index 00000000000..230b02c32d6
--- /dev/null
+++ b/be/src/exprs/function/function_hamming_distance.cpp
@@ -0,0 +1,320 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <vector>
+
+#include "common/status.h"
+#include "core/column/column_nullable.h"
+#include "core/column/column_string.h"
+#include "core/data_type/data_type_number.h"
+#include "core/string_ref.h"
+#include "exprs/function/simple_function_factory.h"
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+
+class FunctionHammingDistance : public IFunction {
+public:
+    using ResultDataType = DataTypeInt64;
+    using ResultPaddedPODArray = PaddedPODArray<Int64>;
+    using ResultColumnType = ColumnVector<ResultDataType::PType>;
+
+    static constexpr auto name = "hamming_distance";
+
+    static FunctionPtr create() { return 
std::make_shared<FunctionHammingDistance>(); }
+
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        const bool has_nullable = std::ranges::any_of(
+                arguments, [](const DataTypePtr& type) { return 
type->is_nullable(); });
+        if (has_nullable) {
+            return make_nullable(std::make_shared<ResultDataType>());
+        }
+        return std::make_shared<ResultDataType>();
+    }
+
+    bool use_default_implementation_for_nulls() const override { return false; 
}
+
+    Status execute_impl(FunctionContext* /*context*/, Block& block, const 
ColumnNumbers& arguments,
+                        uint32_t result, size_t input_rows_count) const 
override {
+        const auto& [left_col, left_const] =
+                unpack_if_const(block.get_by_position(arguments[0]).column);
+        const auto& [right_col, right_const] =
+                unpack_if_const(block.get_by_position(arguments[1]).column);
+
+        const auto* left_nullable = 
check_and_get_column<ColumnNullable>(left_col.get());
+        const auto* right_nullable = 
check_and_get_column<ColumnNullable>(right_col.get());
+
+        const IColumn* left_nested =
+                left_nullable ? &left_nullable->get_nested_column() : 
left_col.get();
+        const IColumn* right_nested =
+                right_nullable ? &right_nullable->get_nested_column() : 
right_col.get();
+
+        const auto* left_str_col = assert_cast<const 
ColumnString*>(left_nested);
+        const auto* right_str_col = assert_cast<const 
ColumnString*>(right_nested);
+
+        auto res_col = ResultColumnType::create(input_rows_count);
+        auto& res_data = res_col->get_data();
+
+        const NullMap* left_null_map =
+                left_nullable ? &left_nullable->get_null_map_data() : nullptr;
+        const NullMap* right_null_map =
+                right_nullable ? &right_nullable->get_null_map_data() : 
nullptr;
+        const bool has_nullable = left_null_map != nullptr || right_null_map 
!= nullptr;
+
+        if (!has_nullable) {
+            if (left_const) {
+                
RETURN_IF_ERROR(scalar_vector(left_str_col->get_data_at(0).trim_tail_padding_zero(),
+                                              *right_str_col, res_data));
+            } else if (right_const) {
+                RETURN_IF_ERROR(vector_scalar(
+                        *left_str_col, 
right_str_col->get_data_at(0).trim_tail_padding_zero(),
+                        res_data));
+            } else {
+                RETURN_IF_ERROR(vector_vector(*left_str_col, *right_str_col, 
res_data));
+            }
+            block.replace_by_position(result, std::move(res_col));
+            return Status::OK();
+        }
+
+        auto null_col = ColumnUInt8::create(input_rows_count, 0);
+        auto& null_map = null_col->get_data();
+        if (left_const) {
+            if (left_null_map && (*left_null_map)[0]) {
+                std::fill(null_map.begin(), null_map.end(), 1);
+                block.replace_by_position(
+                        result, ColumnNullable::create(std::move(res_col), 
std::move(null_col)));
+                return Status::OK();
+            }
+
+            const auto left = 
left_str_col->get_data_at(0).trim_tail_padding_zero();
+            RETURN_IF_ERROR(scalar_vector_nullable(left, *right_str_col, 
right_null_map, res_data,
+                                                   null_map));
+        } else if (right_const) {
+            if (right_null_map && (*right_null_map)[0]) {
+                std::fill(null_map.begin(), null_map.end(), 1);
+                block.replace_by_position(
+                        result, ColumnNullable::create(std::move(res_col), 
std::move(null_col)));
+                return Status::OK();
+            }
+
+            RETURN_IF_ERROR(vector_scalar_nullable(
+                    *left_str_col, 
right_str_col->get_data_at(0).trim_tail_padding_zero(),
+                    left_null_map, res_data, null_map));
+        } else {
+            for (size_t i = 0; i < input_rows_count; ++i) {
+                const bool left_is_null = left_null_map && (*left_null_map)[i];
+                const bool right_is_null = right_null_map && 
(*right_null_map)[i];
+                if (left_is_null || right_is_null) {
+                    null_map[i] = 1;
+                    res_data[i] = 0;
+                    continue;
+                }
+
+                RETURN_IF_ERROR(hamming_distance(
+                        left_str_col->get_data_at(i).trim_tail_padding_zero(),
+                        
right_str_col->get_data_at(i).trim_tail_padding_zero(), res_data[i], i));
+            }
+        }
+
+        block.replace_by_position(result,
+                                  ColumnNullable::create(std::move(res_col), 
std::move(null_col)));
+        return Status::OK();
+    }
+
+private:
+    static Status vector_vector(const ColumnString& lcol, const ColumnString& 
rcol,
+                                ResultPaddedPODArray& res) {
+        DCHECK_EQ(lcol.size(), rcol.size());
+
+        const size_t size = lcol.size();
+        res.resize(size);
+        std::vector<size_t> left_offsets;
+        std::vector<size_t> right_offsets;
+        for (size_t i = 0; i < size; ++i) {
+            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
+            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
+            RETURN_IF_ERROR(hamming_distance_with_offsets(
+                    left, left_offsets, false, 
simd::VStringFunctions::is_ascii(left), right,
+                    right_offsets, false, 
simd::VStringFunctions::is_ascii(right), res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status vector_scalar(const ColumnString& lcol, const StringRef& 
rdata,
+                                ResultPaddedPODArray& res) {
+        const size_t size = lcol.size();
+        res.resize(size);
+        const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
+        std::vector<size_t> right_offsets;
+        simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets);
+        std::vector<size_t> left_offsets;
+        for (size_t i = 0; i < size; ++i) {
+            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
+            RETURN_IF_ERROR(hamming_distance_with_offsets(
+                    left, left_offsets, false, 
simd::VStringFunctions::is_ascii(left), rdata,
+                    right_offsets, true, right_ascii, res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status scalar_vector(const StringRef& ldata, const ColumnString& 
rcol,
+                                ResultPaddedPODArray& res) {
+        const size_t size = rcol.size();
+        res.resize(size);
+        const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
+        std::vector<size_t> left_offsets;
+        simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets);
+        std::vector<size_t> right_offsets;
+        for (size_t i = 0; i < size; ++i) {
+            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
+            RETURN_IF_ERROR(hamming_distance_with_offsets(
+                    ldata, left_offsets, true, left_ascii, right, 
right_offsets, false,
+                    simd::VStringFunctions::is_ascii(right), res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status vector_scalar_nullable(const ColumnString& lcol, const 
StringRef& rdata,
+                                         const NullMap* left_null_map, 
ResultPaddedPODArray& res,
+                                         NullMap& null_map) {
+        const size_t size = lcol.size();
+        res.resize(size);
+        const bool right_ascii = simd::VStringFunctions::is_ascii(rdata);
+        std::vector<size_t> right_offsets;
+        simd::VStringFunctions::get_utf8_char_offsets(rdata, right_offsets);
+        std::vector<size_t> left_offsets;
+        for (size_t i = 0; i < size; ++i) {
+            if (left_null_map && (*left_null_map)[i]) {
+                null_map[i] = 1;
+                res[i] = 0;
+                continue;
+            }
+
+            const auto left = lcol.get_data_at(i).trim_tail_padding_zero();
+            RETURN_IF_ERROR(hamming_distance_with_offsets(
+                    left, left_offsets, false, 
simd::VStringFunctions::is_ascii(left), rdata,
+                    right_offsets, true, right_ascii, res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status scalar_vector_nullable(const StringRef& ldata, const 
ColumnString& rcol,
+                                         const NullMap* right_null_map, 
ResultPaddedPODArray& res,
+                                         NullMap& null_map) {
+        const size_t size = rcol.size();
+        res.resize(size);
+        const bool left_ascii = simd::VStringFunctions::is_ascii(ldata);
+        std::vector<size_t> left_offsets;
+        simd::VStringFunctions::get_utf8_char_offsets(ldata, left_offsets);
+        std::vector<size_t> right_offsets;
+        for (size_t i = 0; i < size; ++i) {
+            if (right_null_map && (*right_null_map)[i]) {
+                null_map[i] = 1;
+                res[i] = 0;
+                continue;
+            }
+
+            const auto right = rcol.get_data_at(i).trim_tail_padding_zero();
+            RETURN_IF_ERROR(hamming_distance_with_offsets(
+                    ldata, left_offsets, true, left_ascii, right, 
right_offsets, false,
+                    simd::VStringFunctions::is_ascii(right), res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status hamming_distance_ascii(const StringRef& left, const 
StringRef& right,
+                                         Int64& result, size_t row) {
+        if (left.size != right.size) {
+            return Status::InvalidArgument(
+                    "hamming_distance requires strings of the same length at 
row {}", row);
+        }
+
+        Int64 distance = 0;
+        for (size_t i = 0; i < left.size; ++i) {
+            distance += static_cast<Int64>(left.data[i] != right.data[i]);
+        }
+        result = distance;
+        return Status::OK();
+    }
+
+    static Status hamming_distance_utf8(const StringRef& left,
+                                        const std::vector<size_t>& 
left_offsets,
+                                        const StringRef& right,
+                                        const std::vector<size_t>& 
right_offsets, Int64& result,
+                                        size_t row) {
+        if (left_offsets.size() != right_offsets.size()) {
+            return Status::InvalidArgument(
+                    "hamming_distance requires strings of the same length at 
row {}", row);
+        }
+
+        Int64 distance = 0;
+        const size_t len = left_offsets.size();
+        for (size_t i = 0; i + 1 < len; ++i) {
+            const size_t left_off = left_offsets[i];
+            const size_t left_next = left_offsets[i + 1];
+            const size_t right_off = right_offsets[i];
+            const size_t right_next = right_offsets[i + 1];
+            distance += 
static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal(
+                    left, left_off, left_next, right, right_off, right_next));
+        }
+        if (len > 0) {
+            const size_t left_off = left_offsets[len - 1];
+            const size_t right_off = right_offsets[len - 1];
+            distance += 
static_cast<Int64>(!simd::VStringFunctions::utf8_char_equal(
+                    left, left_off, left.size, right, right_off, right.size));
+        }
+
+        result = distance;
+        return Status::OK();
+    }
+
+    static Status hamming_distance_with_offsets(
+            const StringRef& left, std::vector<size_t>& left_offsets, bool 
left_offsets_ready,
+            bool left_ascii, const StringRef& right, std::vector<size_t>& 
right_offsets,
+            bool right_offsets_ready, bool right_ascii, Int64& result, size_t 
row) {
+        if (left_ascii && right_ascii) {
+            return hamming_distance_ascii(left, right, result, row);
+        }
+
+        if (!left_offsets_ready) {
+            simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
+        }
+        if (!right_offsets_ready) {
+            simd::VStringFunctions::get_utf8_char_offsets(right, 
right_offsets);
+        }
+        return hamming_distance_utf8(left, left_offsets, right, right_offsets, 
result, row);
+    }
+
+    static Status hamming_distance(const StringRef& left, const StringRef& 
right, Int64& result,
+                                   size_t row) {
+        std::vector<size_t> left_offsets;
+        std::vector<size_t> right_offsets;
+        return hamming_distance_with_offsets(
+                left, left_offsets, false, 
simd::VStringFunctions::is_ascii(left), right,
+                right_offsets, false, simd::VStringFunctions::is_ascii(right), 
result, row);
+    }
+};
+
+void register_function_hamming_distance(SimpleFunctionFactory& factory) {
+    factory.register_function<FunctionHammingDistance>();
+}
+
+} // namespace doris
diff --git a/be/src/exprs/function/function_levenshtein.cpp 
b/be/src/exprs/function/function_levenshtein.cpp
new file mode 100644
index 00000000000..27d24be3ce6
--- /dev/null
+++ b/be/src/exprs/function/function_levenshtein.cpp
@@ -0,0 +1,263 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <vector>
+
+#include "common/status.h"
+#include "core/data_type/data_type_number.h"
+#include "core/string_ref.h"
+#include "exprs/function/function_totype.h"
+#include "exprs/function/simple_function_factory.h"
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+
+struct NameLevenshtein {
+    static constexpr auto name = "levenshtein";
+};
+
+template <typename LeftDataType, typename RightDataType>
+struct LevenshteinImpl {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static Status vector_vector(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets,
+                                const ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        DCHECK_EQ(loffsets.size(), roffsets.size());
+
+        const size_t size = loffsets.size();
+        res.resize(size);
+        std::vector<size_t> left_offsets;
+        std::vector<size_t> right_offsets;
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i),
+                                          string_ref_at(rdata, roffsets, i), 
left_offsets,
+                                          right_offsets);
+        }
+        return Status::OK();
+    }
+
+    static Status vector_scalar(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets, const 
StringRef& rdata,
+                                ResultPaddedPODArray& res) {
+        const size_t size = loffsets.size();
+        res.resize(size);
+        const auto right = rdata.trim_tail_padding_zero();
+        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
+        std::vector<size_t> right_offsets;
+        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
+        std::vector<size_t> left_offsets;
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = 
levenshtein_distance_with_right_offsets(string_ref_at(ldata, loffsets, i),
+                                                             left_offsets, 
right, right_offsets,
+                                                             right_ascii);
+        }
+        return Status::OK();
+    }
+
+    static Status scalar_vector(const StringRef& ldata, const 
ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        const size_t size = roffsets.size();
+        res.resize(size);
+        const auto left = ldata.trim_tail_padding_zero();
+        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
+        std::vector<size_t> left_offsets;
+        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
+        std::vector<size_t> right_offsets;
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = levenshtein_distance_with_left_offsets(left, 
left_offsets, left_ascii,
+                                                            
string_ref_at(rdata, roffsets, i),
+                                                            right_offsets);
+        }
+        return Status::OK();
+    }
+
+private:
+    static StringRef string_ref_at(const ColumnString::Chars& data,
+                                   const ColumnString::Offsets& offsets, 
size_t i) {
+        DCHECK_LT(i, offsets.size());
+        const auto idx = static_cast<ssize_t>(i);
+        return StringRef(data.data() + offsets[idx - 1], offsets[idx] - 
offsets[idx - 1])
+                .trim_tail_padding_zero();
+    }
+
+    static Int32 levenshtein_distance_utf8(const StringRef& left,
+                                           const std::vector<size_t>& 
left_offsets,
+                                           const StringRef& right,
+                                           const std::vector<size_t>& 
right_offsets) {
+        const StringRef* left_ref = &left;
+        const StringRef* right_ref = &right;
+        const std::vector<size_t>* left_offsets_ref = &left_offsets;
+        const std::vector<size_t>* right_offsets_ref = &right_offsets;
+        if (right_offsets_ref->size() > left_offsets_ref->size()) {
+            std::swap(left_offsets_ref, right_offsets_ref);
+            std::swap(left_ref, right_ref);
+        }
+
+        const size_t m = left_offsets_ref->size();
+        const size_t n = right_offsets_ref->size();
+
+        std::vector<Int32> prev(n + 1);
+        std::vector<Int32> curr(n + 1);
+        for (size_t j = 0; j <= n; ++j) {
+            prev[j] = static_cast<Int32>(j);
+        }
+
+        for (size_t i = 1; i <= m; ++i) {
+            curr[0] = static_cast<Int32>(i);
+            const size_t left_off = (*left_offsets_ref)[i - 1];
+            const size_t left_next = i < m ? (*left_offsets_ref)[i] : 
left_ref->size;
+
+            for (size_t j = 1; j <= n; ++j) {
+                const size_t right_off = (*right_offsets_ref)[j - 1];
+                const size_t right_next = j < n ? (*right_offsets_ref)[j] : 
right_ref->size;
+
+                const Int32 cost =
+                        simd::VStringFunctions::utf8_char_equal(*left_ref, 
left_off, left_next,
+                                                                *right_ref, 
right_off, right_next)
+                                ? 0
+                                : 1;
+
+                const Int32 insert_cost = curr[j - 1] + 1;
+                const Int32 delete_cost = prev[j] + 1;
+                const Int32 replace_cost = prev[j - 1] + cost;
+                curr[j] = std::min(std::min(insert_cost, delete_cost), 
replace_cost);
+            }
+            std::swap(prev, curr);
+        }
+
+        return prev[n];
+    }
+
+    static Int32 levenshtein_distance_ascii(const StringRef& left, const 
StringRef& right) {
+        const StringRef* left_ref = &left;
+        const StringRef* right_ref = &right;
+        size_t m = left.size;
+        size_t n = right.size;
+
+        if (n > m) {
+            std::swap(left_ref, right_ref);
+            std::swap(m, n);
+        }
+
+        std::vector<Int32> prev(n + 1);
+        std::vector<Int32> curr(n + 1);
+        for (size_t j = 0; j <= n; ++j) {
+            prev[j] = static_cast<Int32>(j);
+        }
+
+        for (size_t i = 1; i <= m; ++i) {
+            curr[0] = static_cast<Int32>(i);
+            const char left_char = left_ref->data[i - 1];
+
+            for (size_t j = 1; j <= n; ++j) {
+                const Int32 cost = left_char == right_ref->data[j - 1] ? 0 : 1;
+                const Int32 insert_cost = curr[j - 1] + 1;
+                const Int32 delete_cost = prev[j] + 1;
+                const Int32 replace_cost = prev[j - 1] + cost;
+                curr[j] = std::min(std::min(insert_cost, delete_cost), 
replace_cost);
+            }
+            std::swap(prev, curr);
+        }
+
+        return prev[n];
+    }
+
+    static Int32 levenshtein_distance(const StringRef& left, const StringRef& 
right,
+                                      std::vector<size_t>& left_offsets,
+                                      std::vector<size_t>& right_offsets) {
+        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
+        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
+        if (left_ascii && right_ascii) {
+            return levenshtein_distance_ascii(left, right);
+        }
+
+        if (left.size == 0) {
+            return 
static_cast<Int32>(simd::VStringFunctions::get_char_len(right.data, 
right.size));
+        }
+        if (right.size == 0) {
+            return 
static_cast<Int32>(simd::VStringFunctions::get_char_len(left.data, left.size));
+        }
+
+        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
+        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
+        return levenshtein_distance_utf8(left, left_offsets, right, 
right_offsets);
+    }
+
+    static Int32 levenshtein_distance_with_right_offsets(const StringRef& left,
+                                                         std::vector<size_t>& 
left_offsets,
+                                                         const StringRef& 
right,
+                                                         const 
std::vector<size_t>& right_offsets,
+                                                         bool right_ascii) {
+        const bool left_ascii = simd::VStringFunctions::is_ascii(left);
+        if (left_ascii && right_ascii) {
+            return levenshtein_distance_ascii(left, right);
+        }
+
+        if (left.size == 0) {
+            return static_cast<Int32>(right_offsets.size());
+        }
+        if (right.size == 0) {
+            return left_ascii ? static_cast<Int32>(left.size)
+                              : static_cast<Int32>(
+                                        
simd::VStringFunctions::get_char_len(left.data, left.size));
+        }
+
+        simd::VStringFunctions::get_utf8_char_offsets(left, left_offsets);
+        return levenshtein_distance_utf8(left, left_offsets, right, 
right_offsets);
+    }
+
+    static Int32 levenshtein_distance_with_left_offsets(const StringRef& left,
+                                                        const 
std::vector<size_t>& left_offsets,
+                                                        bool left_ascii, const 
StringRef& right,
+                                                        std::vector<size_t>& 
right_offsets) {
+        const bool right_ascii = simd::VStringFunctions::is_ascii(right);
+        if (left_ascii && right_ascii) {
+            return levenshtein_distance_ascii(left, right);
+        }
+
+        if (left.size == 0) {
+            return static_cast<Int32>(
+                    right_ascii ? right.size
+                                : 
simd::VStringFunctions::get_char_len(right.data, right.size));
+        }
+        if (right.size == 0) {
+            return static_cast<Int32>(left_offsets.size());
+        }
+
+        simd::VStringFunctions::get_utf8_char_offsets(right, right_offsets);
+        return levenshtein_distance_utf8(left, left_offsets, right, 
right_offsets);
+    }
+
+    static Int32 levenshtein_distance(const StringRef& left, const StringRef& 
right) {
+        std::vector<size_t> left_offsets;
+        std::vector<size_t> right_offsets;
+        return levenshtein_distance(left, right, left_offsets, right_offsets);
+    }
+};
+
+using FunctionLevenshtein =
+        FunctionBinaryToType<DataTypeString, DataTypeString, LevenshteinImpl, 
NameLevenshtein>;
+
+void register_function_levenshtein(SimpleFunctionFactory& factory) {
+    factory.register_function<FunctionLevenshtein>();
+}
+
+} // namespace doris
diff --git a/be/src/exprs/function/simple_function_factory.h 
b/be/src/exprs/function/simple_function_factory.h
index c1ebcc34535..1d7e26fe559 100644
--- a/be/src/exprs/function/simple_function_factory.h
+++ b/be/src/exprs/function/simple_function_factory.h
@@ -120,6 +120,8 @@ void register_function_ai(SimpleFunctionFactory& factory);
 void register_function_score(SimpleFunctionFactory& factory);
 void register_function_variant_type(SimpleFunctionFactory& factory);
 void register_function_binary(SimpleFunctionFactory& factory);
+void register_function_levenshtein(SimpleFunctionFactory& factory);
+void register_function_hamming_distance(SimpleFunctionFactory& factory);
 void register_function_soundex(SimpleFunctionFactory& factory);
 
 #if defined(BE_TEST) && !defined(BE_BENCHMARK)
@@ -356,6 +358,8 @@ public:
             register_function_ai(instance);
             register_function_score(instance);
             register_function_binary(instance);
+            register_function_levenshtein(instance);
+            register_function_hamming_distance(instance);
             register_function_soundex(instance);
             register_function_json_transform(instance);
             register_function_json_hash(instance);
diff --git a/be/src/util/simd/vstring_function.h 
b/be/src/util/simd/vstring_function.h
index b583dd67fc4..27375a0ea76 100644
--- a/be/src/util/simd/vstring_function.h
+++ b/be/src/util/simd/vstring_function.h
@@ -27,6 +27,7 @@
 #include <array>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 
 #include "core/string_ref.h"
 #include "util/simd/lower_upper_impl.h"
@@ -345,6 +346,21 @@ public:
         return char_len;
     }
 
+    static inline void get_utf8_char_offsets(const StringRef& ref, 
std::vector<size_t>& offsets) {
+        offsets.clear();
+        offsets.reserve(ref.size);
+        get_char_len(ref.data, ref.size, offsets);
+    }
+
+    static inline bool utf8_char_equal(const StringRef& left, size_t left_off, 
size_t left_next,
+                                       const StringRef& right, size_t 
right_off,
+                                       size_t right_next) {
+        const size_t left_len = left_next - left_off;
+        const size_t right_len = right_next - right_off;
+        return left_len == right_len &&
+               std::memcmp(left.data + left_off, right.data + right_off, 
left_len) == 0;
+    }
+
     // utf8-encoding:
     // - 1-byte: 0xxx_xxxx;
     // - 2-byte: 110x_xxxx 10xx_xxxx;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index eb09cb7ab27..0883b93e5a8 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -235,6 +235,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.GetVariantTyp
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Greatest;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Grouping;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.GroupingId;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Hex;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.HllCardinality;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.HllEmpty;
@@ -319,6 +320,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Lcm;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Least;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Left;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Length;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.Levenshtein;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Ln;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Locate;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Log;
@@ -804,6 +806,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(Greatest.class, "greatest"),
             scalar(Grouping.class, "grouping"),
             scalar(GroupingId.class, "grouping_id"),
+            scalar(HammingDistance.class, "hamming_distance"),
             scalar(Hex.class, "hex"),
             scalar(HllCardinality.class, "hll_cardinality"),
             scalar(HllEmpty.class, "hll_empty"),
@@ -891,6 +894,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(LastQueryId.class, "last_query_id"),
             scalar(Lcm.class, "lcm"),
             scalar(Least.class, "least"),
+            scalar(Levenshtein.class, "levenshtein"),
             scalar(Left.class, "left", "strleft"),
             scalar(Length.class, "length", "octet_length"),
             scalar(Crc32.class, "crc32"),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java
index 0172c3b4339..570d0bb4b98 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java
@@ -1123,6 +1123,74 @@ public class StringArithmetic {
         return castStringLikeLiteral(first, result);
     }
 
+    /**
+     * Executable arithmetic functions levenshtein
+     */
+    @ExecFunction(name = "levenshtein")
+    public static Expression levenshtein(StringLikeLiteral first, 
StringLikeLiteral second) {
+        int[] left = first.getValue().codePoints().toArray();
+        int[] right = second.getValue().codePoints().toArray();
+
+        if (right.length > left.length) {
+            int[] tmp = left;
+            left = right;
+            right = tmp;
+        }
+
+        int m = left.length;
+        int n = right.length;
+        if (n == 0) {
+            return new IntegerLiteral(m);
+        }
+        if (m == 0) {
+            return new IntegerLiteral(n);
+        }
+
+        int[] prev = new int[n + 1];
+        int[] curr = new int[n + 1];
+        for (int j = 0; j <= n; j++) {
+            prev[j] = j;
+        }
+
+        for (int i = 1; i <= m; i++) {
+            curr[0] = i;
+            int leftChar = left[i - 1];
+            for (int j = 1; j <= n; j++) {
+                int cost = leftChar == right[j - 1] ? 0 : 1;
+                int insertCost = curr[j - 1] + 1;
+                int deleteCost = prev[j] + 1;
+                int replaceCost = prev[j - 1] + cost;
+                curr[j] = Math.min(insertCost, Math.min(deleteCost, 
replaceCost));
+            }
+            int[] tmp = prev;
+            prev = curr;
+            curr = tmp;
+        }
+
+        return new IntegerLiteral(prev[n]);
+    }
+
+    /**
+     * Executable arithmetic functions hamming_distance
+     */
+    @ExecFunction(name = "hamming_distance")
+    public static Expression hammingDistance(StringLikeLiteral first, 
StringLikeLiteral second) {
+        int[] left = first.getValue().codePoints().toArray();
+        int[] right = second.getValue().codePoints().toArray();
+
+        if (left.length != right.length) {
+            throw new AnalysisException("hamming_distance requires strings of 
the same length");
+        }
+
+        long distance = 0;
+        for (int i = 0; i < left.length; i++) {
+            if (left[i] != right[i]) {
+                distance++;
+            }
+        }
+        return new BigIntLiteral(distance);
+    }
+
     /**
      * Executable arithmetic functions make_set
      */
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java
new file mode 100644
index 00000000000..a874ed7a912
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.BigIntType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'hamming_distance'.
+ */
+public class HammingDistance extends ScalarFunction
+        implements BinaryExpression, ExplicitlyCastableSignature, 
PropagateNullable {
+
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            FunctionSignature.ret(BigIntType.INSTANCE)
+                    .args(VarcharType.SYSTEM_DEFAULT, 
VarcharType.SYSTEM_DEFAULT),
+            FunctionSignature.ret(BigIntType.INSTANCE)
+                    .args(StringType.INSTANCE, StringType.INSTANCE)
+    );
+
+    /**
+     * constructor with 2 arguments.
+     */
+    public HammingDistance(Expression arg0, Expression arg1) {
+        super("hamming_distance", arg0, arg1);
+    }
+
+    /** constructor for withChildren and reuse signature */
+    private HammingDistance(ScalarFunctionParams functionParams) {
+        super(functionParams);
+    }
+
+    /**
+     * withChildren.
+     */
+    @Override
+    public HammingDistance withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 2);
+        return new HammingDistance(getFunctionParams(children));
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitHammingDistance(this, context);
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java
new file mode 100644
index 00000000000..c1095b27a26
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.IntegerType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'levenshtein'.
+ */
+public class Levenshtein extends ScalarFunction
+        implements BinaryExpression, ExplicitlyCastableSignature, 
PropagateNullable {
+
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            
FunctionSignature.ret(IntegerType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT, 
VarcharType.SYSTEM_DEFAULT),
+            
FunctionSignature.ret(IntegerType.INSTANCE).args(StringType.INSTANCE, 
StringType.INSTANCE)
+    );
+
+    /**
+     * constructor with 2 arguments.
+     */
+    public Levenshtein(Expression arg0, Expression arg1) {
+        super("levenshtein", arg0, arg1);
+    }
+
+    /** constructor for withChildren and reuse signature */
+    private Levenshtein(ScalarFunctionParams functionParams) {
+        super(functionParams);
+    }
+
+    /**
+     * withChildren.
+     */
+    @Override
+    public Levenshtein withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 2);
+        return new Levenshtein(getFunctionParams(children));
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitLevenshtein(this, context);
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index 52194f5bc5b..ce9deb2776d 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -248,6 +248,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Gcd;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.GetFormat;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.GetVariantType;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Greatest;
+import 
org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Hex;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.HllCardinality;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.HllEmpty;
@@ -338,6 +339,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Lcm;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Least;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Left;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Length;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.Levenshtein;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Ln;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Locate;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Log;
@@ -1898,6 +1900,14 @@ public interface ScalarFunctionVisitor<R, C> {
         return visitScalarFunction(locate, context);
     }
 
+    default R visitHammingDistance(HammingDistance hammingDistance, C context) 
{
+        return visitScalarFunction(hammingDistance, context);
+    }
+
+    default R visitLevenshtein(Levenshtein levenshtein, C context) {
+        return visitScalarFunction(levenshtein, context);
+    }
+
     default R visitLog(Log log, C context) {
         return visitScalarFunction(log, context);
     }
diff --git 
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out
 
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out
index d126d2cd8ea..c770570ff89 100644
--- 
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out
+++ 
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out
@@ -965,6 +965,202 @@ S530      S530
 -- !soundex_330 --
 R163   R163
 
+-- !levenshtein_331 --
+0      3       2       1       1
+
+-- !levenshtein_332 --
+0      3       3       \N      \N
+
+-- !levenshtein_333 --
+2      1       1
+
+-- !levenshtein_334 --
+\N     2       1
+
+-- !levenshtein_tbl --
+1      3
+2      0
+3      1
+4      \N
+5      1
+6      2
+7      3
+8      2
+9      1
+
+-- !levenshtein_nn_vector_vector --
+1      1
+2      4
+3      1
+4      2
+
+-- !levenshtein_nn_vector_scalar_ascii --
+1      0
+
+-- !levenshtein_nn_scalar_vector_ascii --
+1      0
+
+-- !levenshtein_nn_vector_scalar_utf8 --
+3      1
+
+-- !levenshtein_nn_scalar_vector_utf8 --
+3      1
+
+-- !levenshtein_vector_scalar_nullable --
+2      0
+4      \N
+
+-- !levenshtein_scalar_vector_nullable --
+2      0
+4      \N
+
+-- !levenshtein_vector_scalar_utf8 --
+5      1
+7      2
+
+-- !levenshtein_scalar_vector_utf8 --
+5      1
+7      2
+
+-- !levenshtein_vector_scalar_empty_utf8 --
+5      3
+8      2
+
+-- !levenshtein_scalar_vector_empty_utf8 --
+5      3
+8      2
+
+-- !levenshtein_lv_nn_vector_vector --
+1      3
+2      3
+3      3
+4      3
+5      1
+6      1
+7      3
+8      3
+
+-- !levenshtein_lv_nn_vector_scalar_empty --
+1      3
+2      6
+3      0
+4      3
+5      2
+6      3
+7      0
+8      3
+
+-- !levenshtein_lv_nn_scalar_vector_empty --
+1      3
+2      6
+3      0
+4      3
+5      2
+6      3
+7      0
+8      3
+
+-- !levenshtein_lv_nn_vector_scalar_ascii --
+1      3
+2      0
+3      6
+4      3
+
+-- !levenshtein_lv_nn_scalar_vector_ascii --
+1      3
+2      0
+3      6
+4      3
+
+-- !levenshtein_lv_nn_vector_scalar_utf8 --
+5      1
+6      0
+7      3
+8      0
+
+-- !levenshtein_lv_nn_scalar_vector_utf8 --
+5      1
+6      0
+7      3
+8      0
+
+-- !hamming_distance_333 --
+0      0       1       1
+
+-- !hamming_distance_334 --
+0      \N      \N
+
+-- !hamming_distance_335 --
+4      1       2
+
+-- !hamming_distance_336 --
+\N     \N      \N
+
+-- !hamming_distance_tbl --
+1      0
+2      1
+3      1
+4      \N
+5      4
+6      1
+7      2
+
+-- !hamming_distance_nn_vector_vector --
+1      1
+2      4
+3      1
+4      2
+
+-- !hamming_distance_nn_vector_scalar_ascii --
+1      0
+
+-- !hamming_distance_nn_scalar_vector_ascii --
+1      0
+
+-- !hamming_distance_nn_vector_scalar_utf8 --
+3      1
+
+-- !hamming_distance_nn_scalar_vector_utf8 --
+3      1
+
+-- !hamming_distance_vector_scalar_nullable --
+1      0
+2      0
+4      \N
+
+-- !hamming_distance_scalar_vector_nullable --
+1      0
+2      0
+4      \N
+
+-- !hamming_distance_vector_scalar_nullable_utf8 --
+3      1
+
+-- !hamming_distance_scalar_vector_nullable_utf8 --
+3      1
+
+-- !hamming_distance_left_const_null_nullable --
+1      \N
+4      \N
+
+-- !hamming_distance_right_const_null_nullable --
+1      \N
+4      \N
+
+-- !hamming_distance_left_cast_null_nullable --
+1      \N
+4      \N
+
+-- !hamming_distance_right_cast_null_nullable --
+1      \N
+4      \N
+
+-- !nereids_levenshtein_337 --
+3      \N      1
+
+-- !nereids_hamming_distance_338 --
+1      \N      1
+
 -- !space_333 --
      
 
@@ -1411,4 +1607,3 @@ Hello     Test123
 
 -- !xpath_string_486 --
 123
-
diff --git 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy
 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy
index 9d7123b03a7..094014504bd 100644
--- 
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy
+++ 
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy
@@ -753,6 +753,195 @@ suite("string_functions_all") {
     testFoldConst("SELECT soundex('R@b-e123rt'), soundex('Robert');")
     // SOUNDEX tests with non-ASCII characters - Skipped (not supported)
 
+    // LEVENSHTEIN tests
+    qt_levenshtein_331 "SELECT levenshtein('', ''), levenshtein('kitten', 
'sitting'), levenshtein('flaw', 'lawn'), levenshtein('你好', '你们'), 
levenshtein('数据库', '数据');"
+    testFoldConst("SELECT levenshtein('', ''), levenshtein('kitten', 
'sitting'), levenshtein('flaw', 'lawn'), levenshtein('你好', '你们'), 
levenshtein('数据库', '数据');")
+    qt_levenshtein_332 "SELECT levenshtein('abc', 'abc'), levenshtein('abc', 
''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', 
NULL);"
+    testFoldConst("SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), 
levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);")
+    qt_levenshtein_333 "SELECT levenshtein('abcd', 'abdc'), levenshtein('你好呀', 
'你好'), levenshtein('a你b', 'a们b');"
+    testFoldConst("SELECT levenshtein('abcd', 'abdc'), levenshtein('你好呀', 
'你好'), levenshtein('a你b', 'a们b');")
+    qt_levenshtein_334 "SELECT levenshtein(NULL, NULL), levenshtein('', '你好'), 
levenshtein('你好世界', '你好世间');"
+    testFoldConst("SELECT levenshtein(NULL, NULL), levenshtein('', '你好'), 
levenshtein('你好世界', '你好世间');")
+    sql """DROP TABLE IF EXISTS string_distance_lv_test"""
+    sql """
+        CREATE TABLE IF NOT EXISTS string_distance_lv_test (
+            id int,
+            s1 VARCHAR,
+            s2 VARCHAR
+        )
+        DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES ("replication_num"="1")
+    """
+    sql """
+        insert into string_distance_lv_test values
+        (1, 'kitten', 'sitting'),
+        (2, 'abc', 'abc'),
+        (3, '数据库', '数据'),
+        (4, null, 'abc'),
+        (5, '你好呀', '你好'),
+        (6, 'abcd', 'abdc'),
+        (7, '', '数据库'),
+        (8, '你好', ''),
+        (9, '数据', '数据库')
+    """
+    qt_levenshtein_tbl "SELECT id, levenshtein(s1, s2) FROM 
string_distance_lv_test ORDER BY id"
+
+    sql """DROP TABLE IF EXISTS string_distance_nn_test"""
+    sql """
+        CREATE TABLE IF NOT EXISTS string_distance_nn_test (
+            id int,
+            s1 VARCHAR(20) NOT NULL,
+            s2 VARCHAR(20) NOT NULL
+        )
+        DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES ("replication_num"="1")
+    """
+    sql """
+        insert into string_distance_nn_test values
+        (1, 'abc', 'abd'),
+        (2, 'abcd', 'wxyz'),
+        (3, '你好', '你们'),
+        (4, '数据库', '数库据')
+    """
+    qt_levenshtein_nn_vector_vector "SELECT id, levenshtein(s1, s2) FROM 
string_distance_nn_test ORDER BY id"
+    qt_levenshtein_nn_vector_scalar_ascii "SELECT id, levenshtein(s1, 'abc') 
FROM string_distance_nn_test WHERE id = 1 ORDER BY id"
+    qt_levenshtein_nn_scalar_vector_ascii "SELECT id, levenshtein('abc', s1) 
FROM string_distance_nn_test WHERE id = 1 ORDER BY id"
+    qt_levenshtein_nn_vector_scalar_utf8 "SELECT id, levenshtein(s1, '你们') 
FROM string_distance_nn_test WHERE id = 3 ORDER BY id"
+    qt_levenshtein_nn_scalar_vector_utf8 "SELECT id, levenshtein('你们', s1) 
FROM string_distance_nn_test WHERE id = 3 ORDER BY id"
+    qt_levenshtein_vector_scalar_nullable "SELECT id, levenshtein(s1, 'abc') 
FROM string_distance_lv_test WHERE id IN (2, 4) ORDER BY id"
+    qt_levenshtein_scalar_vector_nullable "SELECT id, levenshtein('abc', s1) 
FROM string_distance_lv_test WHERE id IN (2, 4) ORDER BY id"
+    qt_levenshtein_vector_scalar_utf8 "SELECT id, levenshtein(s1, '你好') FROM 
string_distance_lv_test WHERE id IN (5, 7) ORDER BY id"
+    qt_levenshtein_scalar_vector_utf8 "SELECT id, levenshtein('你好', s1) FROM 
string_distance_lv_test WHERE id IN (5, 7) ORDER BY id"
+    qt_levenshtein_vector_scalar_empty_utf8 "SELECT id, levenshtein(s1, '') 
FROM string_distance_lv_test WHERE id IN (5, 8) ORDER BY id"
+    qt_levenshtein_scalar_vector_empty_utf8 "SELECT id, levenshtein('', s1) 
FROM string_distance_lv_test WHERE id IN (5, 8) ORDER BY id"
+
+    sql """DROP TABLE IF EXISTS string_distance_lv_nn_test"""
+    sql """
+        CREATE TABLE IF NOT EXISTS string_distance_lv_nn_test (
+            id int,
+            s1 VARCHAR(20) NOT NULL,
+            s2 VARCHAR(20) NOT NULL
+        )
+        DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES ("replication_num"="1")
+    """
+    sql """
+        insert into string_distance_lv_nn_test values
+        (1, 'abc', 'abcdef'),
+        (2, 'abcdef', 'abc'),
+        (3, '', 'abc'),
+        (4, 'abc', ''),
+        (5, '数据', '数据库'),
+        (6, '数据库', '数据'),
+        (7, '', '数据库'),
+        (8, '数据库', '')
+    """
+    qt_levenshtein_lv_nn_vector_vector "SELECT id, levenshtein(s1, s2) FROM 
string_distance_lv_nn_test ORDER BY id"
+    qt_levenshtein_lv_nn_vector_scalar_empty "SELECT id, levenshtein(s1, '') 
FROM string_distance_lv_nn_test ORDER BY id"
+    qt_levenshtein_lv_nn_scalar_vector_empty "SELECT id, levenshtein('', s1) 
FROM string_distance_lv_nn_test ORDER BY id"
+    qt_levenshtein_lv_nn_vector_scalar_ascii "SELECT id, levenshtein(s1, 
'abcdef') FROM string_distance_lv_nn_test WHERE id IN (1, 2, 3, 4) ORDER BY id"
+    qt_levenshtein_lv_nn_scalar_vector_ascii "SELECT id, levenshtein('abcdef', 
s1) FROM string_distance_lv_nn_test WHERE id IN (1, 2, 3, 4) ORDER BY id"
+    qt_levenshtein_lv_nn_vector_scalar_utf8 "SELECT id, levenshtein(s1, '数据库') 
FROM string_distance_lv_nn_test WHERE id IN (5, 6, 7, 8) ORDER BY id"
+    qt_levenshtein_lv_nn_scalar_vector_utf8 "SELECT id, levenshtein('数据库', s1) 
FROM string_distance_lv_nn_test WHERE id IN (5, 6, 7, 8) ORDER BY id"
+
+    // HAMMING_DISTANCE tests
+    qt_hamming_distance_333 "SELECT hamming_distance('', ''), 
hamming_distance('abc', 'abc'), hamming_distance('abc', 'abd'), 
hamming_distance('你好', '你们');"
+    testFoldConst("SELECT hamming_distance('', ''), hamming_distance('abc', 
'abc'), hamming_distance('abc', 'abd'), hamming_distance('你好', '你们');")
+    qt_hamming_distance_334 "SELECT hamming_distance('abc', 'abc'), 
hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);"
+    testFoldConst("SELECT hamming_distance('abc', 'abc'), 
hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);")
+    qt_hamming_distance_335 "SELECT hamming_distance('abcd', 'wxyz'), 
hamming_distance('你好吗', '你们吗'), hamming_distance('数据库', '数库据');"
+    testFoldConst("SELECT hamming_distance('abcd', 'wxyz'), 
hamming_distance('你好吗', '你们吗'), hamming_distance('数据库', '数库据');")
+    qt_hamming_distance_336 "SELECT hamming_distance(NULL, NULL), 
hamming_distance(NULL, 'addd'), hamming_distance('addd', NULL);"
+    testFoldConst("SELECT hamming_distance(NULL, NULL), hamming_distance(NULL, 
'addd'), hamming_distance('addd', NULL);")
+    sql """ set debug_skip_fold_constant = false; """
+    test {
+        sql "SELECT hamming_distance('abc', 'ab');"
+        exception "hamming_distance requires strings of the same length"
+    }
+    test {
+        sql "SELECT hamming_distance('你好', '你');"
+        exception "hamming_distance requires strings of the same length"
+    }
+    sql """ set debug_skip_fold_constant = true; """
+    test {
+        sql "SELECT hamming_distance('abc', 'ab');"
+        exception "hamming_distance requires strings of the same length"
+    }
+    test {
+        sql "SELECT hamming_distance('你好', '你');"
+        exception "hamming_distance requires strings of the same length"
+    }
+    sql """ set debug_skip_fold_constant = false; """
+    sql """DROP TABLE IF EXISTS string_distance_hd_test"""
+    sql """
+        CREATE TABLE IF NOT EXISTS string_distance_hd_test (
+            id int,
+            s1 VARCHAR,
+            s2 VARCHAR
+        )
+        DISTRIBUTED BY HASH(id) BUCKETS 1
+        PROPERTIES ("replication_num"="1")
+    """
+    sql """
+        insert into string_distance_hd_test values
+        (1, 'abc', 'abc'),
+        (2, 'abc', 'abd'),
+        (3, '你好', '你们'),
+        (4, null, 'abc'),
+        (5, 'abcd', 'wxyz'),
+        (6, '你好吗', '你们吗'),
+        (7, '数据库', '数库据')
+    """
+    qt_hamming_distance_tbl "SELECT id, hamming_distance(s1, s2) FROM 
string_distance_hd_test ORDER BY id"
+    qt_hamming_distance_nn_vector_vector "SELECT id, hamming_distance(s1, s2) 
FROM string_distance_nn_test ORDER BY id"
+    qt_hamming_distance_nn_vector_scalar_ascii "SELECT id, 
hamming_distance(s1, 'abc') FROM string_distance_nn_test WHERE id = 1 ORDER BY 
id"
+    qt_hamming_distance_nn_scalar_vector_ascii "SELECT id, 
hamming_distance('abc', s1) FROM string_distance_nn_test WHERE id = 1 ORDER BY 
id"
+    qt_hamming_distance_nn_vector_scalar_utf8 "SELECT id, hamming_distance(s1, 
'你们') FROM string_distance_nn_test WHERE id = 3 ORDER BY id"
+    qt_hamming_distance_nn_scalar_vector_utf8 "SELECT id, 
hamming_distance('你们', s1) FROM string_distance_nn_test WHERE id = 3 ORDER BY 
id"
+    qt_hamming_distance_vector_scalar_nullable "SELECT id, 
hamming_distance(s1, 'abc') FROM string_distance_hd_test WHERE id IN (1, 2, 4) 
ORDER BY id"
+    qt_hamming_distance_scalar_vector_nullable "SELECT id, 
hamming_distance('abc', s1) FROM string_distance_hd_test WHERE id IN (1, 2, 4) 
ORDER BY id"
+    qt_hamming_distance_vector_scalar_nullable_utf8 "SELECT id, 
hamming_distance(s1, '你们') FROM string_distance_hd_test WHERE id = 3 ORDER BY 
id"
+    qt_hamming_distance_scalar_vector_nullable_utf8 "SELECT id, 
hamming_distance('你们', s1) FROM string_distance_hd_test WHERE id = 3 ORDER BY 
id"
+    qt_hamming_distance_left_const_null_nullable "SELECT id, 
hamming_distance(NULL, s1) FROM string_distance_hd_test WHERE id IN (1, 4) 
ORDER BY id"
+    qt_hamming_distance_right_const_null_nullable "SELECT id, 
hamming_distance(s1, NULL) FROM string_distance_hd_test WHERE id IN (1, 4) 
ORDER BY id"
+    qt_hamming_distance_left_cast_null_nullable "SELECT id, 
hamming_distance(CAST(NULL AS STRING), s1) FROM string_distance_hd_test WHERE 
id IN (1, 4) ORDER BY id"
+    qt_hamming_distance_right_cast_null_nullable "SELECT id, 
hamming_distance(s1, CAST(NULL AS STRING)) FROM string_distance_hd_test WHERE 
id IN (1, 4) ORDER BY id"
+    test {
+        sql "SELECT hamming_distance(s1, 'ab') FROM string_distance_hd_test 
WHERE id = 1"
+        exception "hamming_distance requires strings of the same length"
+    }
+    test {
+        sql "SELECT hamming_distance('ab', s1) FROM string_distance_hd_test 
WHERE id = 1"
+        exception "hamming_distance requires strings of the same length"
+    }
+    test {
+        sql "SELECT hamming_distance(s1, '你') FROM string_distance_hd_test 
WHERE id = 3"
+        exception "hamming_distance requires strings of the same length"
+    }
+    test {
+        sql "SELECT hamming_distance(s1, 'ab') FROM string_distance_nn_test 
WHERE id = 1"
+        exception "hamming_distance requires strings of the same length"
+    }
+    test {
+        sql "SELECT hamming_distance('ab', s1) FROM string_distance_nn_test 
WHERE id = 1"
+        exception "hamming_distance requires strings of the same length"
+    }
+    test {
+        sql "SELECT hamming_distance(s1, '你') FROM string_distance_nn_test 
WHERE id = 3"
+        exception "hamming_distance requires strings of the same length"
+    }
+
+    sql """ set enable_nereids_planner=true, 
enable_fallback_to_original_planner=false; """
+    qt_nereids_levenshtein_337 "SELECT levenshtein('kitten', 'sitting'), 
levenshtein(NULL, 'abc'), levenshtein('你好世界', '你好世间');"
+    testFoldConst("SELECT levenshtein('kitten', 'sitting'), levenshtein(NULL, 
'abc'), levenshtein('你好世界', '你好世间');")
+    qt_nereids_hamming_distance_338 "SELECT hamming_distance('abcd', 'abcf'), 
hamming_distance(NULL, 'addd'), hamming_distance('你好', '你们');"
+    testFoldConst("SELECT hamming_distance('abcd', 'abcf'), 
hamming_distance(NULL, 'addd'), hamming_distance('你好', '你们');")
+    test {
+        sql "SELECT hamming_distance('abc', 'ab');"
+        exception "hamming_distance requires strings of the same length"
+    }
+    sql """ set enable_nereids_planner=false, 
enable_fallback_to_original_planner=true; """
+
     // SPACE tests
     qt_space_333 "SELECT space(5);"
     testFoldConst("SELECT space(5);")
@@ -1092,4 +1281,4 @@ suite("string_functions_all") {
     testFoldConst("SELECT xpath_string(NULL, '/a');")
     qt_xpath_string_486 "SELECT xpath_string('<a><!-- comment -->123</a>', 
'/a');"
     testFoldConst("SELECT xpath_string('<a><!-- comment -->123</a>', '/a');")
-}
\ No newline at end of file
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to