KazeBox33 commented on code in PR #60412:
URL: https://github.com/apache/doris/pull/60412#discussion_r2936619651


##########
be/src/exprs/function/function_levenshtein.cpp:
##########
@@ -0,0 +1,209 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+#include "common/status.h"
+#include "core/data_type/data_type_number.h"
+#include "core/string_ref.h"
+#include "exprs/function/function_totype.h"
+#include "exprs/function/simple_function_factory.h"
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+
+struct NameLevenshtein {
+    static constexpr auto name = "levenshtein";
+};
+
+template <typename LeftDataType, typename RightDataType>
+struct LevenshteinImpl {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static Status vector_vector(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets,
+                                const ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        DCHECK_EQ(loffsets.size(), roffsets.size());
+
+        const size_t size = loffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i),
+                                          string_ref_at(rdata, roffsets, i));
+        }
+        return Status::OK();
+    }
+
+    static Status vector_scalar(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets, const 
StringRef& rdata,
+                                ResultPaddedPODArray& res) {
+        const size_t size = loffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i), 
rdata);
+        }
+        return Status::OK();
+    }
+
+    static Status scalar_vector(const StringRef& ldata, const 
ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        const size_t size = roffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = levenshtein_distance(ldata, string_ref_at(rdata, 
roffsets, i));
+        }
+        return Status::OK();
+    }
+
+private:
+    static StringRef string_ref_at(const ColumnString::Chars& data,
+                                   const ColumnString::Offsets& offsets, 
size_t i) {
+        DCHECK_LT(i, offsets.size());
+        const size_t begin = (i == 0) ? 0 : offsets[i - 1];
+        const size_t end = offsets[i];
+        if (end <= begin || end > data.size()) {
+            return StringRef("", 0);
+        }
+
+        size_t str_size = end - begin;
+        if (str_size > 0 && data[end - 1] == '\0') {
+            --str_size;
+        }
+        return StringRef(reinterpret_cast<const char*>(data.data() + begin), 
str_size);
+    }
+
+    static void utf8_char_offsets(const StringRef& ref, std::vector<size_t>& 
offsets) {
+        offsets.clear();
+        offsets.reserve(ref.size);
+        simd::VStringFunctions::get_char_len(ref.data, ref.size, offsets);
+    }
+
+    static bool utf8_char_equal(const StringRef& left, size_t left_off, size_t 
left_next,
+                                const StringRef& right, size_t right_off, 
size_t right_next) {
+        const size_t left_len = left_next - left_off;
+        const size_t right_len = right_next - right_off;
+        return left_len == right_len &&
+               std::memcmp(left.data + left_off, right.data + right_off, 
left_len) == 0;
+    }
+
+    static Int32 levenshtein_distance_ascii(const StringRef& left, const 
StringRef& right) {
+        const StringRef* left_ref = &left;
+        const StringRef* right_ref = &right;
+        size_t m = left.size;
+        size_t n = right.size;
+
+        if (n > m) {
+            std::swap(left_ref, right_ref);
+            std::swap(m, n);
+        }
+
+        std::vector<Int32> prev(n + 1);
+        std::vector<Int32> curr(n + 1);
+        for (size_t j = 0; j <= n; ++j) {
+            prev[j] = static_cast<Int32>(j);
+        }
+
+        for (size_t i = 1; i <= m; ++i) {
+            curr[0] = static_cast<Int32>(i);
+            const char left_char = left_ref->data[i - 1];
+
+            for (size_t j = 1; j <= n; ++j) {
+                const Int32 cost = left_char == right_ref->data[j - 1] ? 0 : 1;
+                const Int32 insert_cost = curr[j - 1] + 1;
+                const Int32 delete_cost = prev[j] + 1;
+                const Int32 replace_cost = prev[j - 1] + cost;
+                curr[j] = std::min({insert_cost, delete_cost, replace_cost});
+            }
+            std::swap(prev, curr);
+        }
+
+        return prev[n];
+    }
+
+    static Int32 levenshtein_distance(const StringRef& left, const StringRef& 
right) {
+        if (simd::VStringFunctions::is_ascii(left) && 
simd::VStringFunctions::is_ascii(right)) {
+            return levenshtein_distance_ascii(left, right);
+        }
+
+        if (left.size == 0) {
+            return 
static_cast<Int32>(simd::VStringFunctions::get_char_len(right.data, 
right.size));
+        }
+        if (right.size == 0) {
+            return 
static_cast<Int32>(simd::VStringFunctions::get_char_len(left.data, left.size));
+        }
+
+        std::vector<size_t> left_offsets;
+        std::vector<size_t> right_offsets;
+        utf8_char_offsets(left, left_offsets);

Review Comment:
   done



##########
be/src/exprs/function/function_hamming_distance.cpp:
##########
@@ -0,0 +1,271 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+#include "common/status.h"
+#include "core/column/column_nullable.h"
+#include "core/column/column_string.h"
+#include "core/data_type/data_type_number.h"
+#include "core/string_ref.h"
+#include "exprs/function/function_totype.h"
+#include "exprs/function/simple_function_factory.h"
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+
+struct NameHammingDistance {
+    static constexpr auto name = "hamming_distance";
+};
+
+template <typename LeftDataType, typename RightDataType>
+struct HammingDistanceImpl {
+    using ResultDataType = DataTypeInt64;
+    using ResultPaddedPODArray = PaddedPODArray<Int64>;
+
+    static Status vector_vector(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets,
+                                const ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        DCHECK_EQ(loffsets.size(), roffsets.size());
+
+        const size_t size = loffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i),
+                                             string_ref_at(rdata, roffsets, 
i), res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status vector_scalar(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets, const 
StringRef& rdata,
+                                ResultPaddedPODArray& res) {
+        const size_t size = loffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, 
i), rdata, res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status scalar_vector(const StringRef& ldata, const 
ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        const size_t size = roffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            RETURN_IF_ERROR(hamming_distance(ldata, string_ref_at(rdata, 
roffsets, i), res[i], i));
+        }
+        return Status::OK();
+    }
+
+private:
+    static StringRef string_ref_at(const ColumnString::Chars& data,
+                                   const ColumnString::Offsets& offsets, 
size_t i) {
+        DCHECK_LT(i, offsets.size());
+        const size_t begin = (i == 0) ? 0 : offsets[i - 1];
+        const size_t end = offsets[i];
+        if (end <= begin || end > data.size()) {
+            return StringRef("", 0);
+        }
+
+        size_t str_size = end - begin;
+        if (str_size > 0 && data[end - 1] == '\0') {
+            --str_size;
+        }
+        return StringRef(reinterpret_cast<const char*>(data.data() + begin), 
str_size);
+    }
+
+    static void utf8_char_offsets(const StringRef& ref, std::vector<size_t>& 
offsets) {
+        offsets.clear();
+        offsets.reserve(ref.size);
+        simd::VStringFunctions::get_char_len(ref.data, ref.size, offsets);
+    }
+
+    static bool utf8_char_equal(const StringRef& left, size_t left_off, size_t 
left_next,
+                                const StringRef& right, size_t right_off, 
size_t right_next) {
+        const size_t left_len = left_next - left_off;
+        const size_t right_len = right_next - right_off;
+        return left_len == right_len &&
+               std::memcmp(left.data + left_off, right.data + right_off, 
left_len) == 0;
+    }
+
+public:
+    static Status hamming_distance(const StringRef& left, const StringRef& 
right, Int64& result,
+                                   size_t row) {
+        if (simd::VStringFunctions::is_ascii(left) && 
simd::VStringFunctions::is_ascii(right)) {
+            if (left.size != right.size) {
+                return Status::InvalidArgument(
+                        "hamming_distance requires strings of the same length 
at row {}", row);
+            }
+
+            Int64 distance = 0;
+            for (size_t i = 0; i < left.size; ++i) {
+                distance += static_cast<Int64>(left.data[i] != right.data[i]);
+            }
+            result = distance;
+            return Status::OK();
+        }
+
+        std::vector<size_t> left_offsets;
+        std::vector<size_t> right_offsets;
+        utf8_char_offsets(left, left_offsets);
+        utf8_char_offsets(right, right_offsets);

Review Comment:
   levenshtein and hamming_distance have done



##########
be/src/exprs/function/function_hamming_distance.cpp:
##########
@@ -0,0 +1,128 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cstring>
+#include <vector>
+
+#include "common/status.h"
+#include "util/simd/vstring_function.h"
+#include "vec/columns/column_string.h"
+#include "vec/common/string_ref.h"
+#include "vec/data_types/data_type_number.h"
+#include "vec/functions/function.h"
+#include "vec/functions/simple_function_factory.h"
+
+namespace doris::vectorized {
+#include "common/compile_check_begin.h"
+
+class FunctionHammingDistance : public IFunction {
+public:
+    static constexpr auto name = "hamming_distance";
+
+    static FunctionPtr create() { return 
std::make_shared<FunctionHammingDistance>(); }
+
+    String get_name() const override { return name; }
+
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return std::make_shared<DataTypeInt64>();
+    }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        uint32_t result, size_t input_rows_count) const 
override {
+        const ColumnPtr left_col = block.get_by_position(arguments[0]).column;
+        const ColumnPtr right_col = block.get_by_position(arguments[1]).column;
+
+        auto res_column = ColumnInt64::create(input_rows_count);
+        auto& res_data = res_column->get_data();
+
+        for (size_t i = 0; i < input_rows_count; ++i) {
+            const StringRef left = left_col->get_data_at(i);
+            const StringRef right = right_col->get_data_at(i);
+            RETURN_IF_ERROR(hamming_distance(left, right, res_data[i], i));
+        }
+
+        block.replace_by_position(result, std::move(res_column));
+        return Status::OK();
+    }
+
+private:
+    static void utf8_char_offsets(const StringRef& ref, std::vector<size_t>& 
offsets) {
+        offsets.clear();
+        offsets.reserve(ref.size);
+        const char* data = ref.data;
+        size_t size = ref.size;
+        size_t i = 0;
+        while (i < size) {
+            offsets.push_back(i);
+            uint8_t char_len = UTF8_BYTE_LENGTH[static_cast<uint8_t>(data[i])];
+            if (char_len == 0) {
+                char_len = 1;
+            }
+            if (i + char_len > size) {
+                char_len = static_cast<uint8_t>(size - i);
+            }
+            i += char_len;
+        }
+    }
+
+    static inline bool utf8_char_equal(const StringRef& left, size_t left_off, 
size_t left_next,
+                                       const StringRef& right, size_t 
right_off,
+                                       size_t right_next) {
+        size_t left_len = left_next - left_off;
+        size_t right_len = right_next - right_off;
+        if (left_len != right_len) {
+            return false;
+        }
+        return std::memcmp(left.data + left_off, right.data + right_off, 
left_len) == 0;
+    }
+
+    static Status hamming_distance(const StringRef& left, const StringRef& 
right, Int64& result,
+                                   size_t row) {
+        std::vector<size_t> left_offsets;
+        std::vector<size_t> right_offsets;
+        utf8_char_offsets(left, left_offsets);

Review Comment:
   have used simd::VStringFunctions::is_ascii



##########
be/src/exprs/function/function_levenshtein.cpp:
##########
@@ -0,0 +1,209 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+#include "common/status.h"
+#include "core/data_type/data_type_number.h"
+#include "core/string_ref.h"
+#include "exprs/function/function_totype.h"
+#include "exprs/function/simple_function_factory.h"
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+
+struct NameLevenshtein {
+    static constexpr auto name = "levenshtein";
+};
+
+template <typename LeftDataType, typename RightDataType>
+struct LevenshteinImpl {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static Status vector_vector(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets,
+                                const ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        DCHECK_EQ(loffsets.size(), roffsets.size());
+
+        const size_t size = loffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i),
+                                          string_ref_at(rdata, roffsets, i));
+        }
+        return Status::OK();
+    }
+
+    static Status vector_scalar(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets, const 
StringRef& rdata,
+                                ResultPaddedPODArray& res) {
+        const size_t size = loffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i), 
rdata);
+        }
+        return Status::OK();
+    }
+
+    static Status scalar_vector(const StringRef& ldata, const 
ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        const size_t size = roffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            res[i] = levenshtein_distance(ldata, string_ref_at(rdata, 
roffsets, i));
+        }
+        return Status::OK();
+    }
+
+private:
+    static StringRef string_ref_at(const ColumnString::Chars& data,
+                                   const ColumnString::Offsets& offsets, 
size_t i) {
+        DCHECK_LT(i, offsets.size());
+        const size_t begin = (i == 0) ? 0 : offsets[i - 1];
+        const size_t end = offsets[i];
+        if (end <= begin || end > data.size()) {
+            return StringRef("", 0);
+        }
+
+        size_t str_size = end - begin;
+        if (str_size > 0 && data[end - 1] == '\0') {
+            --str_size;
+        }
+        return StringRef(reinterpret_cast<const char*>(data.data() + begin), 
str_size);
+    }
+
+    static void utf8_char_offsets(const StringRef& ref, std::vector<size_t>& 
offsets) {
+        offsets.clear();
+        offsets.reserve(ref.size);
+        simd::VStringFunctions::get_char_len(ref.data, ref.size, offsets);
+    }
+
+    static bool utf8_char_equal(const StringRef& left, size_t left_off, size_t 
left_next,
+                                const StringRef& right, size_t right_off, 
size_t right_next) {
+        const size_t left_len = left_next - left_off;
+        const size_t right_len = right_next - right_off;
+        return left_len == right_len &&
+               std::memcmp(left.data + left_off, right.data + right_off, 
left_len) == 0;
+    }
+
+    static Int32 levenshtein_distance_ascii(const StringRef& left, const 
StringRef& right) {
+        const StringRef* left_ref = &left;
+        const StringRef* right_ref = &right;
+        size_t m = left.size;
+        size_t n = right.size;
+
+        if (n > m) {
+            std::swap(left_ref, right_ref);
+            std::swap(m, n);
+        }
+
+        std::vector<Int32> prev(n + 1);
+        std::vector<Int32> curr(n + 1);
+        for (size_t j = 0; j <= n; ++j) {
+            prev[j] = static_cast<Int32>(j);
+        }
+
+        for (size_t i = 1; i <= m; ++i) {
+            curr[0] = static_cast<Int32>(i);
+            const char left_char = left_ref->data[i - 1];
+
+            for (size_t j = 1; j <= n; ++j) {
+                const Int32 cost = left_char == right_ref->data[j - 1] ? 0 : 1;
+                const Int32 insert_cost = curr[j - 1] + 1;
+                const Int32 delete_cost = prev[j] + 1;
+                const Int32 replace_cost = prev[j - 1] + cost;
+                curr[j] = std::min({insert_cost, delete_cost, replace_cost});

Review Comment:
   done
   



##########
be/src/exprs/function/function_hamming_distance.cpp:
##########
@@ -0,0 +1,271 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstring>
+#include <vector>
+
+#include "common/status.h"
+#include "core/column/column_nullable.h"
+#include "core/column/column_string.h"
+#include "core/data_type/data_type_number.h"
+#include "core/string_ref.h"
+#include "exprs/function/function_totype.h"
+#include "exprs/function/simple_function_factory.h"
+#include "util/simd/vstring_function.h"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+
+struct NameHammingDistance {
+    static constexpr auto name = "hamming_distance";
+};
+
+template <typename LeftDataType, typename RightDataType>
+struct HammingDistanceImpl {
+    using ResultDataType = DataTypeInt64;
+    using ResultPaddedPODArray = PaddedPODArray<Int64>;
+
+    static Status vector_vector(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets,
+                                const ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        DCHECK_EQ(loffsets.size(), roffsets.size());
+
+        const size_t size = loffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i),
+                                             string_ref_at(rdata, roffsets, 
i), res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status vector_scalar(const ColumnString::Chars& ldata,
+                                const ColumnString::Offsets& loffsets, const 
StringRef& rdata,
+                                ResultPaddedPODArray& res) {
+        const size_t size = loffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, 
i), rdata, res[i], i));
+        }
+        return Status::OK();
+    }
+
+    static Status scalar_vector(const StringRef& ldata, const 
ColumnString::Chars& rdata,
+                                const ColumnString::Offsets& roffsets, 
ResultPaddedPODArray& res) {
+        const size_t size = roffsets.size();
+        res.resize(size);
+        for (size_t i = 0; i < size; ++i) {
+            RETURN_IF_ERROR(hamming_distance(ldata, string_ref_at(rdata, 
roffsets, i), res[i], i));
+        }
+        return Status::OK();
+    }
+
+private:
+    static StringRef string_ref_at(const ColumnString::Chars& data,
+                                   const ColumnString::Offsets& offsets, 
size_t i) {
+        DCHECK_LT(i, offsets.size());
+        const size_t begin = (i == 0) ? 0 : offsets[i - 1];
+        const size_t end = offsets[i];
+        if (end <= begin || end > data.size()) {
+            return StringRef("", 0);
+        }
+
+        size_t str_size = end - begin;
+        if (str_size > 0 && data[end - 1] == '\0') {
+            --str_size;
+        }
+        return StringRef(reinterpret_cast<const char*>(data.data() + begin), 
str_size);
+    }
+
+    static void utf8_char_offsets(const StringRef& ref, std::vector<size_t>& 
offsets) {
+        offsets.clear();
+        offsets.reserve(ref.size);
+        simd::VStringFunctions::get_char_len(ref.data, ref.size, offsets);
+    }
+
+    static bool utf8_char_equal(const StringRef& left, size_t left_off, size_t 
left_next,
+                                const StringRef& right, size_t right_off, 
size_t right_next) {
+        const size_t left_len = left_next - left_off;
+        const size_t right_len = right_next - right_off;
+        return left_len == right_len &&
+               std::memcmp(left.data + left_off, right.data + right_off, 
left_len) == 0;
+    }
+
+public:
+    static Status hamming_distance(const StringRef& left, const StringRef& 
right, Int64& result,
+                                   size_t row) {
+        if (simd::VStringFunctions::is_ascii(left) && 
simd::VStringFunctions::is_ascii(right)) {
+            if (left.size != right.size) {
+                return Status::InvalidArgument(
+                        "hamming_distance requires strings of the same length 
at row {}", row);
+            }
+
+            Int64 distance = 0;
+            for (size_t i = 0; i < left.size; ++i) {
+                distance += static_cast<Int64>(left.data[i] != right.data[i]);
+            }
+            result = distance;
+            return Status::OK();
+        }
+
+        std::vector<size_t> left_offsets;
+        std::vector<size_t> right_offsets;
+        utf8_char_offsets(left, left_offsets);
+        utf8_char_offsets(right, right_offsets);
+
+        if (left_offsets.size() != right_offsets.size()) {
+            return Status::InvalidArgument(
+                    "hamming_distance requires strings of the same length at 
row {}", row);
+        }
+
+        Int64 distance = 0;
+        const size_t len = left_offsets.size();
+        for (size_t i = 0; i + 1 < len; ++i) {
+            const size_t left_off = left_offsets[i];
+            const size_t left_next = left_offsets[i + 1];
+            const size_t right_off = right_offsets[i];
+            const size_t right_next = right_offsets[i + 1];
+            distance += static_cast<Int64>(
+                    !utf8_char_equal(left, left_off, left_next, right, 
right_off, right_next));
+        }
+        if (len > 0) {
+            const size_t left_off = left_offsets[len - 1];
+            const size_t right_off = right_offsets[len - 1];
+            distance += static_cast<Int64>(
+                    !utf8_char_equal(left, left_off, left.size, right, 
right_off, right.size));
+        }
+
+        result = distance;
+        return Status::OK();
+    }
+};
+
+template <template <typename, typename> typename Impl, typename Name>
+class FunctionBinaryStringToTypeWithNull : public IFunction {

Review Comment:
    Because `hamming_distance` needs different behavior from 
`FunctionBinaryToType`: return NULL when any argument is NULL, but still return 
`InvalidArgument` for non-NULL unequal-length inputs. I used this wrapper to 
keep that row-level NULL handling explicit.
   if I do not use this to wrap , hamming_distance("abc",NULL) will return 
`InvalidArgument` .



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to