This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new a59fdf4ead0 [feature](function) support ngram_search function (#38226)
a59fdf4ead0 is described below
commit a59fdf4ead09103f492f49c0d4dfe5dcac7a5dc3
Author: Mryange <[email protected]>
AuthorDate: Fri Jul 26 16:07:22 2024 +0800
[feature](function) support ngram_search function (#38226)
mysql [test]>select ngram_search('123456789' , '12345' , 3);
+---------------------------------------+
| ngram_search('123456789', '12345', 3) |
+---------------------------------------+
| 0.6 |
+---------------------------------------+
1 row in set (0.01 sec)
mysql [test]>select ngram_search("abababab","babababa",2);
+-----------------------------------------+
| ngram_search('abababab', 'babababa', 2) |
+-----------------------------------------+
| 1 |
+-----------------------------------------+
1 row in set (0.01 sec)
```
doc https://github.com/apache/doris-website/pull/899
---
be/src/vec/functions/function_string.cpp | 1 +
be/src/vec/functions/function_string.h | 127 +++++++++++++++++++++
.../doris/catalog/BuiltinScalarFunctions.java | 2 +
.../expressions/functions/scalar/NgramSearch.java | 78 +++++++++++++
.../expressions/visitor/ScalarFunctionVisitor.java | 5 +
.../string_functions/test_string_function.out | Bin 4217 -> 4562 bytes
.../string_functions/test_string_function.groovy | 29 +++++
7 files changed, 242 insertions(+)
diff --git a/be/src/vec/functions/function_string.cpp
b/be/src/vec/functions/function_string.cpp
index 955468f1a16..223d32a5682 100644
--- a/be/src/vec/functions/function_string.cpp
+++ b/be/src/vec/functions/function_string.cpp
@@ -1062,6 +1062,7 @@ void register_function_string(SimpleFunctionFactory&
factory) {
factory.register_function<FunctionSubReplace<SubReplaceFourImpl>>();
factory.register_function<FunctionOverlay>();
factory.register_function<FunctionStrcmp>();
+ factory.register_function<FunctionNgramSearch>();
factory.register_alias(FunctionLeft::name, "strleft");
factory.register_alias(FunctionRight::name, "strright");
diff --git a/be/src/vec/functions/function_string.h
b/be/src/vec/functions/function_string.h
index 5e119e2146c..22eeb93591f 100644
--- a/be/src/vec/functions/function_string.h
+++ b/be/src/vec/functions/function_string.h
@@ -56,6 +56,7 @@
#include "vec/columns/column.h"
#include "vec/columns/column_const.h"
#include "vec/columns/column_vector.h"
+#include "vec/common/hash_table/phmap_fwd_decl.h"
#include "vec/common/int_exp.h"
#include "vec/common/memcmp_small.h"
#include "vec/common/memcpy_small.h"
@@ -3674,4 +3675,130 @@ private:
}
}
};
+
+class FunctionNgramSearch : public IFunction {
+public:
+ static constexpr auto name = "ngram_search";
+ static FunctionPtr create() { return
std::make_shared<FunctionNgramSearch>(); }
+ String get_name() const override { return name; }
+ size_t get_number_of_arguments() const override { return 3; }
+ DataTypePtr get_return_type_impl(const DataTypes& arguments) const
override {
+ return std::make_shared<DataTypeFloat64>();
+ }
+
+ // ngram_search(text,pattern,gram_num)
+ Status execute_impl(FunctionContext* context, Block& block, const
ColumnNumbers& arguments,
+ size_t result, size_t input_rows_count) const override
{
+ CHECK_EQ(arguments.size(), 3);
+ auto col_res = ColumnFloat64::create();
+ bool col_const[3];
+ ColumnPtr argument_columns[3];
+ for (int i = 0; i < 3; ++i) {
+ std::tie(argument_columns[i], col_const[i]) =
+
unpack_if_const(block.get_by_position(arguments[i]).column);
+ }
+ // There is no need to check if the 2-th,3-th parameters are const
here because fe has already checked them.
+ auto pattern = assert_cast<const
ColumnString*>(argument_columns[1].get())->get_data_at(0);
+ auto gram_num = assert_cast<const
ColumnInt32*>(argument_columns[2].get())->get_element(0);
+ const auto* text_col = assert_cast<const
ColumnString*>(argument_columns[0].get());
+
+ if (col_const[0]) {
+ _execute_impl<true>(text_col, pattern, gram_num, *col_res,
input_rows_count);
+ } else {
+ _execute_impl<false>(text_col, pattern, gram_num, *col_res,
input_rows_count);
+ }
+
+ block.replace_by_position(result, std::move(col_res));
+ return Status::OK();
+ }
+
+private:
+ using NgramMap = phmap::flat_hash_map<uint32_t, uint8_t>;
+ // In the map, the key is the CRC32 hash result of a substring in the
string,
+ // and the value indicates whether this hash is found in the text or
pattern.
+ constexpr static auto not_found = 0b00;
+ constexpr static auto found_in_pattern = 0b01;
+ constexpr static auto found_in_text = 0b10;
+ constexpr static auto found_in_pattern_and_text = 0b11;
+
+ uint32_t sub_str_hash(const char* data, int32_t length) const {
+ constexpr static uint32_t seed = 0;
+ return HashUtil::crc_hash(data, length, seed);
+ }
+
+ template <bool column_const>
+ void _execute_impl(const ColumnString* text_col, StringRef& pattern, int
gram_num,
+ ColumnFloat64& res, size_t size) const {
+ auto& res_data = res.get_data();
+ res_data.resize_fill(size, 0);
+ // If the length of the pattern is less than gram_num, return 0.
+ if (pattern.size < gram_num) {
+ return;
+ }
+
+ // Build a map by pattern string, which will be used repeatedly in the
following loop.
+ NgramMap pattern_map;
+ int pattern_count = get_pattern_set(pattern_map, pattern, gram_num);
+ // Each time a loop is executed, the map will be modified, so it needs
to be restored afterward.
+ std::vector<uint32_t> restore_map;
+
+ for (int i = 0; i < size; i++) {
+ auto text =
text_col->get_data_at(index_check_const<column_const>(i));
+ if (text.size < gram_num) {
+ // If the length of the text is less than gram_num, return 0.
+ continue;
+ }
+ restore_map.reserve(text.size);
+ auto [text_count, intersection_count] =
+ get_text_set(text, gram_num, pattern_map, restore_map);
+
+ // 2 * |Intersection| / (|text substr set| + |pattern substr set|)
+ res_data[i] = 2.0 * intersection_count / (text_count +
pattern_count);
+ }
+ }
+
+ size_t get_pattern_set(NgramMap& pattern_map, StringRef& pattern, int
gram_num) const {
+ size_t pattern_count = 0;
+ for (int i = 0; i + gram_num <= pattern.size; i++) {
+ uint32_t cur_hash = sub_str_hash(pattern.data + i, gram_num);
+ if (!pattern_map.contains(cur_hash)) {
+ pattern_map[cur_hash] = found_in_pattern;
+ pattern_count++;
+ }
+ }
+ return pattern_count;
+ }
+
+ pair<size_t, size_t> get_text_set(StringRef& text, int gram_num, NgramMap&
pattern_map,
+ std::vector<uint32_t>& restore_map)
const {
+ restore_map.clear();
+ //intersection_count indicates a substring both in pattern and text.
+ size_t text_count = 0, intersection_count = 0;
+ for (int i = 0; i + gram_num <= text.size; i++) {
+ uint32_t cur_hash = sub_str_hash(text.data + i, gram_num);
+ auto& val = pattern_map[cur_hash];
+ if (val == not_found) {
+ val ^= found_in_text;
+ DCHECK(val == found_in_text);
+ // only found in text
+ text_count++;
+ restore_map.push_back(cur_hash);
+ } else if (val == found_in_pattern) {
+ val ^= found_in_text;
+ DCHECK(val == found_in_pattern_and_text);
+ // found in text and pattern
+ text_count++;
+ intersection_count++;
+ restore_map.push_back(cur_hash);
+ }
+ }
+ // Restore the pattern_map.
+ for (auto& restore_hash : restore_map) {
+ pattern_map[restore_hash] ^= found_in_text;
+ }
+
+ return {text_count, intersection_count};
+ }
+};
+
} // namespace doris::vectorized
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index f57840ffd76..2c792d55904 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -313,6 +313,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAl
import
org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.NgramSearch;
import org.apache.doris.nereids.trees.expressions.functions.scalar.NonNullable;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.NotNullOrEmpty;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Now;
@@ -782,6 +783,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(Negative.class, "negative"),
scalar(NonNullable.class, "non_nullable"),
scalar(NotNullOrEmpty.class, "not_null_or_empty"),
+ scalar(NgramSearch.class, "ngram_search"),
scalar(Now.class, "now", "current_timestamp", "localtime",
"localtimestamp"),
scalar(Nullable.class, "nullable"),
scalar(NullIf.class, "nullif"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/NgramSearch.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/NgramSearch.java
new file mode 100644
index 00000000000..8ac713c6a09
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/NgramSearch.java
@@ -0,0 +1,78 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.exceptions.AnalysisException;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.DoubleType;
+import org.apache.doris.nereids.types.IntegerType;
+import org.apache.doris.nereids.types.StringType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'NgramSearch'.
+ */
+public class NgramSearch extends ScalarFunction
+ implements ExplicitlyCastableSignature, PropagateNullable {
+
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+
FunctionSignature.ret(DoubleType.INSTANCE).args(StringType.INSTANCE,
StringType.INSTANCE,
+ IntegerType.INSTANCE));
+
+ /**
+ * constructor with 3 argument.
+ */
+ public NgramSearch(Expression arg0, Expression arg1, Expression arg2) {
+ super("ngram_search", arg0, arg1, arg2);
+ if (!(arg1.isConstant())) {
+ throw new AnalysisException(
+ "ngram_search(text,pattern,gram_num): pattern support
const value only.");
+ }
+ if (!(arg2.isConstant())) {
+ throw new AnalysisException(
+ "ngram_search(text,pattern,gram_num): gram_num support
const value only.");
+ }
+ }
+
+ /**
+ * withChildren.
+ */
+ @Override
+ public NgramSearch withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 3);
+ return new NgramSearch(children.get(0), children.get(1),
children.get(2));
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitNgramSearch(this, context);
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index 12fd3f119b3..0f00003fa21 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -314,6 +314,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.MultiSearchAl
import
org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash332;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.MurmurHash364;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.NgramSearch;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.NotNullOrEmpty;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Now;
import org.apache.doris.nereids.trees.expressions.functions.scalar.NullIf;
@@ -1614,6 +1615,10 @@ public interface ScalarFunctionVisitor<R, C> {
return visitScalarFunction(negative, context);
}
+ default R visitNgramSearch(NgramSearch ngramSearch, C context) {
+ return visitScalarFunction(ngramSearch, context);
+ }
+
default R visitNotNullOrEmpty(NotNullOrEmpty notNullOrEmpty, C context) {
return visitScalarFunction(notNullOrEmpty, context);
}
diff --git
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
index 69ce99277fa..4cc5f410bbf 100644
Binary files
a/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
and
b/regression-test/data/query_p0/sql_functions/string_functions/test_string_function.out
differ
diff --git
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
index f8fe485f967..566e572bfbd 100644
---
a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
+++
b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_function.groovy
@@ -361,4 +361,33 @@ suite("test_string_function", "arrow_flight_sql") {
qt_strcmp1 """ select strcmp('a', 'abc'); """
qt_strcmp2 """ select strcmp('abc', 'abc'); """
qt_strcmp3 """ select strcmp('abcd', 'abc'); """
+
+ sql "drop table if exists test_function_ngram_search;";
+ sql """ create table test_function_ngram_search (
+ k1 int not null,
+ s string null
+ ) distributed by hash (k1) buckets 1
+ properties ("replication_num"="1");
+ """
+
+ sql """ insert into test_function_ngram_search
values(1,"fffhhhkkkk"),(2,"abc1313131"),(3,'1313131') ,(4,'abc') , (5,null)"""
+
+ qt_ngram_search1 """ select k1, ngram_search(s,'abc1313131',3) as x , s
from test_function_ngram_search order by x ;"""
+
+ qt_ngram_search2 """select ngram_search('abc','abc1313131',3); """
+ qt_ngram_search3 """select ngram_search('abc1313131','abc1313131',3); """
+ qt_ngram_search3 """select ngram_search('1313131','abc1313131',3); """
+
+
+ sql "drop table if exists test_function_ngram_search;";
+ sql """ create table test_function_ngram_search (
+ k1 int not null,
+ s string not null
+ ) distributed by hash (k1) buckets 1
+ properties ("replication_num"="1");
+ """
+
+ sql """ insert into test_function_ngram_search
values(1,"fffhhhkkkk"),(2,"abc1313131"),(3,'1313131') ,(4,'abc') """
+
+ qt_ngram_search1_not_null """ select k1, ngram_search(s,'abc1313131',3) as
x , s from test_function_ngram_search order by x ;"""
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]