This is an automated email from the ASF dual-hosted git repository.
lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 80ce5b924cf [Function](exec) Add Regex_Count function (#51652)
80ce5b924cf is described below
commit 80ce5b924cf0b10968b145b98ea2634aedb87adf
Author: dwdwqfwe <[email protected]>
AuthorDate: Thu Jun 26 15:51:57 2025 +0800
[Function](exec) Add Regex_Count function (#51652)
---
be/src/vec/functions/function_regexp.cpp | 110 +++++++++++++++++++++
.../doris/catalog/BuiltinScalarFunctions.java | 2 +
.../expressions/functions/scalar/RegexpCount.java | 79 +++++++++++++++
.../expressions/visitor/ScalarFunctionVisitor.java | 5 +
.../scalar_function/regexp_count.out | Bin 0 -> 525 bytes
.../scalar_function/regexp_count.groovy | 59 +++++++++++
6 files changed, 255 insertions(+)
diff --git a/be/src/vec/functions/function_regexp.cpp
b/be/src/vec/functions/function_regexp.cpp
index 89258e0a2be..ce64e73bce9 100644
--- a/be/src/vec/functions/function_regexp.cpp
+++ b/be/src/vec/functions/function_regexp.cpp
@@ -42,6 +42,7 @@
#include "vec/core/types.h"
#include "vec/data_types/data_type.h"
#include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
#include "vec/data_types/data_type_string.h"
#include "vec/functions/function.h"
#include "vec/functions/simple_function_factory.h"
@@ -49,6 +50,114 @@
namespace doris::vectorized {
#include "common/compile_check_begin.h"
+struct RegexpCountImpl {
+ static void execute_impl(FunctionContext* context, ColumnPtr
argument_columns[],
+ size_t input_rows_count, ColumnInt32::Container&
result_data) {
+ const auto* str_col =
check_and_get_column<ColumnString>(argument_columns[0].get());
+ const auto* pattern_col =
check_and_get_column<ColumnString>(argument_columns[1].get());
+ for (int i = 0; i < input_rows_count; ++i) {
+ result_data[i] = _execute_inner_loop(context, str_col,
pattern_col, i);
+ }
+ }
+ static int _execute_inner_loop(FunctionContext* context, const
ColumnString* str_col,
+ const ColumnString* pattern_col, const
size_t index_now) {
+ re2::RE2* re = reinterpret_cast<re2::RE2*>(
+ context->get_function_state(FunctionContext::THREAD_LOCAL));
+ std::unique_ptr<re2::RE2> scoped_re;
+ if (re == nullptr) {
+ std::string error_str;
+ DCHECK(pattern_col);
+ const auto& pattern =
pattern_col->get_data_at(index_check_const(index_now, false));
+ bool st = StringFunctions::compile_regex(pattern, &error_str,
StringRef(), StringRef(),
+ scoped_re);
+ if (!st) {
+ context->add_warning(error_str.c_str());
+ throw Exception(Status::InvalidArgument(error_str));
+ return 0;
+ }
+ re = scoped_re.get();
+ }
+
+ const auto& str = str_col->get_data_at(index_now);
+ int count = 0;
+ size_t pos = 0;
+ while (pos < str.size) {
+ auto str_pos = str.data + pos;
+ auto str_size = str.size - pos;
+ re2::StringPiece str_sp_current = re2::StringPiece(str_pos,
str_size);
+ re2::StringPiece match;
+
+ bool success = re->Match(str_sp_current, 0, str_size,
re2::RE2::UNANCHORED, &match, 1);
+ if (!success) {
+ break;
+ }
+ if (match.empty()) {
+ pos += 1;
+ continue;
+ }
+ count++;
+ size_t match_start = match.data() - str_sp_current.data();
+ pos += match_start + match.size();
+ }
+
+ return count;
+ }
+};
+
+class FunctionRegexpCount : public IFunction {
+public:
+ static constexpr auto name = "regexp_count";
+
+ static FunctionPtr create() { return
std::make_shared<FunctionRegexpCount>(); }
+
+ String get_name() const override { return name; }
+
+ size_t get_number_of_arguments() const override { return 2; }
+
+ DataTypePtr get_return_type_impl(const DataTypes& arguments) const
override {
+ return std::make_shared<DataTypeInt32>();
+ }
+
+ Status open(FunctionContext* context, FunctionContext::FunctionStateScope
scope) override {
+ if (scope == FunctionContext::THREAD_LOCAL) {
+ if (context->is_col_constant(1)) {
+ DCHECK(!context->get_function_state(scope));
+ const auto pattern_col =
context->get_constant_col(1)->column_ptr;
+ const auto& pattern = pattern_col->get_data_at(0);
+ if (pattern.size == 0) {
+ return Status::OK();
+ }
+
+ std::string error_str;
+ std::unique_ptr<re2::RE2> scoped_re;
+ bool st = StringFunctions::compile_regex(pattern, &error_str,
StringRef(),
+ StringRef(),
scoped_re);
+ if (!st) {
+ context->set_error(error_str.c_str());
+ return Status::InvalidArgument(error_str);
+ }
+ std::shared_ptr<re2::RE2> re(scoped_re.release());
+ context->set_function_state(scope, re);
+ }
+ }
+ return Status::OK();
+ }
+
+ Status execute_impl(FunctionContext* context, Block& block, const
ColumnNumbers& arguments,
+ uint32_t result, size_t input_rows_count) const
override {
+ auto result_data_column = ColumnInt32::create(input_rows_count);
+ auto& result_data = result_data_column->get_data();
+
+ ColumnPtr argument_columns[2];
+
+ argument_columns[0] = block.get_by_position(arguments[0]).column;
+ argument_columns[1] = block.get_by_position(arguments[1]).column;
+ RegexpCountImpl::execute_impl(context, argument_columns,
input_rows_count, result_data);
+
+ block.get_by_position(result).column = std::move(result_data_column);
+ return Status::OK();
+ }
+};
struct ThreeParamTypes {
static DataTypes get_variadic_argument_types() {
@@ -605,6 +714,7 @@ void
register_function_regexp_extract(SimpleFunctionFactory& factory) {
factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<true>>>();
factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<false>>>();
factory.register_function<FunctionRegexpFunctionality<RegexpExtractAllImpl>>();
+ factory.register_function<FunctionRegexpCount>();
}
} // namespace doris::vectorized
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index eb053c99531..a4b4485a617 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -361,6 +361,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Quote;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Radians;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpCount;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
@@ -863,6 +864,7 @@ public class BuiltinScalarFunctions implements
FunctionHelper {
scalar(Radians.class, "radians"),
scalar(Random.class, "rand", "random"),
scalar(Regexp.class, "regexp"),
+ scalar(RegexpCount.class, "regexp_count"),
scalar(RegexpExtract.class, "regexp_extract"),
scalar(RegexpExtractAll.class, "regexp_extract_all"),
scalar(RegexpExtractOrNull.class, "regexp_extract_or_null"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpCount.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpCount.java
new file mode 100644
index 00000000000..dfb1644ca10
--- /dev/null
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpCount.java
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import
org.apache.doris.nereids.trees.expressions.functions.PropagateNullLiteral;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.IntegerType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'regexp_count'. This class is generated by GenerateFunction.
+ */
+
+public class RegexpCount extends ScalarFunction
+ implements BinaryExpression, ExplicitlyCastableSignature,
PropagateNullLiteral {
+ public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+
FunctionSignature.ret(IntegerType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT,
VarcharType.SYSTEM_DEFAULT),
+
FunctionSignature.ret(IntegerType.INSTANCE).args(StringType.INSTANCE,
StringType.INSTANCE)
+ );
+
+ /**
+ * constructor with 2 arguments.
+ */
+
+ public RegexpCount(Expression arg0, Expression arg1) {
+ super("regexp_count", arg0, arg1);
+ }
+
+ /**
+ * withChildren.
+ */
+
+ @Override
+ public RegexpCount withChildren(List<Expression> children) {
+ Preconditions.checkArgument(children.size() == 2);
+ return new RegexpCount(children.get(0), children.get(1));
+ }
+
+ @Override
+ public List<FunctionSignature> getSignatures() {
+ return SIGNATURES;
+ }
+
+ @Override
+ public boolean nullable() {
+ // Return nullable if any argument is nullable
+ return children().stream().anyMatch(Expression::nullable);
+ }
+
+ @Override
+ public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+ return visitor.visitRegexpCount(this, context);
+ }
+}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index df18f9d7e62..625e3808bb8 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -361,6 +361,7 @@ import
org.apache.doris.nereids.trees.expressions.functions.scalar.Quote;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Radians;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpCount;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
import
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
@@ -1859,6 +1860,10 @@ public interface ScalarFunctionVisitor<R, C> {
return visitScalarFunction(regexpReplaceOne, context);
}
+ default R visitRegexpCount(RegexpCount regexpCount, C context) {
+ return visitScalarFunction(regexpCount, context);
+ }
+
default R visitRepeat(Repeat repeat, C context) {
return visitScalarFunction(repeat, context);
}
diff --git
a/regression-test/data/nereids_function_p0/scalar_function/regexp_count.out
b/regression-test/data/nereids_function_p0/scalar_function/regexp_count.out
new file mode 100644
index 00000000000..e9f60f6adc4
Binary files /dev/null and
b/regression-test/data/nereids_function_p0/scalar_function/regexp_count.out
differ
diff --git
a/regression-test/suites/nereids_function_p0/scalar_function/regexp_count.groovy
b/regression-test/suites/nereids_function_p0/scalar_function/regexp_count.groovy
new file mode 100644
index 00000000000..01d974422da
--- /dev/null
+++
b/regression-test/suites/nereids_function_p0/scalar_function/regexp_count.groovy
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_regexp_count") {
+ qt_basic_count1 "SELECT regexp_count('a.b:c;d', '[.:;]');"
+ qt_basic_count2 "SELECT regexp_count('a.b:c;d', '.');"
+ qt_basic_count3 "SELECT regexp_count('a.b:c;d', ':');"
+ qt_basic_count4 "SELECT regexp_count('Hello123World!', '[a-zA-Z]');"
+ qt_basic_count5 "SELECT regexp_count('a1b2c3d', '[^0-9]');"
+ qt_basic_count6 "SELECT regexp_count('Hello World\tJava\nSQL', 's');"
+ qt_basic_count7 "SELECT regexp_count('Hello, World!', '[[:punct:]]');"
+ qt_basic_count8 "SELECT regexp_count('abc123def456', 'd+');"
+ qt_basic_count9 """SELECT regexp_count('Contact us at [email protected] or
[email protected]',
+ '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+.[A-Za-z]{2,}');"""
+ qt_basic_count10 "SELECT regexp_count('An apple a day keeps the doctor
away', '[ae][a-z]*');"
+
+ qt_empty_string "SELECT regexp_count('', 'x');"
+ qt_empty_pattern "SELECT regexp_count('abcd', '');"
+ qt_both_empty "SELECT regexp_count('', '');"
+
+ sql """DROP TABLE IF EXISTS `test_table_for_regexp_count`;"""
+ sql """CREATE TABLE test_table_for_regexp_count (
+ id INT,
+ text_data VARCHAR(500),
+ pattern VARCHAR(100)
+ ) PROPERTIES ("replication_num"="1");"""
+
+ sql """ INSERT INTO test_table_for_regexp_count VALUES
+ (1, 'HelloWorld', '[A-Z][a-z]+'),
+ (2, 'apple123', '[a-z]{5}[0-9]'),
+ (3, 'aabbcc', '(aa|bb|cc)'),
+ (4, '123-456-7890', '[0-9][0-9][0-9]'),
+ (5, 'test,data', ','),
+ (6, 'a1b2c3', '[a-z][0-9]'),
+ (7, 'book keeper', 'oo|ee'),
+ (8, 'ababab', '(ab)(ab)(ab)'),
+ (9, 'aabbcc', '(aa|bb|cc)'),
+ (10, 'apple,banana', '[aeiou][a-z]+');
+"""
+
+ qt_table_basic "SELECT id, regexp_count(text_data, pattern) as
count_result FROM test_table_for_regexp_count ORDER BY id;"
+ qt_table_fixed_pattern "SELECT id, regexp_count(text_data, 'e') as count_e
FROM test_table_for_regexp_count WHERE text_data IS NOT NULL ORDER BY id;"
+
+ sql """DROP TABLE IF EXISTS `test_table_for_regexp_count`;"""
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]