This is an automated email from the ASF dual-hosted git repository.

lihaopeng pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 80ce5b924cf [Function](exec) Add Regex_Count function (#51652)
80ce5b924cf is described below

commit 80ce5b924cf0b10968b145b98ea2634aedb87adf
Author: dwdwqfwe <[email protected]>
AuthorDate: Thu Jun 26 15:51:57 2025 +0800

    [Function](exec) Add Regex_Count function (#51652)
---
 be/src/vec/functions/function_regexp.cpp           | 110 +++++++++++++++++++++
 .../doris/catalog/BuiltinScalarFunctions.java      |   2 +
 .../expressions/functions/scalar/RegexpCount.java  |  79 +++++++++++++++
 .../expressions/visitor/ScalarFunctionVisitor.java |   5 +
 .../scalar_function/regexp_count.out               | Bin 0 -> 525 bytes
 .../scalar_function/regexp_count.groovy            |  59 +++++++++++
 6 files changed, 255 insertions(+)

diff --git a/be/src/vec/functions/function_regexp.cpp 
b/be/src/vec/functions/function_regexp.cpp
index 89258e0a2be..ce64e73bce9 100644
--- a/be/src/vec/functions/function_regexp.cpp
+++ b/be/src/vec/functions/function_regexp.cpp
@@ -42,6 +42,7 @@
 #include "vec/core/types.h"
 #include "vec/data_types/data_type.h"
 #include "vec/data_types/data_type_nullable.h"
+#include "vec/data_types/data_type_number.h"
 #include "vec/data_types/data_type_string.h"
 #include "vec/functions/function.h"
 #include "vec/functions/simple_function_factory.h"
@@ -49,6 +50,114 @@
 
 namespace doris::vectorized {
 #include "common/compile_check_begin.h"
+struct RegexpCountImpl {
+    static void execute_impl(FunctionContext* context, ColumnPtr 
argument_columns[],
+                             size_t input_rows_count, ColumnInt32::Container& 
result_data) {
+        const auto* str_col = 
check_and_get_column<ColumnString>(argument_columns[0].get());
+        const auto* pattern_col = 
check_and_get_column<ColumnString>(argument_columns[1].get());
+        for (int i = 0; i < input_rows_count; ++i) {
+            result_data[i] = _execute_inner_loop(context, str_col, 
pattern_col, i);
+        }
+    }
+    static int _execute_inner_loop(FunctionContext* context, const 
ColumnString* str_col,
+                                   const ColumnString* pattern_col, const 
size_t index_now) {
+        re2::RE2* re = reinterpret_cast<re2::RE2*>(
+                context->get_function_state(FunctionContext::THREAD_LOCAL));
+        std::unique_ptr<re2::RE2> scoped_re;
+        if (re == nullptr) {
+            std::string error_str;
+            DCHECK(pattern_col);
+            const auto& pattern = 
pattern_col->get_data_at(index_check_const(index_now, false));
+            bool st = StringFunctions::compile_regex(pattern, &error_str, 
StringRef(), StringRef(),
+                                                     scoped_re);
+            if (!st) {
+                context->add_warning(error_str.c_str());
+                throw Exception(Status::InvalidArgument(error_str));
+                return 0;
+            }
+            re = scoped_re.get();
+        }
+
+        const auto& str = str_col->get_data_at(index_now);
+        int count = 0;
+        size_t pos = 0;
+        while (pos < str.size) {
+            auto str_pos = str.data + pos;
+            auto str_size = str.size - pos;
+            re2::StringPiece str_sp_current = re2::StringPiece(str_pos, 
str_size);
+            re2::StringPiece match;
+
+            bool success = re->Match(str_sp_current, 0, str_size, 
re2::RE2::UNANCHORED, &match, 1);
+            if (!success) {
+                break;
+            }
+            if (match.empty()) {
+                pos += 1;
+                continue;
+            }
+            count++;
+            size_t match_start = match.data() - str_sp_current.data();
+            pos += match_start + match.size();
+        }
+
+        return count;
+    }
+};
+
+class FunctionRegexpCount : public IFunction {
+public:
+    static constexpr auto name = "regexp_count";
+
+    static FunctionPtr create() { return 
std::make_shared<FunctionRegexpCount>(); }
+
+    String get_name() const override { return name; }
+
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return std::make_shared<DataTypeInt32>();
+    }
+
+    Status open(FunctionContext* context, FunctionContext::FunctionStateScope 
scope) override {
+        if (scope == FunctionContext::THREAD_LOCAL) {
+            if (context->is_col_constant(1)) {
+                DCHECK(!context->get_function_state(scope));
+                const auto pattern_col = 
context->get_constant_col(1)->column_ptr;
+                const auto& pattern = pattern_col->get_data_at(0);
+                if (pattern.size == 0) {
+                    return Status::OK();
+                }
+
+                std::string error_str;
+                std::unique_ptr<re2::RE2> scoped_re;
+                bool st = StringFunctions::compile_regex(pattern, &error_str, 
StringRef(),
+                                                         StringRef(), 
scoped_re);
+                if (!st) {
+                    context->set_error(error_str.c_str());
+                    return Status::InvalidArgument(error_str);
+                }
+                std::shared_ptr<re2::RE2> re(scoped_re.release());
+                context->set_function_state(scope, re);
+            }
+        }
+        return Status::OK();
+    }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        uint32_t result, size_t input_rows_count) const 
override {
+        auto result_data_column = ColumnInt32::create(input_rows_count);
+        auto& result_data = result_data_column->get_data();
+
+        ColumnPtr argument_columns[2];
+
+        argument_columns[0] = block.get_by_position(arguments[0]).column;
+        argument_columns[1] = block.get_by_position(arguments[1]).column;
+        RegexpCountImpl::execute_impl(context, argument_columns, 
input_rows_count, result_data);
+
+        block.get_by_position(result).column = std::move(result_data_column);
+        return Status::OK();
+    }
+};
 
 struct ThreeParamTypes {
     static DataTypes get_variadic_argument_types() {
@@ -605,6 +714,7 @@ void 
register_function_regexp_extract(SimpleFunctionFactory& factory) {
     
factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<true>>>();
     
factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<false>>>();
     
factory.register_function<FunctionRegexpFunctionality<RegexpExtractAllImpl>>();
+    factory.register_function<FunctionRegexpCount>();
 }
 
 } // namespace doris::vectorized
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
index eb053c99531..a4b4485a617 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java
@@ -361,6 +361,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Quote;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Radians;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpCount;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
@@ -863,6 +864,7 @@ public class BuiltinScalarFunctions implements 
FunctionHelper {
             scalar(Radians.class, "radians"),
             scalar(Random.class, "rand", "random"),
             scalar(Regexp.class, "regexp"),
+            scalar(RegexpCount.class, "regexp_count"),
             scalar(RegexpExtract.class, "regexp_extract"),
             scalar(RegexpExtractAll.class, "regexp_extract_all"),
             scalar(RegexpExtractOrNull.class, "regexp_extract_or_null"),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpCount.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpCount.java
new file mode 100644
index 00000000000..dfb1644ca10
--- /dev/null
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/RegexpCount.java
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package org.apache.doris.nereids.trees.expressions.functions.scalar;
+
+import org.apache.doris.catalog.FunctionSignature;
+import org.apache.doris.nereids.trees.expressions.Expression;
+import 
org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature;
+import 
org.apache.doris.nereids.trees.expressions.functions.PropagateNullLiteral;
+import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression;
+import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
+import org.apache.doris.nereids.types.IntegerType;
+import org.apache.doris.nereids.types.StringType;
+import org.apache.doris.nereids.types.VarcharType;
+
+import com.google.common.base.Preconditions;
+import com.google.common.collect.ImmutableList;
+
+import java.util.List;
+
+/**
+ * ScalarFunction 'regexp_count'. This class is generated by GenerateFunction.
+ */
+
+public class RegexpCount extends ScalarFunction
+        implements BinaryExpression, ExplicitlyCastableSignature, 
PropagateNullLiteral {
+    public static final List<FunctionSignature> SIGNATURES = ImmutableList.of(
+            
FunctionSignature.ret(IntegerType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT, 
VarcharType.SYSTEM_DEFAULT),
+            
FunctionSignature.ret(IntegerType.INSTANCE).args(StringType.INSTANCE, 
StringType.INSTANCE)
+    );
+
+    /**
+     * constructor with 2 arguments.
+     */
+
+    public RegexpCount(Expression arg0, Expression arg1) {
+        super("regexp_count", arg0, arg1);
+    }
+
+    /**
+     * withChildren.
+     */
+
+    @Override
+    public RegexpCount withChildren(List<Expression> children) {
+        Preconditions.checkArgument(children.size() == 2);
+        return new RegexpCount(children.get(0), children.get(1));
+    }
+
+    @Override
+    public List<FunctionSignature> getSignatures() {
+        return SIGNATURES;
+    }
+
+    @Override
+    public boolean nullable() {
+        // Return nullable if any argument is nullable
+        return children().stream().anyMatch(Expression::nullable);
+    }
+
+    @Override
+    public <R, C> R accept(ExpressionVisitor<R, C> visitor, C context) {
+        return visitor.visitRegexpCount(this, context);
+    }
+}
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
index df18f9d7e62..625e3808bb8 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java
@@ -361,6 +361,7 @@ import 
org.apache.doris.nereids.trees.expressions.functions.scalar.Quote;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Radians;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
 import org.apache.doris.nereids.trees.expressions.functions.scalar.RandomBytes;
+import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpCount;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractAll;
 import 
org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtractOrNull;
@@ -1859,6 +1860,10 @@ public interface ScalarFunctionVisitor<R, C> {
         return visitScalarFunction(regexpReplaceOne, context);
     }
 
+    default R visitRegexpCount(RegexpCount regexpCount, C context) {
+        return visitScalarFunction(regexpCount, context);
+    }
+
     default R visitRepeat(Repeat repeat, C context) {
         return visitScalarFunction(repeat, context);
     }
diff --git 
a/regression-test/data/nereids_function_p0/scalar_function/regexp_count.out 
b/regression-test/data/nereids_function_p0/scalar_function/regexp_count.out
new file mode 100644
index 00000000000..e9f60f6adc4
Binary files /dev/null and 
b/regression-test/data/nereids_function_p0/scalar_function/regexp_count.out 
differ
diff --git 
a/regression-test/suites/nereids_function_p0/scalar_function/regexp_count.groovy
 
b/regression-test/suites/nereids_function_p0/scalar_function/regexp_count.groovy
new file mode 100644
index 00000000000..01d974422da
--- /dev/null
+++ 
b/regression-test/suites/nereids_function_p0/scalar_function/regexp_count.groovy
@@ -0,0 +1,59 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_regexp_count") {
+    qt_basic_count1 "SELECT regexp_count('a.b:c;d', '[.:;]');"
+    qt_basic_count2 "SELECT regexp_count('a.b:c;d', '.');"
+    qt_basic_count3 "SELECT regexp_count('a.b:c;d', ':');"
+    qt_basic_count4 "SELECT regexp_count('Hello123World!', '[a-zA-Z]');"
+    qt_basic_count5 "SELECT regexp_count('a1b2c3d', '[^0-9]');"
+    qt_basic_count6 "SELECT regexp_count('Hello World\tJava\nSQL', 's');"
+    qt_basic_count7 "SELECT regexp_count('Hello, World!', '[[:punct:]]');"
+    qt_basic_count8 "SELECT regexp_count('abc123def456', 'd+');"
+    qt_basic_count9 """SELECT regexp_count('Contact us at [email protected] or 
[email protected]', 
+                    '[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+.[A-Za-z]{2,}');"""
+    qt_basic_count10 "SELECT regexp_count('An apple a day keeps the doctor 
away', '[ae][a-z]*');"
+
+    qt_empty_string "SELECT regexp_count('', 'x');"
+    qt_empty_pattern "SELECT regexp_count('abcd', '');"
+    qt_both_empty "SELECT regexp_count('', '');"
+
+    sql """DROP TABLE IF EXISTS `test_table_for_regexp_count`;"""
+    sql """CREATE TABLE test_table_for_regexp_count (
+        id INT,
+        text_data VARCHAR(500),
+        pattern VARCHAR(100)
+    ) PROPERTIES ("replication_num"="1");"""
+
+    sql  """ INSERT INTO test_table_for_regexp_count VALUES
+    (1, 'HelloWorld', '[A-Z][a-z]+'),    
+    (2, 'apple123', '[a-z]{5}[0-9]'),    
+    (3, 'aabbcc', '(aa|bb|cc)'),         
+    (4, '123-456-7890', '[0-9][0-9][0-9]'), 
+    (5, 'test,data', ','),              
+    (6, 'a1b2c3', '[a-z][0-9]'),         
+    (7, 'book keeper', 'oo|ee'),        
+    (8, 'ababab', '(ab)(ab)(ab)'),       
+    (9, 'aabbcc', '(aa|bb|cc)'),         
+    (10, 'apple,banana', '[aeiou][a-z]+');
+"""
+
+    qt_table_basic "SELECT id, regexp_count(text_data, pattern) as 
count_result FROM test_table_for_regexp_count ORDER BY id;"
+    qt_table_fixed_pattern "SELECT id, regexp_count(text_data, 'e') as count_e 
FROM test_table_for_regexp_count WHERE text_data IS NOT NULL ORDER BY id;"
+
+    sql """DROP TABLE IF EXISTS `test_table_for_regexp_count`;"""
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to