This is an automated email from the ASF dual-hosted git repository.

zhangzc pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new c620f4f3f [GLUTEN-5898][CH] Fix regexp_extract function use bracket 
has diff behaver with spark (#5908)
c620f4f3f is described below

commit c620f4f3fa215906125f6c3e979c76dea8a5b30c
Author: Shuai li <[email protected]>
AuthorDate: Thu May 30 17:04:17 2024 +0800

    [GLUTEN-5898][CH] Fix regexp_extract function use bracket has diff behaver 
with spark (#5908)
    
    [CH] Fix regexp_extract function use bracket has diff behaver with spark
---
 .../execution/GlutenFunctionValidateSuite.scala    | 20 ++++++
 .../scalar_function_parser/regexp_extract.cpp      | 80 +++++++++++++++++++++-
 2 files changed, 97 insertions(+), 3 deletions(-)

diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
index 7b52a970e..5a1ca6799 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
@@ -708,4 +708,24 @@ class GlutenFunctionValidateSuite extends 
GlutenClickHouseWholeStageTransformerS
 
   }
 
+  test("GLUTEN-5897: fix regexp_extract with bracket") {
+    withTable("regexp_extract_bracket") {
+      sql("create table regexp_extract_bracket(a String) using parquet")
+      sql(
+        """
+          |insert into regexp_extract_bracket values 
('123.123abc-abc'),('123-LOW'),('123]abc-abc')
+          |""".stripMargin)
+
+      val sql_str =
+        s"""select
+           |    regexp_extract(a, '([0-9][[\\.][0-9]]*)', 1)
+           |  , regexp_extract(a, '([0-9][[\\.][0-9]]*)', 1)
+           |  , regexp_extract(a, '([0-9][[]]]*)', 1)
+           |  from regexp_extract_bracket
+          """.stripMargin
+
+      runQueryAndCompare(sql_str) { _ => }
+    }
+  }
+
 }
diff --git 
a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp 
b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
index 2c0eeff1c..8f75baf68 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
@@ -15,6 +15,8 @@
  * limitations under the License.
  */
 
+#include <stack>
+
 #include <Parser/FunctionParser.h>
 
 namespace DB
@@ -56,10 +58,11 @@ public:
                 size_t expr_size = expr_str.size();
                 if (expr_str.data()[expr_size - 1] == '$')
                     expr_str.replace(expr_str.find_last_of("$"), 1, 
"(?:(\n)*)$");
-                
-                const auto * regex_expr_node = 
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), 
expr_str);
+
+                String sparkRegexp = adjustSparkRegexpRule(expr_str);
+                const auto * regex_expr_node = 
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), 
sparkRegexp);
                 auto parsed_args = parseFunctionArguments(substrait_func, "", 
actions_dag);
-                parsed_args[1] =  regex_expr_node;
+                parsed_args[1] = regex_expr_node;
                 const auto * result_node = toFunctionNode(actions_dag, 
"regexpExtract", parsed_args);
                 return convertNodeTypeIfNeeded(substrait_func, result_node, 
actions_dag);
             }
@@ -69,6 +72,77 @@ public:
         else
             throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {} 
2nd argument's type must be const", getName());
     }
+
+private:
+    String adjustSparkRegexpRule(String & str) const
+    {
+        const auto left_bracket_pos = str.find('[');
+        const auto right_bracket_pos = str.find(']');
+
+        if (left_bracket_pos == str.npos || right_bracket_pos == str.npos || 
left_bracket_pos >= right_bracket_pos)
+            return str;
+
+        auto throw_message = [this, &str]() -> void {
+            throw Exception(
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The value of 
parameter(s) 'regexp' in `{}` is invalid: '{}'", getName(), str);
+        };
+
+        ReadBufferFromString buf(str);
+        std::stack<String> strs;
+        strs.emplace("");
+        bool nead_right_bracket = false;
+
+        while (!buf.eof())
+        {
+            if (*buf.position() == '[')
+            {
+                strs.emplace("");
+            }
+            else if (*buf.position() == ']')
+            {
+                if (strs.size() == 1)
+                {
+                    // "ab]c"
+                    strs.top().append("]");
+                }
+                else
+                {
+                    String back = strs.top();
+                    strs.pop();
+                    if (strs.size() == 1)
+                    {
+                        // "abc[abc]abc"
+                        strs.top().append("[").append(back).append("]");
+                        nead_right_bracket = false;
+                    }
+                    else
+                    {
+                        // "abc[a[abc]c]abc"
+                        strs.top().append(back);
+                        nead_right_bracket = true;
+                    }
+                }
+            }
+            else
+            {
+                strs.top() += *buf.position();
+            }
+
+            ++buf.position();
+        }
+
+        if (nead_right_bracket && strs.size() != 1)
+            throw_message();
+
+        while (strs.size() != 1)
+        {
+            String back = strs.top();
+            strs.pop();
+            strs.top().append("[").append(back);
+        }
+
+        return strs.top();
+    }
 };
 
 static FunctionParserRegister<FunctionParserRegexpExtract> 
register_regexp_extract;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to