This is an automated email from the ASF dual-hosted git repository.

exmy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 547f8b7e64 [GLUTEN-8325][CH] Fix miss matched result for `$` and `.` 
in reg expression (#8345)
547f8b7e64 is described below

commit 547f8b7e64589bad898916351a1f5ba99e1457ab
Author: lgbo <[email protected]>
AuthorDate: Thu Dec 26 15:16:46 2024 +0800

    [GLUTEN-8325][CH] Fix miss matched result for `$` and `.` in reg expression 
(#8345)
    
    align the meaning of . and $ in vanila and ch
---
 .../GlutenClickhouseStringFunctionsSuite.scala     | 24 ++++++++++++++
 .../scalar_function_parser/regexp_extract.cpp      | 38 +++++++++++++++-------
 2 files changed, 50 insertions(+), 12 deletions(-)

diff --git 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala
 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala
index aeb07a3b30..c771260667 100644
--- 
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala
+++ 
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala
@@ -107,6 +107,30 @@ class GlutenClickhouseStringFunctionsSuite extends 
GlutenClickHouseWholeStageTra
     }
   }
 
+  test("GLUTEN-8325 $. missmatched") {
+    withTable("regexp_string_end") {
+      sql("create table regexp_string_end(a String) using parquet")
+      sql("""
+            |insert into regexp_string_end
+            | values ('@abc'),('@abc\n'),('@abc\nsdd'), ('sfsd\n@abc'), 
('sdfsdf\n@abc\n'),
+            | ('sdfsdf\n@abc\nsdf'), ('sdfsdf@abc\nsdf\n'), ('sdfsdf@abc'), 
('sdfsdf@abc\n')
+            |""".stripMargin)
+      runQueryAndCompare("""
+                           |select
+                           |regexp_extract(a, '@(.*?)($)', 1),
+                           |regexp_extract(a, '@(.*?)(f|$)', 1),
+                           |regexp_extract(a, '^@(.*?)(f|$)', 1)
+                           |from regexp_string_end""".stripMargin) { _ => }
+
+      runQueryAndCompare("""
+                           |select
+                           |regexp_extract(a, '@(.*)($)', 1),
+                           |regexp_extract(a, '@(.*)(f|$)', 1),
+                           |regexp_extract(a, '^@(.*?)(f|$)', 1)
+                           |from regexp_string_end""".stripMargin) { _ => }
+    }
+  }
+
   test("replace") {
     val tableName = "replace_table"
     withTable(tableName) {
diff --git 
a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp 
b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
index 8b7fe8ac1e..5d7059ae6c 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
@@ -20,13 +20,16 @@
 #include <DataTypes/DataTypeString.h>
 #include <IO/ReadBufferFromString.h>
 #include <Parser/FunctionParser.h>
+#include <Poco/Logger.h>
+#include <Common/logger_useful.h>
+#include <Common/re2.h>
 
 namespace DB
 {
 namespace ErrorCodes
 {
-    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+extern const int ILLEGAL_TYPE_OF_ARGUMENT;
 }
 }
 
@@ -37,29 +40,26 @@ namespace local_engine
 class FunctionParserRegexpExtract : public FunctionParser
 {
 public:
-    explicit FunctionParserRegexpExtract(ParserContextPtr parser_context_) : 
FunctionParser(parser_context_) {}
+    explicit FunctionParserRegexpExtract(ParserContextPtr parser_context_) : 
FunctionParser(parser_context_) { }
     ~FunctionParserRegexpExtract() override = default;
 
     static constexpr auto name = "regexp_extract";
     String getName() const override { return name; }
 
-    const ActionsDAG::Node * parse(
-        const substrait::Expression_ScalarFunction & substrait_func,
-        ActionsDAG & actions_dag) const override
+    const ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction 
& substrait_func, ActionsDAG & actions_dag) const override
     {
         const auto & args = substrait_func.arguments();
         if (args.size() != 3)
             throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, 
"Function {} requires 3 arguments", getName());
-        
-        if(args[1].value().has_literal())
+
+        if (args[1].value().has_literal())
         {
             const auto & literal_expr = args[1].value().literal();
             if (literal_expr.has_string())
             {
                 std::string expr_str = literal_expr.string();
-                size_t expr_size = expr_str.size();
-                if (expr_str.data()[expr_size - 1] == '$')
-                    expr_str.replace(expr_str.find_last_of("$"), 1, 
"(?:(\n)*)$");
+                /// FIXEDME:  This only works for RE2
+                expr_str = adaptPatternForRE2(expr_str);
 
                 String sparkRegexp = adjustSparkRegexpRule(expr_str);
                 const auto * regex_expr_node = 
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(), 
sparkRegexp);
@@ -76,6 +76,21 @@ public:
     }
 
 private:
+    String adaptPatternForRE2(const String & pattern_) const
+    {
+        LOG_DEBUG(getLogger("FunctionParserRegexpExtract"), "xxx original 
pattern: {}", pattern_);
+        String res = pattern_;
+        // adaptation for $, see issue #8325. equal two cases in re2: $ and 
\n$, but not include strings which contains \n in middle.
+        static const std::string replaced_str = "($|\\\\n$)";
+        static const re2::RE2 replace_dollar_pattern("([^\\\\])(\\$)");
+        re2::RE2::GlobalReplace(&res, replace_dollar_pattern, "\\1" + 
replaced_str);
+        LOG_DEBUG(getLogger("FunctionParserRegexpExtract"), "xxx adaption for 
$: {}", res);
+
+        // adaption for `.` . Need to remove flag s.
+        res = "(?-s)" + res;
+        return res;
+    }
+
     String adjustSparkRegexpRule(String & str) const
     {
         const auto left_bracket_pos = str.find('[');
@@ -142,7 +157,6 @@ private:
             strs.pop();
             strs.top().append("[").append(back);
         }
-
         return strs.top();
     }
 };


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to