This is an automated email from the ASF dual-hosted git repository.
exmy pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 547f8b7e64 [GLUTEN-8325][CH] Fix miss matched result for `$` and `.`
in reg expression (#8345)
547f8b7e64 is described below
commit 547f8b7e64589bad898916351a1f5ba99e1457ab
Author: lgbo <[email protected]>
AuthorDate: Thu Dec 26 15:16:46 2024 +0800
[GLUTEN-8325][CH] Fix miss matched result for `$` and `.` in reg expression
(#8345)
align the meaning of . and $ in vanila and ch
---
.../GlutenClickhouseStringFunctionsSuite.scala | 24 ++++++++++++++
.../scalar_function_parser/regexp_extract.cpp | 38 +++++++++++++++-------
2 files changed, 50 insertions(+), 12 deletions(-)
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala
index aeb07a3b30..c771260667 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala
@@ -107,6 +107,30 @@ class GlutenClickhouseStringFunctionsSuite extends
GlutenClickHouseWholeStageTra
}
}
+ test("GLUTEN-8325 $. missmatched") {
+ withTable("regexp_string_end") {
+ sql("create table regexp_string_end(a String) using parquet")
+ sql("""
+ |insert into regexp_string_end
+ | values ('@abc'),('@abc\n'),('@abc\nsdd'), ('sfsd\n@abc'),
('sdfsdf\n@abc\n'),
+ | ('sdfsdf\n@abc\nsdf'), ('sdfsdf@abc\nsdf\n'), ('sdfsdf@abc'),
('sdfsdf@abc\n')
+ |""".stripMargin)
+ runQueryAndCompare("""
+ |select
+ |regexp_extract(a, '@(.*?)($)', 1),
+ |regexp_extract(a, '@(.*?)(f|$)', 1),
+ |regexp_extract(a, '^@(.*?)(f|$)', 1)
+ |from regexp_string_end""".stripMargin) { _ => }
+
+ runQueryAndCompare("""
+ |select
+ |regexp_extract(a, '@(.*)($)', 1),
+ |regexp_extract(a, '@(.*)(f|$)', 1),
+ |regexp_extract(a, '^@(.*?)(f|$)', 1)
+ |from regexp_string_end""".stripMargin) { _ => }
+ }
+ }
+
test("replace") {
val tableName = "replace_table"
withTable(tableName) {
diff --git
a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
index 8b7fe8ac1e..5d7059ae6c 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
@@ -20,13 +20,16 @@
#include <DataTypes/DataTypeString.h>
#include <IO/ReadBufferFromString.h>
#include <Parser/FunctionParser.h>
+#include <Poco/Logger.h>
+#include <Common/logger_useful.h>
+#include <Common/re2.h>
namespace DB
{
namespace ErrorCodes
{
- extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
- extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
}
@@ -37,29 +40,26 @@ namespace local_engine
class FunctionParserRegexpExtract : public FunctionParser
{
public:
- explicit FunctionParserRegexpExtract(ParserContextPtr parser_context_) :
FunctionParser(parser_context_) {}
+ explicit FunctionParserRegexpExtract(ParserContextPtr parser_context_) :
FunctionParser(parser_context_) { }
~FunctionParserRegexpExtract() override = default;
static constexpr auto name = "regexp_extract";
String getName() const override { return name; }
- const ActionsDAG::Node * parse(
- const substrait::Expression_ScalarFunction & substrait_func,
- ActionsDAG & actions_dag) const override
+ const ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction
& substrait_func, ActionsDAG & actions_dag) const override
{
const auto & args = substrait_func.arguments();
if (args.size() != 3)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Function {} requires 3 arguments", getName());
-
- if(args[1].value().has_literal())
+
+ if (args[1].value().has_literal())
{
const auto & literal_expr = args[1].value().literal();
if (literal_expr.has_string())
{
std::string expr_str = literal_expr.string();
- size_t expr_size = expr_str.size();
- if (expr_str.data()[expr_size - 1] == '$')
- expr_str.replace(expr_str.find_last_of("$"), 1,
"(?:(\n)*)$");
+ /// FIXEDME: This only works for RE2
+ expr_str = adaptPatternForRE2(expr_str);
String sparkRegexp = adjustSparkRegexpRule(expr_str);
const auto * regex_expr_node =
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(),
sparkRegexp);
@@ -76,6 +76,21 @@ public:
}
private:
+ String adaptPatternForRE2(const String & pattern_) const
+ {
+ LOG_DEBUG(getLogger("FunctionParserRegexpExtract"), "xxx original
pattern: {}", pattern_);
+ String res = pattern_;
+ // adaptation for $, see issue #8325. equal two cases in re2: $ and
\n$, but not include strings which contains \n in middle.
+ static const std::string replaced_str = "($|\\\\n$)";
+ static const re2::RE2 replace_dollar_pattern("([^\\\\])(\\$)");
+ re2::RE2::GlobalReplace(&res, replace_dollar_pattern, "\\1" +
replaced_str);
+ LOG_DEBUG(getLogger("FunctionParserRegexpExtract"), "xxx adaption for
$: {}", res);
+
+ // adaption for `.` . Need to remove flag s.
+ res = "(?-s)" + res;
+ return res;
+ }
+
String adjustSparkRegexpRule(String & str) const
{
const auto left_bracket_pos = str.find('[');
@@ -142,7 +157,6 @@ private:
strs.pop();
strs.top().append("[").append(back);
}
-
return strs.top();
}
};
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]