This is an automated email from the ASF dual-hosted git repository.
zhangzc pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new c620f4f3f [GLUTEN-5898][CH] Fix regexp_extract function use bracket
has diff behaver with spark (#5908)
c620f4f3f is described below
commit c620f4f3fa215906125f6c3e979c76dea8a5b30c
Author: Shuai li <[email protected]>
AuthorDate: Thu May 30 17:04:17 2024 +0800
[GLUTEN-5898][CH] Fix regexp_extract function use bracket has diff behaver
with spark (#5908)
[CH] Fix regexp_extract function use bracket has diff behaver with spark
---
.../execution/GlutenFunctionValidateSuite.scala | 20 ++++++
.../scalar_function_parser/regexp_extract.cpp | 80 +++++++++++++++++++++-
2 files changed, 97 insertions(+), 3 deletions(-)
diff --git
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
index 7b52a970e..5a1ca6799 100644
---
a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
+++
b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala
@@ -708,4 +708,24 @@ class GlutenFunctionValidateSuite extends
GlutenClickHouseWholeStageTransformerS
}
+ test("GLUTEN-5897: fix regexp_extract with bracket") {
+ withTable("regexp_extract_bracket") {
+ sql("create table regexp_extract_bracket(a String) using parquet")
+ sql(
+ """
+ |insert into regexp_extract_bracket values
('123.123abc-abc'),('123-LOW'),('123]abc-abc')
+ |""".stripMargin)
+
+ val sql_str =
+ s"""select
+ | regexp_extract(a, '([0-9][[\\.][0-9]]*)', 1)
+ | , regexp_extract(a, '([0-9][[\\.][0-9]]*)', 1)
+ | , regexp_extract(a, '([0-9][[]]]*)', 1)
+ | from regexp_extract_bracket
+ """.stripMargin
+
+ runQueryAndCompare(sql_str) { _ => }
+ }
+ }
+
}
diff --git
a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
index 2c0eeff1c..8f75baf68 100644
--- a/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
+++ b/cpp-ch/local-engine/Parser/scalar_function_parser/regexp_extract.cpp
@@ -15,6 +15,8 @@
* limitations under the License.
*/
+#include <stack>
+
#include <Parser/FunctionParser.h>
namespace DB
@@ -56,10 +58,11 @@ public:
size_t expr_size = expr_str.size();
if (expr_str.data()[expr_size - 1] == '$')
expr_str.replace(expr_str.find_last_of("$"), 1,
"(?:(\n)*)$");
-
- const auto * regex_expr_node =
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(),
expr_str);
+
+ String sparkRegexp = adjustSparkRegexpRule(expr_str);
+ const auto * regex_expr_node =
addColumnToActionsDAG(actions_dag, std::make_shared<DataTypeString>(),
sparkRegexp);
auto parsed_args = parseFunctionArguments(substrait_func, "",
actions_dag);
- parsed_args[1] = regex_expr_node;
+ parsed_args[1] = regex_expr_node;
const auto * result_node = toFunctionNode(actions_dag,
"regexpExtract", parsed_args);
return convertNodeTypeIfNeeded(substrait_func, result_node,
actions_dag);
}
@@ -69,6 +72,77 @@ public:
else
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Function {}
2nd argument's type must be const", getName());
}
+
+private:
+ String adjustSparkRegexpRule(String & str) const
+ {
+ const auto left_bracket_pos = str.find('[');
+ const auto right_bracket_pos = str.find(']');
+
+ if (left_bracket_pos == str.npos || right_bracket_pos == str.npos ||
left_bracket_pos >= right_bracket_pos)
+ return str;
+
+ auto throw_message = [this, &str]() -> void {
+ throw Exception(
+ ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The value of
parameter(s) 'regexp' in `{}` is invalid: '{}'", getName(), str);
+ };
+
+ ReadBufferFromString buf(str);
+ std::stack<String> strs;
+ strs.emplace("");
+ bool nead_right_bracket = false;
+
+ while (!buf.eof())
+ {
+ if (*buf.position() == '[')
+ {
+ strs.emplace("");
+ }
+ else if (*buf.position() == ']')
+ {
+ if (strs.size() == 1)
+ {
+ // "ab]c"
+ strs.top().append("]");
+ }
+ else
+ {
+ String back = strs.top();
+ strs.pop();
+ if (strs.size() == 1)
+ {
+ // "abc[abc]abc"
+ strs.top().append("[").append(back).append("]");
+ nead_right_bracket = false;
+ }
+ else
+ {
+ // "abc[a[abc]c]abc"
+ strs.top().append(back);
+ nead_right_bracket = true;
+ }
+ }
+ }
+ else
+ {
+ strs.top() += *buf.position();
+ }
+
+ ++buf.position();
+ }
+
+ if (nead_right_bracket && strs.size() != 1)
+ throw_message();
+
+ while (strs.size() != 1)
+ {
+ String back = strs.top();
+ strs.pop();
+ strs.top().append("[").append(back);
+ }
+
+ return strs.top();
+ }
};
static FunctionParserRegister<FunctionParserRegexpExtract>
register_regexp_extract;
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]