github-actions[bot] commented on code in PR #64064: URL: https://github.com/apache/doris/pull/64064#discussion_r3347078920
########## fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/RegexpFunctionRewrite.java: ########## @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.expression.rules; + +import org.apache.doris.nereids.rules.expression.ExpressionPatternMatcher; +import org.apache.doris.nereids.rules.expression.ExpressionPatternRuleFactory; +import org.apache.doris.nereids.rules.expression.ExpressionRuleType; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne; +import org.apache.doris.nereids.trees.expressions.literal.IntegerLikeLiteral; +import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * Rewrites regexp functions to cheaper equivalent forms when the regexp shape proves the rewrite is safe. + */ +public class RegexpFunctionRewrite implements ExpressionPatternRuleFactory { + public static final RegexpFunctionRewrite INSTANCE = new RegexpFunctionRewrite(); + + @Override + public List<ExpressionPatternMatcher<? extends Expression>> buildRules() { + return ImmutableList.of( + matchesType(RegexpReplace.class) + .then(RegexpFunctionRewrite::rewriteRegexpReplace) + .toRule(ExpressionRuleType.REGEXP_FUNCTION_REWRITE), + matchesType(RegexpExtract.class) + .then(RegexpFunctionRewrite::rewriteRegexpExtract) + .toRule(ExpressionRuleType.REGEXP_FUNCTION_REWRITE) + ); + } + + private static Expression rewriteRegexpReplace(RegexpReplace regexpReplace) { + String pattern = getStringLiteral(regexpReplace.child(1)); + if (pattern == null || pattern.isEmpty()) { + return regexpReplace; + } + if (!startsWithUnescapedCaret(pattern) && !endsWithUnescapedDollar(pattern)) { + return regexpReplace; + } + if (hasUnescapedAlternation(pattern)) { + return regexpReplace; + } + + if (regexpReplace.arity() == 3) { + return new RegexpReplaceOne(regexpReplace.child(0), regexpReplace.child(1), regexpReplace.child(2)); + } + return new RegexpReplaceOne(regexpReplace.child(0), regexpReplace.child(1), regexpReplace.child(2), + regexpReplace.child(3)); + } + + private static Expression rewriteRegexpExtract(RegexpExtract regexpExtract) { + String pattern = getStringLiteral(regexpExtract.child(1)); + if (pattern == null || pattern.isEmpty() || !isPositiveGroupIndex(regexpExtract.child(2)) + || !hasCapturingGroup(pattern) || hasUnescapedAlternation(pattern)) { + return regexpExtract; + } + + String trimmedPattern = trimExtractPattern(pattern); + if (trimmedPattern.equals(pattern)) { + return regexpExtract; + } + return new RegexpExtract(regexpExtract.child(0), new VarcharLiteral(trimmedPattern), regexpExtract.child(2)); + } + + private static String trimExtractPattern(String pattern) { + String trimmed = pattern; + if (endsWithUnescapedDotStarDollar(trimmed)) { + trimmed = trimmed.substring(0, trimmed.length() - 3); Review Comment: Trimming a trailing `.*$` is not semantics-preserving when inline flags disable dot-newline matching. Doris sets RE2 dot_nl by default, but RE2 inline flags can override it; for example `regexp_extract(concat('a', char(10), 'b'), '(?-s)^(a).*$', 1)` does not match because `.` cannot consume the newline, whereas the rewritten pattern `(?-s)^(a)` matches and returns `a`. Please guard against inline dot-mode changes, or avoid this rewrite for patterns containing inline flag groups that can affect `.`/`$`. ########## fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/RegexpFunctionRewrite.java: ########## @@ -0,0 +1,182 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.expression.rules; + +import org.apache.doris.nereids.rules.expression.ExpressionPatternMatcher; +import org.apache.doris.nereids.rules.expression.ExpressionPatternRuleFactory; +import org.apache.doris.nereids.rules.expression.ExpressionRuleType; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne; +import org.apache.doris.nereids.trees.expressions.literal.IntegerLikeLiteral; +import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * Rewrites regexp functions to cheaper equivalent forms when the regexp shape proves the rewrite is safe. + */ +public class RegexpFunctionRewrite implements ExpressionPatternRuleFactory { + public static final RegexpFunctionRewrite INSTANCE = new RegexpFunctionRewrite(); + + @Override + public List<ExpressionPatternMatcher<? extends Expression>> buildRules() { + return ImmutableList.of( + matchesType(RegexpReplace.class) + .then(RegexpFunctionRewrite::rewriteRegexpReplace) + .toRule(ExpressionRuleType.REGEXP_FUNCTION_REWRITE), + matchesType(RegexpExtract.class) + .then(RegexpFunctionRewrite::rewriteRegexpExtract) + .toRule(ExpressionRuleType.REGEXP_FUNCTION_REWRITE) + ); + } + + private static Expression rewriteRegexpReplace(RegexpReplace regexpReplace) { + String pattern = getStringLiteral(regexpReplace.child(1)); + if (pattern == null || pattern.isEmpty()) { + return regexpReplace; + } + if (!startsWithUnescapedCaret(pattern) && !endsWithUnescapedDollar(pattern)) { Review Comment: This treats a trailing `$` as proving that `regexp_replace` can match at most once, but inline regex flags can make `$` match per line. For example, `regexp_replace('a\na', '(?m)a$', 'x')` is valid RE2 syntax; the original `regexp_replace` uses `RE2::GlobalReplace` and replaces both line-ending matches (`x\nx`), while the rewritten `regexp_replace_one` uses `RE2::Replace` and only replaces the first (`x\na`). Please skip this rewrite when the pattern can enable multiline mode, or otherwise prove the anchor is still single-match under inline flags. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
