github-actions[bot] commented on code in PR #64064: URL: https://github.com/apache/doris/pull/64064#discussion_r3351123560
########## fe/fe-core/src/main/java/org/apache/doris/nereids/rules/expression/rules/RegexpFunctionRewrite.java: ########## @@ -0,0 +1,217 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.rules.expression.rules; + +import org.apache.doris.nereids.rules.expression.ExpressionPatternMatcher; +import org.apache.doris.nereids.rules.expression.ExpressionPatternRuleFactory; +import org.apache.doris.nereids.rules.expression.ExpressionRuleType; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpExtract; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplace; +import org.apache.doris.nereids.trees.expressions.functions.scalar.RegexpReplaceOne; +import org.apache.doris.nereids.trees.expressions.literal.IntegerLikeLiteral; +import org.apache.doris.nereids.trees.expressions.literal.Literal; +import org.apache.doris.nereids.trees.expressions.literal.VarcharLiteral; + +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * Rewrites regexp functions to cheaper equivalent forms when the regexp shape proves the rewrite is safe. + */ +public class RegexpFunctionRewrite implements ExpressionPatternRuleFactory { + public static final RegexpFunctionRewrite INSTANCE = new RegexpFunctionRewrite(); + + @Override + public List<ExpressionPatternMatcher<? extends Expression>> buildRules() { + return ImmutableList.of( + matchesType(RegexpReplace.class) + .then(RegexpFunctionRewrite::rewriteRegexpReplace) + .toRule(ExpressionRuleType.REGEXP_FUNCTION_REWRITE), + matchesType(RegexpExtract.class) + .then(RegexpFunctionRewrite::rewriteRegexpExtract) + .toRule(ExpressionRuleType.REGEXP_FUNCTION_REWRITE) + ); + } + + private static Expression rewriteRegexpReplace(RegexpReplace regexpReplace) { + String pattern = getStringLiteral(regexpReplace.child(1)); + if (pattern == null || pattern.isEmpty()) { + return regexpReplace; + } + if (!startsWithUnescapedCaret(pattern) && !endsWithUnescapedDollar(pattern)) { + return regexpReplace; + } + if (hasUnescapedAlternation(pattern) || hasInlineRegexpFlag(pattern, 'm')) { + return regexpReplace; + } + + if (regexpReplace.arity() == 3) { + return new RegexpReplaceOne(regexpReplace.child(0), regexpReplace.child(1), regexpReplace.child(2)); + } + return new RegexpReplaceOne(regexpReplace.child(0), regexpReplace.child(1), regexpReplace.child(2), + regexpReplace.child(3)); + } + + private static Expression rewriteRegexpExtract(RegexpExtract regexpExtract) { + String pattern = getStringLiteral(regexpExtract.child(1)); + if (pattern == null || pattern.isEmpty() || !isPositiveGroupIndex(regexpExtract.child(2)) + || !hasCapturingGroup(pattern) || hasUnescapedAlternation(pattern) + || hasInlineRegexpFlag(pattern, 's')) { Review Comment: This guard still lets the suffix trim run for patterns that force BE's `regexp_extract` into the extended-regex Boost fallback when `enable_extended_regex=true`. That path is distinct from the RE2 inline-flag cases already discussed: `RegexpExtractEngine::compile` first tries RE2 with `dot_nl=true`, but unsupported constructs like look-around fall back to `boost::regex::normal`, where `.` does not match newlines by default. For example, with extended regex enabled, `regexp_extract(concat('fooa', char(10), 'tail'), '(?<=foo)(a).*$', 1)` should not match because the trailing `.*$` cannot consume through the newline in Boost; after this rewrite the pattern becomes `(?<=foo)(a)` and returns `a`. Please either skip this optimization for patterns that may require the Boost fallback, or otherwise prove the runtime engine keeps the same dot/newline semantics before dropping `.*$`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
