This is an automated email from the ASF dual-hosted git repository.

gengliangwang pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new dffbe1adb2ae [SPARK-57173][SQL] Simplify regexp pattern-compile 
codegen by extracting a static Java helper
dffbe1adb2ae is described below

commit dffbe1adb2aecfee868fefde1b3f4c673ecf5a32
Author: Gengliang Wang <[email protected]>
AuthorDate: Tue Jun 2 20:52:07 2026 -0700

    [SPARK-57173][SQL] Simplify regexp pattern-compile codegen by extracting a 
static Java helper
    
    ### What changes were proposed in this pull request?
    
    Add `ExpressionImplUtils.compileRegexPattern(String regex, int flags, 
String funcName)`, which wraps `Pattern.compile` and maps a 
`PatternSyntaxException` to the user-facing INVALID_PARAMETER_VALUE.PATTERN 
error. Route both the shared codegen (`RegExpUtils.initLastMatcherCode`, used 
by the whole regexp expression family -- `RLike`, `RegExpReplace`, 
`RegExpExtract`, `RegExpExtractAll`, `RegExpInStr`, etc.) and the eval helper 
(`RegExpUtils.getPatternAndLastRegex`) through it.
    
    `initLastMatcherCode` previously emitted a 5-line inline `try { 
Pattern.compile(...) } catch (PatternSyntaxException)` block; it now emits a 
single helper call. The per-stage mutable-state caching (`lastRegex` / 
`pattern`) is preserved in the generated code.
    
    ### Why are the changes needed?
    
    Part of SPARK-56908 (umbrella). This block is emitted by every regexp 
expression in every stage that uses one; collapsing it to a single call shrinks 
the generated Java across the whole family, helping with the JVM 64KB method / 
constant-pool limits, Janino compile time, and JIT work.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No. The compiled behavior is identical; only the emitted Java source text 
changes.
    
    ### How was this patch tested?
    
    ```
    build/sbt "catalyst/testOnly *RegexpExpressionsSuite"
    ```
    
    21/21 pass (exercised both with and without whole-stage codegen).
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (Opus 4.8)
    
    Closes #56223 from gengliangwang/spark-regexp-compile-codegen.
    
    Authored-by: Gengliang Wang <[email protected]>
    Signed-off-by: Gengliang Wang <[email protected]>
---
 .../catalyst/expressions/ExpressionImplUtils.java    | 17 +++++++++++++++++
 .../sql/catalyst/expressions/regexpExpressions.scala | 20 +++++++-------------
 2 files changed, 24 insertions(+), 13 deletions(-)

diff --git 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
index 7bad7c430b86..1053650a3709 100644
--- 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
+++ 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
@@ -25,6 +25,8 @@ import java.text.BreakIterator;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
 import java.util.zip.CRC32;
 import javax.crypto.Cipher;
 import javax.crypto.spec.GCMParameterSpec;
@@ -344,6 +346,21 @@ public class ExpressionImplUtils {
     return UTF8String.fromString(qtChar + sp + qtChar);
   }
 
+  /**
+   * Compiles {@code regex} with the given {@code flags} for the regexp 
expression
+   * family, translating a {@link PatternSyntaxException} into the user-facing
+   * INVALID_PARAMETER_VALUE.PATTERN error. Shared by the regexp eval and 
codegen
+   * paths so the generated Java is a single call instead of an inline 
try/catch
+   * around {@code Pattern.compile}.
+   */
+  public static Pattern compileRegexPattern(String regex, int flags, String 
funcName) {
+    try {
+      return Pattern.compile(regex, flags);
+    } catch (PatternSyntaxException e) {
+      throw QueryExecutionErrors.invalidPatternError(funcName, e.getPattern(), 
e);
+    }
+  }
+
   /**
    * Computes the CRC32 checksum of {@code bytes} for the {@code crc32} 
expression.
    * Shared by the eval and codegen paths so the per-stage generated Java is a
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 5ad360a54e8d..c2c01d2c7815 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -1253,17 +1253,15 @@ object RegExpUtils {
     val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
     val termPattern = ctx.addMutableState(classNamePattern, "pattern")
     val collationRegexFlags = 
CollationSupport.collationAwareRegexFlags(collationId)
+    val utils = classOf[ExpressionImplUtils].getName
 
     s"""
        |if (!$regexp.equals($termLastRegex)) {
        |  // regex value changed
-       |  try {
-       |    UTF8String r = $regexp.clone();
-       |    $termPattern = $classNamePattern.compile(r.toString(), 
$collationRegexFlags);
-       |    $termLastRegex = r;
-       |  } catch (java.util.regex.PatternSyntaxException e) {
-       |    throw QueryExecutionErrors.invalidPatternError("$prettyName", 
e.getPattern(), e);
-       |  }
+       |  UTF8String r = $regexp.clone();
+       |  $termPattern =
+       |    $utils.compileRegexPattern(r.toString(), $collationRegexFlags, 
"$prettyName");
+       |  $termLastRegex = r;
        |}
        |java.util.regex.Matcher $matcher = 
$termPattern.matcher($subject.toString());
        |""".stripMargin
@@ -1272,12 +1270,8 @@ object RegExpUtils {
   def getPatternAndLastRegex(p: Any, prettyName: String, collationId: Int): 
(Pattern, UTF8String) =
   {
     val r = p.asInstanceOf[UTF8String].clone()
-    val pattern = try {
-      Pattern.compile(r.toString, 
CollationSupport.collationAwareRegexFlags(collationId))
-    } catch {
-      case e: PatternSyntaxException =>
-        throw QueryExecutionErrors.invalidPatternError(prettyName, 
e.getPattern, e)
-    }
+    val pattern = ExpressionImplUtils.compileRegexPattern(
+      r.toString, CollationSupport.collationAwareRegexFlags(collationId), 
prettyName)
     (pattern, r)
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to