This is an automated email from the ASF dual-hosted git repository.

gengliangwang pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.x by this push:
     new 011f6a5990fa [SPARK-57172][SQL] Simplify Crc32 codegen by extracting a 
static Java helper
011f6a5990fa is described below

commit 011f6a5990fa89302f371e0983ad7d93135b6a94
Author: Gengliang Wang <[email protected]>
AuthorDate: Tue Jun 2 12:50:29 2026 -0700

    [SPARK-57172][SQL] Simplify Crc32 codegen by extracting a static Java helper
    
    ### What changes were proposed in this pull request?
    
    Add `ExpressionImplUtils.crc32(byte[] bytes)` and route `Crc32`'s eval and 
codegen paths through it. `Crc32.doGenCode` previously emitted a 3-line 
allocate / `update` / `getValue` sequence inline; it now emits a single 
`ExpressionImplUtils.crc32(...)` call, and the eval path calls the same helper.
    
    This is a plain (non-ANSI, non-try/catch) type-independent block, in line 
with the broadened goal of SPARK-56908 to move fixed generated-Java logic into 
static Java helpers.
    
    ### Why are the changes needed?
    
    Part of SPARK-56908 (umbrella). Collapsing the inline CRC32 sequence to one 
call shrinks the generated Java for every stage that computes `crc32`, helping 
with the JVM 64KB method / constant-pool limits, Janino compile time, and JIT 
work.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No. The compiled behavior is identical; only the emitted Java source text 
changes.
    
    ### How was this patch tested?
    
    ```
    build/sbt "catalyst/testOnly *HashExpressionsSuite"
    ```
    
    40/40 pass, including `crc32` (exercised both with and without whole-stage 
codegen).
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude Code (Opus 4.8)
    
    Closes #56222 from gengliangwang/spark-crc32-codegen.
    
    Authored-by: Gengliang Wang <[email protected]>
    Signed-off-by: Gengliang Wang <[email protected]>
    (cherry picked from commit 6db4bf477f95029e338496e6af10e363c4898fb1)
    Signed-off-by: Gengliang Wang <[email protected]>
---
 .../sql/catalyst/expressions/ExpressionImplUtils.java      | 12 ++++++++++++
 .../org/apache/spark/sql/catalyst/expressions/hash.scala   | 14 +++-----------
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
index a5228edc33c8..7bad7c430b86 100644
--- 
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
+++ 
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
@@ -25,6 +25,7 @@ import java.text.BreakIterator;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
+import java.util.zip.CRC32;
 import javax.crypto.Cipher;
 import javax.crypto.spec.GCMParameterSpec;
 import javax.crypto.spec.IvParameterSpec;
@@ -342,4 +343,15 @@ public class ExpressionImplUtils {
     String sp = str.toString().replaceAll(qtChar, qtCharRep);
     return UTF8String.fromString(qtChar + sp + qtChar);
   }
+
+  /**
+   * Computes the CRC32 checksum of {@code bytes} for the {@code crc32} 
expression.
+   * Shared by the eval and codegen paths so the per-stage generated Java is a
+   * single call rather than an inline allocate / update / getValue sequence.
+   */
+  public static long crc32(byte[] bytes) {
+    CRC32 checksum = new CRC32();
+    checksum.update(bytes, 0, bytes.length);
+    return checksum.getValue();
+  }
 }
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 3c1d666c89b3..795eaa22f0f1 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.expressions
 
 import java.math.{BigDecimal, RoundingMode}
 import java.util.concurrent.TimeUnit._
-import java.util.zip.CRC32
 
 import scala.annotation.tailrec
 
@@ -214,20 +213,13 @@ case class Crc32(child: Expression)
   override def contextIndependentFoldable: Boolean = 
child.contextIndependentFoldable
 
   protected override def nullSafeEval(input: Any): Any = {
-    val checksum = new CRC32
-    checksum.update(input.asInstanceOf[Array[Byte]], 0, 
input.asInstanceOf[Array[Byte]].length)
-    checksum.getValue
+    ExpressionImplUtils.crc32(input.asInstanceOf[Array[Byte]])
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-    val CRC32 = "java.util.zip.CRC32"
-    val checksum = ctx.freshName("checksum")
+    val utils = classOf[ExpressionImplUtils].getName
     nullSafeCodeGen(ctx, ev, value => {
-      s"""
-        $CRC32 $checksum = new $CRC32();
-        $checksum.update($value, 0, $value.length);
-        ${ev.value} = $checksum.getValue();
-      """
+      s"${ev.value} = $utils.crc32($value);"
     })
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to