This is an automated email from the ASF dual-hosted git repository.
gengliangwang pushed a commit to branch branch-4.x
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.x by this push:
new 011f6a5990fa [SPARK-57172][SQL] Simplify Crc32 codegen by extracting a
static Java helper
011f6a5990fa is described below
commit 011f6a5990fa89302f371e0983ad7d93135b6a94
Author: Gengliang Wang <[email protected]>
AuthorDate: Tue Jun 2 12:50:29 2026 -0700
[SPARK-57172][SQL] Simplify Crc32 codegen by extracting a static Java helper
### What changes were proposed in this pull request?
Add `ExpressionImplUtils.crc32(byte[] bytes)` and route `Crc32`'s eval and
codegen paths through it. `Crc32.doGenCode` previously emitted a 3-line
allocate / `update` / `getValue` sequence inline; it now emits a single
`ExpressionImplUtils.crc32(...)` call, and the eval path calls the same helper.
This is a plain (non-ANSI, non-try/catch) type-independent block, in line
with the broadened goal of SPARK-56908 to move fixed generated-Java logic into
static Java helpers.
### Why are the changes needed?
Part of SPARK-56908 (umbrella). Collapsing the inline CRC32 sequence to one
call shrinks the generated Java for every stage that computes `crc32`, helping
with the JVM 64KB method / constant-pool limits, Janino compile time, and JIT
work.
### Does this PR introduce _any_ user-facing change?
No. The compiled behavior is identical; only the emitted Java source text
changes.
### How was this patch tested?
```
build/sbt "catalyst/testOnly *HashExpressionsSuite"
```
40/40 pass, including `crc32` (exercised both with and without whole-stage
codegen).
### Was this patch authored or co-authored using generative AI tooling?
Generated-by: Claude Code (Opus 4.8)
Closes #56222 from gengliangwang/spark-crc32-codegen.
Authored-by: Gengliang Wang <[email protected]>
Signed-off-by: Gengliang Wang <[email protected]>
(cherry picked from commit 6db4bf477f95029e338496e6af10e363c4898fb1)
Signed-off-by: Gengliang Wang <[email protected]>
---
.../sql/catalyst/expressions/ExpressionImplUtils.java | 12 ++++++++++++
.../org/apache/spark/sql/catalyst/expressions/hash.scala | 14 +++-----------
2 files changed, 15 insertions(+), 11 deletions(-)
diff --git
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
index a5228edc33c8..7bad7c430b86 100644
---
a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
+++
b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java
@@ -25,6 +25,7 @@ import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;
+import java.util.zip.CRC32;
import javax.crypto.Cipher;
import javax.crypto.spec.GCMParameterSpec;
import javax.crypto.spec.IvParameterSpec;
@@ -342,4 +343,15 @@ public class ExpressionImplUtils {
String sp = str.toString().replaceAll(qtChar, qtCharRep);
return UTF8String.fromString(qtChar + sp + qtChar);
}
+
+ /**
+ * Computes the CRC32 checksum of {@code bytes} for the {@code crc32}
expression.
+ * Shared by the eval and codegen paths so the per-stage generated Java is a
+ * single call rather than an inline allocate / update / getValue sequence.
+ */
+ public static long crc32(byte[] bytes) {
+ CRC32 checksum = new CRC32();
+ checksum.update(bytes, 0, bytes.length);
+ return checksum.getValue();
+ }
}
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
index 3c1d666c89b3..795eaa22f0f1 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala
@@ -19,7 +19,6 @@ package org.apache.spark.sql.catalyst.expressions
import java.math.{BigDecimal, RoundingMode}
import java.util.concurrent.TimeUnit._
-import java.util.zip.CRC32
import scala.annotation.tailrec
@@ -214,20 +213,13 @@ case class Crc32(child: Expression)
override def contextIndependentFoldable: Boolean =
child.contextIndependentFoldable
protected override def nullSafeEval(input: Any): Any = {
- val checksum = new CRC32
- checksum.update(input.asInstanceOf[Array[Byte]], 0,
input.asInstanceOf[Array[Byte]].length)
- checksum.getValue
+ ExpressionImplUtils.crc32(input.asInstanceOf[Array[Byte]])
}
override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
- val CRC32 = "java.util.zip.CRC32"
- val checksum = ctx.freshName("checksum")
+ val utils = classOf[ExpressionImplUtils].getName
nullSafeCodeGen(ctx, ev, value => {
- s"""
- $CRC32 $checksum = new $CRC32();
- $checksum.update($value, 0, $value.length);
- ${ev.value} = $checksum.getValue();
- """
+ s"${ev.value} = $utils.crc32($value);"
})
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]