This is an automated email from the ASF dual-hosted git repository.
zhouyuan pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 5668e14bcf [GLUTEN-12225][CORE] Fix arrow.c shading: exclude
memory/vector packages so public API stays unshaded (#12226)
5668e14bcf is described below
commit 5668e14bcfc3b0fde0a5d02a8d90957c1eaca622
Author: EJ Song <[email protected]>
AuthorDate: Thu Jun 4 05:41:35 2026 -0700
[GLUTEN-12225][CORE] Fix arrow.c shading: exclude memory/vector packages so
public API stays unshaded (#12226)
* [CORE] Fix arrow.c shading: exclude memory/vector packages so public API
stays unshaded
The bundled Arrow C-Data classes (org.apache.arrow.c.*) are correctly
excluded from relocation because their native JNI binds to the original
class names. However, their public API signatures take and return
org.apache.arrow.memory.* and org.apache.arrow.vector.* types, which were
being relocated to org.apache.gluten.shaded.*. The result: bundled
ArrowArrayStream/ArrowSchema/ArrowArray/Data classes are compiled against
the shaded BufferAllocator/VectorSchemaRoot, so any caller passing a
vanilla Apache Arrow allocator gets NoSuchMethodError.
Triggered for any Spark workload that combines gluten with another library
using Arrow C-Data (Iceberg's Arrow vector layer, Lance Java's writer,
Snowflake JDBC's Arrow result decoder, etc.) when gluten's bundle wins
classloader resolution against vanilla Arrow.
Fix: extend the relocation excludes to also keep org.apache.arrow.memory.**
and org.apache.arrow.vector.** unshaded. The bundled C-Data API now matches
the public Apache Arrow API.
Adds dev/check-arrow-c-shading.sh which runs javap on the produced bundle
jar and asserts that public method signatures reference unshaded Arrow
types. Wired into package/pom.xml's verify phase via exec-maven-plugin so
regressions are caught in CI. Tested against the upstream
gluten-velox-bundle-spark3.5_2.12-linux_amd64-1.6.0.jar — script exits 1
with a clear diagnosis on the broken bundle.
Closes #12225
* fixup: spotless — execution element order is goals before phase
---------
Co-authored-by: sezruby <[email protected]>
---
dev/check-arrow-c-shading.sh | 89 ++++++++++++++++++++++++++++++++++++++++++++
package/pom.xml | 43 ++++++++++++++++++++-
2 files changed, 131 insertions(+), 1 deletion(-)
diff --git a/dev/check-arrow-c-shading.sh b/dev/check-arrow-c-shading.sh
new file mode 100755
index 0000000000..620407603e
--- /dev/null
+++ b/dev/check-arrow-c-shading.sh
@@ -0,0 +1,89 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Verify the bundled gluten-velox jar's Arrow C-Data classes have method
+# signatures referencing the *unshaded* org.apache.arrow.memory.BufferAllocator
+# and org.apache.arrow.vector.* types — not the gluten-shaded copies.
+#
+# Background: org.apache.arrow.c.* must NOT be relocated (its native JNI binds
+# to the original class names), but its public API methods accept/return
+# org.apache.arrow.memory.* and org.apache.arrow.vector.* types. Those types
+# must therefore also stay unshaded in the bundle, otherwise the bundled
+# ArrowArrayStream/ArrowSchema get re-bound to the shaded BufferAllocator at
+# compile time and any caller passing a vanilla Apache Arrow allocator hits
+# `NoSuchMethodError`. See gluten#12225.
+#
+# Usage:
+# dev/check-arrow-c-shading.sh <path-to-gluten-velox-bundle.jar>
+#
+# Exit codes:
+# 0 — bundle is well-shaded (Arrow C-Data API uses public Apache Arrow types)
+# 1 — bundle is broken (Arrow C-Data API references gluten-shaded types)
+# 2 — usage / setup error
+
+set -euo pipefail
+
+JAR="${1:?usage: $0 <path-to-gluten-velox-bundle.jar>}"
+if [[ ! -f "$JAR" ]]; then
+ echo "error: jar not found: $JAR" >&2
+ exit 2
+fi
+
+if ! command -v javap >/dev/null; then
+ echo "error: javap not found on PATH" >&2
+ exit 2
+fi
+
+WORKDIR=$(mktemp -d)
+trap 'rm -rf "$WORKDIR"' EXIT
+
+# Classes whose public API touches the unshaded boundary.
+CLASSES=(
+ "org/apache/arrow/c/ArrowArrayStream"
+ "org/apache/arrow/c/ArrowSchema"
+ "org/apache/arrow/c/ArrowArray"
+ "org/apache/arrow/c/Data"
+)
+
+failures=0
+for cls in "${CLASSES[@]}"; do
+ if ! unzip -p "$JAR" "${cls}.class" > "$WORKDIR/$(basename "$cls").class"
2>/dev/null; then
+ echo " SKIP $cls (not in bundle)"
+ continue
+ fi
+ signatures=$(javap -p "$WORKDIR/$(basename "$cls").class" 2>/dev/null ||
true)
+ # Any method signature mentioning the gluten-shaded Arrow path is the bug.
+ bad=$(echo "$signatures" | grep -E
"org\.apache\.gluten\.shaded\.org\.apache\.arrow\.(memory|vector)\." || true)
+ if [[ -n "$bad" ]]; then
+ echo " FAIL $cls — public API references gluten-shaded Arrow types:"
+ echo "$bad" | sed 's/^/ /'
+ failures=$((failures + 1))
+ else
+ echo " OK $cls"
+ fi
+done
+
+if (( failures > 0 )); then
+ echo
+ echo "Bundle has $failures Arrow C-Data class(es) with shaded API types."
+ echo "See gluten#12225 for context. Update package/pom.xml's"
+ echo "<relocation org.apache.arrow> excludes to also exclude"
+ echo "org.apache.arrow.memory.** and org.apache.arrow.vector.**."
+ exit 1
+fi
+
+echo
+echo "All Arrow C-Data classes use unshaded public Apache Arrow API. ✓"
diff --git a/package/pom.xml b/package/pom.xml
index 55dec68cdd..709170a50f 100644
--- a/package/pom.xml
+++ b/package/pom.xml
@@ -121,10 +121,22 @@
<relocation>
<pattern>org.apache.arrow</pattern>
<shadedPattern>${gluten.shade.packageName}.org.apache.arrow</shadedPattern>
- <!--arrow's C and dataset wrapper refers to the original
class path, so we should not relocate here-->
+ <!--
+ arrow's C and dataset wrappers refer to the original class
+ path, so they must not be relocated. Their public APIs also
+ take and return org.apache.arrow.memory.* and
+ org.apache.arrow.vector.* types, so those packages must
also
+ stay unshaded — otherwise the bundled (unshaded)
+ ArrowArrayStream/ArrowSchema get compiled against the
+ relocated BufferAllocator/VectorSchemaRoot, producing
+ `NoSuchMethodError` for any caller passing a vanilla
+ Apache Arrow allocator. See #12225.
+ -->
<excludes>
<exclude>org.apache.arrow.c.*</exclude>
<exclude>org.apache.arrow.c.jni.*</exclude>
+ <exclude>org.apache.arrow.memory.**</exclude>
+ <exclude>org.apache.arrow.vector.**</exclude>
<exclude>org.apache.arrow.dataset.**</exclude>
</excludes>
</relocation>
@@ -287,6 +299,35 @@
</execution>
</executions>
</plugin>
+ <plugin>
+ <groupId>org.codehaus.mojo</groupId>
+ <artifactId>exec-maven-plugin</artifactId>
+ <executions>
+ <execution>
+ <!--
+ Verify that the bundled Arrow C-Data classes have public method
+ signatures referencing the unshaded Apache Arrow API
+ (org.apache.arrow.memory.*, org.apache.arrow.vector.*) and not
+ the gluten-shaded copies. Catches a regression where shading
+ relocates classes that the unshaded org.apache.arrow.c.* API
+ transitively depends on, producing a bundle whose public Arrow
+ C-Data API is incompatible with vanilla Apache Arrow callers
+ (gluten#12225).
+ -->
+ <id>verify-arrow-c-shading</id>
+ <goals>
+ <goal>exec</goal>
+ </goals>
+ <phase>verify</phase>
+ <configuration>
+
<executable>${project.basedir}/../dev/check-arrow-c-shading.sh</executable>
+ <arguments>
+
<argument>${project.build.directory}/${project.build.finalName}.jar</argument>
+ </arguments>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
</plugins>
</build>
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]