This is an automated email from the ASF dual-hosted git repository.
changchen pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/main by this push:
new 7e1c94280f [GLUTEN-11340][CORE][VL][CH] Fix Compatibility issues
addressed in Spark 4.1 (#11313)
7e1c94280f is described below
commit 7e1c94280f35af791ca578b85ad1c0c9c6ff0c94
Author: Chang chen <[email protected]>
AuthorDate: Tue Dec 30 20:31:16 2025 +0800
[GLUTEN-11340][CORE][VL][CH] Fix Compatibility issues addressed in Spark
4.1 (#11313)
* [Fix] Remove `@NotNull` annotations to resolve dependency issues caused
by ORC upgrade.
https://github.com/apache/spark/pull/51676
* [Fix] Make `reserveNewColumn` public in `ArrowWritableColumnVector` and
`WritableColumnVectorShim`
In Java, an overriding method's access modifier cannot be more restrictive
than the overridden method. Changing from protected to public is safe and
ensures compatibility before the Spark version upgrade.
see https://github.com/apache/spark/pull/52557
* [Fix] Add GeographyVal and GeometryVal support in ArrowColumnarRow,
BatchCarrierRow and ColumnarToCarrierRowExecBase
see [SPIP: Add geospatial types in
Spark](https://issues.apache.org/jira/browse/SPARK-51658)
* [Fix] Update commons-collections to version 4.5.0.
see https://github.com/apache/spark/pull/52743
* [Fix] Enable SPARK_TESTING environment variable for Spark test jobs
see https://github.com/apache/spark/pull/53344
---
.github/workflows/velox_backend_x86.yml | 18 ++++++++++++++++++
.../java/org/apache/gluten/metrics/NativeMetrics.java | 2 +-
.../execution/datasources/velox/VeloxBlockStripes.java | 3 +--
.../gluten/vectorized/ArrowWritableColumnVector.java | 2 +-
.../apache/gluten/vectorized/ArrowColumnarRow.scala | 4 ++--
.../memory/memtarget/spark/TreeMemoryConsumers.java | 8 +++-----
pom.xml | 7 +++++++
.../org/apache/gluten/execution/BatchCarrierRow.scala | 2 +-
...mpatible.scala => InternalRowSparkCompatible.scala} | 12 +++++++++---
...e.scala => SpecializedGettersSparkCompatible.scala} | 11 ++++++++++-
.../spark/sql/execution/datasources/BlockStripes.java | 3 ---
.../execution/vectorized/WritableColumnVectorShim.java | 2 +-
12 files changed, 54 insertions(+), 20 deletions(-)
diff --git a/.github/workflows/velox_backend_x86.yml
b/.github/workflows/velox_backend_x86.yml
index a8b2c6aa8d..1f3df3eaa8 100644
--- a/.github/workflows/velox_backend_x86.yml
+++ b/.github/workflows/velox_backend_x86.yml
@@ -935,6 +935,8 @@ jobs:
spark-test-spark35:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -991,6 +993,8 @@ jobs:
spark-test-spark35-scala213:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1041,6 +1045,8 @@ jobs:
spark-test-spark35-slow:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1083,6 +1089,8 @@ jobs:
spark-test-spark35-ras:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1132,6 +1140,8 @@ jobs:
spark-test-spark35-slow-ras:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1173,6 +1183,8 @@ jobs:
spark-test-spark35-smj:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1222,6 +1234,8 @@ jobs:
spark-test-spark35-slow-smj:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1367,6 +1381,8 @@ jobs:
spark-test-spark40:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk17
steps:
- uses: actions/checkout@v2
@@ -1422,6 +1438,8 @@ jobs:
spark-test-spark40-slow:
needs: build-native-lib-centos-7
runs-on: ubuntu-22.04
+ env:
+ SPARK_TESTING: true
container: apache/gluten:centos-8-jdk17
steps:
- uses: actions/checkout@v2
diff --git
a/backends-clickhouse/src/main/java/org/apache/gluten/metrics/NativeMetrics.java
b/backends-clickhouse/src/main/java/org/apache/gluten/metrics/NativeMetrics.java
index 449ef7e2a9..07b8988a72 100644
---
a/backends-clickhouse/src/main/java/org/apache/gluten/metrics/NativeMetrics.java
+++
b/backends-clickhouse/src/main/java/org/apache/gluten/metrics/NativeMetrics.java
@@ -18,7 +18,7 @@ package org.apache.gluten.metrics;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
-import org.apache.commons.collections.CollectionUtils;
+import org.apache.commons.collections4.CollectionUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git
a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxBlockStripes.java
b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxBlockStripes.java
index 9aa0f513c9..646164de3a 100644
---
a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxBlockStripes.java
+++
b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxBlockStripes.java
@@ -22,7 +22,6 @@ import org.apache.spark.sql.catalyst.expressions.UnsafeRow;
import org.apache.spark.sql.execution.datasources.BlockStripe;
import org.apache.spark.sql.execution.datasources.BlockStripes;
import org.apache.spark.sql.vectorized.ColumnarBatch;
-import org.jetbrains.annotations.NotNull;
import java.util.Iterator;
@@ -34,7 +33,7 @@ public class VeloxBlockStripes extends BlockStripes {
}
@Override
- public @NotNull Iterator<BlockStripe> iterator() {
+ public Iterator<BlockStripe> iterator() {
return new Iterator<BlockStripe>() {
private int index = 0;
diff --git
a/gluten-arrow/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java
b/gluten-arrow/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java
index d00786f3f4..7a4585dce7 100644
---
a/gluten-arrow/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java
+++
b/gluten-arrow/src/main/java/org/apache/gluten/vectorized/ArrowWritableColumnVector.java
@@ -378,7 +378,7 @@ public final class ArrowWritableColumnVector extends
WritableColumnVectorShim {
}
@Override
- protected ArrowWritableColumnVector reserveNewColumn(int capacity, DataType
type) {
+ public ArrowWritableColumnVector reserveNewColumn(int capacity, DataType
type) {
return new ArrowWritableColumnVector(capacity, type);
}
diff --git
a/gluten-arrow/src/main/scala/org/apache/gluten/vectorized/ArrowColumnarRow.scala
b/gluten-arrow/src/main/scala/org/apache/gluten/vectorized/ArrowColumnarRow.scala
index f0e2c4dabf..2ee3d081c3 100644
---
a/gluten-arrow/src/main/scala/org/apache/gluten/vectorized/ArrowColumnarRow.scala
+++
b/gluten-arrow/src/main/scala/org/apache/gluten/vectorized/ArrowColumnarRow.scala
@@ -17,7 +17,7 @@
package org.apache.gluten.vectorized
import org.apache.gluten.exception.GlutenException
-import org.apache.gluten.execution.InternalRowGetVariantCompatible
+import org.apache.gluten.execution.InternalRowSparkCompatible
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
@@ -31,7 +31,7 @@ import java.math.BigDecimal
// ArrowWritableColumnVector. And support string and binary type to write,
// Arrow writer does not need to setNotNull before writing a value.
final class ArrowColumnarRow(writableColumns:
Array[ArrowWritableColumnVector], var rowId: Int = 0)
- extends InternalRowGetVariantCompatible {
+ extends InternalRowSparkCompatible {
private val columns: Array[ArrowWritableColumnVector] = writableColumns
diff --git
a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java
b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java
index 87e8937f53..5207ce46a2 100644
---
a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java
+++
b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java
@@ -21,7 +21,7 @@ import org.apache.gluten.memory.memtarget.Spillers;
import org.apache.gluten.memory.memtarget.TreeMemoryTarget;
import com.google.common.base.Preconditions;
-import org.apache.commons.collections.map.ReferenceMap;
+import org.apache.commons.collections4.map.ReferenceMap;
import org.apache.spark.memory.MemoryMode;
import org.apache.spark.memory.TaskMemoryManager;
import org.apache.spark.util.Utils;
@@ -31,15 +31,13 @@ import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
public final class TreeMemoryConsumers {
- private static final ReferenceMap FACTORIES = new ReferenceMap();
+ private static final ReferenceMap<TaskMemoryManager, Factory> FACTORIES =
new ReferenceMap<>();
private TreeMemoryConsumers() {}
- @SuppressWarnings("unchecked")
public static Factory factory(TaskMemoryManager tmm, MemoryMode mode) {
synchronized (FACTORIES) {
- final Factory factory =
- (Factory) FACTORIES.computeIfAbsent(tmm, m -> new
Factory((TaskMemoryManager) m, mode));
+ final Factory factory = FACTORIES.computeIfAbsent(tmm, m -> new
Factory(m, mode));
final MemoryMode foundMode = factory.sparkConsumer.getMode();
Preconditions.checkState(
foundMode == mode,
diff --git a/pom.xml b/pom.xml
index d84df1236d..1d13dc5747 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1085,6 +1085,7 @@
<hudi.version>0.15.0</hudi.version>
<slf4j.version>1.7.30</slf4j.version>
<log4j.version>1.2.17</log4j.version>
+ <commons.collections4.version>4.5.0</commons.collections4.version>
</properties>
<dependencies>
<dependency>
@@ -1093,6 +1094,12 @@
<version>${log4j.version}</version>
<scope>provided</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-collections4</artifactId>
+ <version>${commons.collections4.version}</version>
+ <scope>compile</scope>
+ </dependency>
</dependencies>
</profile>
<profile>
diff --git
a/shims/common/src/main/scala/org/apache/gluten/execution/BatchCarrierRow.scala
b/shims/common/src/main/scala/org/apache/gluten/execution/BatchCarrierRow.scala
index 00a48c918b..f4fb69586e 100644
---
a/shims/common/src/main/scala/org/apache/gluten/execution/BatchCarrierRow.scala
+++
b/shims/common/src/main/scala/org/apache/gluten/execution/BatchCarrierRow.scala
@@ -35,7 +35,7 @@ import org.apache.spark.unsafe.types.{CalendarInterval,
UTF8String}
* PlaceholderRows, followed by one TerminalRow that actually wraps that
columnar batch. The total
* number of PlaceholderRows + the TerminalRow equates to the size of the
original columnar batch.
*/
-sealed abstract class BatchCarrierRow extends InternalRowGetVariantCompatible {
+sealed abstract class BatchCarrierRow extends InternalRowSparkCompatible {
override def numFields: Int = throw unsupported()
override def setNullAt(i: Int): Unit = throw unsupported()
diff --git
a/shims/common/src/main/scala/org/apache/gluten/execution/InternalRowGetVariantCompatible.scala
b/shims/common/src/main/scala/org/apache/gluten/execution/InternalRowSparkCompatible.scala
similarity index 75%
rename from
shims/common/src/main/scala/org/apache/gluten/execution/InternalRowGetVariantCompatible.scala
rename to
shims/common/src/main/scala/org/apache/gluten/execution/InternalRowSparkCompatible.scala
index 8e54892046..f1b5c7708d 100644
---
a/shims/common/src/main/scala/org/apache/gluten/execution/InternalRowGetVariantCompatible.scala
+++
b/shims/common/src/main/scala/org/apache/gluten/execution/InternalRowSparkCompatible.scala
@@ -16,13 +16,19 @@
*/
package org.apache.gluten.execution
-import org.apache.gluten.expression.SpecializedGettersGetVariantCompatible
+import org.apache.gluten.expression.SpecializedGettersSparkCompatible
import org.apache.spark.sql.catalyst.InternalRow
/** An internal-row base implementation that is compatible with both Spark 3.x
and 4.x. */
-abstract class InternalRowGetVariantCompatible
+abstract class InternalRowSparkCompatible
extends InternalRow
- with SpecializedGettersGetVariantCompatible {
+ with SpecializedGettersSparkCompatible {
override def getVariant(ordinal: Int): Nothing = throw new
UnsupportedOperationException()
+
+ override def getGeography(ordinal: Int): Nothing =
+ throw new UnsupportedOperationException()
+
+ override def getGeometry(ordinal: Int): Nothing =
+ throw new UnsupportedOperationException()
}
diff --git
a/shims/common/src/main/scala/org/apache/gluten/expression/SpecializedGettersGetVariantCompatible.scala
b/shims/common/src/main/scala/org/apache/gluten/expression/SpecializedGettersSparkCompatible.scala
similarity index 74%
rename from
shims/common/src/main/scala/org/apache/gluten/expression/SpecializedGettersGetVariantCompatible.scala
rename to
shims/common/src/main/scala/org/apache/gluten/expression/SpecializedGettersSparkCompatible.scala
index 555a29839c..a957affed5 100644
---
a/shims/common/src/main/scala/org/apache/gluten/expression/SpecializedGettersGetVariantCompatible.scala
+++
b/shims/common/src/main/scala/org/apache/gluten/expression/SpecializedGettersSparkCompatible.scala
@@ -19,9 +19,18 @@ package org.apache.gluten.expression
/**
* A mix-in trait mainly for internal-row's implementations to extend, to
ensure the code is
* compatible with Spark 3.x and 4.x at the same time.
+ *
+ * Provides stub implementations for methods that exist in Spark 4.x but not
in Spark 3.x, including
+ * getVariant, getGeography, and getGeometry.
*/
-trait SpecializedGettersGetVariantCompatible {
+trait SpecializedGettersSparkCompatible {
def getVariant(ordinal: Int): Nothing = {
throw new UnsupportedOperationException()
}
+
+ def getGeography(ordinal: Int): Nothing =
+ throw new UnsupportedOperationException()
+
+ def getGeometry(ordinal: Int): Nothing =
+ throw new UnsupportedOperationException()
}
diff --git
a/shims/common/src/main/scala/org/apache/spark/sql/execution/datasources/BlockStripes.java
b/shims/common/src/main/scala/org/apache/spark/sql/execution/datasources/BlockStripes.java
index 3c4a7d4746..d516e4eead 100644
---
a/shims/common/src/main/scala/org/apache/spark/sql/execution/datasources/BlockStripes.java
+++
b/shims/common/src/main/scala/org/apache/spark/sql/execution/datasources/BlockStripes.java
@@ -16,8 +16,6 @@
*/
package org.apache.spark.sql.execution.datasources;
-import org.jetbrains.annotations.NotNull;
-
import java.util.Iterator;
// FIXME: The abstraction is broken: VL / CH don't rely on the same binary
layout of
@@ -57,7 +55,6 @@ public class BlockStripes implements Iterable<BlockStripe> {
throw new UnsupportedOperationException("subclass of BlockStripe
should implement this");
}
- @NotNull
@Override
public Iterator<BlockStripe> iterator() {
throw new UnsupportedOperationException("subclass of BlockStripe
should implement this");
diff --git
a/shims/spark40/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVectorShim.java
b/shims/spark40/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVectorShim.java
index eb29651faa..513f3d2d92 100644
---
a/shims/spark40/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVectorShim.java
+++
b/shims/spark40/src/main/java/org/apache/spark/sql/execution/vectorized/WritableColumnVectorShim.java
@@ -179,7 +179,7 @@ public class WritableColumnVectorShim extends
WritableColumnVector {
}
@Override
- protected WritableColumnVector reserveNewColumn(int capacity, DataType type)
{
+ public WritableColumnVector reserveNewColumn(int capacity, DataType type) {
return null;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]