This is an automated email from the ASF dual-hosted git repository.
philo pushed a commit to branch branch-1.4
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git
The following commit(s) were added to refs/heads/branch-1.4 by this push:
new 31552c68f1 [branch-1.4] Port PR #9200 #9320 #9368 #9209 #9262 (#9431)
31552c68f1 is described below
commit 31552c68f11f9f134907db6c5463de2a22517842
Author: Wei-Ting Chen <[email protected]>
AuthorDate: Tue Apr 29 13:41:43 2025 +0800
[branch-1.4] Port PR #9200 #9320 #9368 #9209 #9262 (#9431)
* [GLUTEN-9199][VL] Fix error when creating shuffle file: open with O_CREAT
or O_TMPFILE in second argument needs 3 arguments (#9200)
* [DOC] Fix broken links in documents (#9320)
* [GLUTEN-9369][DOC] Fix commands in the Gluten C++ debugging developer doc
(#9368)
* [VL][CI] Change to use JDK-17 for Spark 3.3/3.4/3.5 tests (#9209)
* [VL][CI] Bump to use ubuntu-22.04 runner (#9262)
---------
Co-authored-by: Hongze Zhang <[email protected]>
Co-authored-by: PHILO-HE <[email protected]>
Co-authored-by: Dina Suehiro Jones <[email protected]>
Co-authored-by: Yuan <[email protected]>
---
.github/workflows/build_bundle_package.yml | 4 +-
.github/workflows/velox_backend.yml | 320 +++++++--------------
.github/workflows/velox_backend_cache.yml | 8 +-
.github/workflows/velox_nightly.yml | 2 +-
cpp/core/shuffle/LocalPartitionWriter.cc | 2 +-
docs/developers/HowTo.md | 37 +--
docs/developers/NewToGluten.md | 20 +-
docs/get-started/Velox.md | 8 +-
.../gluten/utils/velox/VeloxTestSettings.scala | 9 +-
9 files changed, 157 insertions(+), 253 deletions(-)
diff --git a/.github/workflows/build_bundle_package.yml
b/.github/workflows/build_bundle_package.yml
index 63f3982905..f4c03482e6 100644
--- a/.github/workflows/build_bundle_package.yml
+++ b/.github/workflows/build_bundle_package.yml
@@ -37,7 +37,7 @@ on:
jobs:
build-native-lib:
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Get Ccache
@@ -72,7 +72,7 @@ jobs:
build-bundle-package-centos8:
needs: build-native-lib
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: centos:8
steps:
- uses: actions/checkout@v2
diff --git a/.github/workflows/velox_backend.yml
b/.github/workflows/velox_backend.yml
index 662c77dcd8..86938790a0 100644
--- a/.github/workflows/velox_backend.yml
+++ b/.github/workflows/velox_backend.yml
@@ -71,7 +71,7 @@ concurrency:
jobs:
build-native-lib-centos-7:
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Get Ccache
@@ -114,7 +114,7 @@ jobs:
path: .m2/repository/org/apache/arrow/
if-no-files-found: error
- run-tpc-test-ubuntu:
+ tpc-test-ubuntu:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
@@ -140,7 +140,7 @@ jobs:
java: java-17
- os: ubuntu:22.04
java: java-11
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
@@ -185,7 +185,7 @@ jobs:
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1
- run-tpc-test-centos8:
+ tpc-test-centos8:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
@@ -211,7 +211,7 @@ jobs:
java: java-17
- os: centos:7
java: java-11
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: ${{ matrix.os }}
steps:
- uses: actions/checkout@v2
@@ -277,7 +277,7 @@ jobs:
--local --preset=velox --benchmark-type=ds --error-on-memleak
--off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
--extra-conf=spark.gluten.ras.enabled=true
- run-tpc-test-centos7:
+ tpc-test-centos7:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
@@ -285,7 +285,7 @@ jobs:
spark: [ "spark-3.2", "spark-3.3", "spark-3.4", "spark-3.5" ]
java: [ "java-8" ]
# Spark supports JDK17 since 3.3 and later, see
https://issues.apache.org/jira/browse/SPARK-33772
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Download All Native Artifacts
@@ -343,13 +343,13 @@ jobs:
--extra-conf=spark.gluten.ras.enabled=true
"
- run-tpc-test-ubuntu-oom:
+ tpc-test-ubuntu-oom:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
matrix:
spark: [ "spark-3.2" ]
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
steps:
- name: Maximize build disk space
shell: bash
@@ -458,13 +458,13 @@ jobs:
-d=IO_THREADS:12,spark.gluten.sql.columnar.backend.velox.IOThreads=12 \
-d=IO_THREADS:0,spark.gluten.sql.columnar.backend.velox.IOThreads=0
- run-tpc-test-ubuntu-randomkill:
+ tpc-test-ubuntu-randomkill:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
matrix:
spark: [ "spark-3.2" ]
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
steps:
- name: Maximize build disk space
shell: bash
@@ -510,7 +510,7 @@ jobs:
--local --preset=velox --benchmark-type=ds --error-on-memleak
-s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1
\
--data-gen=skip --random-kill-tasks --no-session-reuse
- run-tpc-test-centos8-uniffle:
+ tpc-test-centos8-uniffle:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
@@ -518,7 +518,7 @@ jobs:
spark: [ "spark-3.2" ]
uniffle: [ "0.9.2" ]
hadoop: [ "2.8.5" ]
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -561,14 +561,14 @@ jobs:
GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox-with-uniffle --benchmark-type=h
--error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1
- run-tpc-test-ubuntu-2204-celeborn:
+ tpc-test-ubuntu-2204-celeborn:
needs: build-native-lib-centos-7
strategy:
fail-fast: false
matrix:
spark: [ "spark-3.2" ]
celeborn: [ "celeborn-0.5.4", "celeborn-0.4.3"]
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -615,9 +615,9 @@ jobs:
--extra-conf=spark.celeborn.push.sortMemory.threshold=8m
--benchmark-type=ds --error-on-memleak \
--off-heap-size=10g -s=1.0 --threads=8 --iterations=1
- run-spark-test-spark32:
+ spark-test-spark32:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -647,7 +647,7 @@ jobs:
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark32
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -659,12 +659,12 @@ jobs:
if: failure()
uses: actions/upload-artifact@v4
with:
- name: golden-files-spark32
+ name: ${{ github.job }}-golden-files
path: /tmp/tpch-approved-plan/**
- run-spark-test-spark32-slow:
+ spark-test-spark32-slow:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -687,7 +687,7 @@ jobs:
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark32-slow
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -696,9 +696,9 @@ jobs:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- run-spark-test-spark33:
+ spark-test-spark33:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -723,14 +723,18 @@ jobs:
run: |
cd $GITHUB_WORKSPACE/
export SPARK_SCALA_VERSION=2.12
- $MVN_CMD clean test -Pspark-3.3 -Pbackends-velox -Piceberg -Pdelta
-Phudi -Pspark-ut \
+ yum install -y java-17-openjdk-devel
+ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
+ export PATH=$JAVA_HOME/bin:$PATH
+ java -version
+ $MVN_CMD clean test -Pspark-3.3 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Phudi -Pspark-ut \
-DargLine="-Dspark.test.home=/opt/shims/spark33/spark_home/" \
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark33
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -742,13 +746,12 @@ jobs:
if: failure()
uses: actions/upload-artifact@v4
with:
- name: golden-files-spark33
+ name: ${{ github.job }}-golden-files
path: /tmp/tpch-approved-plan/**
-
- run-spark-test-spark33-slow:
+ spark-test-spark33-slow:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -766,14 +769,18 @@ jobs:
- name: Build and Run unit test for Spark 3.3.1 (slow tests)
run: |
cd $GITHUB_WORKSPACE/
- $MVN_CMD clean test -Pspark-3.3 -Pbackends-velox -Piceberg -Pdelta
-Phudi -Pspark-ut \
+ yum install -y java-17-openjdk-devel
+ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
+ export PATH=$JAVA_HOME/bin:$PATH
+ java -version
+ $MVN_CMD clean test -Pspark-3.3 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Phudi -Pspark-ut \
-DargLine="-Dspark.test.home=/opt/shims/spark33/spark_home/" \
-DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark33-slow
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -782,9 +789,9 @@ jobs:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- run-spark-test-spark34:
+ spark-test-spark34:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -814,15 +821,15 @@ jobs:
export PATH=$JAVA_HOME/bin:$PATH
java -version
export SPARK_HOME=/opt/shims/spark34/spark_home/
- ls -l /opt/shims/spark34/spark_home/
+ ls -l $SPARK_HOME
$MVN_CMD clean test -Pspark-3.4 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Phudi -Pspark-ut \
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags
\
- -DargLine="-Dspark.test.home=/opt/shims/spark34/spark_home/
${EXTRA_FLAGS}"
+ -DargLine="-Dspark.test.home=$SPARK_HOME ${EXTRA_FLAGS}"
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark34-jdk17
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -834,64 +841,12 @@ jobs:
if: failure()
uses: actions/upload-artifact@v4
with:
- name: golden-files-spark34
+ name: ${{ github.job }}-golden-files
path: /tmp/tpch-approved-plan/**
- run-spark-test-spark34-jdk8:
+ spark-test-spark34-slow:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
- container: apache/gluten:centos-8-jdk8
- steps:
- - uses: actions/checkout@v2
- - name: Download All Artifacts
- uses: actions/download-artifact@v4
- with:
- name: velox-native-lib-centos-7-${{github.sha}}
- path: ./cpp/build/releases
- - name: Download Arrow Jars
- uses: actions/download-artifact@v4
- with:
- name: arrow-jars-centos-7-${{github.sha}}
- path: /root/.m2/repository/org/apache/arrow/
- - name: Prepare spark.test.home for Spark 3.4.4 (other tests)
- run: |
- dnf module -y install python39 && \
- alternatives --set python3 /usr/bin/python3.9 && \
- pip3 install setuptools==77.0.3 && \
- pip3 install pyspark==3.4.4 cython && \
- pip3 install pandas pyarrow
- - name: Build and Run unit test for Spark 3.4.4 (other tests)
- run: |
- cd $GITHUB_WORKSPACE/
- export SPARK_SCALA_VERSION=2.12
- export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
- export SPARK_HOME=/opt/shims/spark34/spark_home/
- ls -l /opt/shims/spark34/spark_home/
- $MVN_CMD clean test -Pspark-3.4 -Pjava-8 -Pbackends-velox -Pdelta
-Phudi -Pspark-ut \
-
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags
\
- -DargLine="-Dspark.test.home=/opt/shims/spark34/spark_home/"
- - name: Upload test report
- if: always()
- uses: actions/upload-artifact@v4
- with:
- name: test-report-spark34
- path: '**/surefire-reports/TEST-*.xml'
- - name: Upload unit tests log files
- if: ${{ !success() }}
- uses: actions/upload-artifact@v4
- with:
- name: ${{ github.job }}-test-log
- path: "**/target/*.log"
- - name: Upload golden files
- if: failure()
- uses: actions/upload-artifact@v4
- with:
- name: golden-files-spark34-jdk8
- path: /tmp/tpch-approved-plan/**
-
- run-spark-test-spark34-slow:
- needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -908,101 +863,20 @@ jobs:
- name: Build and Run unit test for Spark 3.4.4 (slow tests)
run: |
cd $GITHUB_WORKSPACE/
- export SPARK_HOME=/opt/shims/spark34/spark_home/
yum install -y java-17-openjdk-devel
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
export PATH=$JAVA_HOME/bin:$PATH
java -version
- ls -l /opt/shims/spark34/spark_home/
- $MVN_CMD clean test -Pspark-3.4 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Pspark-ut -Phudi \
- -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest \
- -DargLine="-Dspark.test.home=/opt/shims/spark34/spark_home/
${EXTRA_FLAGS}"
- - name: Upload test report
- if: always()
- uses: actions/upload-artifact@v4
- with:
- name: test-report-spark34-slow-jdk17
- path: '**/surefire-reports/TEST-*.xml'
- - name: Upload unit tests log files
- if: ${{ !success() }}
- uses: actions/upload-artifact@v4
- with:
- name: ${{ github.job }}-test-log
- path: "**/target/*.log"
-
- run-spark-test-spark34-slow-jdk8:
- needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
- container: apache/gluten:centos-8-jdk8
- steps:
- - uses: actions/checkout@v2
- - name: Download All Artifacts
- uses: actions/download-artifact@v4
- with:
- name: velox-native-lib-centos-7-${{github.sha}}
- path: ./cpp/build/releases
- - name: Download Arrow Jars
- uses: actions/download-artifact@v4
- with:
- name: arrow-jars-centos-7-${{github.sha}}
- path: /root/.m2/repository/org/apache/arrow/
- - name: Build and Run unit test for Spark 3.4.4 (slow tests)
- run: |
- cd $GITHUB_WORKSPACE/
- export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export SPARK_HOME=/opt/shims/spark34/spark_home/
- ls -l /opt/shims/spark34/spark_home/
- $MVN_CMD clean test -Pspark-3.4 -Pjava-8 -Pbackends-velox -Pdelta
-Pspark-ut -Phudi \
+ ls -l $SPARK_HOME
+ $MVN_CMD clean test -Pspark-3.4 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Pspark-ut -Phudi \
-DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest \
- -DargLine="-Dspark.test.home=/opt/shims/spark34/spark_home/"
- - name: Upload test report
- if: always()
- uses: actions/upload-artifact@v4
- with:
- name: test-report-spark34-slow-jdk8
- path: '**/surefire-reports/TEST-*.xml'
- - name: Upload unit tests log files
- if: ${{ !success() }}
- uses: actions/upload-artifact@v4
- with:
- name: ${{ github.job }}-test-log
- path: "**/target/*.log"
-
- run-spark-test-spark35:
- needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
- container: apache/gluten:centos-8-jdk8
- steps:
- - uses: actions/checkout@v2
- - name: Download All Artifacts
- uses: actions/download-artifact@v4
- with:
- name: velox-native-lib-centos-7-${{github.sha}}
- path: ./cpp/build/releases
- - name: Download Arrow Jars
- uses: actions/download-artifact@v4
- with:
- name: arrow-jars-centos-7-${{github.sha}}
- path: /root/.m2/repository/org/apache/arrow/
- - name: Prepare
- run: |
- dnf module -y install python39 && \
- alternatives --set python3 /usr/bin/python3.9 && \
- pip3 install setuptools==77.0.3 && \
- pip3 install pyspark==3.5.2 cython && \
- pip3 install pandas pyarrow
- - name: Build and Run unit test for Spark 3.5.2 (other tests)
- run: |
- cd $GITHUB_WORKSPACE/
- export SPARK_SCALA_VERSION=2.12
- $MVN_CMD clean test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta
-Phudi -Pspark-ut \
- -DargLine="-Dspark.test.home=/opt/shims/spark35/spark_home/" \
-
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags
+ -DargLine="-Dspark.test.home=$SPARK_HOME ${EXTRA_FLAGS}"
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -1010,16 +884,10 @@ jobs:
with:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- - name: Upload golden files
- if: failure()
- uses: actions/upload-artifact@v4
- with:
- name: golden-files-spark35
- path: /tmp/tpch-approved-plan/**
- run-spark-test-spark35-jdk17:
+ spark-test-spark35:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1055,7 +923,7 @@ jobs:
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35-jdk17
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -1067,12 +935,12 @@ jobs:
if: failure()
uses: actions/upload-artifact@v4
with:
- name: golden-files-spark35
+ name: ${{ github.job }}-golden-files
path: /tmp/tpch-approved-plan/**
- run-spark-test-spark35-scala213:
+ spark-test-spark35-scala213:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1097,14 +965,18 @@ jobs:
run: |
cd $GITHUB_WORKSPACE/
export SPARK_SCALA_VERSION=2.13
- $MVN_CMD clean test -Pspark-3.5 -Pscala-2.13 -Pbackends-velox
-Piceberg \
+ yum install -y java-17-openjdk-devel
+ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
+ export PATH=$JAVA_HOME/bin:$PATH
+ java -version
+ $MVN_CMD clean test -Pspark-3.5 -Pscala-2.13 -Pjava-17
-Pbackends-velox -Piceberg \
-Pdelta -Pspark-ut
-DargLine="-Dspark.test.home=/opt/shims/spark35-scala-2.13/spark_home/" \
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35-scala213
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -1113,9 +985,9 @@ jobs:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- run-spark-test-spark35-slow:
+ spark-test-spark35-slow:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1132,14 +1004,18 @@ jobs:
- name: Build and Run unit test for Spark 3.5.2 (slow tests)
run: |
cd $GITHUB_WORKSPACE/
- $MVN_CMD clean test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta
-Phudi -Pspark-ut \
+ yum install -y java-17-openjdk-devel
+ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
+ export PATH=$JAVA_HOME/bin:$PATH
+ java -version
+ $MVN_CMD clean test -Pspark-3.5 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Phudi -Pspark-ut \
-DargLine="-Dspark.test.home=/opt/shims/spark35/spark_home/" \
-DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest
- name: Upload test report
if: always()
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35-slow
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -1148,9 +1024,9 @@ jobs:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- run-spark-test-spark35-ras:
+ spark-test-spark35-ras:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1175,13 +1051,17 @@ jobs:
run: |
cd $GITHUB_WORKSPACE/
export SPARK_SCALA_VERSION=2.12
- $MVN_CMD clean test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta
-Pspark-ut \
+ yum install -y java-17-openjdk-devel
+ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
+ export PATH=$JAVA_HOME/bin:$PATH
+ java -version
+ $MVN_CMD clean test -Pspark-3.5 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Pspark-ut \
-DargLine="-Dspark.test.home=/opt/shims/spark35/spark_home/
-Dspark.gluten.ras.enabled=true" \
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags
- name: Upload test report
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35-ras
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -1190,9 +1070,9 @@ jobs:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- run-spark-test-spark35-slow-ras:
+ spark-test-spark35-slow-ras:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1209,13 +1089,17 @@ jobs:
- name: Build and Run unit test for Spark 3.5.2 (slow tests)
run: |
cd $GITHUB_WORKSPACE/
- $MVN_CMD clean test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta
-Pspark-ut \
+ yum install -y java-17-openjdk-devel
+ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
+ export PATH=$JAVA_HOME/bin:$PATH
+ java -version
+ $MVN_CMD clean test -Pspark-3.5 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Pspark-ut \
-DargLine="-Dspark.test.home=/opt/shims/spark35/spark_home/
-Dspark.gluten.ras.enabled=true" \
-DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest
- name: Upload test report
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35-slow-ras
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -1224,9 +1108,9 @@ jobs:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- run-spark-test-spark35-smj:
+ spark-test-spark35-smj:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1251,13 +1135,17 @@ jobs:
run: |
cd $GITHUB_WORKSPACE/
export SPARK_SCALA_VERSION=2.12
- $MVN_CMD clean test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta
-Pspark-ut \
+ yum install -y java-17-openjdk-devel
+ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
+ export PATH=$JAVA_HOME/bin:$PATH
+ java -version
+ $MVN_CMD clean test -Pspark-3.5 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Pspark-ut \
-DargLine="-Dspark.test.home=/opt/shims/spark35/spark_home/
-Dspark.gluten.sql.columnar.forceShuffledHashJoin=false" \
-DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags
- name: Upload test report
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35-smj
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -1266,9 +1154,9 @@ jobs:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- run-spark-test-spark35-slow-smj:
+ spark-test-spark35-slow-smj:
needs: build-native-lib-centos-7
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1285,13 +1173,17 @@ jobs:
- name: Build and Run unit test for Spark 3.5.2 (slow tests)
run: |
cd $GITHUB_WORKSPACE/
- $MVN_CMD clean test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta
-Pspark-ut \
+ yum install -y java-17-openjdk-devel
+ export JAVA_HOME=/usr/lib/jvm/java-17-openjdk
+ export PATH=$JAVA_HOME/bin:$PATH
+ java -version
+ $MVN_CMD clean test -Pspark-3.5 -Pjava-17 -Pbackends-velox -Piceberg
-Pdelta -Pspark-ut \
-DargLine="-Dspark.test.home=/opt/shims/spark35/spark_home/
-Dspark.gluten.sql.columnar.forceShuffledHashJoin=false" \
-DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest
- name: Upload test report
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35-slow-smj
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
@@ -1300,8 +1192,8 @@ jobs:
name: ${{ github.job }}-test-log
path: "**/target/*.log"
- run-cpp-test-udf-test:
- runs-on: ubuntu-20.04
+ cpp-test-udf-test:
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -1343,7 +1235,7 @@ jobs:
- name: Upload test report
uses: actions/upload-artifact@v4
with:
- name: test-report-spark35-udf
+ name: ${{ github.job }}-report
path: '**/surefire-reports/TEST-*.xml'
- name: Upload unit tests log files
if: ${{ !success() }}
diff --git a/.github/workflows/velox_backend_cache.yml
b/.github/workflows/velox_backend_cache.yml
index 5140eb5325..7fc2c149e1 100644
--- a/.github/workflows/velox_backend_cache.yml
+++ b/.github/workflows/velox_backend_cache.yml
@@ -30,7 +30,7 @@ concurrency:
jobs:
cache-native-lib-centos-7:
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Get Ccache
@@ -55,7 +55,7 @@ jobs:
key: ccache-centos7-release-default-${{github.sha}}
cache-native-lib-centos-8:
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
container: apache/gluten:centos-8-jdk8
steps:
- uses: actions/checkout@v2
@@ -79,7 +79,7 @@ jobs:
key: ccache-centos8-release-default-${{github.sha}}
# ccache-native-lib-ubuntu-velox-ut:
- # runs-on: ubuntu-20.04
+ # runs-on: ubuntu-22.04
# env:
# CCACHE_DIR: "${{ github.workspace }}/.ccache"
# container: ghcr.io/facebookincubator/velox-dev:amd64-ubuntu-22.04-avx
@@ -111,7 +111,7 @@ jobs:
# path: '${{ env.CCACHE_DIR }}'
# key: ccache-ubuntu-release-default
# ccache-native-lib-centos-velox-ut:
-# runs-on: ubuntu-20.04
+# runs-on: ubuntu-22.04
# env:
# CCACHE_DIR: "${{ github.workspace }}/.ccache"
# container: ghcr.io/facebookincubator/velox-dev:centos8
diff --git a/.github/workflows/velox_nightly.yml
b/.github/workflows/velox_nightly.yml
index fbba838bc3..b880a9d329 100644
--- a/.github/workflows/velox_nightly.yml
+++ b/.github/workflows/velox_nightly.yml
@@ -32,7 +32,7 @@ concurrency:
jobs:
build-native-lib:
- runs-on: ubuntu-20.04
+ runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v4
- name: Get Ccache
diff --git a/cpp/core/shuffle/LocalPartitionWriter.cc
b/cpp/core/shuffle/LocalPartitionWriter.cc
index b0d1b49aeb..2cca7f4c69 100644
--- a/cpp/core/shuffle/LocalPartitionWriter.cc
+++ b/cpp/core/shuffle/LocalPartitionWriter.cc
@@ -387,7 +387,7 @@ std::string LocalPartitionWriter::nextSpilledFileDir() {
arrow::Result<std::shared_ptr<arrow::io::OutputStream>>
LocalPartitionWriter::openFile(const std::string& file) {
std::shared_ptr<arrow::io::FileOutputStream> fout;
- auto fd = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC);
+ auto fd = open(file.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0000);
// Set the shuffle file permissions to 0644 to keep it consistent with the
permissions of
// the built-in shuffler manager in Spark.
fchmod(fd, 0644);
diff --git a/docs/developers/HowTo.md b/docs/developers/HowTo.md
index 7c954d7e7e..c3b6e91203 100644
--- a/docs/developers/HowTo.md
+++ b/docs/developers/HowTo.md
@@ -44,7 +44,7 @@ You can generate the example files by the following steps:
1. Build Velox and Gluten CPP:
```
-${GLUTEN_HOME}/dev/builddeps-veloxbe.sh --build_tests=ON --build_benchmarks=ON
--build_type=Debug
+${GLUTEN_HOME}/dev/builddeps-veloxbe.sh --build_tests=ON --build_benchmarks=ON
--build_examples=ON --build_type=Debug
```
- Compiling with `--build_type=Debug` is good for debugging.
@@ -54,8 +54,7 @@ ${GLUTEN_HOME}/dev/builddeps-veloxbe.sh --build_tests=ON
--build_benchmarks=ON -
```
cd ${GLUTEN_HOME}
-mvn clean package -Pspark-3.2 -Pbackends-velox -Pceleborn -Puniffle
-mvn test -Pspark-3.2 -Pbackends-velox -Pceleborn -pl backends-velox \
+mvn test -Pspark-3.2 -Pbackends-velox -pl backends-velox \
-am -DtagsToInclude="org.apache.gluten.tags.GenerateExample" \
-Dtest=none -DfailIfNoTests=false \
-Dexec.skip
@@ -68,24 +67,28 @@ mvn test -Pspark-3.2 -Pbackends-velox -Pceleborn -pl
backends-velox \
```shell
$ tree ${GLUTEN_HOME}/backends-velox/generated-native-benchmark/
/some-dir-to-gluten-home/backends-velox/generated-native-benchmark/
-├── example.json
-├── example_lineitem
-│ ├── part-00000-3ec19189-d20e-4240-85ae-88631d46b612-c000.snappy.parquet
-│ └── _SUCCESS
-└── example_orders
- ├── part-00000-1e66fb98-4dd6-47a6-8679-8625dbc437ee-c000.snappy.parquet
- └── _SUCCESS
+|-- conf_12_0.ini
+|-- data_12_0_0.parquet
+|-- data_12_0_1.parquet
+`-- plan_12_0.json
```
3. Now, run benchmarks with GDB
```shell
-cd ${GLUTEN_HOME}/cpp/build/velox/benchmarks/
-gdb generic_benchmark
+cd ${GLUTEN_HOME}
+gdb cpp/build/velox/benchmarks/generic_benchmark
```
-- When GDB load `generic_benchmark` successfully, you can set `breakpoint` on
the `main` function with command `b main`, and then run with command `r`,
- then the process `generic_benchmark` will start and stop at the `main`
function.
+- When GDB load `generic_benchmark` successfully, you can set `breakpoint` on
the `main` function with command `b main`, and then run using the `r` command
with
+ arguments for the example files like:
+ ```
+ r --with-shuffle --partitioning hash --threads 1 --iterations 1 \
+ --conf backends-velox/generated-native-benchmark/conf_12_0.ini \
+ --plan backends-velox/generated-native-benchmark/plan_12_0.json \
+ --data
backends-velox/generated-native-benchmark/data_12_0_0.parquet,backends-velox/generated-native-benchmark/data_12_0_1.parquet
+ ```
+ The process `generic_benchmark` will start and stop at the `main` function.
- You can check the variables' state with command `p variable_name`, or
execute the program line by line with command `n`, or step-in the function been
called with command `s`.
- Actually, you can debug `generic_benchmark` with any gdb commands as
debugging normal C++ program, because the `generic_benchmark` is a pure C++
@@ -95,9 +98,7 @@ gdb generic_benchmark
[gdb-tui](https://sourceware.org/gdb/onlinedocs/gdb/TUI.html)
5. You can start `generic_benchmark` with specific JSON plan and input files
-- If you omit them, the `example.json, example_lineitem + example_orders`
under the directory of
`${GLUTEN_HOME}/backends-velox/generated-native-benchmark`
- will be used as default.
-- You can also edit the file `example.json` to custom the Substrait plan or
specify the inputs files placed in the other directory.
+- You can also edit the file `plan_12_0.json` to custom the Substrait plan or
specify the inputs files placed in the other directory.
6. Get more detail information about benchmarks from
[MicroBenchmarks](./MicroBenchmarks.md)
@@ -173,7 +174,7 @@ Here we will explain how to run TPC-H on Velox backend with
the Parquet file for
```
- Modify `${GLUTEN_HOME}/tools/workload/tpch/run_tpch/tpch_parquet.sh`.
- - Set `GLUTEN_JAR` correctly. Please refer to the section of [Build Gluten
with Velox Backend](../get-started/Velox.md/#2-build-gluten-with-velox-backend)
+ - Set `GLUTEN_JAR` correctly. Please refer to the section of [Build Gluten
with Velox Backend](../get-started/Velox.md#build-gluten-with-velox-backend)
- Set `SPARK_HOME` correctly.
- Set the memory configurations appropriately.
- Execute `tpch_parquet.sh` using the below command.
diff --git a/docs/developers/NewToGluten.md b/docs/developers/NewToGluten.md
index 26418a45e5..1c6a22b651 100644
--- a/docs/developers/NewToGluten.md
+++ b/docs/developers/NewToGluten.md
@@ -10,15 +10,23 @@ Help users to debug and test with gluten.
Now gluten supports Ubuntu20.04, Ubuntu22.04, centos8, centos7 and macOS.
-## OpenJDK 8
+## JDK
-### Environment Setting
+Currently, Gluten supports JDK 8 for Spark 3.2/3.3/3.4/3.5. For Spark 3.3 and
higher versions, Gluten
+supports JDK 11 and 17. Please note since Spark 4.0, JDK 8 will not be
supported. So we recommend Velox
+backend users to use higher JDK version now to ease the migration for
deploying Gluten with Spark-4.0
+in the future. And we may probably upgrade Arrow from 15.0.0 to some higher
version, which also requires
+JDK 11 is the minimum version.
+
+### JDK 8
+
+#### Environment Setting
For root user, the environment variables file is `/etc/profile`, it will take
effect for all the users.
For other user, you can set in `~/.bashrc`.
-### Guide for Ubuntu
+#### Guide for Ubuntu
The default JDK version in ubuntu is java11, we need to set to java8.
@@ -41,9 +49,9 @@ export PATH="$PATH:$JAVA_HOME/bin"
> Must set PATH with double quote in ubuntu.
-## OpenJDK 17
+### JDK 11/17
-By default, Gluten compiles package using JDK8. Enable maven profile by
`-Pjava-17` to use JDK17 or `-Pjava-11` to use JDK 11, and please make sure
your JAVA_HOME points to jdk17 or jdk11 respectively.
+By default, Gluten compiles package using JDK8. Enable maven profile by
`-Pjava-17` to use JDK17 or `-Pjava-11` to use JDK 11, and please make sure
your JAVA_HOME is set correctly.
Apache Spark and Arrow requires setting java args
`-Dio.netty.tryReflectionSetAccessible=true`, see
[SPARK-29924](https://issues.apache.org/jira/browse/SPARK-29924) and
[ARROW-6206](https://issues.apache.org/jira/browse/ARROW-6206).
So please add following configs in `spark-defaults.conf`:
@@ -457,7 +465,7 @@ valgrind --leak-check=yes ./exec_backend_test
# Run TPC-H and TPC-DS
We supply `<gluten_home>/tools/gluten-it` to execute these queries
-Refer to
[velox_be.yml](https://github.com/apache/incubator-gluten/blob/main/.github/workflows/velox_be.yml)
+Refer to
[velox_backend.yml](https://github.com/apache/incubator-gluten/blob/main/.github/workflows/velox_backend.yml)
# Run gluten+velox on clean machine
diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md
index 582a18a0ed..533ed3cf62 100644
--- a/docs/get-started/Velox.md
+++ b/docs/get-started/Velox.md
@@ -431,7 +431,7 @@ Using the following configuration options to customize
spilling:
# Velox User-Defined Functions (UDF) and User-Defined Aggregate Functions
(UDAF)
-Please check the [VeloxNativeUDF.md](../developers/VeloxNativeUDF.md) for more
detailed usage and configurations.
+Please check the [VeloxNativeUDF.md](../developers/VeloxUDF.md) for more
detailed usage and configurations.
# Test TPC-H or TPC-DS on Gluten with Velox backend
@@ -445,10 +445,6 @@ The data generation scripts are [TPC-H dategen
script](../../tools/workload/tpch
The used TPC-H and TPC-DS queries are the original ones, and can be accessed
from [TPC-DS
queries](../../tools/gluten-it/common/src/main/resources/tpcds-queries)
and [TPC-H
queries](../../tools/gluten-it/common/src/main/resources/tpch-queries).
-Some other versions of TPC-DS queries are also provided, but are **not**
recommended for testing, including:
-
-- the modified TPC-DS queries with "Decimal-to-Double": [TPC-DS non-decimal
queries](../../gluten-core/src/test/resources/tpcds-queries/tpcds.queries.no-decimal)
(outdated).
-
## Submit the Spark SQL job
Submit test script from spark-shell. You can find the scala code to [Run
TPC-H](../../tools/workload/tpch/run_tpch/tpch_parquet.scala) as an example.
Please remember to modify
@@ -547,7 +543,7 @@ I20231121 10:19:42.348845 90094332
WholeStageResultIterator.cc:220] Native Plan
## Using Stage-Level Resource Adjustment to Avoid OOM(Experimental)
- see more [here](../VeloxStageResourceAdj.md)
+ see more [here](./VeloxStageResourceAdj.md)
## Broadcast Build Relations to Off-Heap(Experimental)
diff --git
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
index 20dea16267..987e2db163 100644
---
a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
+++
b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala
@@ -652,7 +652,10 @@ class VeloxTestSettings extends BackendTestSettings {
// for ObjectHashAggregateExec will fail.
"SPARK-22223: ObjectHashAggregate should not introduce unnecessary
shuffle",
"SPARK-31620: agg with subquery (whole-stage-codegen = true)",
- "SPARK-31620: agg with subquery (whole-stage-codegen = false)"
+ "SPARK-31620: agg with subquery (whole-stage-codegen = false)",
+ // The below test just verifies Spark's scala code. The involved toString
+ // implementation has different result on Java 17.
+ "SPARK-24788: RelationalGroupedDataset.toString with unresolved exprs
should not fail"
)
enableSuite[GlutenDataFrameAsOfJoinSuite]
enableSuite[GlutenDataFrameComplexTypeSuite]
@@ -741,6 +744,10 @@ class VeloxTestSettings extends BackendTestSettings {
// Rewrite the following two tests in GlutenDatasetSuite.
.exclude("dropDuplicates: columns with same column name")
.exclude("groupBy.as")
+ // The below two tests just verify Spark's scala code. The involved
toString
+ // implementation has different result on Java 17.
+ .exclude("Check RelationalGroupedDataset toString: Single data")
+ .exclude("Check RelationalGroupedDataset toString: over length schema ")
enableSuite[GlutenDateFunctionsSuite]
// The below two are replaced by two modified versions.
.exclude("unix_timestamp")
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]