This is an automated email from the ASF dual-hosted git repository.

yuanzhou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/incubator-gluten.git


The following commit(s) were added to refs/heads/main by this push:
     new 4e35abc48a [GLUTEN-11302][VL] Fix Gluten-GPU build by bumping to 
CUDA-13.1 (#11275)
4e35abc48a is described below

commit 4e35abc48ac5b7f3549d959dbbf72b6d5aee9edc
Author: Yuan <[email protected]>
AuthorDate: Thu Dec 18 21:55:00 2025 +0800

    [GLUTEN-11302][VL] Fix Gluten-GPU build by bumping to CUDA-13.1 (#11275)
    
    Velox upgraded to CUDF-25.12, this patch fix Gluten-GPU build by:
    - switch to use gcc-14
    - bumping to cuda-toolkit-13.1
    
    The new cuda-toolkit-13.1 requires larger disk spaces, so this patch also 
modified GHA to clean up the disk space firstly
    
    ---------
    
    Signed-off-by: Yuan <[email protected]>
---
 .github/workflows/velox_backend_cache.yml | 100 +++++++-----------------------
 .github/workflows/velox_backend_x86.yml   |  30 ++++++---
 cpp/velox/CMakeLists.txt                  |   3 +-
 dev/docker/cudf/Dockerfile                |   5 +-
 ep/build-velox/src/build-velox.sh         |   4 +-
 5 files changed, 53 insertions(+), 89 deletions(-)

diff --git a/.github/workflows/velox_backend_cache.yml 
b/.github/workflows/velox_backend_cache.yml
index eb060c3a90..8d0a3da0c9 100644
--- a/.github/workflows/velox_backend_cache.yml
+++ b/.github/workflows/velox_backend_cache.yml
@@ -145,9 +145,13 @@ jobs:
     strategy:
       matrix:
         os: [ ubuntu-22.04 ]
-    container: apache/gluten:centos-9-jdk8-cudf
     steps:
-      - uses: actions/checkout@v2
+      - name: "node-cleanup" # by default the free runner does not have enough 
disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc 
/opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          sudo docker builder prune -a
+      - uses: actions/checkout@v4
       - name: Get Ccache
         uses: actions/cache/restore@v3
         with:
@@ -157,14 +161,26 @@ jobs:
             ccache-centos9-release-shared-${{runner.arch}}
       - name: Build Gluten shared libraries
         run: |
-          export CCACHE_MAXSIZE=1G
-          dnf autoremove -y
+          docker run -v $GITHUB_WORKSPACE:/work -w /work 
apache/gluten:centos-9-jdk8-cudf bash -c "
+          rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 
/opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build 
script later
           df -a
-          rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 
/opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build 
script later
+          dnf autoremove -y && dnf clean all
+          dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1
+          ls -l /usr/local/
           source /opt/rh/gcc-toolset-12/enable
+
+          export CMAKE_BUILD_PARALLEL_LEVEL=4
           export NUM_THREADS=4
-          bash dev/builddeps-veloxbe.sh --run_setup_script=OFF 
--build_arrow=OFF --build_tests=OFF --build_benchmarks=ON --enable_gpu=ON # 
TODO: re-enable tests with more disk space
+          export CCACHE_MAXSIZE=1G
+          export CCACHE_DIR=/work/.ccache
+          mkdir -p /work/.ccache
+          
+          cd /work
+          bash dev/builddeps-veloxbe.sh --run_setup_script=OFF 
--build_arrow=OFF --build_tests=ON --build_benchmarks=ON --enable_gpu=ON # 
TODO: re-enable tests with more disk space
           rm -rf ep/build-velox/build/velox_ep
+          mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests
+          ccache -s
+          "
       - name: Save Ccache
         if: always()
         uses: actions/cache/save@v3
@@ -172,75 +188,3 @@ jobs:
         with:
           path: '${{ env.CCACHE_DIR }}'
           key: ccache-centos9-release-shared-${{runner.arch}}-${{github.sha}}
-
-  # ccache-native-lib-ubuntu-velox-ut:
-  #   runs-on: ubuntu-22.04
-  #   env:
-  #     CCACHE_DIR: "${{ github.workspace }}/.ccache"
-  #   container: ghcr.io/facebookincubator/velox-dev:amd64-ubuntu-22.04-avx
-  #   steps:
-  #     - uses: actions/checkout@v2
-  #     - name: Get Ccache
-  #       uses: actions/cache/restore@v3
-  #       with:
-  #         path: '${{ env.CCACHE_DIR }}'
-  #         key: ccache-ubuntu-release-default
-  #     - name: Ensure Cache Dirs Exists
-  #       working-directory: ${{ github.workspace }}
-  #       run: |
-  #         mkdir -p '${{ env.CCACHE_DIR }}'
-  #     - name: Build Gluten native libraries
-  #       run: |
-  #         rm -rf /opt/miniconda-for-velox/
-  #         cd ep/build-velox/src && \
-  #         ./get-velox.sh
-  #         cd ../build/velox_ep/
-  #         make EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_PARQUET=ON 
-DVELOX_BUILD_TESTING=ON -DVELOX_BUILD_TEST_UTILS=ON" 
-
-  #     - name: CCache after
-  #       run: |
-  #         ccache -vs
-
-  #     - uses: actions/cache/save@v3
-  #       with:
-  #         path: '${{ env.CCACHE_DIR }}'
-  #         key: ccache-ubuntu-release-default
-#  ccache-native-lib-centos-velox-ut:
-#    runs-on: ubuntu-22.04
-#    env:
-#      CCACHE_DIR: "${{ github.workspace }}/.ccache"
-#    container: ghcr.io/facebookincubator/velox-dev:centos8
-#    steps:
-#      - uses: actions/checkout@v2
-#      - name: Setup java and maven
-#        run: |
-#          yum install sudo patch java-1.8.0-openjdk-devel wget -y && \
-#          wget 
https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
-#          tar -xvf apache-maven-3.8.8-bin.tar.gz
-#          mv apache-maven-3.8.8 /usr/lib/maven
-#      - name: Get Ccache
-#        uses: actions/cache/restore@v3
-#        with:
-#          path: '${{ env.CCACHE_DIR }}'
-#          key: ccache-centos-release-default
-#      - name: Ensure Cache Dirs Exists
-#        working-directory: ${{ github.workspace }}
-#        run: |
-#          mkdir -p '${{ env.CCACHE_DIR }}'
-#      - name: Build Gluten native libraries
-#        run: |
-#          rm -rf /opt/miniconda-for-velox/
-#          cd ep/build-velox/src && \
-#          ./get-velox.sh
-#          cd ../build/velox_ep/
-#          source /opt/rh/gcc-toolset-9/enable
-#          make EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_PARQUET=ON 
-DVELOX_BUILD_TESTING=ON -DVELOX_BUILD_TEST_UTILS=ON"
-#
-#      - name: CCache after
-#        run: |
-#          ccache -s
-#
-#      - uses: actions/cache/save@v3
-#        with:
-#          path: '${{ env.CCACHE_DIR }}'
-#          key: ccache-centos-release-default
diff --git a/.github/workflows/velox_backend_x86.yml 
b/.github/workflows/velox_backend_x86.yml
index 5618aba857..1bcf1f97ef 100644
--- a/.github/workflows/velox_backend_x86.yml
+++ b/.github/workflows/velox_backend_x86.yml
@@ -1314,9 +1314,15 @@ jobs:
 
   build-cudf-centos-9:
     runs-on: ubuntu-22.04
-    container: apache/gluten:centos-9-jdk8-cudf
     steps:
-      - uses: actions/checkout@v2
+      - name: "node-cleanup" # by default the free runner does not have enough 
disk space
+        run: |
+          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc 
/opt/hostedtoolcache/CodeQL
+          sudo docker image prune --all --force
+          sudo docker builder prune -a
+      - run: df -h | sort -k 5 -nr # check disk space for debug
+
+      - uses: actions/checkout@v4
       - name: Get Ccache
         uses: actions/cache/restore@v4
         with:
@@ -1326,15 +1332,25 @@ jobs:
             ccache-centos9-release-shared-${{runner.arch}}
       - name: Build Gluten native libraries
         run: |
-          rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 
/opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build 
script later
-          dnf autoremove -y
+          docker run -v $GITHUB_WORKSPACE:/work -w /work 
apache/gluten:centos-9-jdk8-cudf bash -c "
+          rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 
/opt/rh/gcc-toolset-12 # hack to use gcc 14, should upgrade in Velox build 
script later
           df -a
+          dnf autoremove -y && dnf clean all
+          dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1
+          ls -l /usr/local/
           source /opt/rh/gcc-toolset-12/enable
+
+          export CMAKE_BUILD_PARALLEL_LEVEL=4
           export NUM_THREADS=4
-          # bash dev/builddeps-veloxbe.sh --run_setup_script=OFF 
--build_arrow=OFF --build_tests=OFF --build_benchmarks=ON --enable_gpu=ON # 
TODO: re-enable tests with more disk space
-          # rm -rf ep/build-velox/build/velox_ep
-          # mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests
+          export CCACHE_DIR=/work/.ccache
+          mkdir -p /work/.ccache
+
+          cd /work
+          bash dev/builddeps-veloxbe.sh --run_setup_script=OFF 
--build_arrow=OFF --build_tests=ON --build_benchmarks=ON --enable_gpu=ON # 
TODO: re-enable tests with more disk space
+          rm -rf ep/build-velox/build/velox_ep
+          mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests
           ccache -s
+          "
 
   spark-test-spark40:
     needs: build-native-lib-centos-7
diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt
index e389113c4e..9547cde0e2 100644
--- a/cpp/velox/CMakeLists.txt
+++ b/cpp/velox/CMakeLists.txt
@@ -439,6 +439,7 @@ if(ENABLE_GPU)
             ${VELOX_BUILD_PATH}/_deps/nvtx3-src/c/include
             ${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/include
             ${VELOX_BUILD_PATH}/_deps/rapids_logger-src/include
+            /usr/local/cuda/include/cccl
             /usr/local/cuda/include)
 
   target_compile_definitions(
@@ -458,7 +459,7 @@ if(ENABLE_GPU)
       
${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/lib64/libnvcomp.so
       
${VELOX_BUILD_PATH}/_deps/nvcomp_proprietary_binary-src/lib64/libnvcomp_cpu.so
       ${VELOX_BUILD_PATH}/_deps/rapids_logger-build/librapids_logger.so
-      /usr/local/cuda-12.8/lib64/libcudart.so.12)
+      /usr/local/cuda/lib64/libcudart.so)
 endif()
 
 add_custom_command(
diff --git a/dev/docker/cudf/Dockerfile b/dev/docker/cudf/Dockerfile
index 42258a69c9..c900c632ea 100644
--- a/dev/docker/cudf/Dockerfile
+++ b/dev/docker/cudf/Dockerfile
@@ -28,7 +28,10 @@ ENV CUDA_ARCHITECTURES=70
 
 
 WORKDIR /opt/gluten
-RUN rm -rf /opt/rh/gcc-toolset-12 && cp -r /opt/rh/gcc-toolset-14 
/opt/rh/gcc-toolset-12; \
+RUN rm -rf /opt/rh/gcc-toolset-12 && ln -s /opt/rh/gcc-toolset-14 
/opt/rh/gcc-toolset-12; \
+    dnf remove -y cuda-toolkit-12* && dnf install -y cuda-toolkit-13-1; \
+    dnf autoremove -y && dnf clean all; \
+    source /opt/rh/gcc-toolset-12/enable; \
     bash ./dev/buildbundle-veloxbe.sh --run_setup_script=OFF --build_arrow=ON 
--spark_version=3.4 --build_tests=ON --build_benchmarks=ON --enable_gpu=ON && 
rm -rf /opt/gluten
 
 # You can try the data in folder 
backends-velox/src/test/resources/tpch-data-parquet
diff --git a/ep/build-velox/src/build-velox.sh 
b/ep/build-velox/src/build-velox.sh
index 3e0f6be4fb..ac62f8fc27 100755
--- a/ep/build-velox/src/build-velox.sh
+++ b/ep/build-velox/src/build-velox.sh
@@ -134,8 +134,8 @@ function compile {
   if [ $ENABLE_GPU == "ON" ]; then
     # the cuda default options are for Centos9 image from Meta
     echo "enable GPU support."
-    COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_GPU=ON 
-DVELOX_ENABLE_CUDF=ON -DCMAKE_CUDA_ARCHITECTURES=70 \
-        -DCMAKE_CUDA_COMPILER=/usr/local/cuda-12.8/bin/nvcc"
+    COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_GPU=ON 
-DVELOX_ENABLE_CUDF=ON -DCMAKE_CUDA_ARCHITECTURES=75 \
+        -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc"
   fi
   if [ -n "${GLUTEN_VCPKG_ENABLED:-}" ]; then
     COMPILE_OPTION="$COMPILE_OPTION -DVELOX_GFLAGS_TYPE=static"


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to