This is an automated email from the ASF dual-hosted git repository.

sunchao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-comet.git


The following commit(s) were added to refs/heads/main by this push:
     new e2a6aca  build: Separate and speedup TPC-DS benchmark (#130)
e2a6aca is described below

commit e2a6acaf9b1a318f6ffee2f0a2a0d5899be1e348
Author: advancedxy <[email protected]>
AuthorDate: Thu Feb 29 08:56:16 2024 +0800

    build: Separate and speedup TPC-DS benchmark (#130)
---
 .github/workflows/{pr_build.yml => benchmark.yml} | 169 +++++++++-------------
 .github/workflows/pr_build.yml                    |  61 --------
 2 files changed, 66 insertions(+), 164 deletions(-)

diff --git a/.github/workflows/pr_build.yml b/.github/workflows/benchmark.yml
similarity index 53%
copy from .github/workflows/pr_build.yml
copy to .github/workflows/benchmark.yml
index fe4dd04..adfa1ae 100644
--- a/.github/workflows/pr_build.yml
+++ b/.github/workflows/benchmark.yml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: PR Build
+name: Run TPC-DS Benchmark
 
 concurrency:
   group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ 
github.workflow }}
@@ -38,103 +38,8 @@ env:
   RUST_VERSION: nightly
 
 jobs:
-  linux-test:
-    strategy:
-      matrix:
-        os: [ubuntu-latest]
-        java_version: [8, 11, 17]
-        test-target: [rust, java]
-        is_push_event:
-          - ${{ github.event_name == 'push' }}
-        exclude: # exclude java 11 for pull_request event
-          - java_version: 11
-            is_push_event: false
-      fail-fast: false
-    name: ${{ matrix.test-target }} test on ${{ matrix.os }} with java ${{ 
matrix.java_version }}
-    runs-on: ${{ matrix.os }}
-    container:
-      image: amd64/rust
-    env:
-      JAVA_VERSION: ${{ matrix.java_version == 8 && '1.8' || format('{0}', 
matrix.java_version) }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Rust & Java toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: ${{env.RUST_VERSION}}
-          jdk-version: ${{ matrix.java_version }}
-
-      - uses: actions/checkout@v4
-      - if: matrix.test-target == 'rust'
-        name: Rust test steps
-        uses: ./.github/actions/rust-test
-      - if: matrix.test-target == 'java'
-        name: Java test steps
-        uses: ./.github/actions/java-test
-
-  macos-test:
-    strategy:
-      matrix:
-        os: [macos-13]
-        java_version: [8, 11, 17]
-        test-target: [rust, java]
-      fail-fast: false
-    if: github.event_name == 'push'
-    name: ${{ matrix.test-target }} test on ${{ matrix.os }} with java ${{ 
matrix.java_version }}
-    runs-on: ${{ matrix.os }}
-    env:
-      JAVA_VERSION: ${{ matrix.java_version == 8 && '1.8' || format('{0}', 
matrix.java_version) }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Rust & Java toolchain
-        uses: ./.github/actions/setup-macos-builder
-        with:
-          rust-version: ${{env.RUST_VERSION}}
-          jdk-version: ${{ matrix.java_version }}
-
-      - uses: actions/checkout@v4
-      - if: matrix.test-target == 'rust'
-        name: Rust test steps
-        uses: ./.github/actions/rust-test
-      - if: matrix.test-target == 'java'
-        name: Java test steps
-        uses: ./.github/actions/java-test
-
-  macos-aarch64-test:
-    strategy:
-      matrix:
-        java_version: [8, 11, 17]
-        test-target: [rust, java]
-        is_push_event:
-          - ${{ github.event_name == 'push' }}
-        exclude: # exclude java 11 for pull_request event
-          - java_version: 11
-            is_push_event: false
-      fail-fast: false
-    name: ${{ matrix.test-target }} test on macos-aarch64 with java ${{ 
matrix.java_version }}
-    runs-on: macos-14
-    env:
-      JAVA_VERSION: ${{ matrix.java_version == 8 && '1.8' || format('{0}', 
matrix.java_version) }}
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Rust & Java toolchain
-        uses: ./.github/actions/setup-macos-builder
-        with:
-          rust-version: ${{env.RUST_VERSION}}
-          jdk-version: ${{ matrix.java_version }}
-          jdk-architecture: aarch64
-          protoc-architecture: aarch_64
-
-      - uses: actions/checkout@v4
-      - if: matrix.test-target == 'rust'
-        name: Rust test steps
-        uses: ./.github/actions/rust-test
-      - if: matrix.test-target == 'java'
-        name: Java test steps
-        uses: ./.github/actions/java-test
-
-  tpcds-1g:
-    name: Run TPC-DS queries with SF=1
+  prepare:
+    name: Build native lib and prepare TPC-DS data
     runs-on: ubuntu-latest
     container:
       image: amd64/rust
@@ -147,13 +52,22 @@ jobs:
         with:
           rust-version: ${{env.RUST_VERSION}}
           jdk-version: 11
+      - name: Cache Maven dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.m2/repository
+            /root/.m2/repository
+          key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
+          restore-keys: |
+            ${{ runner.os }}-java-maven-
 
       - name: Cache TPC-DS generated data
         id: cache-tpcds-sf-1
         uses: actions/cache@v4
         with:
           path: ./tpcds-sf-1
-          key: tpcds-${{ hashFiles('.github/workflows/pr_build.yml') }}
+          key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
       - name: Checkout tpcds-kit repository
         if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
         uses: actions/checkout@v4
@@ -162,6 +76,15 @@ jobs:
           path: ./tpcds-kit
       - name: Build Comet
         run: make release
+      - name: Upload Comet native lib
+        uses: actions/upload-artifact@v4
+        with:
+          name: libcomet-${{ github.run_id }}
+          path: |
+            core/target/release/libcomet.so
+            core/target/release/libcomet.dylib
+          retention-days: 1 # remove the artifact after 1 day, only valid for 
this workflow
+          overwrite: true
       - name: Build tpcds-kit
         if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
         run: |
@@ -172,24 +95,64 @@ jobs:
         run: |
           cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java 
-Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" 
-Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" 
-Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location 
`pwd`/../tpcds-sf-1 --scaleFactor 1  --numPartitions 1"
           cd ..
+
+  benchmark:
+    name: Run TPC-DS benchmark
+    runs-on: ubuntu-latest
+    needs: [prepare]
+    container:
+      image: amd64/rust
+    strategy:
+      matrix:
+        join: [sort_merge, broadcast, hash]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Setup Rust & Java toolchain
+        uses: ./.github/actions/setup-builder
+        with:
+          rust-version: ${{env.RUST_VERSION}}
+          jdk-version: 11
+      - name: Cache Maven dependencies
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.m2/repository
+            /root/.m2/repository
+          key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
+          restore-keys: |
+            ${{ runner.os }}-java-maven-
+      - name: Restore TPC-DS generated data
+        id: cache-tpcds-sf-1
+        uses: actions/cache@v4
+        with:
+          path: ./tpcds-sf-1
+          key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
+          fail-on-cache-miss: true # it's always be cached as it should be 
generated by pre-step if not existed
+      - name: Download Comet native lib
+        uses: actions/download-artifact@v4
+        with:
+          name: libcomet-${{ github.run_id }}
+          path: core/target/release
       - name: Run TPC-DS queries (Sort merge join)
+        if: matrix.join == 'sort_merge'
         run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw 
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B 
-Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
         env:
           SPARK_TPCDS_JOIN_CONF: |
             spark.sql.autoBroadcastJoinThreshold=-1
             spark.sql.join.preferSortMergeJoin=true
       - name: Run TPC-DS queries (Broadcast hash join)
+        if: matrix.join == 'broadcast'
         run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw 
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B 
-Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
         env:
           SPARK_TPCDS_JOIN_CONF: |
             spark.sql.autoBroadcastJoinThreshold=10485760
       - name: Run TPC-DS queries (Shuffled hash join)
+        if: matrix.join == 'hash'
         run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw 
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B 
-Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
         env:
           SPARK_TPCDS_JOIN_CONF: |
             spark.sql.autoBroadcastJoinThreshold=-1
             spark.sql.join.forceApplyShuffledHashJoin=true
-
diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml
index fe4dd04..316532c 100644
--- a/.github/workflows/pr_build.yml
+++ b/.github/workflows/pr_build.yml
@@ -132,64 +132,3 @@ jobs:
       - if: matrix.test-target == 'java'
         name: Java test steps
         uses: ./.github/actions/java-test
-
-  tpcds-1g:
-    name: Run TPC-DS queries with SF=1
-    runs-on: ubuntu-latest
-    container:
-      image: amd64/rust
-    env:
-      JAVA_VERSION: 11
-    steps:
-      - uses: actions/checkout@v4
-      - name: Setup Rust & Java toolchain
-        uses: ./.github/actions/setup-builder
-        with:
-          rust-version: ${{env.RUST_VERSION}}
-          jdk-version: 11
-
-      - name: Cache TPC-DS generated data
-        id: cache-tpcds-sf-1
-        uses: actions/cache@v4
-        with:
-          path: ./tpcds-sf-1
-          key: tpcds-${{ hashFiles('.github/workflows/pr_build.yml') }}
-      - name: Checkout tpcds-kit repository
-        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-        uses: actions/checkout@v4
-        with:
-          repository: databricks/tpcds-kit
-          path: ./tpcds-kit
-      - name: Build Comet
-        run: make release
-      - name: Build tpcds-kit
-        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-        run: |
-          apt-get install -y yacc bison flex
-          cd tpcds-kit/tools && make OS=LINUX
-      - name: Generate TPC-DS (SF=1) table data
-        if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
-        run: |
-          cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java 
-Dexec.mainClass="org.apache.spark.sql.GenTPCDSData" 
-Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false" 
-Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location 
`pwd`/../tpcds-sf-1 --scaleFactor 1  --numPartitions 1"
-          cd ..
-      - name: Run TPC-DS queries (Sort merge join)
-        run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw 
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
-        env:
-          SPARK_TPCDS_JOIN_CONF: |
-            spark.sql.autoBroadcastJoinThreshold=-1
-            spark.sql.join.preferSortMergeJoin=true
-      - name: Run TPC-DS queries (Broadcast hash join)
-        run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw 
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
-        env:
-          SPARK_TPCDS_JOIN_CONF: |
-            spark.sql.autoBroadcastJoinThreshold=10485760
-      - name: Run TPC-DS queries (Shuffled hash join)
-        run: |
-          SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw 
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
-        env:
-          SPARK_TPCDS_JOIN_CONF: |
-            spark.sql.autoBroadcastJoinThreshold=-1
-            spark.sql.join.forceApplyShuffledHashJoin=true
-

Reply via email to