This is an automated email from the ASF dual-hosted git repository.
sunchao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion-comet.git
The following commit(s) were added to refs/heads/main by this push:
new e2a6aca build: Separate and speedup TPC-DS benchmark (#130)
e2a6aca is described below
commit e2a6acaf9b1a318f6ffee2f0a2a0d5899be1e348
Author: advancedxy <[email protected]>
AuthorDate: Thu Feb 29 08:56:16 2024 +0800
build: Separate and speedup TPC-DS benchmark (#130)
---
.github/workflows/{pr_build.yml => benchmark.yml} | 169 +++++++++-------------
.github/workflows/pr_build.yml | 61 --------
2 files changed, 66 insertions(+), 164 deletions(-)
diff --git a/.github/workflows/pr_build.yml b/.github/workflows/benchmark.yml
similarity index 53%
copy from .github/workflows/pr_build.yml
copy to .github/workflows/benchmark.yml
index fe4dd04..adfa1ae 100644
--- a/.github/workflows/pr_build.yml
+++ b/.github/workflows/benchmark.yml
@@ -15,7 +15,7 @@
# specific language governing permissions and limitations
# under the License.
-name: PR Build
+name: Run TPC-DS Benchmark
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{
github.workflow }}
@@ -38,103 +38,8 @@ env:
RUST_VERSION: nightly
jobs:
- linux-test:
- strategy:
- matrix:
- os: [ubuntu-latest]
- java_version: [8, 11, 17]
- test-target: [rust, java]
- is_push_event:
- - ${{ github.event_name == 'push' }}
- exclude: # exclude java 11 for pull_request event
- - java_version: 11
- is_push_event: false
- fail-fast: false
- name: ${{ matrix.test-target }} test on ${{ matrix.os }} with java ${{
matrix.java_version }}
- runs-on: ${{ matrix.os }}
- container:
- image: amd64/rust
- env:
- JAVA_VERSION: ${{ matrix.java_version == 8 && '1.8' || format('{0}',
matrix.java_version) }}
- steps:
- - uses: actions/checkout@v4
- - name: Setup Rust & Java toolchain
- uses: ./.github/actions/setup-builder
- with:
- rust-version: ${{env.RUST_VERSION}}
- jdk-version: ${{ matrix.java_version }}
-
- - uses: actions/checkout@v4
- - if: matrix.test-target == 'rust'
- name: Rust test steps
- uses: ./.github/actions/rust-test
- - if: matrix.test-target == 'java'
- name: Java test steps
- uses: ./.github/actions/java-test
-
- macos-test:
- strategy:
- matrix:
- os: [macos-13]
- java_version: [8, 11, 17]
- test-target: [rust, java]
- fail-fast: false
- if: github.event_name == 'push'
- name: ${{ matrix.test-target }} test on ${{ matrix.os }} with java ${{
matrix.java_version }}
- runs-on: ${{ matrix.os }}
- env:
- JAVA_VERSION: ${{ matrix.java_version == 8 && '1.8' || format('{0}',
matrix.java_version) }}
- steps:
- - uses: actions/checkout@v4
- - name: Setup Rust & Java toolchain
- uses: ./.github/actions/setup-macos-builder
- with:
- rust-version: ${{env.RUST_VERSION}}
- jdk-version: ${{ matrix.java_version }}
-
- - uses: actions/checkout@v4
- - if: matrix.test-target == 'rust'
- name: Rust test steps
- uses: ./.github/actions/rust-test
- - if: matrix.test-target == 'java'
- name: Java test steps
- uses: ./.github/actions/java-test
-
- macos-aarch64-test:
- strategy:
- matrix:
- java_version: [8, 11, 17]
- test-target: [rust, java]
- is_push_event:
- - ${{ github.event_name == 'push' }}
- exclude: # exclude java 11 for pull_request event
- - java_version: 11
- is_push_event: false
- fail-fast: false
- name: ${{ matrix.test-target }} test on macos-aarch64 with java ${{
matrix.java_version }}
- runs-on: macos-14
- env:
- JAVA_VERSION: ${{ matrix.java_version == 8 && '1.8' || format('{0}',
matrix.java_version) }}
- steps:
- - uses: actions/checkout@v4
- - name: Setup Rust & Java toolchain
- uses: ./.github/actions/setup-macos-builder
- with:
- rust-version: ${{env.RUST_VERSION}}
- jdk-version: ${{ matrix.java_version }}
- jdk-architecture: aarch64
- protoc-architecture: aarch_64
-
- - uses: actions/checkout@v4
- - if: matrix.test-target == 'rust'
- name: Rust test steps
- uses: ./.github/actions/rust-test
- - if: matrix.test-target == 'java'
- name: Java test steps
- uses: ./.github/actions/java-test
-
- tpcds-1g:
- name: Run TPC-DS queries with SF=1
+ prepare:
+ name: Build native lib and prepare TPC-DS data
runs-on: ubuntu-latest
container:
image: amd64/rust
@@ -147,13 +52,22 @@ jobs:
with:
rust-version: ${{env.RUST_VERSION}}
jdk-version: 11
+ - name: Cache Maven dependencies
+ uses: actions/cache@v4
+ with:
+ path: |
+ ~/.m2/repository
+ /root/.m2/repository
+ key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-java-maven-
- name: Cache TPC-DS generated data
id: cache-tpcds-sf-1
uses: actions/cache@v4
with:
path: ./tpcds-sf-1
- key: tpcds-${{ hashFiles('.github/workflows/pr_build.yml') }}
+ key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
- name: Checkout tpcds-kit repository
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
uses: actions/checkout@v4
@@ -162,6 +76,15 @@ jobs:
path: ./tpcds-kit
- name: Build Comet
run: make release
+ - name: Upload Comet native lib
+ uses: actions/upload-artifact@v4
+ with:
+ name: libcomet-${{ github.run_id }}
+ path: |
+ core/target/release/libcomet.so
+ core/target/release/libcomet.dylib
+ retention-days: 1 # remove the artifact after 1 day, only valid for
this workflow
+ overwrite: true
- name: Build tpcds-kit
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: |
@@ -172,24 +95,64 @@ jobs:
run: |
cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java
-Dexec.mainClass="org.apache.spark.sql.GenTPCDSData"
-Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false"
-Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location
`pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1"
cd ..
+
+ benchmark:
+ name: Run TPC-DS benchmark
+ runs-on: ubuntu-latest
+ needs: [prepare]
+ container:
+ image: amd64/rust
+ strategy:
+ matrix:
+ join: [sort_merge, broadcast, hash]
+ steps:
+ - uses: actions/checkout@v4
+ - name: Setup Rust & Java toolchain
+ uses: ./.github/actions/setup-builder
+ with:
+ rust-version: ${{env.RUST_VERSION}}
+ jdk-version: 11
+ - name: Cache Maven dependencies
+ uses: actions/cache@v4
+ with:
+ path: |
+ ~/.m2/repository
+ /root/.m2/repository
+ key: ${{ runner.os }}-java-maven-${{ hashFiles('**/pom.xml') }}
+ restore-keys: |
+ ${{ runner.os }}-java-maven-
+ - name: Restore TPC-DS generated data
+ id: cache-tpcds-sf-1
+ uses: actions/cache@v4
+ with:
+ path: ./tpcds-sf-1
+ key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml') }}
+ fail-on-cache-miss: true # it's always be cached as it should be
generated by pre-step if not existed
+ - name: Download Comet native lib
+ uses: actions/download-artifact@v4
+ with:
+ name: libcomet-${{ github.run_id }}
+ path: core/target/release
- name: Run TPC-DS queries (Sort merge join)
+ if: matrix.join == 'sort_merge'
run: |
- SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+ SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B
-Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.preferSortMergeJoin=true
- name: Run TPC-DS queries (Broadcast hash join)
+ if: matrix.join == 'broadcast'
run: |
- SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+ SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B
-Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=10485760
- name: Run TPC-DS queries (Shuffled hash join)
+ if: matrix.join == 'hash'
run: |
- SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
+ SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw -B
-Prelease -Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.forceApplyShuffledHashJoin=true
-
diff --git a/.github/workflows/pr_build.yml b/.github/workflows/pr_build.yml
index fe4dd04..316532c 100644
--- a/.github/workflows/pr_build.yml
+++ b/.github/workflows/pr_build.yml
@@ -132,64 +132,3 @@ jobs:
- if: matrix.test-target == 'java'
name: Java test steps
uses: ./.github/actions/java-test
-
- tpcds-1g:
- name: Run TPC-DS queries with SF=1
- runs-on: ubuntu-latest
- container:
- image: amd64/rust
- env:
- JAVA_VERSION: 11
- steps:
- - uses: actions/checkout@v4
- - name: Setup Rust & Java toolchain
- uses: ./.github/actions/setup-builder
- with:
- rust-version: ${{env.RUST_VERSION}}
- jdk-version: 11
-
- - name: Cache TPC-DS generated data
- id: cache-tpcds-sf-1
- uses: actions/cache@v4
- with:
- path: ./tpcds-sf-1
- key: tpcds-${{ hashFiles('.github/workflows/pr_build.yml') }}
- - name: Checkout tpcds-kit repository
- if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
- uses: actions/checkout@v4
- with:
- repository: databricks/tpcds-kit
- path: ./tpcds-kit
- - name: Build Comet
- run: make release
- - name: Build tpcds-kit
- if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
- run: |
- apt-get install -y yacc bison flex
- cd tpcds-kit/tools && make OS=LINUX
- - name: Generate TPC-DS (SF=1) table data
- if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
- run: |
- cd spark && MAVEN_OPTS='-Xmx20g' ../mvnw exec:java
-Dexec.mainClass="org.apache.spark.sql.GenTPCDSData"
-Dexec.classpathScope="test" -Dexec.cleanupDaemonThreads="false"
-Dexec.args="--dsdgenDir `pwd`/../tpcds-kit/tools --location
`pwd`/../tpcds-sf-1 --scaleFactor 1 --numPartitions 1"
- cd ..
- - name: Run TPC-DS queries (Sort merge join)
- run: |
- SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
- env:
- SPARK_TPCDS_JOIN_CONF: |
- spark.sql.autoBroadcastJoinThreshold=-1
- spark.sql.join.preferSortMergeJoin=true
- - name: Run TPC-DS queries (Broadcast hash join)
- run: |
- SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
- env:
- SPARK_TPCDS_JOIN_CONF: |
- spark.sql.autoBroadcastJoinThreshold=10485760
- - name: Run TPC-DS queries (Shuffled hash join)
- run: |
- SPARK_HOME=`pwd` SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 ./mvnw
-Dsuites=org.apache.spark.sql.CometTPCDSQuerySuite test
- env:
- SPARK_TPCDS_JOIN_CONF: |
- spark.sql.autoBroadcastJoinThreshold=-1
- spark.sql.join.forceApplyShuffledHashJoin=true
-