HyukjinKwon commented on a change in pull request #33544:
URL: https://github.com/apache/spark/pull/33544#discussion_r678766666
##########
File path: .github/workflows/benchmark.yml
##########
@@ -19,32 +19,108 @@ on:
description: 'Number of job splits'
required: true
default: '1'
+ run-tcp-ds-query-benchmark:
+ description: 'Run TPCDSQueryBenchmark: true or false'
+ required: false
+ default: 'false'
jobs:
matrix-gen:
name: Generate matrix for job splits
runs-on: ubuntu-20.04
outputs:
- matrix: ${{ steps.set-matrix.outputs.matrix }}
+ class: ${{ steps.set-matrix.outputs.class }}
+ split: ${{ steps.set-matrix.outputs.split }}
+ num-split: ${{ steps.set-matrix.outputs.num-split }}
env:
+ SPARK_BENCHMARK_CLASS: ${{ github.event.inputs.class }}
SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
steps:
- name: Generate matrix
id: set-matrix
- run: echo "::set-output name=matrix::["`seq -s, 1
$SPARK_BENCHMARK_NUM_SPLITS`"]"
+ run: |
+ echo "::set-output name=class::['"$SPARK_BENCHMARK_CLASS"']"
+ echo "::set-output name=split::["`seq -s, 1
$SPARK_BENCHMARK_NUM_SPLITS`"]"
+ echo "::set-output name=num-split::['"$SPARK_BENCHMARK_NUM_SPLITS"']"
+
+ # Any TPC-DS related updates on this job need to be applied to tpcds-1g job
of build_and_test.yml as well
+ generate-tpc-ds-dataset:
+ name: "Generate an input dataset for TPCDSQueryBenchmark"
+ if: github.event.inputs.run-tcp-ds-query-benchmark == 'true'
+ runs-on: ubuntu-20.04
+ env:
+ SPARK_GENERATE_BENCHMARK_FILES: 1
+ SPARK_LOCAL_IP: localhost
+ steps:
+ - name: Checkout Spark repository
+ uses: actions/checkout@v2
+ # In order to get diff files
+ with:
+ fetch-depth: 0
+ - name: Cache Scala, SBT and Maven
+ uses: actions/cache@v2
+ with:
+ path: |
+ build/apache-maven-*
+ build/scala-*
+ build/*.jar
+ ~/.sbt
+ key: build-${{ hashFiles('**/pom.xml', 'project/build.properties',
'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash',
'build/spark-build-info') }}
+ restore-keys: |
+ build-
+ - name: Cache Coursier local repository
+ uses: actions/cache@v2
+ with:
+ path: ~/.cache/coursier
+ key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{
hashFiles('**/pom.xml', '**/plugins.sbt') }}
+ restore-keys: |
+ benchmark-coursier-${{ github.event.inputs.jdk }}
+ - name: Install Java ${{ github.event.inputs.jdk }}
+ uses: actions/setup-java@v1
+ with:
+ java-version: ${{ github.event.inputs.jdk }}
+ - name: Cache TPC-DS generated data
+ id: cache-tpcds-sf-1
+ uses: actions/cache@v2
+ with:
+ path: ./tpcds-sf-1
+ key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml',
'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
+ - name: Checkout tpcds-kit repository
+ if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+ uses: actions/checkout@v2
+ with:
+ repository: databricks/tpcds-kit
+ ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
+ path: ./tpcds-kit
+ - name: Build tpcds-kit
+ if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+ run: cd tpcds-kit/tools && make OS=LINUX
+ - name: Generate TPC-DS (SF=1) table data
+ if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
+ run: build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData
--dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1
--numPartitions 1 --overwrite"
benchmark:
- name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{
github.event.inputs.jdk }}, ${{ matrix.split }} out of ${{
github.event.inputs.num-splits }} splits)"
- needs: matrix-gen
+ name: "Run benchmarks: ${{ matrix.class }} (JDK ${{
github.event.inputs.jdk }}, ${{ matrix.split }} out of ${{
github.event.inputs.num-splits }} splits)"
+ if: always()
+ needs: [matrix-gen, generate-tpc-ds-dataset]
# Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
- split: ${{fromJSON(needs.matrix-gen.outputs.matrix)}}
+ class: ${{ fromJSON(needs.matrix-gen.outputs.class) }}
Review comment:
Let's also don't forget to update `object Benchmarks`'s docuentation. We
can, for example, if you define `TPC_DS_DATA_LOC` environment variable with a
proper data location, that triggers TPC-DS as well.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]