This is an automated email from the ASF dual-hosted git repository. JackieTien97 pushed a commit to branch speedup-windows-ci in repository https://gitbox.apache.org/repos/asf/iotdb.git
commit 89748f1ff6c6d0d5b02393dabf1ad83e3effbf77 Author: JackieTien97 <[email protected]> AuthorDate: Sun May 17 08:49:41 2026 +0800 Shard Windows IT jobs to speed up 1C1D and Table 1C1D CI The Windows runners for Cluster IT - 1C1D and Table Cluster IT - 1C1D are 67-77% slower than their Ubuntu counterparts, making them the bottleneck of the entire PR check pipeline (87 min and 65 min wall clock respectively). Split each pipeline's Windows job into 3 parallel matrix shards: - LocalStandaloneIT test classes (276) split for Cluster IT - 1C1D - TableLocalStandaloneIT test classes (231) split for Table Cluster IT - 1C1D Each shard uses failsafe.includesFile reading from a generated file, avoiding command-line length limits regardless of how the test suite grows. Ubuntu jobs stay as a single job since they were already fast enough. Expected wall clock reduction: - Cluster IT - 1C1D: 87 min -> ~49 min (capped by Ubuntu) - Table Cluster IT - 1C1D: 65 min -> ~39 min (capped by Ubuntu) --- .github/workflows/cluster-it-1c1d.yml | 78 +++++++++++++++++++++++------ .github/workflows/table-cluster-it-1c1d.yml | 78 +++++++++++++++++++++++------ 2 files changed, 126 insertions(+), 30 deletions(-) diff --git a/.github/workflows/cluster-it-1c1d.yml b/.github/workflows/cluster-it-1c1d.yml index 4ab201450e6..477cf33e21e 100644 --- a/.github/workflows/cluster-it-1c1d.yml +++ b/.github/workflows/cluster-it-1c1d.yml @@ -30,13 +30,52 @@ env: DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: - Simple: + # Ubuntu runs all ITs in a single job (already fast at ~49 min) + Ubuntu: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v5 + - name: Set up JDK + uses: actions/setup-java@v5 + with: + distribution: corretto + java-version: 17 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Cache Maven packages + uses: actions/cache@v5 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2- + - name: Adjust Linux kernel somaxconn + shell: bash + run: sudo sysctl -w net.core.somaxconn=65535 + - name: IT/UT Test + shell: bash + run: | + mvn clean verify \ + -P with-integration-tests \ + -DskipUTs \ + -DintegrationTest.forkCount=2 \ + -pl integration-test \ + -am + - name: Upload Artifact + if: failure() + uses: actions/upload-artifact@v6 + with: + name: standalone-log-Linux + path: integration-test/target/cluster-logs + retention-days: 1 + + # Windows is ~77% slower than Ubuntu, so split into 3 shards to parallelize + Windows: strategy: fail-fast: false - max-parallel: 15 matrix: - os: [ubuntu-latest, windows-latest] - runs-on: ${{ matrix.os }} + shard: [0, 1, 2] + runs-on: windows-latest steps: - uses: actions/checkout@v5 @@ -54,36 +93,45 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2- - name: Adjust network dynamic TCP ports range - if: ${{ runner.os == 'Windows' }} shell: pwsh run: | netsh int ipv4 set dynamicport tcp start=32768 num=32768 netsh int ipv4 set dynamicport udp start=32768 num=32768 netsh int ipv6 set dynamicport tcp start=32768 num=32768 netsh int ipv6 set dynamicport udp start=32768 num=32768 - - name: Adjust Linux kernel somaxconn - if: ${{ runner.os == 'Linux' }} + - name: Build IT shard list shell: bash - run: sudo sysctl -w net.core.somaxconn=65535 - # - name: Adjust Mac kernel somaxconn - # if: ${{ runner.os == 'macOS' }} - # shell: bash - # run: sudo sysctl -w kern.ipc.somaxconn=65535 + # Distribute LocalStandaloneIT test classes across 3 shards using hash-mod assignment. + # The list is written to a file so failsafe.includesFile can read it without command-line length limits. + run: | + set -euo pipefail + SHARD=${{ matrix.shard }} + TOTAL=3 + mkdir -p integration-test + find integration-test/src/test/java -name '*IT.java' -print0 \ + | xargs -0 grep -lE '\bLocalStandaloneIT\b' \ + | awk -F'/' '{print $NF}' | sed 's/\.java$//' \ + | sort \ + | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \ + > integration-test/it-shard.txt + echo "Shard $SHARD/$TOTAL contains $(wc -l < integration-test/it-shard.txt) test classes" + head -5 integration-test/it-shard.txt - name: IT/UT Test shell: bash - # we do not compile client-cpp for saving time, it is tested in client.yml - # we can skip influxdb-protocol because it has been tested separately in influxdb-protocol.yml run: | mvn clean verify \ -P with-integration-tests \ -DskipUTs \ -DintegrationTest.forkCount=2 \ + -Dfailsafe.includesFile="$(pwd)/integration-test/it-shard.txt" \ + -DfailIfNoTests=false \ + -Dfailsafe.failIfNoSpecifiedTests=false \ -pl integration-test \ -am - name: Upload Artifact if: failure() uses: actions/upload-artifact@v6 with: - name: standalone-log-java${{ matrix.java }}-${{ runner.os }} + name: standalone-log-Windows-shard${{ matrix.shard }} path: integration-test/target/cluster-logs retention-days: 1 diff --git a/.github/workflows/table-cluster-it-1c1d.yml b/.github/workflows/table-cluster-it-1c1d.yml index 782bafa4ddb..149b6a3fd27 100644 --- a/.github/workflows/table-cluster-it-1c1d.yml +++ b/.github/workflows/table-cluster-it-1c1d.yml @@ -31,13 +31,52 @@ env: DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }} jobs: - Simple: + # Ubuntu runs all ITs in a single job (already fast at ~39 min) + Ubuntu: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v5 + - name: Set up JDK + uses: actions/setup-java@v5 + with: + distribution: corretto + java-version: 17 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Cache Maven packages + uses: actions/cache@v5 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2- + - name: Adjust Linux kernel somaxconn + shell: bash + run: sudo sysctl -w net.core.somaxconn=65535 + - name: IT/UT Test + shell: bash + run: | + mvn clean verify \ + -P with-integration-tests \ + -DskipUTs \ + -DintegrationTest.forkCount=2 -DDataNodeMaxHeapSize=1024 \ + -pl integration-test \ + -am -PTableSimpleIT + - name: Upload Artifact + if: failure() + uses: actions/upload-artifact@v6 + with: + name: table-standalone-log-Linux + path: integration-test/target/cluster-logs + retention-days: 1 + + # Windows is ~67% slower than Ubuntu, so split into 3 shards to parallelize + Windows: strategy: fail-fast: false - max-parallel: 15 matrix: - os: [ubuntu-latest, windows-latest] - runs-on: ${{ matrix.os }} + shard: [0, 1, 2] + runs-on: windows-latest steps: - uses: actions/checkout@v5 @@ -55,36 +94,45 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2- - name: Adjust network dynamic TCP ports range - if: ${{ runner.os == 'Windows' }} shell: pwsh run: | netsh int ipv4 set dynamicport tcp start=32768 num=32768 netsh int ipv4 set dynamicport udp start=32768 num=32768 netsh int ipv6 set dynamicport tcp start=32768 num=32768 netsh int ipv6 set dynamicport udp start=32768 num=32768 - - name: Adjust Linux kernel somaxconn - if: ${{ runner.os == 'Linux' }} + - name: Build IT shard list shell: bash - run: sudo sysctl -w net.core.somaxconn=65535 - # - name: Adjust Mac kernel somaxconn - # if: ${{ runner.os == 'macOS' }} - # shell: bash - # run: sudo sysctl -w kern.ipc.somaxconn=65535 + # Distribute TableLocalStandaloneIT test classes across 3 shards using hash-mod assignment. + # The list is written to a file so failsafe.includesFile can read it without command-line length limits. + run: | + set -euo pipefail + SHARD=${{ matrix.shard }} + TOTAL=3 + mkdir -p integration-test + find integration-test/src/test/java -name '*IT.java' -print0 \ + | xargs -0 grep -l 'TableLocalStandaloneIT' \ + | awk -F'/' '{print $NF}' | sed 's/\.java$//' \ + | sort \ + | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \ + > integration-test/it-shard.txt + echo "Shard $SHARD/$TOTAL contains $(wc -l < integration-test/it-shard.txt) test classes" + head -5 integration-test/it-shard.txt - name: IT/UT Test shell: bash - # we do not compile client-cpp for saving time, it is tested in client.yml - # we can skip influxdb-protocol because it has been tested separately in influxdb-protocol.yml run: | mvn clean verify \ -P with-integration-tests \ -DskipUTs \ -DintegrationTest.forkCount=2 -DDataNodeMaxHeapSize=1024 \ + -Dfailsafe.includesFile="$(pwd)/integration-test/it-shard.txt" \ + -DfailIfNoTests=false \ + -Dfailsafe.failIfNoSpecifiedTests=false \ -pl integration-test \ -am -PTableSimpleIT - name: Upload Artifact if: failure() uses: actions/upload-artifact@v6 with: - name: table-standalone-log-java${{ matrix.java }}-${{ runner.os }} + name: table-standalone-log-Windows-shard${{ matrix.shard }} path: integration-test/target/cluster-logs retention-days: 1
