This is an automated email from the ASF dual-hosted git repository.

JackieTien97 pushed a commit to branch speedup-windows-ci
in repository https://gitbox.apache.org/repos/asf/iotdb.git

commit 89748f1ff6c6d0d5b02393dabf1ad83e3effbf77
Author: JackieTien97 <[email protected]>
AuthorDate: Sun May 17 08:49:41 2026 +0800

    Shard Windows IT jobs to speed up 1C1D and Table 1C1D CI
    
    The Windows runners for Cluster IT - 1C1D and Table Cluster IT - 1C1D
    are 67-77% slower than their Ubuntu counterparts, making them the
    bottleneck of the entire PR check pipeline (87 min and 65 min wall
    clock respectively).
    
    Split each pipeline's Windows job into 3 parallel matrix shards:
    - LocalStandaloneIT test classes (276) split for Cluster IT - 1C1D
    - TableLocalStandaloneIT test classes (231) split for Table Cluster IT - 
1C1D
    
    Each shard uses failsafe.includesFile reading from a generated file,
    avoiding command-line length limits regardless of how the test suite grows.
    
    Ubuntu jobs stay as a single job since they were already fast enough.
    
    Expected wall clock reduction:
    - Cluster IT - 1C1D: 87 min -> ~49 min (capped by Ubuntu)
    - Table Cluster IT - 1C1D: 65 min -> ~39 min (capped by Ubuntu)
---
 .github/workflows/cluster-it-1c1d.yml       | 78 +++++++++++++++++++++++------
 .github/workflows/table-cluster-it-1c1d.yml | 78 +++++++++++++++++++++++------
 2 files changed, 126 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/cluster-it-1c1d.yml 
b/.github/workflows/cluster-it-1c1d.yml
index 4ab201450e6..477cf33e21e 100644
--- a/.github/workflows/cluster-it-1c1d.yml
+++ b/.github/workflows/cluster-it-1c1d.yml
@@ -30,13 +30,52 @@ env:
   DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
 
 jobs:
-  Simple:
+  # Ubuntu runs all ITs in a single job (already fast at ~49 min)
+  Ubuntu:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v5
+      - name: Set up JDK
+        uses: actions/setup-java@v5
+        with:
+          distribution: corretto
+          java-version: 17
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Cache Maven packages
+        uses: actions/cache@v5
+        with:
+          path: ~/.m2
+          key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+          restore-keys: ${{ runner.os }}-m2-
+      - name: Adjust Linux kernel somaxconn
+        shell: bash
+        run: sudo sysctl -w net.core.somaxconn=65535
+      - name: IT/UT Test
+        shell: bash
+        run: |
+          mvn clean verify \
+          -P with-integration-tests \
+          -DskipUTs \
+          -DintegrationTest.forkCount=2 \
+          -pl integration-test \
+          -am
+      - name: Upload Artifact
+        if: failure()
+        uses: actions/upload-artifact@v6
+        with:
+          name: standalone-log-Linux
+          path: integration-test/target/cluster-logs
+          retention-days: 1
+
+  # Windows is ~77% slower than Ubuntu, so split into 3 shards to parallelize
+  Windows:
     strategy:
       fail-fast: false
-      max-parallel: 15
       matrix:
-        os: [ubuntu-latest, windows-latest]
-    runs-on: ${{ matrix.os }}
+        shard: [0, 1, 2]
+    runs-on: windows-latest
 
     steps:
       - uses: actions/checkout@v5
@@ -54,36 +93,45 @@ jobs:
           key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
           restore-keys: ${{ runner.os }}-m2-
       - name: Adjust network dynamic TCP ports range
-        if: ${{ runner.os == 'Windows' }}
         shell: pwsh
         run: |
           netsh int ipv4 set dynamicport tcp start=32768 num=32768
           netsh int ipv4 set dynamicport udp start=32768 num=32768
           netsh int ipv6 set dynamicport tcp start=32768 num=32768
           netsh int ipv6 set dynamicport udp start=32768 num=32768
-      - name: Adjust Linux kernel somaxconn
-        if: ${{ runner.os == 'Linux' }}
+      - name: Build IT shard list
         shell: bash
-        run: sudo sysctl -w net.core.somaxconn=65535
-      #      - name: Adjust Mac kernel somaxconn
-      #        if: ${{ runner.os == 'macOS' }}
-      #        shell: bash
-      #        run: sudo sysctl -w kern.ipc.somaxconn=65535
+        # Distribute LocalStandaloneIT test classes across 3 shards using 
hash-mod assignment.
+        # The list is written to a file so failsafe.includesFile can read it 
without command-line length limits.
+        run: |
+          set -euo pipefail
+          SHARD=${{ matrix.shard }}
+          TOTAL=3
+          mkdir -p integration-test
+          find integration-test/src/test/java -name '*IT.java' -print0 \
+            | xargs -0 grep -lE '\bLocalStandaloneIT\b' \
+            | awk -F'/' '{print $NF}' | sed 's/\.java$//' \
+            | sort \
+            | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \
+            > integration-test/it-shard.txt
+          echo "Shard $SHARD/$TOTAL contains $(wc -l < 
integration-test/it-shard.txt) test classes"
+          head -5 integration-test/it-shard.txt
       - name: IT/UT Test
         shell: bash
-        # we do not compile client-cpp for saving time, it is tested in 
client.yml
-        # we can skip influxdb-protocol because it has been tested separately 
in influxdb-protocol.yml
         run: |
           mvn clean verify \
           -P with-integration-tests \
           -DskipUTs \
           -DintegrationTest.forkCount=2 \
+          -Dfailsafe.includesFile="$(pwd)/integration-test/it-shard.txt" \
+          -DfailIfNoTests=false \
+          -Dfailsafe.failIfNoSpecifiedTests=false \
           -pl integration-test \
           -am
       - name: Upload Artifact
         if: failure()
         uses: actions/upload-artifact@v6
         with:
-          name: standalone-log-java${{ matrix.java }}-${{ runner.os }}
+          name: standalone-log-Windows-shard${{ matrix.shard }}
           path: integration-test/target/cluster-logs
           retention-days: 1
diff --git a/.github/workflows/table-cluster-it-1c1d.yml 
b/.github/workflows/table-cluster-it-1c1d.yml
index 782bafa4ddb..149b6a3fd27 100644
--- a/.github/workflows/table-cluster-it-1c1d.yml
+++ b/.github/workflows/table-cluster-it-1c1d.yml
@@ -31,13 +31,52 @@ env:
   DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
 
 jobs:
-  Simple:
+  # Ubuntu runs all ITs in a single job (already fast at ~39 min)
+  Ubuntu:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v5
+      - name: Set up JDK
+        uses: actions/setup-java@v5
+        with:
+          distribution: corretto
+          java-version: 17
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Cache Maven packages
+        uses: actions/cache@v5
+        with:
+          path: ~/.m2
+          key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
+          restore-keys: ${{ runner.os }}-m2-
+      - name: Adjust Linux kernel somaxconn
+        shell: bash
+        run: sudo sysctl -w net.core.somaxconn=65535
+      - name: IT/UT Test
+        shell: bash
+        run: |
+          mvn clean verify \
+          -P with-integration-tests \
+          -DskipUTs \
+          -DintegrationTest.forkCount=2 -DDataNodeMaxHeapSize=1024 \
+          -pl integration-test \
+          -am -PTableSimpleIT
+      - name: Upload Artifact
+        if: failure()
+        uses: actions/upload-artifact@v6
+        with:
+          name: table-standalone-log-Linux
+          path: integration-test/target/cluster-logs
+          retention-days: 1
+
+  # Windows is ~67% slower than Ubuntu, so split into 3 shards to parallelize
+  Windows:
     strategy:
       fail-fast: false
-      max-parallel: 15
       matrix:
-        os: [ubuntu-latest, windows-latest]
-    runs-on: ${{ matrix.os }}
+        shard: [0, 1, 2]
+    runs-on: windows-latest
 
     steps:
       - uses: actions/checkout@v5
@@ -55,36 +94,45 @@ jobs:
           key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
           restore-keys: ${{ runner.os }}-m2-
       - name: Adjust network dynamic TCP ports range
-        if: ${{ runner.os == 'Windows' }}
         shell: pwsh
         run: |
           netsh int ipv4 set dynamicport tcp start=32768 num=32768
           netsh int ipv4 set dynamicport udp start=32768 num=32768
           netsh int ipv6 set dynamicport tcp start=32768 num=32768
           netsh int ipv6 set dynamicport udp start=32768 num=32768
-      - name: Adjust Linux kernel somaxconn
-        if: ${{ runner.os == 'Linux' }}
+      - name: Build IT shard list
         shell: bash
-        run: sudo sysctl -w net.core.somaxconn=65535
-      #      - name: Adjust Mac kernel somaxconn
-      #        if: ${{ runner.os == 'macOS' }}
-      #        shell: bash
-      #        run: sudo sysctl -w kern.ipc.somaxconn=65535
+        # Distribute TableLocalStandaloneIT test classes across 3 shards using 
hash-mod assignment.
+        # The list is written to a file so failsafe.includesFile can read it 
without command-line length limits.
+        run: |
+          set -euo pipefail
+          SHARD=${{ matrix.shard }}
+          TOTAL=3
+          mkdir -p integration-test
+          find integration-test/src/test/java -name '*IT.java' -print0 \
+            | xargs -0 grep -l 'TableLocalStandaloneIT' \
+            | awk -F'/' '{print $NF}' | sed 's/\.java$//' \
+            | sort \
+            | awk -v s=$SHARD -v t=$TOTAL 'NR%t==s' \
+            > integration-test/it-shard.txt
+          echo "Shard $SHARD/$TOTAL contains $(wc -l < 
integration-test/it-shard.txt) test classes"
+          head -5 integration-test/it-shard.txt
       - name: IT/UT Test
         shell: bash
-        # we do not compile client-cpp for saving time, it is tested in 
client.yml
-        # we can skip influxdb-protocol because it has been tested separately 
in influxdb-protocol.yml
         run: |
           mvn clean verify \
           -P with-integration-tests \
           -DskipUTs \
           -DintegrationTest.forkCount=2 -DDataNodeMaxHeapSize=1024 \
+          -Dfailsafe.includesFile="$(pwd)/integration-test/it-shard.txt" \
+          -DfailIfNoTests=false \
+          -Dfailsafe.failIfNoSpecifiedTests=false \
           -pl integration-test \
           -am -PTableSimpleIT
       - name: Upload Artifact
         if: failure()
         uses: actions/upload-artifact@v6
         with:
-          name: table-standalone-log-java${{ matrix.java }}-${{ runner.os }}
+          name: table-standalone-log-Windows-shard${{ matrix.shard }}
           path: integration-test/target/cluster-logs
           retention-days: 1

Reply via email to