This is an automated email from the ASF dual-hosted git repository.

wusheng pushed a commit to branch fix/ci-e2e-failures
in repository https://gitbox.apache.org/repos/asf/skywalking.git

commit 14e17858ba34027435601c0fa58511b22a44d4fa
Author: Wu Sheng <[email protected]>
AuthorDate: Thu Feb 26 17:24:57 2026 +0800

    Fix ES cgroup v2 crash in Kind-based E2E tests and add diagnostics
    
    The ES 7.17.x bundled JVM fails to detect cgroup controllers in
    cgroup v2 environments (newer GitHub Actions runner images), causing
    ES to crash inside Kind nodes during helm install.
    
    Add `-Djdk.platform.numa.support=true` to ES Java opts in the Istio
    test values as a POC fix. Also add comprehensive Kind cluster
    diagnostics (pod status, ES logs, ulimits, events) to the Istio and
    Istio-ambient CI jobs for better failure visibility.
    
    Co-Authored-By: Claude Opus 4.6 <[email protected]>
---
 .github/workflows/skywalking.yaml   | 64 ++++++++++++++++++++++++++++++++++---
 test/e2e-v2/cases/istio/values.yaml |  1 +
 2 files changed, 61 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/skywalking.yaml 
b/.github/workflows/skywalking.yaml
index 1b49f2fae8..79e9752aff 100644
--- a/.github/workflows/skywalking.yaml
+++ b/.github/workflows/skywalking.yaml
@@ -853,10 +853,38 @@ jobs:
         with:
           e2e-file: test/e2e-v2/cases/istio/als/e2e.yaml
       - if: ${{ failure() }}
+        name: Diagnose Kind cluster
         run: |
+          echo "=== Runner image ==="
+          cat /etc/os-release | head -5
+          echo "=== Docker info ==="
+          docker version --format 'Docker {{.Server.Version}}'
+          echo "=== Disk & memory ==="
           df -h
-          du -sh .
-          docker images
+          free -h
+          echo "=== Kind cluster node ==="
+          kubectl get nodes -o wide --kubeconfig /tmp/e2e-k8s.config 
2>/dev/null || true
+          echo "=== All pods ==="
+          kubectl get pods -A -o wide --kubeconfig /tmp/e2e-k8s.config 
2>/dev/null || true
+          echo "=== Pod descriptions (non-Running) ==="
+          kubectl get pods -A --kubeconfig /tmp/e2e-k8s.config --no-headers 
2>/dev/null | grep -v Running | while read ns name rest; do
+            echo "--- $ns/$name ---"
+            kubectl describe pod "$name" -n "$ns" --kubeconfig 
/tmp/e2e-k8s.config 2>/dev/null | tail -30
+          done || true
+          echo "=== Events (warnings) ==="
+          kubectl get events -A --kubeconfig /tmp/e2e-k8s.config 
--field-selector type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -30 
|| true
+          echo "=== ES pod logs ==="
+          kubectl logs -n istio-system -l app=elasticsearch --tail=80 
--kubeconfig /tmp/e2e-k8s.config 2>/dev/null || true
+          echo "=== ES init container logs ==="
+          for pod in $(kubectl get pods -n istio-system -l app=elasticsearch 
--kubeconfig /tmp/e2e-k8s.config -o name 2>/dev/null); do
+            kubectl logs -n istio-system "$pod" --all-containers --previous 
--kubeconfig /tmp/e2e-k8s.config 2>/dev/null || true
+          done || true
+          echo "=== ulimit inside Kind node ==="
+          docker exec kind-control-plane sh -c 'ulimit -n; ulimit -u; cat 
/proc/sys/vm/max_map_count' 2>/dev/null || true
+          echo "=== OAP pod logs ==="
+          kubectl logs -n istio-system -l app=skywalking -c skywalking 
--tail=30 --kubeconfig /tmp/e2e-k8s.config 2>/dev/null || true
+          echo "=== Node describe (resources) ==="
+          kubectl describe node --kubeconfig /tmp/e2e-k8s.config 2>/dev/null | 
grep -A 20 "Allocated resources" || true
       - uses: actions/upload-artifact@v4
         if: ${{ failure() }}
         name: Upload Logs
@@ -914,10 +942,38 @@ jobs:
         with:
           e2e-file: test/e2e-v2/cases/istio/ambient-als/e2e.yaml
       - if: ${{ failure() }}
+        name: Diagnose Kind cluster
         run: |
+          echo "=== Runner image ==="
+          cat /etc/os-release | head -5
+          echo "=== Docker info ==="
+          docker version --format 'Docker {{.Server.Version}}'
+          echo "=== Disk & memory ==="
           df -h
-          du -sh .
-          docker images
+          free -h
+          echo "=== Kind cluster node ==="
+          kubectl get nodes -o wide --kubeconfig /tmp/e2e-k8s.config 
2>/dev/null || true
+          echo "=== All pods ==="
+          kubectl get pods -A -o wide --kubeconfig /tmp/e2e-k8s.config 
2>/dev/null || true
+          echo "=== Pod descriptions (non-Running) ==="
+          kubectl get pods -A --kubeconfig /tmp/e2e-k8s.config --no-headers 
2>/dev/null | grep -v Running | while read ns name rest; do
+            echo "--- $ns/$name ---"
+            kubectl describe pod "$name" -n "$ns" --kubeconfig 
/tmp/e2e-k8s.config 2>/dev/null | tail -30
+          done || true
+          echo "=== Events (warnings) ==="
+          kubectl get events -A --kubeconfig /tmp/e2e-k8s.config 
--field-selector type=Warning --sort-by='.lastTimestamp' 2>/dev/null | tail -30 
|| true
+          echo "=== ES pod logs ==="
+          kubectl logs -n istio-system -l app=elasticsearch --tail=80 
--kubeconfig /tmp/e2e-k8s.config 2>/dev/null || true
+          echo "=== ES init container logs ==="
+          for pod in $(kubectl get pods -n istio-system -l app=elasticsearch 
--kubeconfig /tmp/e2e-k8s.config -o name 2>/dev/null); do
+            kubectl logs -n istio-system "$pod" --all-containers --previous 
--kubeconfig /tmp/e2e-k8s.config 2>/dev/null || true
+          done || true
+          echo "=== ulimit inside Kind node ==="
+          docker exec kind-control-plane sh -c 'ulimit -n; ulimit -u; cat 
/proc/sys/vm/max_map_count' 2>/dev/null || true
+          echo "=== OAP pod logs ==="
+          kubectl logs -n istio-system -l app=skywalking -c skywalking 
--tail=30 --kubeconfig /tmp/e2e-k8s.config 2>/dev/null || true
+          echo "=== Node describe (resources) ==="
+          kubectl describe node --kubeconfig /tmp/e2e-k8s.config 2>/dev/null | 
grep -A 20 "Allocated resources" || true
       - uses: actions/upload-artifact@v4
         if: ${{ failure() }}
         name: Upload Logs
diff --git a/test/e2e-v2/cases/istio/values.yaml 
b/test/e2e-v2/cases/istio/values.yaml
index 115282057d..f9e85dfb63 100644
--- a/test/e2e-v2/cases/istio/values.yaml
+++ b/test/e2e-v2/cases/istio/values.yaml
@@ -39,6 +39,7 @@ oap:
 
 
 elasticsearch:
+  esJavaOpts: "-Djdk.platform.numa.support=true"
   esConfig:
     elasticsearch.yml: |
       cluster.routing.allocation.disk.threshold_enabled: false

Reply via email to