This is an automated email from the ASF dual-hosted git repository. mrproliu pushed a commit to branch fodc-pressure-profiler in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb-helm.git
commit 266f83cb2c231af09da43ecd38a8d3d88c0ad3a5 Author: mrproliu <[email protected]> AuthorDate: Wed Jul 1 10:49:35 2026 +0800 Add FODC memory-pressure pprof capture for data and liaison nodes --- CHANGES.md | 2 ++ chart/templates/_helpers.tpl | 22 +++++++++++++++++++ chart/templates/cluster_data_statefulset.yaml | 25 ++++++++++++++++++++-- chart/templates/cluster_liaison_statefulset.yaml | 27 ++++++++++++++++++++++-- chart/values.yaml | 17 +++++++++++++++ doc/parameters.md | 6 ++++++ 6 files changed, 95 insertions(+), 4 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 0c2eb0c..8755573 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,6 +13,8 @@ Release Notes. - Disable the lifecycle metrics collector when the lifecycle sidecar container is disabled. - Enable FODC panic/crash diagnostics collection by default. Configure via `cluster.fodc.agent.config.crashCollection.{enabled,dir,maxArtifacts,diagnosisMemoryPercent}`. +- Add FODC memory-pressure pprof capture for data and liaison nodes. + Configure via `cluster.fodc.agent.pressureProfiler.*`. 0.6.0 ----------------- diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl index 666b8e6..3d9f3ab 100644 --- a/chart/templates/_helpers.tpl +++ b/chart/templates/_helpers.tpl @@ -367,3 +367,25 @@ Resolve discovery file data key {{- $cm := $file.configMap | default dict }} {{- default "nodes.yaml" $cm.key }} {{- end }} + +{{/* +Convert a human-readable size to an integer number of bytes. +Case-insensitive. All suffixes are 1024-based (binary): K/M/G/T, KB/MB/GB/TB, +Ki/Mi/Gi/Ti (also KiB/MiB/GiB/TiB). No suffix -> plain byte count. Empty or 0 -> 0. +*/}} +{{- define "banyandb.toBytes" -}} +{{- $s := . | toString | trim -}} +{{- if or (eq $s "") (eq $s "0") -}} +0 +{{- else -}} +{{- $num := $s | regexFind "^[0-9]+" | int64 -}} +{{- $unit := lower (regexReplaceAll "^[0-9]+" $s "" | trim) -}} +{{- $mult := int64 1 -}} +{{- if or (eq $unit "k") (eq $unit "kb") (eq $unit "ki") (eq $unit "kib") -}}{{- $mult = int64 1024 -}} +{{- else if or (eq $unit "m") (eq $unit "mb") (eq $unit "mi") (eq $unit "mib") -}}{{- $mult = int64 1048576 -}} +{{- else if or (eq $unit "g") (eq $unit "gb") (eq $unit "gi") (eq $unit "gib") -}}{{- $mult = int64 1073741824 -}} +{{- else if or (eq $unit "t") (eq $unit "tb") (eq $unit "ti") (eq $unit "tib") -}}{{- $mult = int64 1099511627776 -}} +{{- end -}} +{{- mul $num $mult -}} +{{- end -}} +{{- end }} diff --git a/chart/templates/cluster_data_statefulset.yaml b/chart/templates/cluster_data_statefulset.yaml index d4c2778..3c68e55 100644 --- a/chart/templates/cluster_data_statefulset.yaml +++ b/chart/templates/cluster_data_statefulset.yaml @@ -344,6 +344,19 @@ spec: {{- if $.Values.cluster.fodc.agent.config.crashCollection.enabled }} - --crash-source-dir={{ $.Values.cluster.fodc.agent.config.crashCollection.dir }} {{- end }} + {{- with $.Values.cluster.fodc.agent.pressureProfiler }} + {{- if .enabled }} + - --pressure-profiler-enabled=true + - --pressure-profiler-trigger-percent={{ .triggerPercent }} + - --pressure-profiler-pprof-port=6060 + - --pressure-profiler-cooldown={{ .cooldown }} + - --pressure-profiler-dir={{ .dir }} + - --pressure-profiler-max-artifacts={{ .maxArtifacts }} + - --pressure-profiler-max-disk-bytes={{ include "banyandb.toBytes" .maxDiskSize }} + {{- else }} + - --pressure-profiler-enabled=false + {{- end }} + {{- end }} env: - name: POD_NAME valueFrom: @@ -375,7 +388,7 @@ spec: {{- end }} {{- end }} {{- end }} - {{- if or $roleConfig.lifecycleSidecar.enabled (and $.Values.cluster.fodc.enabled $.Values.cluster.fodc.agent.config.crashCollection.enabled) }} + {{- if or $roleConfig.lifecycleSidecar.enabled (and $.Values.cluster.fodc.enabled $.Values.cluster.fodc.agent.config.crashCollection.enabled) (and $.Values.cluster.fodc.agent.pressureProfiler $.Values.cluster.fodc.agent.pressureProfiler.enabled) }} volumeMounts: {{- if $roleConfig.lifecycleSidecar.enabled }} - name: lifecycle-report-shared @@ -385,6 +398,10 @@ spec: - name: crash-shared mountPath: {{ $.Values.cluster.fodc.agent.config.crashCollection.dir }} {{- end }} + {{- if and $.Values.cluster.fodc.agent.pressureProfiler $.Values.cluster.fodc.agent.pressureProfiler.enabled }} + - name: pressure-profiles + mountPath: {{ $.Values.cluster.fodc.agent.pressureProfiler.dir }} + {{- end }} {{- end }} {{- end }} {{- if $roleConfig.backupSidecar.enabled }} @@ -539,7 +556,7 @@ spec: {{- end }} {{- end }} - {{- if or $roleConfig.tls $schemaClientTls.secretName $schemaServerTls.secretName $nodeDiscoveryFileMode (and $roleConfig.lifecycleSidecar.enabled $.Values.cluster.fodc.enabled) (and $.Values.cluster.fodc.enabled $.Values.cluster.fodc.agent.config.crashCollection.enabled) }} + {{- if or $roleConfig.tls $schemaClientTls.secretName $schemaServerTls.secretName $nodeDiscoveryFileMode (and $roleConfig.lifecycleSidecar.enabled $.Values.cluster.fodc.enabled) (and $.Values.cluster.fodc.enabled $.Values.cluster.fodc.agent.config.crashCollection.enabled) (and $.Values.cluster.fodc.enabled $.Values.cluster.fodc.agent.pressureProfiler $.Values.cluster.fodc.agent.pressureProfiler.enabled) }} volumes: {{- if $roleConfig.tls }} {{- if $roleConfig.tls.grpcSecretName }} @@ -574,6 +591,10 @@ spec: - name: crash-shared emptyDir: {} {{- end }} + {{- if and $.Values.cluster.fodc.enabled $.Values.cluster.fodc.agent.pressureProfiler $.Values.cluster.fodc.agent.pressureProfiler.enabled }} + - name: pressure-profiles + emptyDir: {} + {{- end }} {{- end }} {{- if $roleConfig.tolerations }} diff --git a/chart/templates/cluster_liaison_statefulset.yaml b/chart/templates/cluster_liaison_statefulset.yaml index f0efebf..492eb9b 100644 --- a/chart/templates/cluster_liaison_statefulset.yaml +++ b/chart/templates/cluster_liaison_statefulset.yaml @@ -309,6 +309,19 @@ spec: {{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }} - --crash-source-dir={{ .Values.cluster.fodc.agent.config.crashCollection.dir }} {{- end }} + {{- with .Values.cluster.fodc.agent.pressureProfiler }} + {{- if .enabled }} + - --pressure-profiler-enabled=true + - --pressure-profiler-trigger-percent={{ .triggerPercent }} + - --pressure-profiler-pprof-port=6060 + - --pressure-profiler-cooldown={{ .cooldown }} + - --pressure-profiler-dir={{ .dir }} + - --pressure-profiler-max-artifacts={{ .maxArtifacts }} + - --pressure-profiler-max-disk-bytes={{ include "banyandb.toBytes" .maxDiskSize }} + {{- else }} + - --pressure-profiler-enabled=false + {{- end }} + {{- end }} env: - name: POD_NAME valueFrom: @@ -340,14 +353,20 @@ spec: {{- end }} {{- end }} {{- end }} - {{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }} + {{- if or .Values.cluster.fodc.agent.config.crashCollection.enabled (and .Values.cluster.fodc.agent.pressureProfiler .Values.cluster.fodc.agent.pressureProfiler.enabled) }} volumeMounts: + {{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }} - name: crash-shared mountPath: {{ .Values.cluster.fodc.agent.config.crashCollection.dir }} + {{- end }} + {{- if and .Values.cluster.fodc.agent.pressureProfiler .Values.cluster.fodc.agent.pressureProfiler.enabled }} + - name: pressure-profiles + mountPath: {{ .Values.cluster.fodc.agent.pressureProfiler.dir }} + {{- end }} {{- end }} {{- end }} - {{- if or .Values.cluster.liaison.tls $schemaClientTls.secretName .Values.auth.enabled $nodeDiscoveryFileMode (and .Values.cluster.fodc.enabled .Values.cluster.fodc.agent.config.crashCollection.enabled) }} + {{- if or .Values.cluster.liaison.tls $schemaClientTls.secretName .Values.auth.enabled $nodeDiscoveryFileMode (and .Values.cluster.fodc.enabled .Values.cluster.fodc.agent.config.crashCollection.enabled) (and .Values.cluster.fodc.enabled .Values.cluster.fodc.agent.pressureProfiler .Values.cluster.fodc.agent.pressureProfiler.enabled) }} volumes: {{- if .Values.cluster.liaison.tls }} {{- if .Values.cluster.liaison.tls.grpcSecretName }} @@ -392,6 +411,10 @@ spec: - name: crash-shared emptyDir: {} {{- end }} + {{- if and .Values.cluster.fodc.enabled .Values.cluster.fodc.agent.pressureProfiler .Values.cluster.fodc.agent.pressureProfiler.enabled }} + - name: pressure-profiles + emptyDir: {} + {{- end }} {{- end }} {{- if .Values.cluster.liaison.tolerations }} diff --git a/chart/values.yaml b/chart/values.yaml index 694fffc..cfb07ff 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -1067,6 +1067,23 @@ cluster: maxArtifacts: 10 ## @param cluster.fodc.agent.config.crashCollection.diagnosisMemoryPercent Set banyand GOMEMLIMIT to this percent of the cgroup memory limit, reserving headroom for post-panic diagnostics (0 disables) diagnosisMemoryPercent: 50 + ## Memory-pressure pprof capture. When a data container's RSS approaches its + ## cgroup memory limit, the agent pulls heap+goroutine pprof from the data + ## container's pprof endpoint (port 6060) onto a shared emptyDir, served by the + ## proxy's HTTP API. Requires a data-container memory limit to be effective. + pressureProfiler: + ## @param cluster.fodc.agent.pressureProfiler.enabled Enable automatic heap+goroutine pprof capture under memory pressure + enabled: true + ## @param cluster.fodc.agent.pressureProfiler.triggerPercent Capture when RSS / cgroup_limit reaches this percentage + triggerPercent: 75 + ## @param cluster.fodc.agent.pressureProfiler.cooldown Minimum interval between two captures + cooldown: 5m + ## @param cluster.fodc.agent.pressureProfiler.dir Directory (on the writable volume) where captured profiles are stored; must equal the pressure-profiles mount path + dir: /tmp/pressure-profiles + ## @param cluster.fodc.agent.pressureProfiler.maxArtifacts Maximum number of capture events to retain (lowest-RSS evicted first) + maxArtifacts: 16 + ## @param cluster.fodc.agent.pressureProfiler.maxDiskSize Maximum total on-disk size for retained events; case-insensitive, all suffixes are 1024-based (K/M/G/T, KB/MB/GB/TB, Ki/Mi/Gi/Ti), or a plain byte count; 0 disables the disk bound + maxDiskSize: 512MB ## Liveness probe for Agent livenessProbe: ## @param cluster.fodc.agent.livenessProbe.initialDelaySeconds Initial delay for Agent liveness probe diff --git a/doc/parameters.md b/doc/parameters.md index d52581d..26b9f1c 100644 --- a/doc/parameters.md +++ b/doc/parameters.md @@ -396,6 +396,12 @@ The content of this document describes the parameters that can be configured in | `cluster.fodc.agent.config.crashCollection.dir` | Shared path where banyand writes panic.json and fodc-agent reads it | `/tmp/crash` | | `cluster.fodc.agent.config.crashCollection.maxArtifacts` | Max crash artifact directories banyand retains (oldest removed first; 0 disables pruning) | `10` | | `cluster.fodc.agent.config.crashCollection.diagnosisMemoryPercent` | Set banyand GOMEMLIMIT to this percent of the cgroup memory limit, reserving headroom for post-panic diagnostics (0 disables) | `50` | +| `cluster.fodc.agent.pressureProfiler.enabled` | Enable automatic heap pprof capture under memory pressure | `true` | +| `cluster.fodc.agent.pressureProfiler.triggerPercent` | Capture when RSS / cgroup_limit reaches this percentage | `75` | +| `cluster.fodc.agent.pressureProfiler.cooldown` | Minimum interval between two captures | `5m` | +| `cluster.fodc.agent.pressureProfiler.dir` | Directory (on the writable volume) where captured profiles are stored; must equal the pressure-profiles mount path | `/tmp/pressure-profiles` | +| `cluster.fodc.agent.pressureProfiler.maxArtifacts` | Maximum number of capture events to retain (lowest-RSS evicted first) | `16` | +| `cluster.fodc.agent.pressureProfiler.maxDiskSize` | Max total on-disk size for retained events; case-insensitive 1024-based suffix (e.g. 512Mi, 1Gi); 0 disables | `512Mi` | | `cluster.fodc.agent.livenessProbe.initialDelaySeconds` | Initial delay for Agent liveness probe | `90` | | `cluster.fodc.agent.livenessProbe.periodSeconds` | Probe period for Agent liveness probe | `30` | | `cluster.fodc.agent.livenessProbe.timeoutSeconds` | Timeout in seconds for Agent liveness probe | `5` |
