This is an automated email from the ASF dual-hosted git repository.

hanahmily pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb-helm.git


The following commit(s) were added to refs/heads/master by this push:
     new 0cb1a64  Add FODC memory-pressure pprof capture for data and liaison 
nodes (#65)
0cb1a64 is described below

commit 0cb1a6498e04001384a984c7bfa9172444502c0a
Author: mrproliu <[email protected]>
AuthorDate: Wed Jul 1 11:07:27 2026 +0800

    Add FODC memory-pressure pprof capture for data and liaison nodes (#65)
---
 CHANGES.md                                       |  2 ++
 chart/templates/_helpers.tpl                     | 25 ++++++++++++++++++++++
 chart/templates/cluster_data_statefulset.yaml    | 25 ++++++++++++++++++++--
 chart/templates/cluster_liaison_statefulset.yaml | 27 ++++++++++++++++++++++--
 chart/values.yaml                                | 18 ++++++++++++++++
 doc/parameters.md                                |  6 ++++++
 6 files changed, 99 insertions(+), 4 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 0c2eb0c..8755573 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -13,6 +13,8 @@ Release Notes.
 - Disable the lifecycle metrics collector when the lifecycle sidecar container 
is disabled.
 - Enable FODC panic/crash diagnostics collection by default.
   Configure via 
`cluster.fodc.agent.config.crashCollection.{enabled,dir,maxArtifacts,diagnosisMemoryPercent}`.
+- Add FODC memory-pressure pprof capture for data and liaison nodes.
+  Configure via `cluster.fodc.agent.pressureProfiler.*`.
 
 0.6.0
 -----------------
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
index 666b8e6..4c0879c 100644
--- a/chart/templates/_helpers.tpl
+++ b/chart/templates/_helpers.tpl
@@ -367,3 +367,28 @@ Resolve discovery file data key
 {{- $cm := $file.configMap | default dict }}
 {{- default "nodes.yaml" $cm.key }}
 {{- end }}
+
+{{/*
+Convert a human-readable size to an integer number of bytes.
+Case-insensitive. All suffixes are 1024-based (binary): K/M/G/T, KB/MB/GB/TB,
+Ki/Mi/Gi/Ti (also KiB/MiB/GiB/TiB). No suffix -> plain byte count. Empty or 0 
-> 0.
+*/}}
+{{- define "banyandb.toBytes" -}}
+{{- $s := . | toString | trim -}}
+{{- if or (eq $s "") (eq $s "0") -}}
+0
+{{- else -}}
+{{- if not (regexMatch 
"(?i)^[0-9]+(k|kb|ki|kib|m|mb|mi|mib|g|gb|gi|gib|t|tb|ti|tib)?$" $s) -}}
+{{- fail (printf "banyandb.toBytes: invalid size %q; expected an integer 
optionally followed by K/M/G/T, KB/MB/GB/TB, or Ki/Mi/Gi/Ti (case-insensitive), 
e.g. 512Mi" $s) -}}
+{{- end -}}
+{{- $num := $s | regexFind "^[0-9]+" | int64 -}}
+{{- $unit := lower (regexReplaceAll "^[0-9]+" $s "" | trim) -}}
+{{- $mult := int64 1 -}}
+{{- if or (eq $unit "k") (eq $unit "kb") (eq $unit "ki") (eq $unit "kib") 
-}}{{- $mult = int64 1024 -}}
+{{- else if or (eq $unit "m") (eq $unit "mb") (eq $unit "mi") (eq $unit "mib") 
-}}{{- $mult = int64 1048576 -}}
+{{- else if or (eq $unit "g") (eq $unit "gb") (eq $unit "gi") (eq $unit "gib") 
-}}{{- $mult = int64 1073741824 -}}
+{{- else if or (eq $unit "t") (eq $unit "tb") (eq $unit "ti") (eq $unit "tib") 
-}}{{- $mult = int64 1099511627776 -}}
+{{- end -}}
+{{- mul $num $mult -}}
+{{- end -}}
+{{- end }}
diff --git a/chart/templates/cluster_data_statefulset.yaml 
b/chart/templates/cluster_data_statefulset.yaml
index d4c2778..3c68e55 100644
--- a/chart/templates/cluster_data_statefulset.yaml
+++ b/chart/templates/cluster_data_statefulset.yaml
@@ -344,6 +344,19 @@ spec:
             {{- if $.Values.cluster.fodc.agent.config.crashCollection.enabled 
}}
             - --crash-source-dir={{ 
$.Values.cluster.fodc.agent.config.crashCollection.dir }}
             {{- end }}
+            {{- with $.Values.cluster.fodc.agent.pressureProfiler }}
+            {{- if .enabled }}
+            - --pressure-profiler-enabled=true
+            - --pressure-profiler-trigger-percent={{ .triggerPercent }}
+            - --pressure-profiler-pprof-port=6060
+            - --pressure-profiler-cooldown={{ .cooldown }}
+            - --pressure-profiler-dir={{ .dir }}
+            - --pressure-profiler-max-artifacts={{ .maxArtifacts }}
+            - --pressure-profiler-max-disk-bytes={{ include "banyandb.toBytes" 
.maxDiskSize }}
+            {{- else }}
+            - --pressure-profiler-enabled=false
+            {{- end }}
+            {{- end }}
           env:
             - name: POD_NAME
               valueFrom:
@@ -375,7 +388,7 @@ spec:
               {{- end }}
             {{- end }}
           {{- end }}
-          {{- if or $roleConfig.lifecycleSidecar.enabled (and 
$.Values.cluster.fodc.enabled 
$.Values.cluster.fodc.agent.config.crashCollection.enabled) }}
+          {{- if or $roleConfig.lifecycleSidecar.enabled (and 
$.Values.cluster.fodc.enabled 
$.Values.cluster.fodc.agent.config.crashCollection.enabled) (and 
$.Values.cluster.fodc.agent.pressureProfiler 
$.Values.cluster.fodc.agent.pressureProfiler.enabled) }}
           volumeMounts:
             {{- if $roleConfig.lifecycleSidecar.enabled }}
             - name: lifecycle-report-shared
@@ -385,6 +398,10 @@ spec:
             - name: crash-shared
               mountPath: {{ 
$.Values.cluster.fodc.agent.config.crashCollection.dir }}
             {{- end }}
+            {{- if and $.Values.cluster.fodc.agent.pressureProfiler 
$.Values.cluster.fodc.agent.pressureProfiler.enabled }}
+            - name: pressure-profiles
+              mountPath: {{ $.Values.cluster.fodc.agent.pressureProfiler.dir }}
+            {{- end }}
           {{- end }}
         {{- end }}
         {{- if $roleConfig.backupSidecar.enabled }}
@@ -539,7 +556,7 @@ spec:
           {{- end }}
         {{- end }}
 
-      {{- if or $roleConfig.tls $schemaClientTls.secretName 
$schemaServerTls.secretName $nodeDiscoveryFileMode (and 
$roleConfig.lifecycleSidecar.enabled $.Values.cluster.fodc.enabled) (and 
$.Values.cluster.fodc.enabled 
$.Values.cluster.fodc.agent.config.crashCollection.enabled) }}
+      {{- if or $roleConfig.tls $schemaClientTls.secretName 
$schemaServerTls.secretName $nodeDiscoveryFileMode (and 
$roleConfig.lifecycleSidecar.enabled $.Values.cluster.fodc.enabled) (and 
$.Values.cluster.fodc.enabled 
$.Values.cluster.fodc.agent.config.crashCollection.enabled) (and 
$.Values.cluster.fodc.enabled $.Values.cluster.fodc.agent.pressureProfiler 
$.Values.cluster.fodc.agent.pressureProfiler.enabled) }}
       volumes:
         {{- if $roleConfig.tls }}
         {{- if $roleConfig.tls.grpcSecretName }}
@@ -574,6 +591,10 @@ spec:
         - name: crash-shared
           emptyDir: {}
         {{- end }}
+        {{- if and $.Values.cluster.fodc.enabled 
$.Values.cluster.fodc.agent.pressureProfiler 
$.Values.cluster.fodc.agent.pressureProfiler.enabled }}
+        - name: pressure-profiles
+          emptyDir: {}
+        {{- end }}
       {{- end }}
 
       {{- if $roleConfig.tolerations }}
diff --git a/chart/templates/cluster_liaison_statefulset.yaml 
b/chart/templates/cluster_liaison_statefulset.yaml
index f0efebf..492eb9b 100644
--- a/chart/templates/cluster_liaison_statefulset.yaml
+++ b/chart/templates/cluster_liaison_statefulset.yaml
@@ -309,6 +309,19 @@ spec:
             {{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }}
             - --crash-source-dir={{ 
.Values.cluster.fodc.agent.config.crashCollection.dir }}
             {{- end }}
+            {{- with .Values.cluster.fodc.agent.pressureProfiler }}
+            {{- if .enabled }}
+            - --pressure-profiler-enabled=true
+            - --pressure-profiler-trigger-percent={{ .triggerPercent }}
+            - --pressure-profiler-pprof-port=6060
+            - --pressure-profiler-cooldown={{ .cooldown }}
+            - --pressure-profiler-dir={{ .dir }}
+            - --pressure-profiler-max-artifacts={{ .maxArtifacts }}
+            - --pressure-profiler-max-disk-bytes={{ include "banyandb.toBytes" 
.maxDiskSize }}
+            {{- else }}
+            - --pressure-profiler-enabled=false
+            {{- end }}
+            {{- end }}
           env:
             - name: POD_NAME
               valueFrom:
@@ -340,14 +353,20 @@ spec:
               {{- end }}
             {{- end }}
           {{- end }}
-          {{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }}
+          {{- if or .Values.cluster.fodc.agent.config.crashCollection.enabled 
(and .Values.cluster.fodc.agent.pressureProfiler 
.Values.cluster.fodc.agent.pressureProfiler.enabled) }}
           volumeMounts:
+            {{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }}
             - name: crash-shared
               mountPath: {{ 
.Values.cluster.fodc.agent.config.crashCollection.dir }}
+            {{- end }}
+            {{- if and .Values.cluster.fodc.agent.pressureProfiler 
.Values.cluster.fodc.agent.pressureProfiler.enabled }}
+            - name: pressure-profiles
+              mountPath: {{ .Values.cluster.fodc.agent.pressureProfiler.dir }}
+            {{- end }}
           {{- end }}
         {{- end }}
 
-      {{- if or .Values.cluster.liaison.tls $schemaClientTls.secretName 
.Values.auth.enabled $nodeDiscoveryFileMode (and .Values.cluster.fodc.enabled 
.Values.cluster.fodc.agent.config.crashCollection.enabled) }}
+      {{- if or .Values.cluster.liaison.tls $schemaClientTls.secretName 
.Values.auth.enabled $nodeDiscoveryFileMode (and .Values.cluster.fodc.enabled 
.Values.cluster.fodc.agent.config.crashCollection.enabled) (and 
.Values.cluster.fodc.enabled .Values.cluster.fodc.agent.pressureProfiler 
.Values.cluster.fodc.agent.pressureProfiler.enabled) }}
       volumes:
         {{- if .Values.cluster.liaison.tls }}
         {{- if .Values.cluster.liaison.tls.grpcSecretName }}
@@ -392,6 +411,10 @@ spec:
         - name: crash-shared
           emptyDir: {}
         {{- end }}
+        {{- if and .Values.cluster.fodc.enabled 
.Values.cluster.fodc.agent.pressureProfiler 
.Values.cluster.fodc.agent.pressureProfiler.enabled }}
+        - name: pressure-profiles
+          emptyDir: {}
+        {{- end }}
       {{- end }}
 
       {{- if .Values.cluster.liaison.tolerations }}
diff --git a/chart/values.yaml b/chart/values.yaml
index 694fffc..0585ee4 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -1067,6 +1067,24 @@ cluster:
           maxArtifacts: 10
           ## @param 
cluster.fodc.agent.config.crashCollection.diagnosisMemoryPercent Set banyand 
GOMEMLIMIT to this percent of the cgroup memory limit, reserving headroom for 
post-panic diagnostics (0 disables)
           diagnosisMemoryPercent: 50
+      ## Memory-pressure pprof capture. When a data container's RSS approaches 
its
+      ## cgroup memory limit, the agent pulls heap+goroutine pprof from the 
data
+      ## container's pprof endpoint (port 6060) onto an emptyDir mounted into 
the
+      ## fodc-agent, served by the proxy's HTTP API. Requires a data-container 
memory
+      ## limit to be effective.
+      pressureProfiler:
+        ## @param cluster.fodc.agent.pressureProfiler.enabled Enable automatic 
heap+goroutine pprof capture under memory pressure
+        enabled: true
+        ## @param cluster.fodc.agent.pressureProfiler.triggerPercent Capture 
when RSS / cgroup_limit reaches this percentage
+        triggerPercent: 75
+        ## @param cluster.fodc.agent.pressureProfiler.cooldown Minimum 
interval between two captures
+        cooldown: 5m
+        ## @param cluster.fodc.agent.pressureProfiler.dir Directory (on the 
writable volume) where captured profiles are stored; must equal the 
pressure-profiles mount path
+        dir: /tmp/pressure-profiles
+        ## @param cluster.fodc.agent.pressureProfiler.maxArtifacts Maximum 
number of capture events to retain (lowest-RSS evicted first)
+        maxArtifacts: 16
+        ## @param cluster.fodc.agent.pressureProfiler.maxDiskSize Maximum 
total on-disk size for retained events; case-insensitive, all suffixes are 
1024-based (K/M/G/T, KB/MB/GB/TB, Ki/Mi/Gi/Ti), or a plain byte count; 0 
disables the disk bound
+        maxDiskSize: 512Mi
       ## Liveness probe for Agent
       livenessProbe:
         ## @param cluster.fodc.agent.livenessProbe.initialDelaySeconds Initial 
delay for Agent liveness probe
diff --git a/doc/parameters.md b/doc/parameters.md
index d52581d..63b83d5 100644
--- a/doc/parameters.md
+++ b/doc/parameters.md
@@ -396,6 +396,12 @@ The content of this document describes the parameters that 
can be configured in
 | `cluster.fodc.agent.config.crashCollection.dir`                    | Shared 
path where banyand writes panic.json and fodc-agent reads it                    
                                       | `/tmp/crash`                           
         |
 | `cluster.fodc.agent.config.crashCollection.maxArtifacts`           | Max 
crash artifact directories banyand retains (oldest removed first; 0 disables 
pruning)                                     | `10`                             
               |
 | `cluster.fodc.agent.config.crashCollection.diagnosisMemoryPercent` | Set 
banyand GOMEMLIMIT to this percent of the cgroup memory limit, reserving 
headroom for post-panic diagnostics (0 disables) | `50`                         
                   |
+| `cluster.fodc.agent.pressureProfiler.enabled`                      | Enable 
automatic heap+goroutine pprof capture under memory pressure                    
                                       | `true`                                 
         |
+| `cluster.fodc.agent.pressureProfiler.triggerPercent`               | Capture 
when RSS / cgroup_limit reaches this percentage                                 
                                      | `75`                                    
        |
+| `cluster.fodc.agent.pressureProfiler.cooldown`                     | Minimum 
interval between two captures                                                   
                                      | `5m`                                    
        |
+| `cluster.fodc.agent.pressureProfiler.dir`                          | 
Directory (on the writable volume) where captured profiles are stored; must 
equal the pressure-profiles mount path            | `/tmp/pressure-profiles`    
                    |
+| `cluster.fodc.agent.pressureProfiler.maxArtifacts`                 | Maximum 
number of capture events to retain (lowest-RSS evicted first)                   
                                      | `16`                                    
        |
+| `cluster.fodc.agent.pressureProfiler.maxDiskSize`                  | Max 
total on-disk size for retained events; case-insensitive 1024-based suffix 
(e.g. 512Mi, 1Gi); 0 disables                  | `512Mi`                        
                 |
 | `cluster.fodc.agent.livenessProbe.initialDelaySeconds`             | Initial 
delay for Agent liveness probe                                                  
                                      | `90`                                    
        |
 | `cluster.fodc.agent.livenessProbe.periodSeconds`                   | Probe 
period for Agent liveness probe                                                 
                                        | `30`                                  
          |
 | `cluster.fodc.agent.livenessProbe.timeoutSeconds`                  | Timeout 
in seconds for Agent liveness probe                                             
                                      | `5`                                     
        |

Reply via email to