This is an automated email from the ASF dual-hosted git repository.
hanahmily pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb-helm.git
The following commit(s) were added to refs/heads/master by this push:
new 0cb1a64 Add FODC memory-pressure pprof capture for data and liaison
nodes (#65)
0cb1a64 is described below
commit 0cb1a6498e04001384a984c7bfa9172444502c0a
Author: mrproliu <[email protected]>
AuthorDate: Wed Jul 1 11:07:27 2026 +0800
Add FODC memory-pressure pprof capture for data and liaison nodes (#65)
---
CHANGES.md | 2 ++
chart/templates/_helpers.tpl | 25 ++++++++++++++++++++++
chart/templates/cluster_data_statefulset.yaml | 25 ++++++++++++++++++++--
chart/templates/cluster_liaison_statefulset.yaml | 27 ++++++++++++++++++++++--
chart/values.yaml | 18 ++++++++++++++++
doc/parameters.md | 6 ++++++
6 files changed, 99 insertions(+), 4 deletions(-)
diff --git a/CHANGES.md b/CHANGES.md
index 0c2eb0c..8755573 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -13,6 +13,8 @@ Release Notes.
- Disable the lifecycle metrics collector when the lifecycle sidecar container
is disabled.
- Enable FODC panic/crash diagnostics collection by default.
Configure via
`cluster.fodc.agent.config.crashCollection.{enabled,dir,maxArtifacts,diagnosisMemoryPercent}`.
+- Add FODC memory-pressure pprof capture for data and liaison nodes.
+ Configure via `cluster.fodc.agent.pressureProfiler.*`.
0.6.0
-----------------
diff --git a/chart/templates/_helpers.tpl b/chart/templates/_helpers.tpl
index 666b8e6..4c0879c 100644
--- a/chart/templates/_helpers.tpl
+++ b/chart/templates/_helpers.tpl
@@ -367,3 +367,28 @@ Resolve discovery file data key
{{- $cm := $file.configMap | default dict }}
{{- default "nodes.yaml" $cm.key }}
{{- end }}
+
+{{/*
+Convert a human-readable size to an integer number of bytes.
+Case-insensitive. All suffixes are 1024-based (binary): K/M/G/T, KB/MB/GB/TB,
+Ki/Mi/Gi/Ti (also KiB/MiB/GiB/TiB). No suffix -> plain byte count. Empty or 0
-> 0.
+*/}}
+{{- define "banyandb.toBytes" -}}
+{{- $s := . | toString | trim -}}
+{{- if or (eq $s "") (eq $s "0") -}}
+0
+{{- else -}}
+{{- if not (regexMatch
"(?i)^[0-9]+(k|kb|ki|kib|m|mb|mi|mib|g|gb|gi|gib|t|tb|ti|tib)?$" $s) -}}
+{{- fail (printf "banyandb.toBytes: invalid size %q; expected an integer
optionally followed by K/M/G/T, KB/MB/GB/TB, or Ki/Mi/Gi/Ti (case-insensitive),
e.g. 512Mi" $s) -}}
+{{- end -}}
+{{- $num := $s | regexFind "^[0-9]+" | int64 -}}
+{{- $unit := lower (regexReplaceAll "^[0-9]+" $s "" | trim) -}}
+{{- $mult := int64 1 -}}
+{{- if or (eq $unit "k") (eq $unit "kb") (eq $unit "ki") (eq $unit "kib")
-}}{{- $mult = int64 1024 -}}
+{{- else if or (eq $unit "m") (eq $unit "mb") (eq $unit "mi") (eq $unit "mib")
-}}{{- $mult = int64 1048576 -}}
+{{- else if or (eq $unit "g") (eq $unit "gb") (eq $unit "gi") (eq $unit "gib")
-}}{{- $mult = int64 1073741824 -}}
+{{- else if or (eq $unit "t") (eq $unit "tb") (eq $unit "ti") (eq $unit "tib")
-}}{{- $mult = int64 1099511627776 -}}
+{{- end -}}
+{{- mul $num $mult -}}
+{{- end -}}
+{{- end }}
diff --git a/chart/templates/cluster_data_statefulset.yaml
b/chart/templates/cluster_data_statefulset.yaml
index d4c2778..3c68e55 100644
--- a/chart/templates/cluster_data_statefulset.yaml
+++ b/chart/templates/cluster_data_statefulset.yaml
@@ -344,6 +344,19 @@ spec:
{{- if $.Values.cluster.fodc.agent.config.crashCollection.enabled
}}
- --crash-source-dir={{
$.Values.cluster.fodc.agent.config.crashCollection.dir }}
{{- end }}
+ {{- with $.Values.cluster.fodc.agent.pressureProfiler }}
+ {{- if .enabled }}
+ - --pressure-profiler-enabled=true
+ - --pressure-profiler-trigger-percent={{ .triggerPercent }}
+ - --pressure-profiler-pprof-port=6060
+ - --pressure-profiler-cooldown={{ .cooldown }}
+ - --pressure-profiler-dir={{ .dir }}
+ - --pressure-profiler-max-artifacts={{ .maxArtifacts }}
+ - --pressure-profiler-max-disk-bytes={{ include "banyandb.toBytes"
.maxDiskSize }}
+ {{- else }}
+ - --pressure-profiler-enabled=false
+ {{- end }}
+ {{- end }}
env:
- name: POD_NAME
valueFrom:
@@ -375,7 +388,7 @@ spec:
{{- end }}
{{- end }}
{{- end }}
- {{- if or $roleConfig.lifecycleSidecar.enabled (and
$.Values.cluster.fodc.enabled
$.Values.cluster.fodc.agent.config.crashCollection.enabled) }}
+ {{- if or $roleConfig.lifecycleSidecar.enabled (and
$.Values.cluster.fodc.enabled
$.Values.cluster.fodc.agent.config.crashCollection.enabled) (and
$.Values.cluster.fodc.agent.pressureProfiler
$.Values.cluster.fodc.agent.pressureProfiler.enabled) }}
volumeMounts:
{{- if $roleConfig.lifecycleSidecar.enabled }}
- name: lifecycle-report-shared
@@ -385,6 +398,10 @@ spec:
- name: crash-shared
mountPath: {{
$.Values.cluster.fodc.agent.config.crashCollection.dir }}
{{- end }}
+ {{- if and $.Values.cluster.fodc.agent.pressureProfiler
$.Values.cluster.fodc.agent.pressureProfiler.enabled }}
+ - name: pressure-profiles
+ mountPath: {{ $.Values.cluster.fodc.agent.pressureProfiler.dir }}
+ {{- end }}
{{- end }}
{{- end }}
{{- if $roleConfig.backupSidecar.enabled }}
@@ -539,7 +556,7 @@ spec:
{{- end }}
{{- end }}
- {{- if or $roleConfig.tls $schemaClientTls.secretName
$schemaServerTls.secretName $nodeDiscoveryFileMode (and
$roleConfig.lifecycleSidecar.enabled $.Values.cluster.fodc.enabled) (and
$.Values.cluster.fodc.enabled
$.Values.cluster.fodc.agent.config.crashCollection.enabled) }}
+ {{- if or $roleConfig.tls $schemaClientTls.secretName
$schemaServerTls.secretName $nodeDiscoveryFileMode (and
$roleConfig.lifecycleSidecar.enabled $.Values.cluster.fodc.enabled) (and
$.Values.cluster.fodc.enabled
$.Values.cluster.fodc.agent.config.crashCollection.enabled) (and
$.Values.cluster.fodc.enabled $.Values.cluster.fodc.agent.pressureProfiler
$.Values.cluster.fodc.agent.pressureProfiler.enabled) }}
volumes:
{{- if $roleConfig.tls }}
{{- if $roleConfig.tls.grpcSecretName }}
@@ -574,6 +591,10 @@ spec:
- name: crash-shared
emptyDir: {}
{{- end }}
+ {{- if and $.Values.cluster.fodc.enabled
$.Values.cluster.fodc.agent.pressureProfiler
$.Values.cluster.fodc.agent.pressureProfiler.enabled }}
+ - name: pressure-profiles
+ emptyDir: {}
+ {{- end }}
{{- end }}
{{- if $roleConfig.tolerations }}
diff --git a/chart/templates/cluster_liaison_statefulset.yaml
b/chart/templates/cluster_liaison_statefulset.yaml
index f0efebf..492eb9b 100644
--- a/chart/templates/cluster_liaison_statefulset.yaml
+++ b/chart/templates/cluster_liaison_statefulset.yaml
@@ -309,6 +309,19 @@ spec:
{{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }}
- --crash-source-dir={{
.Values.cluster.fodc.agent.config.crashCollection.dir }}
{{- end }}
+ {{- with .Values.cluster.fodc.agent.pressureProfiler }}
+ {{- if .enabled }}
+ - --pressure-profiler-enabled=true
+ - --pressure-profiler-trigger-percent={{ .triggerPercent }}
+ - --pressure-profiler-pprof-port=6060
+ - --pressure-profiler-cooldown={{ .cooldown }}
+ - --pressure-profiler-dir={{ .dir }}
+ - --pressure-profiler-max-artifacts={{ .maxArtifacts }}
+ - --pressure-profiler-max-disk-bytes={{ include "banyandb.toBytes"
.maxDiskSize }}
+ {{- else }}
+ - --pressure-profiler-enabled=false
+ {{- end }}
+ {{- end }}
env:
- name: POD_NAME
valueFrom:
@@ -340,14 +353,20 @@ spec:
{{- end }}
{{- end }}
{{- end }}
- {{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }}
+ {{- if or .Values.cluster.fodc.agent.config.crashCollection.enabled
(and .Values.cluster.fodc.agent.pressureProfiler
.Values.cluster.fodc.agent.pressureProfiler.enabled) }}
volumeMounts:
+ {{- if .Values.cluster.fodc.agent.config.crashCollection.enabled }}
- name: crash-shared
mountPath: {{
.Values.cluster.fodc.agent.config.crashCollection.dir }}
+ {{- end }}
+ {{- if and .Values.cluster.fodc.agent.pressureProfiler
.Values.cluster.fodc.agent.pressureProfiler.enabled }}
+ - name: pressure-profiles
+ mountPath: {{ .Values.cluster.fodc.agent.pressureProfiler.dir }}
+ {{- end }}
{{- end }}
{{- end }}
- {{- if or .Values.cluster.liaison.tls $schemaClientTls.secretName
.Values.auth.enabled $nodeDiscoveryFileMode (and .Values.cluster.fodc.enabled
.Values.cluster.fodc.agent.config.crashCollection.enabled) }}
+ {{- if or .Values.cluster.liaison.tls $schemaClientTls.secretName
.Values.auth.enabled $nodeDiscoveryFileMode (and .Values.cluster.fodc.enabled
.Values.cluster.fodc.agent.config.crashCollection.enabled) (and
.Values.cluster.fodc.enabled .Values.cluster.fodc.agent.pressureProfiler
.Values.cluster.fodc.agent.pressureProfiler.enabled) }}
volumes:
{{- if .Values.cluster.liaison.tls }}
{{- if .Values.cluster.liaison.tls.grpcSecretName }}
@@ -392,6 +411,10 @@ spec:
- name: crash-shared
emptyDir: {}
{{- end }}
+ {{- if and .Values.cluster.fodc.enabled
.Values.cluster.fodc.agent.pressureProfiler
.Values.cluster.fodc.agent.pressureProfiler.enabled }}
+ - name: pressure-profiles
+ emptyDir: {}
+ {{- end }}
{{- end }}
{{- if .Values.cluster.liaison.tolerations }}
diff --git a/chart/values.yaml b/chart/values.yaml
index 694fffc..0585ee4 100644
--- a/chart/values.yaml
+++ b/chart/values.yaml
@@ -1067,6 +1067,24 @@ cluster:
maxArtifacts: 10
## @param
cluster.fodc.agent.config.crashCollection.diagnosisMemoryPercent Set banyand
GOMEMLIMIT to this percent of the cgroup memory limit, reserving headroom for
post-panic diagnostics (0 disables)
diagnosisMemoryPercent: 50
+ ## Memory-pressure pprof capture. When a data container's RSS approaches
its
+ ## cgroup memory limit, the agent pulls heap+goroutine pprof from the
data
+ ## container's pprof endpoint (port 6060) onto an emptyDir mounted into
the
+ ## fodc-agent, served by the proxy's HTTP API. Requires a data-container
memory
+ ## limit to be effective.
+ pressureProfiler:
+ ## @param cluster.fodc.agent.pressureProfiler.enabled Enable automatic
heap+goroutine pprof capture under memory pressure
+ enabled: true
+ ## @param cluster.fodc.agent.pressureProfiler.triggerPercent Capture
when RSS / cgroup_limit reaches this percentage
+ triggerPercent: 75
+ ## @param cluster.fodc.agent.pressureProfiler.cooldown Minimum
interval between two captures
+ cooldown: 5m
+ ## @param cluster.fodc.agent.pressureProfiler.dir Directory (on the
writable volume) where captured profiles are stored; must equal the
pressure-profiles mount path
+ dir: /tmp/pressure-profiles
+ ## @param cluster.fodc.agent.pressureProfiler.maxArtifacts Maximum
number of capture events to retain (lowest-RSS evicted first)
+ maxArtifacts: 16
+ ## @param cluster.fodc.agent.pressureProfiler.maxDiskSize Maximum
total on-disk size for retained events; case-insensitive, all suffixes are
1024-based (K/M/G/T, KB/MB/GB/TB, Ki/Mi/Gi/Ti), or a plain byte count; 0
disables the disk bound
+ maxDiskSize: 512Mi
## Liveness probe for Agent
livenessProbe:
## @param cluster.fodc.agent.livenessProbe.initialDelaySeconds Initial
delay for Agent liveness probe
diff --git a/doc/parameters.md b/doc/parameters.md
index d52581d..63b83d5 100644
--- a/doc/parameters.md
+++ b/doc/parameters.md
@@ -396,6 +396,12 @@ The content of this document describes the parameters that
can be configured in
| `cluster.fodc.agent.config.crashCollection.dir` | Shared
path where banyand writes panic.json and fodc-agent reads it
| `/tmp/crash`
|
| `cluster.fodc.agent.config.crashCollection.maxArtifacts` | Max
crash artifact directories banyand retains (oldest removed first; 0 disables
pruning) | `10`
|
| `cluster.fodc.agent.config.crashCollection.diagnosisMemoryPercent` | Set
banyand GOMEMLIMIT to this percent of the cgroup memory limit, reserving
headroom for post-panic diagnostics (0 disables) | `50`
|
+| `cluster.fodc.agent.pressureProfiler.enabled` | Enable
automatic heap+goroutine pprof capture under memory pressure
| `true`
|
+| `cluster.fodc.agent.pressureProfiler.triggerPercent` | Capture
when RSS / cgroup_limit reaches this percentage
| `75`
|
+| `cluster.fodc.agent.pressureProfiler.cooldown` | Minimum
interval between two captures
| `5m`
|
+| `cluster.fodc.agent.pressureProfiler.dir` |
Directory (on the writable volume) where captured profiles are stored; must
equal the pressure-profiles mount path | `/tmp/pressure-profiles`
|
+| `cluster.fodc.agent.pressureProfiler.maxArtifacts` | Maximum
number of capture events to retain (lowest-RSS evicted first)
| `16`
|
+| `cluster.fodc.agent.pressureProfiler.maxDiskSize` | Max
total on-disk size for retained events; case-insensitive 1024-based suffix
(e.g. 512Mi, 1Gi); 0 disables | `512Mi`
|
| `cluster.fodc.agent.livenessProbe.initialDelaySeconds` | Initial
delay for Agent liveness probe
| `90`
|
| `cluster.fodc.agent.livenessProbe.periodSeconds` | Probe
period for Agent liveness probe
| `30`
|
| `cluster.fodc.agent.livenessProbe.timeoutSeconds` | Timeout
in seconds for Agent liveness probe
| `5`
|