This is an automated email from the ASF dual-hosted git repository. hanahmily pushed a commit to branch vectorized-query in repository https://gitbox.apache.org/repos/asf/skywalking-banyandb.git
commit 0c79de2ab531c96e4f3db376505c98cbe281a544 Author: Hongtao Gao <[email protected]> AuthorDate: Mon May 11 00:50:32 2026 +0000 feat(soak): tapered monitor for unattended G5d runs Add scripts/soak-monitor.sh — polls the most recent dist/soak/<ts>/ run on a tapered cadence (8 ticks at 15 min for the first 2 h, then every 60 min for the remainder of the window). Each tick emits a single OK or ALERT line summarising pprof count, MemoryTracker alert lines, parity-diff failures, banyand.log freshness, and compose health. Exits 0 when summary.json appears with no alerts, 2 if any alert fired during the window — pipe to your notification mechanism (notify-send, telegram, etc). Runbook section "Tapered monitor (recommended for unattended runs)" covers usage and the alert criteria mapping. --- docs/soak/g5d-runbook.md | 27 +++++++- scripts/soak-monitor.sh | 163 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 189 insertions(+), 1 deletion(-) diff --git a/docs/soak/g5d-runbook.md b/docs/soak/g5d-runbook.md index 95c25e9d6..fd4995096 100644 --- a/docs/soak/g5d-runbook.md +++ b/docs/soak/g5d-runbook.md @@ -96,7 +96,32 @@ Key environment variables: | `PPROF_INTERVAL_MIN` | 30 | Minutes between heap/goroutine captures | | `PARITY_INTERVAL_MIN` | 5 | Minutes between replay-and-diff runs | -Monitoring during the run: +### Tapered monitor (recommended for unattended runs) + +In a separate pane, kick off the monitor — it polls every 15 min for the +first 2 hours, then drops to every hour for the rest of the window, and +emits one `OK` or `ALERT` status line per tick: + +```bash +./scripts/soak-monitor.sh # watches the most recent run +./scripts/soak-monitor.sh dist/soak/<ts> # watches a specific run +``` + +`ALERT` fires on any of: +- `banyand.log` untouched for >10 min (container hung) +- `memory-alerts.log` gained a line (criterion 3 violation) +- any `diff-*.json` has `"pass": false` (criterion 2 violation) +- `docker compose` reports an unhealthy/exited service + +The monitor exits with code 0 when `summary.json` appears and no alerts +were emitted, code 2 if any alert fired during the window, or non-zero +on configuration error. Pipe to your notification mechanism of choice: + +```bash +./scripts/soak-monitor.sh; rc=$?; [[ $rc -ne 0 ]] && notify-send "soak alert ($rc)" +``` + +Manual one-off checks (drop into a tail pane if you want raw visibility): ```bash # Live BanyanDB logs diff --git a/scripts/soak-monitor.sh b/scripts/soak-monitor.sh new file mode 100755 index 000000000..f63c4d7ba --- /dev/null +++ b/scripts/soak-monitor.sh @@ -0,0 +1,163 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# G5d soak monitor — polls the most recent run under dist/soak/ on a +# tapered cadence and writes a one-line status per tick. Each line is +# tagged either OK or ALERT; ALERT means at least one of: +# - banyand.log has not been touched in >10 min (container hung) +# - memory-alerts.log gained any line (acceptance criterion 3 violated) +# - any diff-*.json shows "pass": false (acceptance criterion 2 violated) +# - docker compose health probe reports degraded +# +# Cadence (matching the operator request): +# ticks 1..8 — every 15 min (covers the first 2 h) +# ticks 9.. — every 60 min (covers the remainder of the 48 h window) +# +# Exits automatically when the run's summary.json appears (soak +# complete) or on Ctrl-C. Returns non-zero if any ALERT line was +# emitted — so a wrapper can chain to a notification mechanism. +# +# Usage: +# ./scripts/soak-monitor.sh # watch most recent run +# ./scripts/soak-monitor.sh dist/soak/20260512T101010 # specific run +# +# Env overrides (rarely needed): +# FIRST_PHASE_TICKS number of 15-min ticks before slowing down (default 8) +# FAST_INTERVAL_SEC fast cadence in seconds (default 900) +# SLOW_INTERVAL_SEC slow cadence in seconds (default 3600) +# LOG_STALE_SEC ALERT threshold for banyand.log freshness (default 600) + +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +COMPOSE_FILE="${REPO_ROOT}/test/soak/docker-compose.soak.yaml" + +FIRST_PHASE_TICKS="${FIRST_PHASE_TICKS:-8}" +FAST_INTERVAL_SEC="${FAST_INTERVAL_SEC:-900}" +SLOW_INTERVAL_SEC="${SLOW_INTERVAL_SEC:-3600}" +LOG_STALE_SEC="${LOG_STALE_SEC:-600}" + +# Resolve the run directory. +if (( $# >= 1 )); then + RUN="$1" +else + RUN="$(ls -td "${REPO_ROOT}"/dist/soak/2026* 2>/dev/null | head -1 || true)" +fi +if [[ -z "${RUN:-}" || ! -d "${RUN}" ]]; then + echo "[soak-monitor] ERROR: no soak run directory found (looked under dist/soak/2026*)" + exit 1 +fi + +LOG="${RUN}/monitor.log" +echo "[soak-monitor] watching ${RUN}" +echo "[soak-monitor] writing status to ${LOG}" + +# Tee from this point on so the status log persists alongside the run. +exec > >(tee -a "${LOG}") 2>&1 + +count_or_zero() { + local n + n="$(eval "$1" 2>/dev/null | wc -l | tr -d ' ')" + echo "${n:-0}" +} + +# Compose health: true if every service shows "healthy" (or has no +# healthcheck — only those without one report empty). +compose_health() { + local status + status="$(docker compose -f "${COMPOSE_FILE}" ps --format '{{.Name}} {{.State}} {{.Health}}' 2>/dev/null || true)" + if [[ -z "${status}" ]]; then + echo "down" + return + fi + # If anything is "unhealthy" or "exited", flag. + if echo "${status}" | grep -qE 'unhealthy|exited|restarting|dead'; then + echo "degraded" + return + fi + echo "healthy" +} + +alert_count=0 +tick=0 +trap 'echo "[soak-monitor] stopped (ticks=${tick} alerts=${alert_count})"' EXIT + +while true; do + tick=$(( tick + 1 )) + ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + + # banyand.log freshness + last_log_ts="$(stat -c %Y "${RUN}/banyand.log" 2>/dev/null || echo 0)" + log_age=$(( $(date +%s) - last_log_ts )) + + # MemoryTracker exhaustion lines + mem_alerts="$(wc -l < "${RUN}/memory-alerts.log" 2>/dev/null | tr -d ' ')" + mem_alerts="${mem_alerts:-0}" + + # Parity divergence reports + diff_fail="$(count_or_zero "grep -l '\"pass\": *false' ${RUN}/diff-*.json")" + + # pprof captures so far + pprof_n="$(count_or_zero "ls -d ${RUN}/pprof-*")" + + # Summary present means soak finished + summary_present="no" + if [[ -f "${RUN}/summary.json" ]]; then summary_present="yes"; fi + + health="$(compose_health)" + + status="OK" + reasons="" + if (( mem_alerts > 0 )); then + status="ALERT" + reasons="${reasons} memory_alerts=${mem_alerts}" + fi + if (( diff_fail > 0 )); then + status="ALERT" + reasons="${reasons} diff_fail=${diff_fail}" + fi + if (( log_age > LOG_STALE_SEC )) && [[ "${summary_present}" == "no" ]]; then + status="ALERT" + reasons="${reasons} log_stale=${log_age}s" + fi + if [[ "${health}" != "healthy" && "${summary_present}" == "no" ]]; then + status="ALERT" + reasons="${reasons} health=${health}" + fi + if [[ "${status}" == "ALERT" ]]; then + alert_count=$(( alert_count + 1 )) + fi + + printf "[%s] %s tick=%d pprof=%s mem_alerts=%s diff_fail=%s log_age=%ds health=%s summary=%s%s\n" \ + "${ts}" "${status}" "${tick}" "${pprof_n}" "${mem_alerts}" "${diff_fail}" "${log_age}" \ + "${health}" "${summary_present}" "${reasons}" + + if [[ "${summary_present}" == "yes" ]]; then + echo "[soak-monitor] soak complete (summary.json present) — exiting" + if (( alert_count > 0 )); then + exit 2 + fi + exit 0 + fi + + if (( tick < FIRST_PHASE_TICKS )); then + sleep "${FAST_INTERVAL_SEC}" + else + sleep "${SLOW_INTERVAL_SEC}" + fi +done
