On 28 Aug 2023, at 16:15, Aaron Conole wrote:
> Eelco Chaudron <[email protected]> writes: > >> This patch adds an utility that can be used to determine if >> an issue is related to a lack of Linux kernel resources. >> >> This tool is also featured in a Red Hat developers blog article: >> >> >> https://developers.redhat.com/articles/2023/07/24/troubleshooting-open-vswitch-kernel-blame >> >> Signed-off-by: Eelco Chaudron <[email protected]> >> --- > > Hi Eelco, > > Thanks for the writeup and utilities. Thanks see, inlines below, and will sent an updated version. Cheers, Eelco >> utilities/automake.mk | 4 >> utilities/usdt-scripts/kernel_delay.py | 1397 >> +++++++++++++++++++++++++++++++ >> utilities/usdt-scripts/kernel_delay.rst | 594 +++++++++++++ >> 3 files changed, 1995 insertions(+) >> create mode 100755 utilities/usdt-scripts/kernel_delay.py >> create mode 100644 utilities/usdt-scripts/kernel_delay.rst >> >> diff --git a/utilities/automake.mk b/utilities/automake.mk >> index 37d679f82..9a2114df4 100644 >> --- a/utilities/automake.mk >> +++ b/utilities/automake.mk >> @@ -23,6 +23,8 @@ scripts_DATA += utilities/ovs-lib >> usdt_SCRIPTS += \ >> utilities/usdt-scripts/bridge_loop.bt \ >> utilities/usdt-scripts/dpif_nl_exec_monitor.py \ >> + utilities/usdt-scripts/kernel_delay.py \ >> + utilities/usdt-scripts/kernel_delay.rst \ >> utilities/usdt-scripts/reval_monitor.py \ >> utilities/usdt-scripts/upcall_cost.py \ >> utilities/usdt-scripts/upcall_monitor.py >> @@ -70,6 +72,8 @@ EXTRA_DIST += \ >> utilities/docker/debian/build-kernel-modules.sh \ >> utilities/usdt-scripts/bridge_loop.bt \ >> utilities/usdt-scripts/dpif_nl_exec_monitor.py \ >> + utilities/usdt-scripts/kernel_delay.py \ >> + utilities/usdt-scripts/kernel_delay.rst \ >> utilities/usdt-scripts/reval_monitor.py \ >> utilities/usdt-scripts/upcall_cost.py \ >> utilities/usdt-scripts/upcall_monitor.py >> diff --git a/utilities/usdt-scripts/kernel_delay.py >> b/utilities/usdt-scripts/kernel_delay.py >> new file mode 100755 >> index 000000000..bba7bc044 >> --- /dev/null >> +++ b/utilities/usdt-scripts/kernel_delay.py >> @@ -0,0 +1,1397 @@ >> +#!/usr/bin/env python3 >> +# >> +# Copyright (c) 2022 Red Hat, Inc. > > Maybe this should be 2022,2023 ? This is always puzzling to me, Should it only be the first year, or all years? Well to be sure I’ll add 2023 ;) >> +# >> +# Licensed under the Apache License, Version 2.0 (the "License"); >> +# you may not use this file except in compliance with the License. >> +# You may obtain a copy of the License at: >> +# >> +# http://www.apache.org/licenses/LICENSE-2.0 >> +# >> +# Unless required by applicable law or agreed to in writing, software >> +# distributed under the License is distributed on an "AS IS" BASIS, >> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> +# See the License for the specific language governing permissions and >> +# limitations under the License. >> +# >> +# Script information: >> +# ------------------- >> +# This script allows a developer to quickly identify if the issue at hand >> +# might be related to the kernel running out of resources or if it really is >> +# an Open vSwitch issue. >> +# >> +# For documentation see the kernel_delay.rst file. >> +# >> +try: >> + from bcc import BPF, USDT, USDTException >> + from bcc.syscall import syscalls, syscall_name >> +except ModuleNotFoundError: >> + print("ERROR: Can't find the BPF Compiler Collection (BCC) tools!") > > I think this needs to exit in this exception block Thanks, missed it. >> + >> +from enum import IntEnum >> + >> +import argparse >> +import datetime >> +import os >> +import pytz >> +import psutil >> +import re >> +import sys >> +import time >> + >> +import ctypes as ct >> + >> +try: >> + from time import time_ns >> +except ImportError: >> + # For compatibility with Python <= v3.6. >> + def time_ns(): >> + now = datetime.datetime.now() >> + return int(now.timestamp() * 1e9) >> + >> + >> +# >> +# Actual eBPF source code >> +# >> +EBPF_SOURCE = """ >> +#include <linux/irq.h> >> +#include <linux/sched.h> >> + >> +#define MONITOR_PID <MONITOR_PID> >> + >> +enum { >> +<EVENT_ENUM> >> +}; >> + >> +struct event_t { >> + u64 ts; >> + u32 tid; >> + u32 id; >> + >> + int user_stack_id; >> + int kernel_stack_id; >> + >> + u32 syscall; >> + u64 entry_ts; >> + >> +}; >> + >> +BPF_RINGBUF_OUTPUT(events, <BUFFER_PAGE_CNT>); >> +BPF_STACK_TRACE(stack_traces, <STACK_TRACE_SIZE>); >> +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1); >> +BPF_TABLE("percpu_array", uint32_t, uint64_t, trigger_miss, 1); >> + >> +BPF_ARRAY(capture_on, u64, 1); >> +static bool capture_enabled(u64 pid_tgid) { >> + int key = 0; >> + u64 *ret; >> + >> + if ((pid_tgid >> 32) != MONITOR_PID) >> + return false; >> + >> + ret = capture_on.lookup(&key); >> + return ret && *ret == 1; >> +} >> + >> +static inline bool capture_enabled__() { >> + int key = 0; >> + u64 *ret; >> + >> + ret = capture_on.lookup(&key); >> + return ret && *ret == 1; >> +} >> + >> + >> +static struct event_t *get_event(uint32_t id) { >> + struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t)); >> + >> + if (!event) { >> + dropcnt.increment(0); >> + return NULL; >> + } >> + >> + event->id = id; >> + event->ts = bpf_ktime_get_ns(); >> + event->tid = bpf_get_current_pid_tgid(); >> + >> + return event; >> +} >> + >> +static int start_trigger() { >> + int key = 0; >> + u64 *val = capture_on.lookup(&key); >> + >> + /* If the value is -1 we can't start as we are still processing the >> + * results in userspace. */ >> + if (!val || *val != 0) { >> + trigger_miss.increment(0); >> + return 0; >> + } >> + >> + struct event_t *event = get_event(EVENT_START_TRIGGER); >> + if (event) { >> + events.ringbuf_submit(event, 0); >> + *val = 1; >> + } else { >> + trigger_miss.increment(0); >> + } >> + return 0; >> +} >> + >> +static int stop_trigger() { >> + int key = 0; >> + u64 *val = capture_on.lookup(&key); >> + >> + if (!val || *val != 1) >> + return 0; >> + >> + struct event_t *event = get_event(EVENT_STOP_TRIGGER); >> + >> + if (event) >> + events.ringbuf_submit(event, 0); >> + >> + if (val) >> + *val = -1; >> + >> + return 0; >> +} >> + >> +<START_TRIGGER> >> +<STOP_TRIGGER> >> + >> + >> +/* >> + * For the syscall monitor the following probes get installed. >> + */ >> +struct syscall_data_t { >> + u64 count; >> + u64 total_ns; >> + u64 worst_ns; >> +}; >> + >> +struct syscall_data_key_t { >> + u32 pid; >> + u32 tid; >> + u32 syscall; >> +}; >> + >> +BPF_HASH(syscall_start, u64, u64); >> +BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t); >> + >> +TRACEPOINT_PROBE(raw_syscalls, sys_enter) { >> + u64 pid_tgid = bpf_get_current_pid_tgid(); >> + >> + if (!capture_enabled(pid_tgid)) >> + return 0; >> + >> + u64 t = bpf_ktime_get_ns(); >> + syscall_start.update(&pid_tgid, &t); >> + >> + return 0; >> +} >> + >> +TRACEPOINT_PROBE(raw_syscalls, sys_exit) { >> + struct syscall_data_t *val, zero = {}; >> + struct syscall_data_key_t key; >> + >> + u64 pid_tgid = bpf_get_current_pid_tgid(); >> + >> + if (!capture_enabled(pid_tgid)) >> + return 0; >> + >> + key.pid = pid_tgid >> 32; >> + key.tid = (u32)pid_tgid; >> + key.syscall = args->id; >> + >> + u64 *start_ns = syscall_start.lookup(&pid_tgid); >> + >> + if (!start_ns) >> + return 0; >> + >> + val = syscall_data.lookup_or_try_init(&key, &zero); >> + if (val) { >> + u64 delta = bpf_ktime_get_ns() - *start_ns; >> + val->count++; >> + val->total_ns += delta; >> + if (val->worst_ns == 0 || delta > val->worst_ns) >> + val->worst_ns = delta; >> + >> + if (<SYSCAL_TRACE_EVENTS>) { >> + struct event_t *event = get_event(EVENT_SYSCALL); >> + if (event) { >> + event->syscall = args->id; >> + event->entry_ts = *start_ns; >> + if (<STACK_TRACE_ENABLED>) { >> + event->user_stack_id = stack_traces.get_stackid( >> + args, BPF_F_USER_STACK); >> + event->kernel_stack_id = stack_traces.get_stackid( >> + args, 0); >> + } >> + events.ringbuf_submit(event, 0); >> + } >> + } >> + } >> + return 0; >> +} >> + >> + >> + >> +/* >> + * For measuring the thread run time, we need the following. >> + */ >> +struct run_time_data_t { >> + u64 count; >> + u64 total_ns; >> + u64 max_ns; >> + u64 min_ns; >> +}; >> + >> +struct pid_tid_key_t { >> + u32 pid; >> + u32 tid; >> +}; >> + >> +BPF_HASH(run_start, u64, u64); >> +BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t); >> + >> +static inline void thread_start_run(u64 pid_tgid, u64 ktime) >> +{ >> + run_start.update(&pid_tgid, &ktime); >> +} >> + >> +static inline void thread_stop_run(u32 pid, u32 tgid, u64 ktime) >> +{ >> + u64 pid_tgid = (u64) tgid << 32 | pid; >> + u64 *start_ns = run_start.lookup(&pid_tgid); >> + >> + if (!start_ns || *start_ns == 0) >> + return; >> + >> + struct run_time_data_t *val, zero = {}; >> + struct pid_tid_key_t key = { .pid = tgid, >> + .tid = pid }; >> + >> + val = run_data.lookup_or_try_init(&key, &zero); >> + if (val) { >> + u64 delta = ktime - *start_ns; >> + val->count++; >> + val->total_ns += delta; >> + if (val->max_ns == 0 || delta > val->max_ns) >> + val->max_ns = delta; >> + if (val->min_ns == 0 || delta < val->min_ns) >> + val->min_ns = delta; >> + } >> + *start_ns = 0; >> +} >> + >> + >> +/* >> + * For measuring the thread-ready delay, we need the following. >> + */ >> +struct ready_data_t { >> + u64 count; >> + u64 total_ns; >> + u64 worst_ns; >> +}; >> + >> +BPF_HASH(ready_start, u64, u64); >> +BPF_HASH(ready_data, struct pid_tid_key_t, struct ready_data_t); >> + >> +static inline int sched_wakeup__(u32 pid, u32 tgid) >> +{ >> + u64 pid_tgid = (u64) tgid << 32 | pid; >> + >> + if (!capture_enabled(pid_tgid)) >> + return 0; >> + >> + u64 t = bpf_ktime_get_ns(); >> + ready_start.update(&pid_tgid, &t); >> + return 0; >> +} >> + >> +RAW_TRACEPOINT_PROBE(sched_wakeup) >> +{ >> + struct task_struct *t = (struct task_struct *)ctx->args[0]; >> + return sched_wakeup__(t->pid, t->tgid); >> +} >> + >> +RAW_TRACEPOINT_PROBE(sched_wakeup_new) >> +{ >> + struct task_struct *t = (struct task_struct *)ctx->args[0]; >> + return sched_wakeup__(t->pid, t->tgid); >> +} >> + >> +RAW_TRACEPOINT_PROBE(sched_switch) >> +{ >> + struct task_struct *prev = (struct task_struct *)ctx->args[1]; >> + struct task_struct *next= (struct task_struct *)ctx->args[2]; >> + u64 ktime = 0; >> + >> + if (!capture_enabled__()) >> + return 0; >> + >> + if (prev-><STATE_FIELD> == TASK_RUNNING && prev->tgid == MONITOR_PID) >> + sched_wakeup__(prev->pid, prev->tgid); >> + >> + if (prev->tgid == MONITOR_PID) { >> + ktime = bpf_ktime_get_ns(); >> + thread_stop_run(prev->pid, prev->tgid, ktime); >> + } >> + >> + u64 pid_tgid = (u64)next->tgid << 32 | next->pid; >> + >> + if (next->tgid != MONITOR_PID) >> + return 0; >> + >> + if (ktime == 0) >> + ktime = bpf_ktime_get_ns(); >> + >> + u64 *start_ns = ready_start.lookup(&pid_tgid); >> + >> + if (start_ns && *start_ns != 0) { >> + >> + struct ready_data_t *val, zero = {}; >> + struct pid_tid_key_t key = { .pid = next->tgid, >> + .tid = next->pid }; >> + >> + val = ready_data.lookup_or_try_init(&key, &zero); >> + if (val) { >> + u64 delta = ktime - *start_ns; >> + val->count++; >> + val->total_ns += delta; >> + if (val->worst_ns == 0 || delta > val->worst_ns) >> + val->worst_ns = delta; >> + } >> + *start_ns = 0; >> + } >> + >> + thread_start_run(pid_tgid, ktime); >> + return 0; >> +} >> + >> +/* >> + * For measuring the hard irq time, we need the following. >> + */ >> +struct hardirq_start_data_t { >> + u64 start_ns; >> + char irq_name[32]; >> +}; >> + >> +struct hardirq_data_t { >> + u64 count; >> + u64 total_ns; >> + u64 worst_ns; >> +}; >> + >> +struct hardirq_data_key_t { >> + u32 pid; >> + u32 tid; >> + char irq_name[32]; >> +}; >> + >> +BPF_HASH(hardirq_start, u64, struct hardirq_start_data_t); >> +BPF_HASH(hardirq_data, struct hardirq_data_key_t, struct hardirq_data_t); >> + >> +TRACEPOINT_PROBE(irq, irq_handler_entry) >> +{ >> + u64 pid_tgid = bpf_get_current_pid_tgid(); >> + >> + if (!capture_enabled(pid_tgid)) >> + return 0; >> + >> + struct hardirq_start_data_t data = {}; >> + >> + data.start_ns = bpf_ktime_get_ns(); >> + TP_DATA_LOC_READ_STR(&data.irq_name, name, sizeof(data.irq_name)); >> + hardirq_start.update(&pid_tgid, &data); >> + return 0; >> +} >> + >> +TRACEPOINT_PROBE(irq, irq_handler_exit) >> +{ >> + u64 pid_tgid = bpf_get_current_pid_tgid(); >> + >> + if (!capture_enabled(pid_tgid)) >> + return 0; >> + >> + struct hardirq_start_data_t *data; >> + data = hardirq_start.lookup(&pid_tgid); >> + if (!data || data->start_ns == 0) >> + return 0; >> + >> + if (args->ret != IRQ_NONE) { >> + struct hardirq_data_t *val, zero = {}; >> + struct hardirq_data_key_t key = { .pid = pid_tgid >> 32, >> + .tid = (u32)pid_tgid }; >> + >> + bpf_probe_read_kernel(&key.irq_name, sizeof(key.irq_name), >> + data->irq_name); >> + val = hardirq_data.lookup_or_try_init(&key, &zero); >> + if (val) { >> + u64 delta = bpf_ktime_get_ns() - data->start_ns; >> + val->count++; >> + val->total_ns += delta; >> + if (val->worst_ns == 0 || delta > val->worst_ns) >> + val->worst_ns = delta; >> + } >> + } >> + >> + data->start_ns = 0; >> + return 0; >> +} >> + >> +/* >> + * For measuring the soft irq time, we need the following. >> + */ >> +struct softirq_start_data_t { >> + u64 start_ns; >> + u32 vec_nr; >> +}; >> + >> +struct softirq_data_t { >> + u64 count; >> + u64 total_ns; >> + u64 worst_ns; >> +}; >> + >> +struct softirq_data_key_t { >> + u32 pid; >> + u32 tid; >> + u32 vec_nr; >> +}; >> + >> +BPF_HASH(softirq_start, u64, struct softirq_start_data_t); >> +BPF_HASH(softirq_data, struct softirq_data_key_t, struct softirq_data_t); >> + >> +TRACEPOINT_PROBE(irq, softirq_entry) >> +{ >> + u64 pid_tgid = bpf_get_current_pid_tgid(); >> + >> + if (!capture_enabled(pid_tgid)) >> + return 0; >> + >> + struct softirq_start_data_t data = {}; >> + >> + data.start_ns = bpf_ktime_get_ns(); >> + data.vec_nr = args->vec; >> + softirq_start.update(&pid_tgid, &data); >> + return 0; >> +} >> + >> +TRACEPOINT_PROBE(irq, softirq_exit) >> +{ >> + u64 pid_tgid = bpf_get_current_pid_tgid(); >> + >> + if (!capture_enabled(pid_tgid)) >> + return 0; >> + >> + struct softirq_start_data_t *data; >> + data = softirq_start.lookup(&pid_tgid); >> + if (!data || data->start_ns == 0) >> + return 0; >> + >> + struct softirq_data_t *val, zero = {}; >> + struct softirq_data_key_t key = { .pid = pid_tgid >> 32, >> + .tid = (u32)pid_tgid, >> + .vec_nr = data->vec_nr}; >> + >> + val = softirq_data.lookup_or_try_init(&key, &zero); >> + if (val) { >> + u64 delta = bpf_ktime_get_ns() - data->start_ns; >> + val->count++; >> + val->total_ns += delta; >> + if (val->worst_ns == 0 || delta > val->worst_ns) >> + val->worst_ns = delta; >> + } >> + >> + data->start_ns = 0; >> + return 0; >> +} >> +""" >> + >> + >> +# >> +# Probe class to use for the start/stop triggers >> +# >> +class Probe(object): >> + ''' >> + The goal for this object is to support as many as possible >> + probe/events as supported by BCC. See >> + >> https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments >> + ''' >> + def __init__(self, probe, pid=None): >> + self.pid = pid >> + self.text_probe = probe >> + self._parse_text_probe() >> + >> + def __str__(self): >> + if self.probe_type == "usdt": >> + return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type, >> + self.usdt_provider, >> self.usdt_probe) >> + elif self.probe_type == "trace": >> + return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type, >> + self.trace_system, >> self.trace_event) >> + elif self.probe_type == "kprobe" or self.probe_type == "kretprobe": >> + return "[{}]; {}:{}".format(self.text_probe, self.probe_type, >> + self.kprobe_function) >> + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": >> + return "[{}]; {}:{}".format(self.text_probe, self.probe_type, >> + self.uprobe_function) >> + else: >> + return "[{}] <{}:unknown probe>".format(self.text_probe, >> + self.probe_type) >> + >> + def _raise(self, error): >> + raise ValueError("[{}]; {}".format(self.text_probe, error)) >> + >> + def _verify_kprobe_probe(self): >> + # Nothing to verify for now, just return. >> + return >> + >> + def _verify_trace_probe(self): >> + # Nothing to verify for now, just return. >> + return >> + >> + def _verify_uprobe_probe(self): >> + # Nothing to verify for now, just return. >> + return >> + >> + def _verify_usdt_probe(self): >> + if not self.pid: >> + self._raise("USDT probes need a valid PID.") >> + >> + usdt = USDT(pid=self.pid) >> + >> + for probe in usdt.enumerate_probes(): >> + if probe.provider.decode('utf-8') == self.usdt_provider and \ >> + probe.name.decode('utf-8') == self.usdt_probe: >> + return >> + >> + self._raise("Can't find UDST probe >> '{}:{}'".format(self.usdt_provider, >> + self.usdt_probe)) >> + >> + def _parse_text_probe(self): >> + ''' >> + The text probe format is defined as follows: >> + <probe_type>:<probe_specific> >> + >> + Types: >> + USDT: u|usdt:<provider>:<probe> >> + TRACE: t|trace:<system>:<event> >> + KPROBE: k|kprobe:<kernel_function> >> + KRETPROBE: kr|kretprobe:<kernel_function> >> + UPROBE: up|uprobe:<function> >> + URETPROBE: ur|uretprobe:<function> >> + ''' >> + args = self.text_probe.split(":") >> + if len(args) <= 1: >> + self._raise("Can't extract probe type.") >> + >> + if args[0] not in ["k", "kprobe", "kr", "kretprobe", "t", "trace", >> + "u", "usdt", "up", "uprobe", "ur", "uretprobe"]: >> + self._raise("Invalid probe type '{}'".format(args[0])) >> + >> + self.probe_type = "kprobe" if args[0] == "k" else args[0] >> + self.probe_type = "kretprobe" if args[0] == "kr" else >> self.probe_type >> + self.probe_type = "trace" if args[0] == "t" else self.probe_type >> + self.probe_type = "usdt" if args[0] == "u" else self.probe_type >> + self.probe_type = "uprobe" if args[0] == "up" else self.probe_type >> + self.probe_type = "uretprobe" if args[0] == "ur" else >> self.probe_type >> + >> + if self.probe_type == "usdt": >> + if len(args) != 3: >> + self._raise("Invalid number of arguments for USDT") >> + >> + self.usdt_provider = args[1] >> + self.usdt_probe = args[2] >> + self._verify_usdt_probe() >> + >> + elif self.probe_type == "trace": >> + if len(args) != 3: >> + self._raise("Invalid number of arguments for TRACE") >> + >> + self.trace_system = args[1] >> + self.trace_event = args[2] >> + self._verify_trace_probe() >> + >> + elif self.probe_type == "kprobe" or self.probe_type == "kretprobe": >> + if len(args) != 2: >> + self._raise("Invalid number of arguments for K(RET)PROBE") >> + self.kprobe_function = args[1] >> + self._verify_kprobe_probe() >> + >> + elif self.probe_type == "uprobe" or self.probe_type == "uretprobe": >> + if len(args) != 2: >> + self._raise("Invalid number of arguments for U(RET)PROBE") >> + self.uprobe_function = args[1] >> + self._verify_uprobe_probe() >> + >> + def _get_kprobe_c_code(self, function_name, function_content): >> + # >> + # The kprobe__* do not require a function name, so it's >> + # ignored in the code generation. >> + # >> + return """ >> +int {}__{}(struct pt_regs *ctx) {{ >> + {} >> +}} >> +""".format(self.probe_type, self.kprobe_function, function_content) >> + >> + def _get_trace_c_code(self, function_name, function_content): >> + # >> + # The TRACEPOINT_PROBE() do not require a function name, so it's >> + # ignored in the code generation. >> + # >> + return """ >> +TRACEPOINT_PROBE({},{}) {{ >> + {} >> +}} >> +""".format(self.trace_system, self.trace_event, function_content) >> + >> + def _get_uprobe_c_code(self, function_name, function_content): >> + return """ >> +int {}(struct pt_regs *ctx) {{ >> + {} >> +}} >> +""".format(function_name, function_content) >> + >> + def _get_usdt_c_code(self, function_name, function_content): >> + return """ >> +int {}(struct pt_regs *ctx) {{ >> + {} >> +}} >> +""".format(function_name, function_content) >> + >> + def get_c_code(self, function_name, function_content): >> + if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe': >> + return self._get_kprobe_c_code(function_name, function_content) >> + elif self.probe_type == 'trace': >> + return self._get_trace_c_code(function_name, function_content) >> + elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe': >> + return self._get_uprobe_c_code(function_name, function_content) >> + elif self.probe_type == 'usdt': >> + return self._get_usdt_c_code(function_name, function_content) >> + >> + return "" >> + >> + def probe_name(self): >> + if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe': >> + return "{}".format(self.kprobe_function) >> + elif self.probe_type == 'trace': >> + return "{}:{}".format(self.trace_system, >> + self.trace_event) >> + elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe': >> + return "{}".format(self.uprobe_function) >> + elif self.probe_type == 'usdt': >> + return "{}:{}".format(self.usdt_provider, >> + self.usdt_probe) >> + >> + return "" >> + >> + >> +# >> +# event_to_dict() >> +# >> +def event_to_dict(event): >> + return dict([(field, getattr(event, field)) >> + for (field, _) in event._fields_ >> + if isinstance(getattr(event, field), (int, bytes))]) >> + >> + >> +# >> +# Event enum >> +# >> +Event = IntEnum("Event", ["SYSCALL", "START_TRIGGER", "STOP_TRIGGER", >> + ], start=0) >> + >> + >> +# >> +# process_event() >> +# >> +def process_event(ctx, data, size): >> + global start_trigger_ts >> + global stop_trigger_ts >> + >> + event = bpf['events'].event(data) >> + if event.id == Event.SYSCALL: >> + syscall_events.append({"tid": event.tid, >> + "ts_entry": event.entry_ts, >> + "ts_exit": event.ts, >> + "syscall": event.syscall, >> + "user_stack_id": event.user_stack_id, >> + "kernel_stack_id": event.kernel_stack_id}) >> + elif event.id == Event.START_TRIGGER: >> + # >> + # This event would have start the trigger already, so all we need to >> + # do is record the start timestamp. >> + # >> + start_trigger_ts = event.ts >> + >> + elif event.id == Event.STOP_TRIGGER: >> + # >> + # This event would have stopped the trigger already, so all we need >> to >> + # do is record the start timestamp. >> + stop_trigger_ts = event.ts >> + >> + >> +# >> +# next_power_of_two() >> +# >> +def next_power_of_two(val): >> + np = 1 >> + while np < val: >> + np *= 2 >> + return np >> + >> + >> +# >> +# unsigned_int() >> +# >> +def unsigned_int(value): >> + try: >> + value = int(value) >> + except ValueError: >> + raise argparse.ArgumentTypeError("must be an integer") >> + >> + if value < 0: >> + raise argparse.ArgumentTypeError("must be positive") >> + return value >> + >> + >> +# >> +# unsigned_nonzero_int() >> +# >> +def unsigned_nonzero_int(value): >> + value = unsigned_int(value) >> + if value == 0: >> + raise argparse.ArgumentTypeError("must be nonzero") >> + return value >> + >> + >> +# >> +# get_thread_name() >> +# >> +def get_thread_name(pid, tid): >> + try: >> + with open(f"/proc/{pid}/task/{tid}/comm", encoding="utf8") as f: >> + return f.readline().strip("\n") >> + except FileNotFoundError: >> + pass >> + >> + return f"<unknown:{pid}/{tid}>" >> + >> + >> +# >> +# get_vec_nr_name() >> +# >> +def get_vec_nr_name(vec_nr): >> + known_vec_nr = ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll", >> + "tasklet", "sched", "hrtimer", "rcu"] >> + >> + if vec_nr < 0 or vec_nr > len(known_vec_nr): >> + return f"<unknown:{vec_nr}>" >> + >> + return known_vec_nr[vec_nr] >> + >> + >> +# >> +# start/stop/reset capture >> +# >> +def start_capture(): >> + bpf["capture_on"][ct.c_int(0)] = ct.c_int(1) >> + >> + >> +def stop_capture(force=False): >> + if force: >> + bpf["capture_on"][ct.c_int(0)] = ct.c_int(0xffff) >> + else: >> + bpf["capture_on"][ct.c_int(0)] = ct.c_int(0) >> + >> + >> +def capture_running(): >> + return True if bpf["capture_on"][ct.c_int(0)].value == 1 else False >> + >> + >> +def reset_capture(): >> + bpf["syscall_start"].clear() >> + bpf["syscall_data"].clear() >> + bpf["run_start"].clear() >> + bpf["run_data"].clear() >> + bpf["ready_start"].clear() >> + bpf["ready_data"].clear() >> + bpf["hardirq_start"].clear() >> + bpf["hardirq_data"].clear() >> + bpf["softirq_start"].clear() >> + bpf["softirq_data"].clear() >> + bpf["stack_traces"].clear() >> + >> + >> +# >> +# Display timestamp >> +# >> +def print_timestamp(msg): >> + ltz = datetime.datetime.now() >> + utc = ltz.astimezone(pytz.utc) >> + time_string = "{} @{} ({} UTC)".format( >> + msg, ltz.isoformat(), utc.strftime("%H:%M:%S")) >> + print(time_string) >> + >> + >> +# >> +# process_results() >> +# >> +def process_results(syscal_events=None, trigger_delta=None): >> + if trigger_delta: >> + print_timestamp("# Triggered sample dump, stop-start delta {:,} ns". >> + format(trigger_delta)) >> + else: >> + print_timestamp("# Sample dump") >> + >> + # >> + # First get a list of all threads we need to report on. >> + # >> + threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items() >> + if k.syscall != 0xffffffff} >> + >> + threads_run = {k.tid for k, _ in bpf["run_data"].items() >> + if k.pid != 0xffffffff} >> + >> + threads_ready = {k.tid for k, _ in bpf["ready_data"].items() >> + if k.pid != 0xffffffff} >> + >> + threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items() >> + if k.pid != 0xffffffff} >> + >> + threads_softirq = {k.tid for k, _ in bpf["softirq_data"].items() >> + if k.pid != 0xffffffff} >> + >> + threads = sorted(threads_syscall | threads_run | threads_ready | >> + threads_hardirq | threads_softirq, >> + key=lambda x: get_thread_name(options.pid, x)) >> + >> + # >> + # Print header... >> + # >> + print("{:10} {:16} {}".format("TID", "THREAD", "<RESOURCE SPECIFIC>")) >> + print("{:10} {:16} {}".format("-" * 10, "-" * 16, "-" * 76)) >> + indent = 28 * " " >> + >> + # >> + # Print all events/statistics per threads. >> + # >> + poll_id = [k for k, v in syscalls.items() if v == b'poll'][0] >> + for thread in threads: >> + >> + if thread != threads[0]: >> + print("") >> + >> + # >> + # SYSCALL_STATISTICS >> + # >> + print("{:10} {:16} {}\n{}{:20} {:>6} {:>10} {:>16} >> {:>16}".format( >> + thread, get_thread_name(options.pid, thread), >> + "[SYSCALL STATISTICS]", indent, >> + "NAME", "NUMBER", "COUNT", "TOTAL ns", "MAX ns")) >> + >> + total_count = 0 >> + total_ns = 0 >> + for k, v in sorted(bpf["syscall_data"].items(), >> + key=lambda kv: -kv[1].total_ns): >> + if k.tid != thread: >> + continue >> + >> + print("{}{:20.20} {:6} {:10} {:16,} {:16,}".format( >> + indent, syscall_name(k.syscall).decode('utf-8'), k.syscall, >> + v.count, v.total_ns, v.worst_ns)) >> + if k.syscall != poll_id: >> + total_count += v.count >> + total_ns += v.total_ns >> + >> + if total_count > 0: >> + print("{}{:20.20} {:6} {:10} {:16,}".format( >> + indent, "TOTAL( - poll):", "", total_count, total_ns)) >> + >> + # >> + # THREAD RUN STATISTICS >> + # >> + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format( >> + "", "", "[THREAD RUN STATISTICS]", indent, >> + "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns")) >> + >> + for k, v in bpf["run_data"].items(): >> + if k.tid != thread: >> + continue >> + >> + print("{}{:10} {:16,} {:16,} {:16,}".format( >> + indent, v.count, v.total_ns, v.min_ns, v.max_ns)) >> + >> + # >> + # THREAD READY STATISTICS >> + # >> + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16}".format( >> + "", "", "[THREAD READY STATISTICS]", indent, >> + "SCHED_CNT", "TOTAL ns", "MAX ns")) >> + >> + for k, v in bpf["ready_data"].items(): >> + if k.tid != thread: >> + continue >> + >> + print("{}{:10} {:16,} {:16,}".format( >> + indent, v.count, v.total_ns, v.worst_ns)) >> + >> + # >> + # HARD IRQ STATISTICS >> + # >> + total_ns = 0 >> + total_count = 0 >> + header_printed = False >> + for k, v in sorted(bpf["hardirq_data"].items(), >> + key=lambda kv: -kv[1].total_ns): >> + if k.tid != thread: >> + continue >> + >> + if not header_printed: >> + print("\n{:10} {:16} {}\n{}{:20} {:>10} {:>16} {:>16}". >> + format("", "", "[HARD IRQ STATISTICS]", indent, >> + "NAME", "COUNT", "TOTAL ns", "MAX ns")) >> + header_printed = True >> + >> + print("{}{:20.20} {:10} {:16,} {:16,}".format( >> + indent, k.irq_name.decode('utf-8'), >> + v.count, v.total_ns, v.worst_ns)) >> + >> + total_count += v.count >> + total_ns += v.total_ns >> + >> + if total_count > 0: >> + print("{}{:20.20} {:10} {:16,}".format( >> + indent, "TOTAL:", total_count, total_ns)) >> + >> + # >> + # SOFT IRQ STATISTICS >> + # >> + total_ns = 0 >> + total_count = 0 >> + header_printed = False >> + for k, v in sorted(bpf["softirq_data"].items(), >> + key=lambda kv: -kv[1].total_ns): >> + if k.tid != thread: >> + continue >> + >> + if not header_printed: >> + print("\n{:10} {:16} {}\n" >> + "{}{:20} {:>7} {:>10} {:>16} {:>16}". >> + format("", "", "[SOFT IRQ STATISTICS]", indent, >> + "NAME", "VECT_NR", "COUNT", "TOTAL ns", "MAX >> ns")) >> + header_printed = True >> + >> + print("{}{:20.20} {:>7} {:10} {:16,} {:16,}".format( >> + indent, get_vec_nr_name(k.vec_nr), k.vec_nr, >> + v.count, v.total_ns, v.worst_ns)) >> + >> + total_count += v.count >> + total_ns += v.total_ns >> + >> + if total_count > 0: >> + print("{}{:20.20} {:7} {:10} {:16,}".format( >> + indent, "TOTAL:", "", total_count, total_ns)) >> + >> + # >> + # Print events >> + # >> + lost_stack_traces = 0 >> + if syscall_events is not None and len(syscall_events) > 0: >> + stack_traces = bpf.get_table("stack_traces") >> + >> + print("\n\n# SYSCALL EVENTS:" >> + "\n{}{:>19} {:>19} {:>10} {:16} {:>10} {}".format( >> + 2 * " ", "ENTRY (ns)", "EXIT (ns)", "TID", "COMM", >> + "DELTA (us)", "SYSCALL")) >> + print("{}{:19} {:19} {:10} {:16} {:10} {}".format( >> + 2 * " ", "-" * 19, "-" * 19, "-" * 10, "-" * 16, >> + "-" * 10, "-" * 16)) >> + for event in syscall_events: >> + print("{}{:19} {:19} {:10} {:16} {:10,} {}".format( >> + " " * 2, >> + event["ts_entry"], event["ts_exit"], event["tid"], >> + get_thread_name(options.pid, event["tid"]), >> + int((event["ts_exit"] - event["ts_entry"]) / 1000), >> + syscall_name(event["syscall"]).decode('utf-8'))) >> + # >> + # Not sure where to put this, but I'll add some info on stack >> + # traces here... Userspace stack traces are very limited due to >> + # the fact that bcc does not support dwarf backtraces. As OVS >> + # gets compiled without frame pointers we will not see much. >> + # If however, OVS does get built with frame pointers, we should >> not >> + # use the BPF_STACK_TRACE_BUILDID as it does not seem to handle >> + # the debug symbols correctly. Also, note that for kernel >> + # traces you should not use BPF_STACK_TRACE_BUILDID, so two >> + # buffers are needed. >> + # >> + # Some info on manual dwarf walk support: >> + # https://github.com/iovisor/bcc/issues/3515 >> + # https://github.com/iovisor/bcc/pull/4463 >> + # >> + if options.stack_trace_size == 0: >> + continue >> + >> + if event['kernel_stack_id'] < 0 or event['user_stack_id'] < 0: >> + lost_stack_traces += 1 >> + >> + kernel_stack = stack_traces.walk(event['kernel_stack_id']) \ >> + if event['kernel_stack_id'] >= 0 else [] >> + user_stack = stack_traces.walk(event['user_stack_id']) \ >> + if event['user_stack_id'] >= 0 else [] >> + >> + for addr in kernel_stack: >> + print("{}{}".format( >> + " " * 10, >> + bpf.ksym(addr, show_module=True, >> + show_offset=True).decode('utf-8', 'replace'))) >> + >> + for addr in user_stack: >> + addr_str = bpf.sym(addr, options.pid, show_module=True, >> + show_offset=True).decode('utf-8', >> 'replace') >> + >> + if addr_str == "[unknown]": >> + addr_str += " 0x{:x}".format(addr) >> + >> + print("{}{}".format(" " * 10, addr_str)) >> + >> + # >> + # Print any footer messages. >> + # >> + if lost_stack_traces > 0: >> + print("\n#WARNING: We where not able to display {} stack traces!\n" >> + "# Consider increasing the stack trace size using\n" >> + "# the '--stack-trace-size' option.\n" >> + "# Note that this can also happen due to a stack id\n" >> + "# collision.".format(lost_stack_traces)) >> + >> + >> +# >> +# main() >> +# >> +def main(): >> + # >> + # Don't like these globals, but ctx passing does not seem to work with >> the >> + # existing open_ring_buffer() API :( >> + # >> + global bpf >> + global options >> + global syscall_events >> + global start_trigger_ts >> + global stop_trigger_ts >> + >> + start_trigger_ts = 0 >> + stop_trigger_ts = 0 >> + >> + # >> + # Argument parsing >> + # >> + parser = argparse.ArgumentParser() >> + >> + parser.add_argument("-D", "--debug", >> + help="Enable eBPF debugging", >> + type=int, const=0x3f, default=0, nargs='?') >> + parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID", >> + help="ovs-vswitch's PID", >> + type=unsigned_int, default=None) >> + parser.add_argument("-s", "--syscall-events", metavar="DURATION_NS", >> + help="Record syscall events, default disabled", >> + type=unsigned_int, const=0, default=None, nargs='?') >> + parser.add_argument("--buffer-page-count", >> + help="Number of BPF ring buffer pages, default >> 1024", >> + type=unsigned_int, default=1024, metavar="NUMBER") >> + parser.add_argument("--sample-count", >> + help="Number of sample runs, default 1", >> + type=unsigned_nonzero_int, default=1, >> metavar="RUNS") >> + parser.add_argument("--sample-interval", >> + help="Delay between sample runs, default 0", >> + type=float, default=0, metavar="SECONDS") >> + parser.add_argument("--sample-time", >> + help="Sample time, default 0.5 seconds", >> + type=float, default=0.5, metavar="SECONDS") >> + parser.add_argument("--skip-syscall-poll-events", >> + help="Skip poll() syscalls with --syscall-events", >> + action="store_true") >> + parser.add_argument("--stack-trace-size", >> + help="Number of unique stack traces that can be " >> + "recorded, default 4096. 0 to disable", >> + type=unsigned_int, default=4096) >> + parser.add_argument("--start-trigger", metavar="TRIGGER", >> + help="Start trigger, see documentation for details", >> + type=str, default=None) >> + parser.add_argument("--stop-trigger", metavar="TRIGGER", >> + help="Stop trigger, see documentation for details", >> + type=str, default=None) >> + parser.add_argument("--trigger-delta", metavar="DURATION_NS", >> + help="Only report event when the trigger duration > >> " >> + "DURATION_NS, default 0 (all events)", >> + type=unsigned_int, const=0, default=0, nargs='?') >> + >> + options = parser.parse_args() >> + >> + # >> + # Find the PID of the ovs-vswitchd daemon if not specified. >> + # >> + if options.pid is None: >> + for proc in psutil.process_iter(): >> + if 'ovs-vswitchd' in proc.name(): >> + if options.pid is not None: >> + print("ERROR: Multiple ovs-vswitchd daemons running, " >> + "use the -p option!") >> + sys.exit(-1) > > We have some inconsistencies here because some utilities we send > "sys.exit(1)" and some we send "sys.exit(-1)" > > I think we switched to using '1' because it is a valid return value on > other OSes. I guess for now there isn't any need to normalize it since > these are eBPF programs intended to run on linux. That said, maybe we > could consider a normalized value anyway just for review purposes. ACK, good idea! I changed all the exit codes using os.EX_???. >> + >> + options.pid = proc.pid >> + >> + # >> + # Error checking on input parameters. >> + # >> + if options.pid is None: >> + print("ERROR: Failed to find ovs-vswitchd's PID!") >> + sys.exit(-1) >> + >> + options.buffer_page_count = next_power_of_two(options.buffer_page_count) >> + >> + # >> + # Make sure we are running as root, or else we can not attach the >> probes. >> + # >> + if os.geteuid() != 0: >> + print("ERROR: We need to run as root to attached probes!") >> + sys.exit(-1) >> + >> + # >> + # Setup any of the start stop triggers >> + # >> + if options.start_trigger is not None: >> + try: >> + start_trigger = Probe(options.start_trigger, pid=options.pid) >> + except ValueError as e: >> + print(f"ERROR: Invalid start trigger {str(e)}") >> + sys.exit(-1) >> + else: >> + start_trigger = None >> + >> + if options.stop_trigger is not None: >> + try: >> + stop_trigger = Probe(options.stop_trigger, pid=options.pid) >> + except ValueError as e: >> + print(f"ERROR: Invalid stop trigger {str(e)}") >> + sys.exit(-1) >> + else: >> + stop_trigger = None >> + >> + # >> + # Attach probe to running process. >> + # >> + source = EBPF_SOURCE.replace("<EVENT_ENUM>", "\n".join( >> + [" EVENT_{} = {},".format( >> + event.name, event.value) for event in Event])) >> + source = source.replace("<BUFFER_PAGE_CNT>", >> + str(options.buffer_page_count)) >> + source = source.replace("<MONITOR_PID>", str(options.pid)) >> + >> + if BPF.kernel_struct_has_field(b'task_struct', b'state') == 1: >> + source = source.replace('<STATE_FIELD>', 'state') >> + else: >> + source = source.replace('<STATE_FIELD>', '__state') >> + >> + poll_id = [k for k, v in syscalls.items() if v == b'poll'][0] >> + if options.syscall_events is None: >> + syscall_trace_events = "false" >> + elif options.syscall_events == 0: >> + if not options.skip_syscall_poll_events: >> + syscall_trace_events = "true" >> + else: >> + syscall_trace_events = f"args->id != {poll_id}" >> + else: >> + syscall_trace_events = "delta > {}".format(options.syscall_events) >> + if options.skip_syscall_poll_events: >> + syscall_trace_events += f" && args->id != {poll_id}" >> + >> + source = source.replace("<SYSCAL_TRACE_EVENTS>", >> + syscall_trace_events) >> + >> + source = source.replace("<STACK_TRACE_SIZE>", >> + str(options.stack_trace_size)) >> + >> + source = source.replace("<STACK_TRACE_ENABLED>", "true" >> + if options.stack_trace_size > 0 else "false") >> + >> + # >> + # Handle start/stop probes >> + # >> + if start_trigger: >> + source = source.replace("<START_TRIGGER>", >> + start_trigger.get_c_code( >> + "start_trigger_probe", >> + "return start_trigger();")) >> + else: >> + source = source.replace("<START_TRIGGER>", "") >> + >> + if stop_trigger: >> + source = source.replace("<STOP_TRIGGER>", >> + stop_trigger.get_c_code( >> + "stop_trigger_probe", >> + "return stop_trigger();")) >> + else: >> + source = source.replace("<STOP_TRIGGER>", "") >> + >> + # >> + # Setup usdt or other probes that need handling trough the BFP class. >> + # >> + usdt = USDT(pid=int(options.pid)) >> + try: >> + if start_trigger and start_trigger.probe_type == 'usdt': >> + usdt.enable_probe(probe=start_trigger.probe_name(), >> + fn_name="start_trigger_probe") >> + if stop_trigger and stop_trigger.probe_type == 'usdt': >> + usdt.enable_probe(probe=stop_trigger.probe_name(), >> + fn_name="stop_trigger_probe") >> + >> + except USDTException as e: >> + print("ERROR: {}".format( >> + (re.sub('^', ' ' * 7, str(e), flags=re.MULTILINE)).strip(). >> + replace("--with-dtrace or --enable-dtrace", >> + "--enable-usdt-probes"))) >> + sys.exit(-1) >> + >> + bpf = BPF(text=source, usdt_contexts=[usdt], debug=options.debug) >> + >> + if start_trigger: >> + try: >> + if start_trigger.probe_type == "uprobe": >> + bpf.attach_uprobe(name=f"/proc/{options.pid}/exe", >> + sym=start_trigger.probe_name(), >> + fn_name="start_trigger_probe", >> + pid=options.pid) >> + >> + if start_trigger.probe_type == "uretprobe": >> + bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe", >> + sym=start_trigger.probe_name(), >> + fn_name="start_trigger_probe", >> + pid=options.pid) >> + except Exception as e: >> + print("ERROR: Failed attaching uprobe start trigger " >> + f"'{start_trigger.probe_name()}';\n {str(e)}") >> + sys.exit(-1) >> + >> + if stop_trigger: >> + try: >> + if stop_trigger.probe_type == "uprobe": >> + bpf.attach_uprobe(name=f"/proc/{options.pid}/exe", >> + sym=stop_trigger.probe_name(), >> + fn_name="stop_trigger_probe", >> + pid=options.pid) >> + >> + if stop_trigger.probe_type == "uretprobe": >> + bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe", >> + sym=stop_trigger.probe_name(), >> + fn_name="stop_trigger_probe", >> + pid=options.pid) >> + except Exception as e: >> + print("ERROR: Failed attaching uprobe stop trigger" >> + f"'{stop_trigger.probe_name()}';\n {str(e)}") >> + sys.exit(-1) >> + >> + # >> + # If no triggers are configured use the delay configuration >> + # >> + bpf['events'].open_ring_buffer(process_event) >> + >> + sample_count = 0 >> + while sample_count < options.sample_count: >> + sample_count += 1 >> + syscall_events = [] >> + >> + if options.start_trigger is None: >> + print_timestamp("# Start sampling") >> + start_capture() >> + stop_time = -1 if options.stop_trigger else \ >> + time_ns() + options.sample_time * 1000000000 >> + else: >> + # For start triggers the stop time depends on the start trigger >> + # time, or depends on the stop trigger if configured. >> + stop_time = -1 if options.stop_trigger else 0 >> + >> + while True: >> + try: >> + last_start_ts = start_trigger_ts >> + last_stop_ts = stop_trigger_ts >> + >> + if stop_time > 0: >> + delay = int((stop_time - time_ns()) / 1000000) >> + if delay <= 0: >> + break >> + else: >> + delay = -1 >> + >> + bpf.ring_buffer_poll(timeout=delay) >> + >> + if stop_time <= 0 and last_start_ts != start_trigger_ts: >> + print_timestamp( >> + "# Start sampling (trigger@{})".format( >> + start_trigger_ts)) >> + >> + if not options.stop_trigger: >> + stop_time = time_ns() + \ >> + options.sample_time * 1000000000 >> + >> + if last_stop_ts != stop_trigger_ts: >> + break >> + >> + except KeyboardInterrupt: >> + break >> + >> + if options.stop_trigger and not capture_running(): >> + print_timestamp("# Stop sampling (trigger@{})".format( >> + stop_trigger_ts)) >> + else: >> + print_timestamp("# Stop sampling") >> + >> + if stop_trigger_ts != 0 and start_trigger_ts != 0: >> + trigger_delta = stop_trigger_ts - start_trigger_ts >> + else: >> + trigger_delta = None >> + >> + if not trigger_delta or trigger_delta >= options.trigger_delta: >> + stop_capture(force=True) # Prevent a new trigger to start. >> + process_results(syscal_events=syscall_events, >> + trigger_delta=trigger_delta) >> + elif trigger_delta: >> + sample_count -= 1 >> + print_timestamp("# Sample dump skipped, delta {:,} ns".format( >> + trigger_delta)) >> + >> + reset_capture() >> + stop_capture() >> + if options.sample_interval > 0: >> + time.sleep(options.sample_interval) >> + >> + # >> + # Report lost events. >> + # >> + dropcnt = bpf.get_table("dropcnt") >> + for k in dropcnt.keys(): >> + count = dropcnt.sum(k).value >> + if k.value == 0 and count > 0: >> + print("\n# WARNING: Not all events were captured, {} were " >> + "dropped!\n# Increase the BPF ring buffer size " >> + "with the --buffer-page-count option.".format(count)) >> + >> + if (options.sample_count > 1): >> + trigger_miss = bpf.get_table("trigger_miss") >> + for k in trigger_miss.keys(): >> + count = trigger_miss.sum(k).value >> + if k.value == 0 and count > 0: >> + print("\n# WARNING: Not all start triggers were successful. >> " >> + "{} were missed due to\n# slow userspace " >> + "processing!".format(count)) >> + >> + >> +# >> +# Start main() as the default entry point... >> +# >> +if __name__ == '__main__': >> + main() >> diff --git a/utilities/usdt-scripts/kernel_delay.rst >> b/utilities/usdt-scripts/kernel_delay.rst >> new file mode 100644 >> index 000000000..95e98db34 >> --- /dev/null >> +++ b/utilities/usdt-scripts/kernel_delay.rst >> @@ -0,0 +1,594 @@ >> +Troubleshooting Open vSwitch: Is the kernel to blame? >> +===================================================== >> +Often, when troubleshooting Open vSwitch (OVS) in the field, you might be >> left >> +wondering if the issue is really OVS-related, or if it's a problem with the >> +kernel being overloaded. The kernel_delay.py tool can help you quickly >> +identify if the focus of your investigation should be OVS or the Linux >> kernel. > > We might want to give additional context here. For example, hints that > the system could be overloaded due to the WARN level log "Unreasonably > long poll..." or uname/uptime output. I know you do reference it later > in the document, but perhaps a section on when the reader can infer that > there could be a problem. Ack, added a sentence to explain what these issues might look like. >> + >> + >> +Introduction >> +------------ >> +``kernel_delay.py`` consists of a Python script that uses the BCC [#BCC]_ >> +framework to install eBPF probes. The data the eBPF probes collect will be >> +analyzed and presented to the user by the Python script. Some of the >> presented >> +data can also be captured by the individual scripts included in the BBC >> [#BCC]_ >> +framework. >> + >> +kernel_delay.py has two modes of operation: >> + >> +- In **time mode**, the tool runs for a specific time and collects the >> + information. >> +- In **trigger mode**, event collection can be started and/or stopped based >> on >> + a specific eBPF probe. Currently, we support the following probes: >> + - USDT probes >> + - Kernel tracepoints >> + - kprobe >> + - kretprobe >> + - uprobe >> + - uretprobe >> + >> + >> +In addition, the option, ``--sample-count``, exists to specify how many >> +iterations you would like to do. When using triggers, you can also ignore >> +samples if they are less than a number of nanoseconds with the >> +``--trigger-delta`` option. The latter might be useful when debugging Linux >> +syscalls which take a long time to complete. More on this later. Finally, >> you >> +can configure the delay between two sample runs with the >> ``--sample-interval`` >> +option. >> + >> +Before getting into more details, let's just run the tool without any >> options >> +to see what the output looks like. Notice that it will try to automatically >> +get the process ID of the running ``ovs-vsdwitchd``. You can overwrite this >> +with the ``--pid`` option. >> + >> +.. code-block:: console >> + >> + $ sudo ./kernel_delay.py >> + # Start sampling @2023-06-08T12:17:22.725127 (10:17:22 UTC) >> + # Stop sampling @2023-06-08T12:17:23.224781 (10:17:23 UTC) >> + # Sample dump @2023-06-08T12:17:23.224855 (10:17:23 UTC) >> + TID THREAD <RESOURCE SPECIFIC> >> + ---------- ---------------- >> ---------------------------------------------------------------------------- >> + 27090 ovs-vswitchd [SYSCALL STATISTICS] >> + <EDIT: REMOVED DATA FOR ovs-vswitchd THREAD> >> + >> + 31741 revalidator122 [SYSCALL STATISTICS] >> + NAME NUMBER COUNT TOTAL ns >> MAX ns >> + poll 7 5 184,193,176 >> 184,191,520 >> + recvmsg 47 494 125,208,756 >> 310,331 >> + futex 202 8 18,768,758 >> 4,023,039 >> + sendto 44 10 375,861 >> 266,867 >> + sendmsg 46 4 43,294 >> 11,213 >> + write 1 1 5,949 >> 5,949 >> + getrusage 98 1 1,424 >> 1,424 >> + read 0 1 1,292 >> 1,292 >> + TOTAL( - poll): 519 144,405,334 >> + >> + [THREAD RUN STATISTICS] >> + SCHED_CNT TOTAL ns MIN ns >> MAX ns >> + 6 136,764,071 1,480 >> 115,146,424 >> + >> + [THREAD READY STATISTICS] >> + SCHED_CNT TOTAL ns MAX ns >> + 7 11,334 6,636 >> + >> + [HARD IRQ STATISTICS] >> + NAME COUNT TOTAL ns >> MAX ns >> + eno8303-rx-1 1 3,586 >> 3,586 >> + TOTAL: 1 3,586 >> + >> + [SOFT IRQ STATISTICS] >> + NAME VECT_NR COUNT TOTAL >> ns MAX ns >> + net_rx 3 1 >> 17,699 17,699 >> + sched 7 6 >> 13,820 3,226 >> + rcu 9 16 >> 13,586 1,554 >> + timer 1 3 >> 10,259 3,815 >> + TOTAL: 26 55,364 >> + >> + >> +By default, the tool will run for half a second in `time mode`. To extend >> this >> +you can use the ``--sample-time`` option. >> + >> + >> +What will it report >> +------------------- >> +The above sample output separates the captured data on a per-thread basis. >> +For this, it displays the thread's id (``TID``) and name (``THREAD``), >> +followed by resource-specific data. Which are: >> + >> +- ``SYSCALL STATISTICS`` >> +- ``THREAD RUN STATISTICS`` >> +- ``THREAD READY STATISTICS`` >> +- ``HARD IRQ STATISTICS`` >> +- ``SOFT IRQ STATISTICS`` >> + >> +The following sections will describe in detail what statistics they report. >> + >> + >> +``SYSCALL STATISTICS`` >> +~~~~~~~~~~~~~~~~~~~~~~ >> +``SYSCALL STATISTICS`` tell you which Linux system calls got executed during >> +the measurement interval. This includes the number of times the syscall was >> +called (``COUNT``), the total time spent in the system calls (``TOTAL ns``), >> +and the worst-case duration of a single call (``MAX ns``). >> + >> +It also shows the total of all system calls, but it excludes the poll system >> +call, as the purpose of this call is to wait for activity on a set of >> sockets, >> +and usually, the thread gets swapped out. >> + >> +Note that it only counts calls that started and stopped during the >> +measurement interval! >> + >> + >> +``THREAD RUN STATISTICS`` >> +~~~~~~~~~~~~~~~~~~~~~~~~~ >> +``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU >> +during the measurement interval. >> + >> +Note that these statistics only count events where the thread started and >> +stopped running on a CPU during the measurement interval. For example, if >> +this was a PMD thread, you should see zero ``SCHED_CNT`` and ``TOTAL_ns``. >> +If not, there might be a misconfiguration. >> + >> + >> +``THREAD READY STATISTICS`` >> +~~~~~~~~~~~~~~~~~~~~~~~~~~~ >> +``THREAD READY STATISTICS`` tell you the time between the thread being ready >> +to run and it actually running on the CPU. >> + >> +Note that these statistics only count events where the thread was getting >> +ready to run and started running during the measurement interval. >> + >> + >> +``HARD IRQ STATISTICS`` >> +~~~~~~~~~~~~~~~~~~~~~~~ >> +``HARD IRQ STATISTICS`` tell you how much time was spent servicing hard >> +interrupts during the threads run time. >> + >> +It shows the interrupt name (``NAME``), the number of interrupts >> (``COUNT``), >> +the total time spent in the interrupt handler (``TOTAL ns``), and the >> +worst-case duration (``MAX ns``). >> + >> + >> +``SOFT IRQ STATISTICS`` >> +~~~~~~~~~~~~~~~~~~~~~~~ >> +``SOFT IRQ STATISTICS`` tell you how much time was spent servicing soft >> +interrupts during the threads run time. >> + >> +It shows the interrupt name (``NAME``), vector number (``VECT_NR``), the >> +number of interrupts (``COUNT``), the total time spent in the interrupt >> +handler (``TOTAL ns``), and the worst-case duration (``MAX ns``). >> + >> + >> +The ``--syscall-events`` option >> +------------------------------- >> +In addition to reporting global syscall statistics in >> ``SYSCALL_STATISTICS``, >> +the tool can also report each individual syscall. This can be a usefull >> +second step if the ``SYSCALL_STATISTICS`` show high latency numbers. >> + >> +All you need to do is add the ``--syscall-events`` option, with or without >> +the additional ``DURATION_NS`` parameter. The ``DUTATION_NS`` parameter >> +allows you to exclude events that take less than the supplied time. >> + >> +The ``--skip-syscall-poll-events`` option allows you to exclude poll >> +syscalls from the report. >> + >> +Below is an example run, note that I have removed the resource-specific data >> +to highlight the syscall events: >> + >> +.. code-block:: console >> + >> + $ sudo ./kernel_delay.py --syscall-events 50000 >> --skip-syscall-poll-events >> + # Start sampling @2023-06-13T17:10:46.460874 (15:10:46 UTC) >> + # Stop sampling @2023-06-13T17:10:46.960727 (15:10:46 UTC) >> + # Sample dump @2023-06-13T17:10:46.961033 (15:10:46 UTC) >> + TID THREAD <RESOURCE SPECIFIC> >> + ---------- ---------------- >> ---------------------------------------------------------------------------- >> + 3359686 ipf_clean2 [SYSCALL STATISTICS] >> + ... >> + 3359635 ovs-vswitchd [SYSCALL STATISTICS] >> + ... >> + 3359697 revalidator12 [SYSCALL STATISTICS] >> + ... >> + 3359698 revalidator13 [SYSCALL STATISTICS] >> + ... >> + 3359699 revalidator14 [SYSCALL STATISTICS] >> + ... >> + 3359700 revalidator15 [SYSCALL STATISTICS] >> + ... >> + >> + # SYSCALL EVENTS: >> + ENTRY (ns) EXIT (ns) TID COMM DELTA >> (us) SYSCALL >> + ------------------- ------------------- ---------- ---------------- >> ---------- ---------------- >> + 2161821694935486 2161821695031201 3359699 revalidator14 >> 95 futex >> + syscall_exit_to_user_mode_prepare+0x161 [kernel] >> + syscall_exit_to_user_mode_prepare+0x161 [kernel] >> + syscall_exit_to_user_mode+0x9 [kernel] >> + do_syscall_64+0x68 [kernel] >> + entry_SYSCALL_64_after_hwframe+0x72 [kernel] >> + __GI___lll_lock_wait+0x30 [libc.so.6] >> + ovs_mutex_lock_at+0x18 [ovs-vswitchd] >> + [unknown] 0x696c003936313a63 >> + 2161821695276882 2161821695333687 3359698 revalidator13 >> 56 futex >> + syscall_exit_to_user_mode_prepare+0x161 [kernel] >> + syscall_exit_to_user_mode_prepare+0x161 [kernel] >> + syscall_exit_to_user_mode+0x9 [kernel] >> + do_syscall_64+0x68 [kernel] >> + entry_SYSCALL_64_after_hwframe+0x72 [kernel] >> + __GI___lll_lock_wait+0x30 [libc.so.6] >> + ovs_mutex_lock_at+0x18 [ovs-vswitchd] >> + [unknown] 0x696c003134313a63 >> + 2161821695275820 2161821695405733 3359700 revalidator15 >> 129 futex >> + syscall_exit_to_user_mode_prepare+0x161 [kernel] >> + syscall_exit_to_user_mode_prepare+0x161 [kernel] >> + syscall_exit_to_user_mode+0x9 [kernel] >> + do_syscall_64+0x68 [kernel] >> + entry_SYSCALL_64_after_hwframe+0x72 [kernel] >> + __GI___lll_lock_wait+0x30 [libc.so.6] >> + ovs_mutex_lock_at+0x18 [ovs-vswitchd] >> + [unknown] 0x696c003936313a63 >> + 2161821695964969 2161821696052021 3359635 ovs-vswitchd >> 87 accept >> + syscall_exit_to_user_mode_prepare+0x161 [kernel] >> + syscall_exit_to_user_mode_prepare+0x161 [kernel] >> + syscall_exit_to_user_mode+0x9 [kernel] >> + do_syscall_64+0x68 [kernel] >> + entry_SYSCALL_64_after_hwframe+0x72 [kernel] >> + __GI_accept+0x4d [libc.so.6] >> + pfd_accept+0x3a [ovs-vswitchd] >> + [unknown] 0x7fff19f2bd00 >> + [unknown] 0xe4b8001f0f >> + >> +As you can see above, the output also shows the stackback trace. You can >> +disable this using the ``--stack-trace-size 0`` option. >> + >> +As you can see above, the backtrace does not show a lot of useful >> information >> +due to the BCC [#BCC]_ toolkit not supporting dwarf decoding. So to further >> +analyze system call backtraces, you could use perf. The following perf >> +script can do this for you (refer to the embedded instructions): >> + >> +https://github.com/chaudron/perf_scripts/blob/master/analyze_perf_pmd_syscall.py > > Maybe we should consider putting something like that in-tree for use. Good idea, added a todo to my queue :) >> + >> + >> +Using triggers >> +-------------- >> +The tool supports start and, or stop triggers. This will allow you to >> capture >> +statistics triggered by a specific event. First, let's look at what >> +combinations of stop-and-start triggers we can use. >> + >> +If you only use ``--start-trigger``, the inspection start when the trigger >> +happens and runs until the ``--sample-time`` number of seconds has passed. >> +The example below shows all the supported options in this scenario. >> + >> +.. code-block:: console >> + >> + $ sudo ./kernel_delay.py --start-trigger up:bridge_run --sample-time 4 \ >> + --sample-count 4 --sample-interval 1 >> + >> + >> +If you only use ``--stop-trigger``, the inspection starts immediately and >> +stops when the trigger happens. The example below shows all the supported >> +options in this scenario. >> + >> +.. code-block:: console >> + >> + $ sudo ./kernel_delay.py --stop-trigger upr:bridge_run \ >> + --sample-count 4 --sample-interval 1 >> + >> + >> +If you use both ``--start-trigger`` and ``--stop-trigger`` triggers, the >> +statistics are captured between the two first occurrences of these events. >> +The example below shows all the supported options in this scenario. >> + >> +.. code-block:: console >> + >> + $ sudo ./kernel_delay.py --start-trigger up:bridge_run \ >> + --stop-trigger upr:bridge_run \ >> + --sample-count 4 --sample-interval 1 \ >> + --trigger-delta 50000 >> + >> + >> +Now that we know how these triggers can be used, let's investigate what >> +triggers are supported. What we call triggers, BCC [#BCC]_, calls events; >> +these are eBPF tracepoints you can attach to. For more details on the >> +supported tracepoints, check out the BCC documentation [#BCC_EVENT]_. >> + >> +The list below shows the supported triggers and their argument format: >> + >> +**USDT probes:** >> + [u|usdt]:{provider}:{probe} >> +**Kernel tracepoint:** >> + [t:trace]:{system}:{event} >> +**kprobe:** >> + [k:kprobe]:{kernel_function} >> +**kretprobe:** >> + [kr:kretprobe]:{kernel_function} >> +**uprobe:** >> + [up:uprobe]:{function} >> +**uretprobe:** >> + [upr:uretprobe]:{function} >> + >> +Here are a couple of trigger examples, more use-case-specific examples can >> be >> +found in the *Examples* section. >> + >> +.. code-block:: console >> + >> + --start|stop-trigger u:udpif_revalidator:start_dump >> + --start|stop-trigger t:openvswitch:ovs_dp_upcall >> + --start|stop-trigger k:ovs_dp_process_packet >> + --start|stop-trigger kr:ovs_dp_process_packet >> + --start|stop-trigger up:bridge_run >> + --start|stop-trigger upr:bridge_run >> + >> + >> +Examples >> +-------- >> +This section will give some examples of how to use this tool in real-world >> +scenarios. Let's start with the issue where Open vSwitch reports >> +``Unreasonably long XXXXms poll interval`` on your revalidator threads. Note >> +that there is a blog available explaining how the revalidator process works >> +in OVS [#REVAL_BLOG]_. >> + >> +First, let me explain this log message. It gets logged if the time delta >> +between two ``poll_block()`` calls is more than 1 second. In other words, >> +the process was spending a lot of time processing stuff that was made >> +available by the return of the ``poll_block()`` function. >> + >> +Do a run with the tool using the existing USDT revalidator probes as a start >> +and stop trigger (Note that I removed the resource-specific data from the >> none >> +revalidator threads): >> + >> +.. code-block:: console >> + >> + $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump >> --stop-trigger u:udpif_revalidator:sweep_done >> + # Start sampling (trigger@791777093512008) @2023-06-14T14:52:00.110303 >> (12:52:00 UTC) >> + # Stop sampling (trigger@791778281498462) @2023-06-14T14:52:01.297975 >> (12:52:01 UTC) >> + # Triggered sample dump, stop-start delta 1,187,986,454 ns >> @2023-06-14T14:52:01.298021 (12:52:01 UTC) >> + TID THREAD <RESOURCE SPECIFIC> >> + ---------- ---------------- >> ---------------------------------------------------------------------------- >> + 1457761 handler24 [SYSCALL STATISTICS] >> + NAME NUMBER COUNT >> TOTAL ns MAX ns >> + sendmsg 46 6110 >> 123,274,761 41,776 >> + recvmsg 47 136299 >> 99,397,508 49,896 >> + futex 202 51 >> 7,655,832 7,536,776 >> + poll 7 4068 >> 1,202,883 2,907 >> + getrusage 98 2034 >> 586,602 1,398 >> + sendto 44 9 >> 213,682 27,417 >> + TOTAL( - poll): 144503 >> 231,128,385 >> + >> + [THREAD RUN STATISTICS] >> + SCHED_CNT TOTAL ns MIN >> ns MAX ns >> + >> + [THREAD READY STATISTICS] >> + SCHED_CNT TOTAL ns MAX ns >> + 1 1,438 1,438 >> + >> + [SOFT IRQ STATISTICS] >> + NAME VECT_NR COUNT >> TOTAL ns MAX ns >> + sched 7 21 >> 59,145 3,769 >> + rcu 9 50 >> 42,917 2,234 >> + TOTAL: 71 >> 102,062 >> + 1457733 ovs-vswitchd [SYSCALL STATISTICS] >> + ... >> + 1457792 revalidator55 [SYSCALL STATISTICS] >> + NAME NUMBER COUNT >> TOTAL ns MAX ns >> + futex 202 73 >> 572,576,329 19,621,600 >> + recvmsg 47 815 >> 296,697,618 405,338 >> + sendto 44 3 >> 78,302 26,837 >> + sendmsg 46 3 >> 38,712 13,250 >> + write 1 1 >> 5,073 5,073 >> + TOTAL( - poll): 895 >> 869,396,034 >> + >> + [THREAD RUN STATISTICS] >> + SCHED_CNT TOTAL ns MIN >> ns MAX ns >> + 48 394,350,393 >> 1,729 140,455,796 >> + >> + [THREAD READY STATISTICS] >> + SCHED_CNT TOTAL ns MAX ns >> + 49 23,650 1,559 >> + >> + [SOFT IRQ STATISTICS] >> + NAME VECT_NR COUNT >> TOTAL ns MAX ns >> + sched 7 14 >> 26,889 3,041 >> + rcu 9 28 >> 23,024 1,600 >> + TOTAL: 42 >> 49,913 >> + >> + >> +You can see from the start of the output that the trigger took more than a >> +second (1,187,986,454 ns), which we would already know, by looking at the >> +output of the ``ovs-vsctl upcall/show`` command. >> + >> +From the *revalidator55*'s ``SYSCALL STATISTICS`` statistics we can see it >> +spent almost 870ms handling syscalls, and there were no poll() calls being >> +executed. The ``THREAD RUN STATISTICS`` statistics here are a bit >> misleading, >> +as it looks like we only spent 394ms on the CPU. But earlier, we learned >> that >> +this time does not include the time being on the CPU at the start or stop of >> +an event. What is exactly the case here, because we are using USDT probes. >> + >> +From the above data and maybe some ``top`` output, we can determine that the >> +*revalidator55* thread is taking a lot of CPU time, probably because it has >> +to do a lot of revalidator work by itself. The solution here is to increase >> +the number of revalidator threads, so more work could be done in parallel. >> + >> +Let's do another run of the same command in another scenario: >> + >> +.. code-block:: console >> + >> + $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump >> --stop-trigger u:udpif_revalidator:sweep_done >> + # Start sampling (trigger@795160501758971) @2023-06-14T15:48:23.518512 >> (13:48:23 UTC) >> + # Stop sampling (trigger@795160764940201) @2023-06-14T15:48:23.781381 >> (13:48:23 UTC) >> + # Triggered sample dump, stop-start delta 263,181,230 ns >> @2023-06-14T15:48:23.781414 (13:48:23 UTC) >> + TID THREAD <RESOURCE SPECIFIC> >> + ---------- ---------------- >> ---------------------------------------------------------------------------- >> + 1457733 ovs-vswitchd [SYSCALL STATISTICS] >> + ... >> + 1457792 revalidator55 [SYSCALL STATISTICS] >> + NAME NUMBER COUNT >> TOTAL ns MAX ns >> + recvmsg 47 284 >> 193,422,110 46,248,418 >> + sendto 44 2 >> 46,685 23,665 >> + sendmsg 46 2 >> 24,916 12,703 >> + write 1 1 >> 6,534 6,534 >> + TOTAL( - poll): 289 >> 193,500,245 >> + >> + [THREAD RUN STATISTICS] >> + SCHED_CNT TOTAL ns MIN >> ns MAX ns >> + 2 47,333,558 >> 331,516 47,002,042 >> + >> + [THREAD READY STATISTICS] >> + SCHED_CNT TOTAL ns MAX ns >> + 3 87,000,403 45,999,712 >> + >> + [SOFT IRQ STATISTICS] >> + NAME VECT_NR COUNT >> TOTAL ns MAX ns >> + sched 7 2 >> 9,504 5,109 >> + TOTAL: 2 >> 9,504 >> + >> + >> +Here you can see the revalidator run took about 263ms, which does not look >> +odd, however, the ``THREAD READY STATISTICS`` information shows us we were >> +waiting 87ms for a CPU to be run on. This means the revalidator process >> could >> +have finished 87ms faster. Looking at the ``MAX ns`` value, we see a >> +worst-case delay of almost 46ms, which hints at an overloaded system. >> + >> +One final example where we use a ``uprobe`` to get some statistics on >> +a ``bridge_run()`` execution that takes more than 1ms. >> + >> +.. code-block:: console >> + >> + $ sudo ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger >> ur:bridge_run --trigger-delta 1000000 >> + # Start sampling (trigger@2245245432101270) @2023-06-14T16:21:10.467919 >> (14:21:10 UTC) >> + # Stop sampling (trigger@2245245432414656) @2023-06-14T16:21:10.468296 >> (14:21:10 UTC) >> + # Sample dump skipped, delta 313,386 ns @2023-06-14T16:21:10.468419 >> (14:21:10 UTC) >> + # Start sampling (trigger@2245245505301745) @2023-06-14T16:21:10.540970 >> (14:21:10 UTC) >> + # Stop sampling (trigger@2245245506911119) @2023-06-14T16:21:10.542499 >> (14:21:10 UTC) >> + # Triggered sample dump, stop-start delta 1,609,374 ns >> @2023-06-14T16:21:10.542565 (14:21:10 UTC) >> + TID THREAD <RESOURCE SPECIFIC> >> + ---------- ---------------- >> ---------------------------------------------------------------------------- >> + 3371035 <unknown:3366258/3371035> [SYSCALL STATISTICS] >> + ... <REMOVED 7 MORE unknown THREADS> >> + 3371102 handler66 [SYSCALL STATISTICS] >> + ... <REMOVED 7 MORE HANDLER THREADS> >> + 3366258 ovs-vswitchd [SYSCALL STATISTICS] >> + NAME NUMBER COUNT >> TOTAL ns MAX ns >> + futex 202 43 >> 403,469 199,312 >> + clone3 435 13 >> 174,394 30,731 >> + munmap 11 8 >> 115,774 21,861 >> + poll 7 5 >> 92,969 38,307 >> + unlink 87 2 >> 49,918 35,741 >> + mprotect 10 8 >> 47,618 13,201 >> + accept 43 10 >> 31,360 6,976 >> + mmap 9 8 >> 30,279 5,776 >> + write 1 6 >> 27,720 11,774 >> + rt_sigprocmask 14 28 >> 12,281 970 >> + read 0 6 >> 9,478 2,318 >> + recvfrom 45 3 >> 7,024 4,024 >> + sendto 44 1 >> 4,684 4,684 >> + getrusage 98 5 >> 4,594 1,342 >> + close 3 2 >> 2,918 1,627 >> + recvmsg 47 1 >> 2,722 2,722 >> + TOTAL( - poll): 144 >> 924,233 >> + >> + [THREAD RUN STATISTICS] >> + SCHED_CNT TOTAL ns MIN >> ns MAX ns >> + 13 817,605 >> 5,433 524,376 >> + >> + [THREAD READY STATISTICS] >> + SCHED_CNT TOTAL ns MAX ns >> + 14 28,646 11,566 >> + >> + [SOFT IRQ STATISTICS] >> + NAME VECT_NR COUNT >> TOTAL ns MAX ns >> + rcu 9 1 >> 2,838 2,838 >> + TOTAL: 1 >> 2,838 >> + >> + 3371110 revalidator74 [SYSCALL STATISTICS] >> + ... <REMOVED 7 MORE NEW revalidator THREADS> >> + 3366311 urcu3 [SYSCALL STATISTICS] >> + ... >> + >> + >> +We removed some of the threads and their resource-specific data, but based >> +on the ``<unknown:3366258/3371035>`` thread name, you can see that some >> +threads no longer exist. In the ``ovs-vswitchd`` thread, you can see some >> +``clone3`` syscalls, indicating threads were created. In this example, it >> was >> +due to the deletion of a bridge, which resulted in the recreation of the >> +revalidator and handler threads. >> + >> + >> +Use with Openshift >> +------------------ >> +This section describes how you would use the tool on a node in an OpenShift >> +cluster. It assumes you have console access to the node, either directly or >> +through a debug container. >> + >> +We will use a base fedora38 container through podman, as this will allow us >> to >> +install some additional tools and packages we need. >> + >> +The first thing we need to do is to start the container: >> + >> +.. code-block:: console >> + >> + [core@sno-master ~]$ sudo podman run -it --rm \ >> + -e PS1='[(DEBUG)\u@\h \W]\$ ' \ >> + --privileged --network=host --pid=host \ >> + -v /lib/modules:/lib/modules:ro \ >> + -v /sys/kernel/debug:/sys/kernel/debug \ >> + -v /proc:/proc \ >> + -v /:/mnt/rootdir \ >> + quay.io/fedora/fedora:38-x86_64 >> + >> + [(DEBUG)root@sno-master /]# >> + >> + >> +Next add the ``linux_delay.py`` dependencies: >> + >> +.. code-block:: console >> + >> + [(DEBUG)root@sno-master /]# dnf install -y bcc-tools perl-interpreter \ >> + python3-pytz python3-psutil >> + >> + >> +You need to install the devel, debug and source RPMs for your OVS and kernel >> +version: >> + >> +.. code-block:: console >> + >> + [(DEBUG)root@sno-master home]# rpm -i \ >> + openvswitch2.17-debuginfo-2.17.0-67.el8fdp.x86_64.rpm \ >> + openvswitch2.17-debugsource-2.17.0-67.el8fdp.x86_64.rpm \ >> + kernel-devel-4.18.0-372.41.1.el8_6.x86_64.rpm >> + >> + >> +Now we can run the tool. Here we use the above ```bridge_run()`` example: >> + >> +.. code-block:: console >> + >> + [(DEBUG)root@sno-master home]# ./kernel_delay.py --start-trigger >> up:bridge_run --stop-trigger ur:bridge_run >> + # Start sampling (trigger@75279117343513) @2023-06-15T11:44:07.628372 >> (11:44:07 UTC) >> + # Stop sampling (trigger@75279117443980) @2023-06-15T11:44:07.628529 >> (11:44:07 UTC) >> + # Triggered sample dump, stop-start delta 100,467 ns >> @2023-06-15T11:44:07.628569 (11:44:07 UTC) >> + TID THREAD <RESOURCE SPECIFIC> >> + ---------- ---------------- >> ---------------------------------------------------------------------------- >> + 1246 ovs-vswitchd [SYSCALL STATISTICS] >> + NAME NUMBER COUNT >> TOTAL ns MAX ns >> + getdents64 217 2 >> 8,560 8,162 >> + openat 257 1 >> 6,951 6,951 >> + accept 43 4 >> 6,942 3,763 >> + recvfrom 45 1 >> 3,726 3,726 >> + recvmsg 47 2 >> 2,880 2,188 >> + stat 4 2 >> 1,946 1,384 >> + close 3 1 >> 1,393 1,393 >> + fstat 5 1 >> 1,324 1,324 >> + TOTAL( - poll): 14 >> 33,722 >> + >> + [THREAD RUN STATISTICS] >> + SCHED_CNT TOTAL ns MIN >> ns MAX ns >> + >> + [THREAD READY STATISTICS] >> + SCHED_CNT TOTAL ns MAX ns >> + >> + >> +.. rubric:: Footnotes >> + >> +.. [#BCC] https://github.com/iovisor/bcc >> +.. [#BCC_EVENT] >> https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments >> +.. [#REVAL_BLOG] >> https://developers.redhat.com/articles/2022/10/19/open-vswitch-revalidator-process-explained >> >> _______________________________________________ >> dev mailing list >> [email protected] >> https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
