Re: [ovs-dev] [PATCH] utilities: Add kernel_delay.py script to debug a busy Linux kernel.

Aaron Conole Mon, 28 Aug 2023 07:15:30 -0700

Eelco Chaudron <[email protected]> writes:

> This patch adds an utility that can be used to determine if
> an issue is related to a lack of Linux kernel resources.
>
> This tool is also featured in a Red Hat developers blog article:
>
>   
> https://developers.redhat.com/articles/2023/07/24/troubleshooting-open-vswitch-kernel-blame
>
> Signed-off-by: Eelco Chaudron <[email protected]>
> ---


Hi Eelco,

Thanks for the writeup and utilities.

>  utilities/automake.mk                   |    4 
>  utilities/usdt-scripts/kernel_delay.py  | 1397 
> +++++++++++++++++++++++++++++++
>  utilities/usdt-scripts/kernel_delay.rst |  594 +++++++++++++
>  3 files changed, 1995 insertions(+)
>  create mode 100755 utilities/usdt-scripts/kernel_delay.py
>  create mode 100644 utilities/usdt-scripts/kernel_delay.rst
>
> diff --git a/utilities/automake.mk b/utilities/automake.mk
> index 37d679f82..9a2114df4 100644
> --- a/utilities/automake.mk
> +++ b/utilities/automake.mk
> @@ -23,6 +23,8 @@ scripts_DATA += utilities/ovs-lib
>  usdt_SCRIPTS += \
>       utilities/usdt-scripts/bridge_loop.bt \
>       utilities/usdt-scripts/dpif_nl_exec_monitor.py \
> +     utilities/usdt-scripts/kernel_delay.py \
> +     utilities/usdt-scripts/kernel_delay.rst \
>       utilities/usdt-scripts/reval_monitor.py \
>       utilities/usdt-scripts/upcall_cost.py \
>       utilities/usdt-scripts/upcall_monitor.py
> @@ -70,6 +72,8 @@ EXTRA_DIST += \
>       utilities/docker/debian/build-kernel-modules.sh \
>       utilities/usdt-scripts/bridge_loop.bt \
>       utilities/usdt-scripts/dpif_nl_exec_monitor.py \
> +     utilities/usdt-scripts/kernel_delay.py \
> +     utilities/usdt-scripts/kernel_delay.rst \
>       utilities/usdt-scripts/reval_monitor.py \
>       utilities/usdt-scripts/upcall_cost.py \
>       utilities/usdt-scripts/upcall_monitor.py
> diff --git a/utilities/usdt-scripts/kernel_delay.py 
> b/utilities/usdt-scripts/kernel_delay.py
> new file mode 100755
> index 000000000..bba7bc044
> --- /dev/null
> +++ b/utilities/usdt-scripts/kernel_delay.py
> @@ -0,0 +1,1397 @@
> +#!/usr/bin/env python3
> +#
> +# Copyright (c) 2022 Red Hat, Inc.

Maybe this should be 2022,2023 ?

> +#
> +# Licensed under the Apache License, Version 2.0 (the "License");
> +# you may not use this file except in compliance with the License.
> +# You may obtain a copy of the License at:
> +#
> +#     http://www.apache.org/licenses/LICENSE-2.0
> +#
> +# Unless required by applicable law or agreed to in writing, software
> +# distributed under the License is distributed on an "AS IS" BASIS,
> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> +# See the License for the specific language governing permissions and
> +# limitations under the License.
> +#
> +# Script information:
> +# -------------------
> +# This script allows a developer to quickly identify if the issue at hand
> +# might be related to the kernel running out of resources or if it really is
> +# an Open vSwitch issue.
> +#
> +# For documentation see the kernel_delay.rst file.
> +#
> +try:
> +    from bcc import BPF, USDT, USDTException
> +    from bcc.syscall import syscalls, syscall_name
> +except ModuleNotFoundError:
> +    print("ERROR: Can't find the BPF Compiler Collection (BCC) tools!")

I think this needs to exit in this exception block

> +
> +from enum import IntEnum
> +
> +import argparse
> +import datetime
> +import os
> +import pytz
> +import psutil
> +import re
> +import sys
> +import time
> +
> +import ctypes as ct
> +
> +try:
> +    from time import time_ns
> +except ImportError:
> +    # For compatibility with Python <= v3.6.
> +    def time_ns():
> +        now = datetime.datetime.now()
> +        return int(now.timestamp() * 1e9)
> +
> +
> +#
> +# Actual eBPF source code
> +#
> +EBPF_SOURCE = """
> +#include <linux/irq.h>
> +#include <linux/sched.h>
> +
> +#define MONITOR_PID <MONITOR_PID>
> +
> +enum {
> +<EVENT_ENUM>
> +};
> +
> +struct event_t {
> +    u64 ts;
> +    u32 tid;
> +    u32 id;
> +
> +    int user_stack_id;
> +    int kernel_stack_id;
> +
> +    u32 syscall;
> +    u64 entry_ts;
> +
> +};
> +
> +BPF_RINGBUF_OUTPUT(events, <BUFFER_PAGE_CNT>);
> +BPF_STACK_TRACE(stack_traces, <STACK_TRACE_SIZE>);
> +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1);
> +BPF_TABLE("percpu_array", uint32_t, uint64_t, trigger_miss, 1);
> +
> +BPF_ARRAY(capture_on, u64, 1);
> +static bool capture_enabled(u64 pid_tgid) {
> +    int key = 0;
> +    u64 *ret;
> +
> +    if ((pid_tgid >> 32) != MONITOR_PID)
> +        return false;
> +
> +    ret = capture_on.lookup(&key);
> +    return ret && *ret == 1;
> +}
> +
> +static inline bool capture_enabled__() {
> +    int key = 0;
> +    u64 *ret;
> +
> +    ret = capture_on.lookup(&key);
> +    return ret && *ret == 1;
> +}
> +
> +
> +static struct event_t *get_event(uint32_t id) {
> +    struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t));
> +
> +    if (!event) {
> +        dropcnt.increment(0);
> +        return NULL;
> +    }
> +
> +    event->id = id;
> +    event->ts = bpf_ktime_get_ns();
> +    event->tid = bpf_get_current_pid_tgid();
> +
> +    return event;
> +}
> +
> +static int start_trigger() {
> +    int key = 0;
> +    u64 *val = capture_on.lookup(&key);
> +
> +    /* If the value is -1 we can't start as we are still processing the
> +     * results in userspace. */
> +    if (!val || *val != 0) {
> +        trigger_miss.increment(0);
> +        return 0;
> +    }
> +
> +    struct event_t *event = get_event(EVENT_START_TRIGGER);
> +    if (event) {
> +       events.ringbuf_submit(event, 0);
> +       *val = 1;
> +    } else {
> +        trigger_miss.increment(0);
> +    }
> +    return 0;
> +}
> +
> +static int stop_trigger() {
> +    int key = 0;
> +    u64 *val = capture_on.lookup(&key);
> +
> +    if (!val || *val != 1)
> +        return 0;
> +
> +    struct event_t *event = get_event(EVENT_STOP_TRIGGER);
> +
> +    if (event)
> +       events.ringbuf_submit(event, 0);
> +
> +    if (val)
> +        *val = -1;
> +
> +    return 0;
> +}
> +
> +<START_TRIGGER>
> +<STOP_TRIGGER>
> +
> +
> +/*
> + * For the syscall monitor the following probes get installed.
> + */
> +struct syscall_data_t {
> +    u64 count;
> +    u64 total_ns;
> +    u64 worst_ns;
> +};
> +
> +struct syscall_data_key_t {
> +    u32 pid;
> +    u32 tid;
> +    u32 syscall;
> +};
> +
> +BPF_HASH(syscall_start, u64, u64);
> +BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t);
> +
> +TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
> +    u64 pid_tgid = bpf_get_current_pid_tgid();
> +
> +    if (!capture_enabled(pid_tgid))
> +       return 0;
> +
> +    u64 t = bpf_ktime_get_ns();
> +    syscall_start.update(&pid_tgid, &t);
> +
> +    return 0;
> +}
> +
> +TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
> +    struct syscall_data_t *val, zero = {};
> +    struct syscall_data_key_t key;
> +
> +    u64 pid_tgid = bpf_get_current_pid_tgid();
> +
> +    if (!capture_enabled(pid_tgid))
> +       return 0;
> +
> +    key.pid = pid_tgid >> 32;
> +    key.tid = (u32)pid_tgid;
> +    key.syscall = args->id;
> +
> +    u64 *start_ns = syscall_start.lookup(&pid_tgid);
> +
> +    if (!start_ns)
> +        return 0;
> +
> +    val = syscall_data.lookup_or_try_init(&key, &zero);
> +    if (val) {
> +        u64 delta = bpf_ktime_get_ns() - *start_ns;
> +        val->count++;
> +        val->total_ns += delta;
> +        if (val->worst_ns == 0 || delta > val->worst_ns)
> +            val->worst_ns = delta;
> +
> +        if (<SYSCAL_TRACE_EVENTS>) {
> +            struct event_t *event = get_event(EVENT_SYSCALL);
> +            if (event) {
> +                event->syscall = args->id;
> +                event->entry_ts = *start_ns;
> +                if (<STACK_TRACE_ENABLED>) {
> +                    event->user_stack_id = stack_traces.get_stackid(
> +                        args, BPF_F_USER_STACK);
> +                    event->kernel_stack_id = stack_traces.get_stackid(
> +                        args, 0);
> +                }
> +                events.ringbuf_submit(event, 0);
> +            }
> +        }
> +    }
> +    return 0;
> +}
> +
> +
> +
> +/*
> + * For measuring the thread run time, we need the following.
> + */
> +struct run_time_data_t {
> +    u64 count;
> +    u64 total_ns;
> +    u64 max_ns;
> +    u64 min_ns;
> +};
> +
> +struct pid_tid_key_t {
> +    u32  pid;
> +    u32  tid;
> +};
> +
> +BPF_HASH(run_start, u64, u64);
> +BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t);
> +
> +static inline void thread_start_run(u64 pid_tgid, u64 ktime)
> +{
> +    run_start.update(&pid_tgid, &ktime);
> +}
> +
> +static inline void thread_stop_run(u32 pid, u32 tgid, u64 ktime)
> +{
> +    u64 pid_tgid = (u64) tgid << 32 | pid;
> +    u64 *start_ns = run_start.lookup(&pid_tgid);
> +
> +    if (!start_ns || *start_ns == 0)
> +        return;
> +
> +    struct run_time_data_t *val, zero = {};
> +    struct pid_tid_key_t key = { .pid = tgid,
> +                                 .tid = pid };
> +
> +    val = run_data.lookup_or_try_init(&key, &zero);
> +    if (val) {
> +        u64 delta = ktime - *start_ns;
> +        val->count++;
> +        val->total_ns += delta;
> +        if (val->max_ns == 0 || delta > val->max_ns)
> +            val->max_ns = delta;
> +        if (val->min_ns == 0 || delta < val->min_ns)
> +            val->min_ns = delta;
> +    }
> +    *start_ns = 0;
> +}
> +
> +
> +/*
> + * For measuring the thread-ready delay, we need the following.
> + */
> +struct ready_data_t {
> +    u64 count;
> +    u64 total_ns;
> +    u64 worst_ns;
> +};
> +
> +BPF_HASH(ready_start, u64, u64);
> +BPF_HASH(ready_data, struct pid_tid_key_t, struct ready_data_t);
> +
> +static inline int sched_wakeup__(u32 pid, u32 tgid)
> +{
> +    u64 pid_tgid = (u64) tgid << 32 | pid;
> +
> +    if (!capture_enabled(pid_tgid))
> +        return 0;
> +
> +    u64 t = bpf_ktime_get_ns();
> +    ready_start.update(&pid_tgid, &t);
> +    return 0;
> +}
> +
> +RAW_TRACEPOINT_PROBE(sched_wakeup)
> +{
> +    struct task_struct *t = (struct task_struct *)ctx->args[0];
> +    return sched_wakeup__(t->pid, t->tgid);
> +}
> +
> +RAW_TRACEPOINT_PROBE(sched_wakeup_new)
> +{
> +    struct task_struct *t = (struct task_struct *)ctx->args[0];
> +    return sched_wakeup__(t->pid, t->tgid);
> +}
> +
> +RAW_TRACEPOINT_PROBE(sched_switch)
> +{
> +    struct task_struct *prev = (struct task_struct *)ctx->args[1];
> +    struct task_struct *next= (struct task_struct *)ctx->args[2];
> +    u64 ktime = 0;
> +
> +    if (!capture_enabled__())
> +        return 0;
> +
> +    if (prev-><STATE_FIELD> == TASK_RUNNING && prev->tgid == MONITOR_PID)
> +        sched_wakeup__(prev->pid, prev->tgid);
> +
> +    if (prev->tgid == MONITOR_PID) {
> +        ktime = bpf_ktime_get_ns();
> +        thread_stop_run(prev->pid, prev->tgid, ktime);
> +    }
> +
> +    u64 pid_tgid = (u64)next->tgid << 32 | next->pid;
> +
> +    if (next->tgid != MONITOR_PID)
> +        return 0;
> +
> +    if (ktime == 0)
> +        ktime = bpf_ktime_get_ns();
> +
> +    u64 *start_ns = ready_start.lookup(&pid_tgid);
> +
> +    if (start_ns && *start_ns != 0) {
> +
> +        struct ready_data_t *val, zero = {};
> +        struct pid_tid_key_t key = { .pid = next->tgid,
> +                                     .tid = next->pid };
> +
> +        val = ready_data.lookup_or_try_init(&key, &zero);
> +        if (val) {
> +            u64 delta = ktime - *start_ns;
> +            val->count++;
> +            val->total_ns += delta;
> +            if (val->worst_ns == 0 || delta > val->worst_ns)
> +                val->worst_ns = delta;
> +        }
> +        *start_ns = 0;
> +    }
> +
> +    thread_start_run(pid_tgid, ktime);
> +    return 0;
> +}
> +
> +/*
> + * For measuring the hard irq time, we need the following.
> + */
> +struct hardirq_start_data_t {
> +    u64  start_ns;
> +    char irq_name[32];
> +};
> +
> +struct hardirq_data_t {
> +    u64 count;
> +    u64 total_ns;
> +    u64 worst_ns;
> +};
> +
> +struct hardirq_data_key_t {
> +    u32 pid;
> +    u32 tid;
> +    char irq_name[32];
> +};
> +
> +BPF_HASH(hardirq_start, u64, struct hardirq_start_data_t);
> +BPF_HASH(hardirq_data, struct hardirq_data_key_t, struct hardirq_data_t);
> +
> +TRACEPOINT_PROBE(irq, irq_handler_entry)
> +{
> +    u64 pid_tgid = bpf_get_current_pid_tgid();
> +
> +    if (!capture_enabled(pid_tgid))
> +        return 0;
> +
> +    struct hardirq_start_data_t data = {};
> +
> +    data.start_ns = bpf_ktime_get_ns();
> +    TP_DATA_LOC_READ_STR(&data.irq_name, name, sizeof(data.irq_name));
> +    hardirq_start.update(&pid_tgid, &data);
> +    return 0;
> +}
> +
> +TRACEPOINT_PROBE(irq, irq_handler_exit)
> +{
> +    u64 pid_tgid = bpf_get_current_pid_tgid();
> +
> +    if (!capture_enabled(pid_tgid))
> +        return 0;
> +
> +    struct hardirq_start_data_t *data;
> +    data = hardirq_start.lookup(&pid_tgid);
> +    if (!data || data->start_ns == 0)
> +        return 0;
> +
> +    if (args->ret != IRQ_NONE) {
> +        struct hardirq_data_t *val, zero = {};
> +        struct hardirq_data_key_t key = { .pid = pid_tgid >> 32,
> +                                          .tid = (u32)pid_tgid };
> +
> +        bpf_probe_read_kernel(&key.irq_name, sizeof(key.irq_name),
> +                              data->irq_name);
> +        val = hardirq_data.lookup_or_try_init(&key, &zero);
> +        if (val) {
> +            u64 delta = bpf_ktime_get_ns() - data->start_ns;
> +            val->count++;
> +            val->total_ns += delta;
> +            if (val->worst_ns == 0 || delta > val->worst_ns)
> +                val->worst_ns = delta;
> +        }
> +    }
> +
> +    data->start_ns = 0;
> +    return 0;
> +}
> +
> +/*
> + * For measuring the soft irq time, we need the following.
> + */
> +struct softirq_start_data_t {
> +    u64 start_ns;
> +    u32 vec_nr;
> +};
> +
> +struct softirq_data_t {
> +    u64 count;
> +    u64 total_ns;
> +    u64 worst_ns;
> +};
> +
> +struct softirq_data_key_t {
> +    u32 pid;
> +    u32 tid;
> +    u32 vec_nr;
> +};
> +
> +BPF_HASH(softirq_start, u64, struct softirq_start_data_t);
> +BPF_HASH(softirq_data, struct softirq_data_key_t, struct softirq_data_t);
> +
> +TRACEPOINT_PROBE(irq, softirq_entry)
> +{
> +    u64 pid_tgid = bpf_get_current_pid_tgid();
> +
> +    if (!capture_enabled(pid_tgid))
> +        return 0;
> +
> +    struct softirq_start_data_t data = {};
> +
> +    data.start_ns = bpf_ktime_get_ns();
> +    data.vec_nr = args->vec;
> +    softirq_start.update(&pid_tgid, &data);
> +    return 0;
> +}
> +
> +TRACEPOINT_PROBE(irq, softirq_exit)
> +{
> +    u64 pid_tgid = bpf_get_current_pid_tgid();
> +
> +    if (!capture_enabled(pid_tgid))
> +        return 0;
> +
> +    struct softirq_start_data_t *data;
> +    data = softirq_start.lookup(&pid_tgid);
> +    if (!data || data->start_ns == 0)
> +        return 0;
> +
> +    struct softirq_data_t *val, zero = {};
> +    struct softirq_data_key_t key = { .pid = pid_tgid >> 32,
> +                                      .tid = (u32)pid_tgid,
> +                                      .vec_nr = data->vec_nr};
> +
> +    val = softirq_data.lookup_or_try_init(&key, &zero);
> +    if (val) {
> +        u64 delta = bpf_ktime_get_ns() - data->start_ns;
> +        val->count++;
> +        val->total_ns += delta;
> +        if (val->worst_ns == 0 || delta > val->worst_ns)
> +            val->worst_ns = delta;
> +    }
> +
> +    data->start_ns = 0;
> +    return 0;
> +}
> +"""
> +
> +
> +#
> +# Probe class to use for the start/stop triggers
> +#
> +class Probe(object):
> +    '''
> +    The goal for this object is to support as many as possible
> +    probe/events as supported by BCC. See
> +       
> https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments
> +    '''
> +    def __init__(self, probe, pid=None):
> +        self.pid = pid
> +        self.text_probe = probe
> +        self._parse_text_probe()
> +
> +    def __str__(self):
> +        if self.probe_type == "usdt":
> +            return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type,
> +                                           self.usdt_provider, 
> self.usdt_probe)
> +        elif self.probe_type == "trace":
> +            return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type,
> +                                           self.trace_system, 
> self.trace_event)
> +        elif self.probe_type == "kprobe" or self.probe_type == "kretprobe":
> +            return "[{}]; {}:{}".format(self.text_probe, self.probe_type,
> +                                        self.kprobe_function)
> +        elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
> +            return "[{}]; {}:{}".format(self.text_probe, self.probe_type,
> +                                        self.uprobe_function)
> +        else:
> +            return "[{}] <{}:unknown probe>".format(self.text_probe,
> +                                                    self.probe_type)
> +
> +    def _raise(self, error):
> +        raise ValueError("[{}]; {}".format(self.text_probe, error))
> +
> +    def _verify_kprobe_probe(self):
> +        # Nothing to verify for now, just return.
> +        return
> +
> +    def _verify_trace_probe(self):
> +        # Nothing to verify for now, just return.
> +        return
> +
> +    def _verify_uprobe_probe(self):
> +        # Nothing to verify for now, just return.
> +        return
> +
> +    def _verify_usdt_probe(self):
> +        if not self.pid:
> +            self._raise("USDT probes need a valid PID.")
> +
> +        usdt = USDT(pid=self.pid)
> +
> +        for probe in usdt.enumerate_probes():
> +            if probe.provider.decode('utf-8') == self.usdt_provider and \
> +               probe.name.decode('utf-8') == self.usdt_probe:
> +                return
> +
> +        self._raise("Can't find UDST probe 
> '{}:{}'".format(self.usdt_provider,
> +                                                           self.usdt_probe))
> +
> +    def _parse_text_probe(self):
> +        '''
> +        The text probe format is defined as follows:
> +          <probe_type>:<probe_specific>
> +
> +        Types:
> +          USDT:      u|usdt:<provider>:<probe>
> +          TRACE:     t|trace:<system>:<event>
> +          KPROBE:    k|kprobe:<kernel_function>
> +          KRETPROBE: kr|kretprobe:<kernel_function>
> +          UPROBE:    up|uprobe:<function>
> +          URETPROBE: ur|uretprobe:<function>
> +        '''
> +        args = self.text_probe.split(":")
> +        if len(args) <= 1:
> +            self._raise("Can't extract probe type.")
> +
> +        if args[0] not in ["k", "kprobe", "kr", "kretprobe", "t", "trace",
> +                           "u", "usdt", "up", "uprobe", "ur", "uretprobe"]:
> +            self._raise("Invalid probe type '{}'".format(args[0]))
> +
> +        self.probe_type = "kprobe" if args[0] == "k" else args[0]
> +        self.probe_type = "kretprobe" if args[0] == "kr" else self.probe_type
> +        self.probe_type = "trace" if args[0] == "t" else self.probe_type
> +        self.probe_type = "usdt" if args[0] == "u" else self.probe_type
> +        self.probe_type = "uprobe" if args[0] == "up" else self.probe_type
> +        self.probe_type = "uretprobe" if args[0] == "ur" else self.probe_type
> +
> +        if self.probe_type == "usdt":
> +            if len(args) != 3:
> +                self._raise("Invalid number of arguments for USDT")
> +
> +            self.usdt_provider = args[1]
> +            self.usdt_probe = args[2]
> +            self._verify_usdt_probe()
> +
> +        elif self.probe_type == "trace":
> +            if len(args) != 3:
> +                self._raise("Invalid number of arguments for TRACE")
> +
> +            self.trace_system = args[1]
> +            self.trace_event = args[2]
> +            self._verify_trace_probe()
> +
> +        elif self.probe_type == "kprobe" or self.probe_type == "kretprobe":
> +            if len(args) != 2:
> +                self._raise("Invalid number of arguments for K(RET)PROBE")
> +            self.kprobe_function = args[1]
> +            self._verify_kprobe_probe()
> +
> +        elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
> +            if len(args) != 2:
> +                self._raise("Invalid number of arguments for U(RET)PROBE")
> +            self.uprobe_function = args[1]
> +            self._verify_uprobe_probe()
> +
> +    def _get_kprobe_c_code(self, function_name, function_content):
> +        #
> +        # The kprobe__* do not require a function name, so it's
> +        # ignored in the code generation.
> +        #
> +        return """
> +int {}__{}(struct pt_regs *ctx) {{
> +    {}
> +}}
> +""".format(self.probe_type, self.kprobe_function, function_content)
> +
> +    def _get_trace_c_code(self, function_name, function_content):
> +        #
> +        # The TRACEPOINT_PROBE() do not require a function name, so it's
> +        # ignored in the code generation.
> +        #
> +        return """
> +TRACEPOINT_PROBE({},{}) {{
> +    {}
> +}}
> +""".format(self.trace_system, self.trace_event, function_content)
> +
> +    def _get_uprobe_c_code(self, function_name, function_content):
> +        return """
> +int {}(struct pt_regs *ctx) {{
> +    {}
> +}}
> +""".format(function_name, function_content)
> +
> +    def _get_usdt_c_code(self, function_name, function_content):
> +        return """
> +int {}(struct pt_regs *ctx) {{
> +    {}
> +}}
> +""".format(function_name, function_content)
> +
> +    def get_c_code(self, function_name, function_content):
> +        if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe':
> +            return self._get_kprobe_c_code(function_name, function_content)
> +        elif self.probe_type == 'trace':
> +            return self._get_trace_c_code(function_name, function_content)
> +        elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe':
> +            return self._get_uprobe_c_code(function_name, function_content)
> +        elif self.probe_type == 'usdt':
> +            return self._get_usdt_c_code(function_name, function_content)
> +
> +        return ""
> +
> +    def probe_name(self):
> +        if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe':
> +            return "{}".format(self.kprobe_function)
> +        elif self.probe_type == 'trace':
> +            return "{}:{}".format(self.trace_system,
> +                                  self.trace_event)
> +        elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe':
> +            return "{}".format(self.uprobe_function)
> +        elif self.probe_type == 'usdt':
> +            return "{}:{}".format(self.usdt_provider,
> +                                  self.usdt_probe)
> +
> +        return ""
> +
> +
> +#
> +# event_to_dict()
> +#
> +def event_to_dict(event):
> +    return dict([(field, getattr(event, field))
> +                 for (field, _) in event._fields_
> +                 if isinstance(getattr(event, field), (int, bytes))])
> +
> +
> +#
> +# Event enum
> +#
> +Event = IntEnum("Event", ["SYSCALL", "START_TRIGGER", "STOP_TRIGGER",
> +                          ], start=0)
> +
> +
> +#
> +# process_event()
> +#
> +def process_event(ctx, data, size):
> +    global start_trigger_ts
> +    global stop_trigger_ts
> +
> +    event = bpf['events'].event(data)
> +    if event.id == Event.SYSCALL:
> +        syscall_events.append({"tid": event.tid,
> +                               "ts_entry": event.entry_ts,
> +                               "ts_exit": event.ts,
> +                               "syscall": event.syscall,
> +                               "user_stack_id": event.user_stack_id,
> +                               "kernel_stack_id": event.kernel_stack_id})
> +    elif event.id == Event.START_TRIGGER:
> +        #
> +        # This event would have start the trigger already, so all we need to
> +        # do is record the start timestamp.
> +        #
> +        start_trigger_ts = event.ts
> +
> +    elif event.id == Event.STOP_TRIGGER:
> +        #
> +        # This event would have stopped the trigger already, so all we need 
> to
> +        # do is record the start timestamp.
> +        stop_trigger_ts = event.ts
> +
> +
> +#
> +# next_power_of_two()
> +#
> +def next_power_of_two(val):
> +    np = 1
> +    while np < val:
> +        np *= 2
> +    return np
> +
> +
> +#
> +# unsigned_int()
> +#
> +def unsigned_int(value):
> +    try:
> +        value = int(value)
> +    except ValueError:
> +        raise argparse.ArgumentTypeError("must be an integer")
> +
> +    if value < 0:
> +        raise argparse.ArgumentTypeError("must be positive")
> +    return value
> +
> +
> +#
> +# unsigned_nonzero_int()
> +#
> +def unsigned_nonzero_int(value):
> +    value = unsigned_int(value)
> +    if value == 0:
> +        raise argparse.ArgumentTypeError("must be nonzero")
> +    return value
> +
> +
> +#
> +# get_thread_name()
> +#
> +def get_thread_name(pid, tid):
> +    try:
> +        with open(f"/proc/{pid}/task/{tid}/comm", encoding="utf8") as f:
> +            return f.readline().strip("\n")
> +    except FileNotFoundError:
> +        pass
> +
> +    return f"<unknown:{pid}/{tid}>"
> +
> +
> +#
> +# get_vec_nr_name()
> +#
> +def get_vec_nr_name(vec_nr):
> +    known_vec_nr = ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
> +                    "tasklet", "sched", "hrtimer", "rcu"]
> +
> +    if vec_nr < 0 or vec_nr > len(known_vec_nr):
> +        return f"<unknown:{vec_nr}>"
> +
> +    return known_vec_nr[vec_nr]
> +
> +
> +#
> +# start/stop/reset capture
> +#
> +def start_capture():
> +    bpf["capture_on"][ct.c_int(0)] = ct.c_int(1)
> +
> +
> +def stop_capture(force=False):
> +    if force:
> +        bpf["capture_on"][ct.c_int(0)] = ct.c_int(0xffff)
> +    else:
> +        bpf["capture_on"][ct.c_int(0)] = ct.c_int(0)
> +
> +
> +def capture_running():
> +    return True if bpf["capture_on"][ct.c_int(0)].value == 1 else False
> +
> +
> +def reset_capture():
> +    bpf["syscall_start"].clear()
> +    bpf["syscall_data"].clear()
> +    bpf["run_start"].clear()
> +    bpf["run_data"].clear()
> +    bpf["ready_start"].clear()
> +    bpf["ready_data"].clear()
> +    bpf["hardirq_start"].clear()
> +    bpf["hardirq_data"].clear()
> +    bpf["softirq_start"].clear()
> +    bpf["softirq_data"].clear()
> +    bpf["stack_traces"].clear()
> +
> +
> +#
> +# Display timestamp
> +#
> +def print_timestamp(msg):
> +    ltz = datetime.datetime.now()
> +    utc = ltz.astimezone(pytz.utc)
> +    time_string = "{} @{} ({} UTC)".format(
> +        msg, ltz.isoformat(), utc.strftime("%H:%M:%S"))
> +    print(time_string)
> +
> +
> +#
> +# process_results()
> +#
> +def process_results(syscal_events=None, trigger_delta=None):
> +    if trigger_delta:
> +        print_timestamp("# Triggered sample dump, stop-start delta {:,} ns".
> +                        format(trigger_delta))
> +    else:
> +        print_timestamp("# Sample dump")
> +
> +    #
> +    # First get a list of all threads we need to report on.
> +    #
> +    threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items()
> +                       if k.syscall != 0xffffffff}
> +
> +    threads_run = {k.tid for k, _ in bpf["run_data"].items()
> +                   if k.pid != 0xffffffff}
> +
> +    threads_ready = {k.tid for k, _ in bpf["ready_data"].items()
> +                     if k.pid != 0xffffffff}
> +
> +    threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items()
> +                       if k.pid != 0xffffffff}
> +
> +    threads_softirq = {k.tid for k, _ in bpf["softirq_data"].items()
> +                       if k.pid != 0xffffffff}
> +
> +    threads = sorted(threads_syscall | threads_run | threads_ready |
> +                     threads_hardirq | threads_softirq,
> +                     key=lambda x: get_thread_name(options.pid, x))
> +
> +    #
> +    # Print header...
> +    #
> +    print("{:10} {:16} {}".format("TID", "THREAD", "<RESOURCE SPECIFIC>"))
> +    print("{:10} {:16} {}".format("-" * 10, "-" * 16, "-" * 76))
> +    indent = 28 * " "
> +
> +    #
> +    # Print all events/statistics per threads.
> +    #
> +    poll_id = [k for k, v in syscalls.items() if v == b'poll'][0]
> +    for thread in threads:
> +
> +        if thread != threads[0]:
> +            print("")
> +
> +        #
> +        # SYSCALL_STATISTICS
> +        #
> +        print("{:10} {:16} {}\n{}{:20} {:>6}  {:>10}  {:>16}  {:>16}".format(
> +            thread, get_thread_name(options.pid, thread),
> +            "[SYSCALL STATISTICS]", indent,
> +            "NAME", "NUMBER", "COUNT", "TOTAL ns", "MAX ns"))
> +
> +        total_count = 0
> +        total_ns = 0
> +        for k, v in sorted(bpf["syscall_data"].items(),
> +                           key=lambda kv: -kv[1].total_ns):
> +            if k.tid != thread:
> +                continue
> +
> +            print("{}{:20.20} {:6}  {:10}  {:16,}  {:16,}".format(
> +                indent, syscall_name(k.syscall).decode('utf-8'), k.syscall,
> +                v.count, v.total_ns, v.worst_ns))
> +            if k.syscall != poll_id:
> +                total_count += v.count
> +                total_ns += v.total_ns
> +
> +        if total_count > 0:
> +            print("{}{:20.20} {:6}  {:10}  {:16,}".format(
> +                indent, "TOTAL( - poll):", "", total_count, total_ns))
> +
> +        #
> +        # THREAD RUN STATISTICS
> +        #
> +        print("\n{:10} {:16} {}\n{}{:10}  {:>16}  {:>16}  {:>16}".format(
> +            "", "", "[THREAD RUN STATISTICS]", indent,
> +            "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns"))
> +
> +        for k, v in bpf["run_data"].items():
> +            if k.tid != thread:
> +                continue
> +
> +            print("{}{:10}  {:16,}  {:16,}  {:16,}".format(
> +                indent, v.count, v.total_ns, v.min_ns, v.max_ns))
> +
> +        #
> +        # THREAD READY STATISTICS
> +        #
> +        print("\n{:10} {:16} {}\n{}{:10}  {:>16}  {:>16}".format(
> +            "", "", "[THREAD READY STATISTICS]", indent,
> +            "SCHED_CNT", "TOTAL ns", "MAX ns"))
> +
> +        for k, v in bpf["ready_data"].items():
> +            if k.tid != thread:
> +                continue
> +
> +            print("{}{:10}  {:16,}  {:16,}".format(
> +                indent, v.count, v.total_ns, v.worst_ns))
> +
> +        #
> +        # HARD IRQ STATISTICS
> +        #
> +        total_ns = 0
> +        total_count = 0
> +        header_printed = False
> +        for k, v in sorted(bpf["hardirq_data"].items(),
> +                           key=lambda kv: -kv[1].total_ns):
> +            if k.tid != thread:
> +                continue
> +
> +            if not header_printed:
> +                print("\n{:10} {:16} {}\n{}{:20}  {:>10}  {:>16}  {:>16}".
> +                      format("", "", "[HARD IRQ STATISTICS]", indent,
> +                             "NAME", "COUNT", "TOTAL ns", "MAX ns"))
> +                header_printed = True
> +
> +            print("{}{:20.20}  {:10}  {:16,}  {:16,}".format(
> +                indent, k.irq_name.decode('utf-8'),
> +                v.count, v.total_ns, v.worst_ns))
> +
> +            total_count += v.count
> +            total_ns += v.total_ns
> +
> +        if total_count > 0:
> +            print("{}{:20.20}  {:10}  {:16,}".format(
> +                indent, "TOTAL:", total_count, total_ns))
> +
> +        #
> +        # SOFT IRQ STATISTICS
> +        #
> +        total_ns = 0
> +        total_count = 0
> +        header_printed = False
> +        for k, v in sorted(bpf["softirq_data"].items(),
> +                           key=lambda kv: -kv[1].total_ns):
> +            if k.tid != thread:
> +                continue
> +
> +            if not header_printed:
> +                print("\n{:10} {:16} {}\n"
> +                      "{}{:20} {:>7}  {:>10}  {:>16}  {:>16}".
> +                      format("", "", "[SOFT IRQ STATISTICS]", indent,
> +                             "NAME", "VECT_NR", "COUNT", "TOTAL ns", "MAX 
> ns"))
> +                header_printed = True
> +
> +            print("{}{:20.20} {:>7}  {:10}  {:16,}  {:16,}".format(
> +                indent, get_vec_nr_name(k.vec_nr), k.vec_nr,
> +                v.count, v.total_ns, v.worst_ns))
> +
> +            total_count += v.count
> +            total_ns += v.total_ns
> +
> +        if total_count > 0:
> +            print("{}{:20.20} {:7}  {:10}  {:16,}".format(
> +                indent, "TOTAL:", "", total_count, total_ns))
> +
> +    #
> +    # Print events
> +    #
> +    lost_stack_traces = 0
> +    if syscall_events is not None and len(syscall_events) > 0:
> +        stack_traces = bpf.get_table("stack_traces")
> +
> +        print("\n\n# SYSCALL EVENTS:"
> +              "\n{}{:>19} {:>19} {:>10} {:16} {:>10}  {}".format(
> +                  2 * " ", "ENTRY (ns)", "EXIT (ns)", "TID", "COMM",
> +                  "DELTA (us)", "SYSCALL"))
> +        print("{}{:19} {:19} {:10} {:16} {:10}  {}".format(
> +            2 * " ", "-" * 19, "-" * 19, "-" * 10, "-" * 16,
> +            "-" * 10, "-" * 16))
> +        for event in syscall_events:
> +            print("{}{:19} {:19} {:10} {:16} {:10,}  {}".format(
> +                " " * 2,
> +                event["ts_entry"], event["ts_exit"], event["tid"],
> +                get_thread_name(options.pid, event["tid"]),
> +                int((event["ts_exit"] - event["ts_entry"]) / 1000),
> +                syscall_name(event["syscall"]).decode('utf-8')))
> +            #
> +            # Not sure where to put this, but I'll add some info on stack
> +            # traces here... Userspace stack traces are very limited due to
> +            # the fact that bcc does not support dwarf backtraces. As OVS
> +            # gets compiled without frame pointers we will not see much.
> +            # If however, OVS does get built with frame pointers, we should 
> not
> +            # use the BPF_STACK_TRACE_BUILDID as it does not seem to handle
> +            # the debug symbols correctly. Also, note that for kernel
> +            # traces you should not use BPF_STACK_TRACE_BUILDID, so two
> +            # buffers are needed.
> +            #
> +            # Some info on manual dwarf walk support:
> +            #   https://github.com/iovisor/bcc/issues/3515
> +            #   https://github.com/iovisor/bcc/pull/4463
> +            #
> +            if options.stack_trace_size == 0:
> +                continue
> +
> +            if event['kernel_stack_id'] < 0 or event['user_stack_id'] < 0:
> +                lost_stack_traces += 1
> +
> +            kernel_stack = stack_traces.walk(event['kernel_stack_id']) \
> +                if event['kernel_stack_id'] >= 0 else []
> +            user_stack = stack_traces.walk(event['user_stack_id']) \
> +                if event['user_stack_id'] >= 0 else []
> +
> +            for addr in kernel_stack:
> +                print("{}{}".format(
> +                    " " * 10,
> +                    bpf.ksym(addr, show_module=True,
> +                             show_offset=True).decode('utf-8', 'replace')))
> +
> +            for addr in user_stack:
> +                addr_str = bpf.sym(addr, options.pid, show_module=True,
> +                                   show_offset=True).decode('utf-8', 
> 'replace')
> +
> +                if addr_str == "[unknown]":
> +                    addr_str += " 0x{:x}".format(addr)
> +
> +                print("{}{}".format(" " * 10, addr_str))
> +
> +    #
> +    # Print any footer messages.
> +    #
> +    if lost_stack_traces > 0:
> +        print("\n#WARNING: We where not able to display {} stack traces!\n"
> +              "#         Consider increasing the stack trace size using\n"
> +              "#         the '--stack-trace-size' option.\n"
> +              "#         Note that this can also happen due to a stack id\n"
> +              "#         collision.".format(lost_stack_traces))
> +
> +
> +#
> +# main()
> +#
> +def main():
> +    #
> +    # Don't like these globals, but ctx passing does not seem to work with 
> the
> +    # existing open_ring_buffer() API :(
> +    #
> +    global bpf
> +    global options
> +    global syscall_events
> +    global start_trigger_ts
> +    global stop_trigger_ts
> +
> +    start_trigger_ts = 0
> +    stop_trigger_ts = 0
> +
> +    #
> +    # Argument parsing
> +    #
> +    parser = argparse.ArgumentParser()
> +
> +    parser.add_argument("-D", "--debug",
> +                        help="Enable eBPF debugging",
> +                        type=int, const=0x3f, default=0, nargs='?')
> +    parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID",
> +                        help="ovs-vswitch's PID",
> +                        type=unsigned_int, default=None)
> +    parser.add_argument("-s", "--syscall-events", metavar="DURATION_NS",
> +                        help="Record syscall events, default disabled",
> +                        type=unsigned_int, const=0, default=None, nargs='?')
> +    parser.add_argument("--buffer-page-count",
> +                        help="Number of BPF ring buffer pages, default 1024",
> +                        type=unsigned_int, default=1024, metavar="NUMBER")
> +    parser.add_argument("--sample-count",
> +                        help="Number of sample runs, default 1",
> +                        type=unsigned_nonzero_int, default=1, metavar="RUNS")
> +    parser.add_argument("--sample-interval",
> +                        help="Delay between sample runs, default 0",
> +                        type=float, default=0, metavar="SECONDS")
> +    parser.add_argument("--sample-time",
> +                        help="Sample time, default 0.5 seconds",
> +                        type=float, default=0.5, metavar="SECONDS")
> +    parser.add_argument("--skip-syscall-poll-events",
> +                        help="Skip poll() syscalls with --syscall-events",
> +                        action="store_true")
> +    parser.add_argument("--stack-trace-size",
> +                        help="Number of unique stack traces that can be "
> +                        "recorded, default 4096. 0 to disable",
> +                        type=unsigned_int, default=4096)
> +    parser.add_argument("--start-trigger", metavar="TRIGGER",
> +                        help="Start trigger, see documentation for details",
> +                        type=str, default=None)
> +    parser.add_argument("--stop-trigger", metavar="TRIGGER",
> +                        help="Stop trigger, see documentation for details",
> +                        type=str, default=None)
> +    parser.add_argument("--trigger-delta", metavar="DURATION_NS",
> +                        help="Only report event when the trigger duration > "
> +                             "DURATION_NS, default 0 (all events)",
> +                        type=unsigned_int, const=0, default=0, nargs='?')
> +
> +    options = parser.parse_args()
> +
> +    #
> +    # Find the PID of the ovs-vswitchd daemon if not specified.
> +    #
> +    if options.pid is None:
> +        for proc in psutil.process_iter():
> +            if 'ovs-vswitchd' in proc.name():
> +                if options.pid is not None:
> +                    print("ERROR: Multiple ovs-vswitchd daemons running, "
> +                          "use the -p option!")
> +                    sys.exit(-1)

We have some inconsistencies here because some utilities we send
"sys.exit(1)" and some we send "sys.exit(-1)"

I think we switched to using '1' because it is a valid return value on
other OSes.  I guess for now there isn't any need to normalize it since
these are eBPF programs intended to run on linux.  That said, maybe we
could consider a normalized value anyway just for review purposes.

> +
> +                options.pid = proc.pid
> +
> +    #
> +    # Error checking on input parameters.
> +    #
> +    if options.pid is None:
> +        print("ERROR: Failed to find ovs-vswitchd's PID!")
> +        sys.exit(-1)
> +
> +    options.buffer_page_count = next_power_of_two(options.buffer_page_count)
> +
> +    #
> +    # Make sure we are running as root, or else we can not attach the probes.
> +    #
> +    if os.geteuid() != 0:
> +        print("ERROR: We need to run as root to attached probes!")
> +        sys.exit(-1)
> +
> +    #
> +    # Setup any of the start stop triggers
> +    #
> +    if options.start_trigger is not None:
> +        try:
> +            start_trigger = Probe(options.start_trigger, pid=options.pid)
> +        except ValueError as e:
> +            print(f"ERROR: Invalid start trigger {str(e)}")
> +            sys.exit(-1)
> +    else:
> +        start_trigger = None
> +
> +    if options.stop_trigger is not None:
> +        try:
> +            stop_trigger = Probe(options.stop_trigger, pid=options.pid)
> +        except ValueError as e:
> +            print(f"ERROR: Invalid stop trigger {str(e)}")
> +            sys.exit(-1)
> +    else:
> +        stop_trigger = None
> +
> +    #
> +    # Attach probe to running process.
> +    #
> +    source = EBPF_SOURCE.replace("<EVENT_ENUM>", "\n".join(
> +        ["    EVENT_{} = {},".format(
> +            event.name, event.value) for event in Event]))
> +    source = source.replace("<BUFFER_PAGE_CNT>",
> +                            str(options.buffer_page_count))
> +    source = source.replace("<MONITOR_PID>", str(options.pid))
> +
> +    if BPF.kernel_struct_has_field(b'task_struct', b'state') == 1:
> +        source = source.replace('<STATE_FIELD>', 'state')
> +    else:
> +        source = source.replace('<STATE_FIELD>', '__state')
> +
> +    poll_id = [k for k, v in syscalls.items() if v == b'poll'][0]
> +    if options.syscall_events is None:
> +        syscall_trace_events = "false"
> +    elif options.syscall_events == 0:
> +        if not options.skip_syscall_poll_events:
> +            syscall_trace_events = "true"
> +        else:
> +            syscall_trace_events = f"args->id != {poll_id}"
> +    else:
> +        syscall_trace_events = "delta > {}".format(options.syscall_events)
> +        if options.skip_syscall_poll_events:
> +            syscall_trace_events += f" && args->id != {poll_id}"
> +
> +    source = source.replace("<SYSCAL_TRACE_EVENTS>",
> +                            syscall_trace_events)
> +
> +    source = source.replace("<STACK_TRACE_SIZE>",
> +                            str(options.stack_trace_size))
> +
> +    source = source.replace("<STACK_TRACE_ENABLED>", "true"
> +                            if options.stack_trace_size > 0 else "false")
> +
> +    #
> +    # Handle start/stop probes
> +    #
> +    if start_trigger:
> +        source = source.replace("<START_TRIGGER>",
> +                                start_trigger.get_c_code(
> +                                    "start_trigger_probe",
> +                                    "return start_trigger();"))
> +    else:
> +        source = source.replace("<START_TRIGGER>", "")
> +
> +    if stop_trigger:
> +        source = source.replace("<STOP_TRIGGER>",
> +                                stop_trigger.get_c_code(
> +                                    "stop_trigger_probe",
> +                                    "return stop_trigger();"))
> +    else:
> +        source = source.replace("<STOP_TRIGGER>", "")
> +
> +    #
> +    # Setup usdt or other probes that need handling trough the BFP class.
> +    #
> +    usdt = USDT(pid=int(options.pid))
> +    try:
> +        if start_trigger and start_trigger.probe_type == 'usdt':
> +            usdt.enable_probe(probe=start_trigger.probe_name(),
> +                              fn_name="start_trigger_probe")
> +        if stop_trigger and stop_trigger.probe_type == 'usdt':
> +            usdt.enable_probe(probe=stop_trigger.probe_name(),
> +                              fn_name="stop_trigger_probe")
> +
> +    except USDTException as e:
> +        print("ERROR: {}".format(
> +            (re.sub('^', ' ' * 7, str(e), flags=re.MULTILINE)).strip().
> +            replace("--with-dtrace or --enable-dtrace",
> +                    "--enable-usdt-probes")))
> +        sys.exit(-1)
> +
> +    bpf = BPF(text=source, usdt_contexts=[usdt], debug=options.debug)
> +
> +    if start_trigger:
> +        try:
> +            if start_trigger.probe_type == "uprobe":
> +                bpf.attach_uprobe(name=f"/proc/{options.pid}/exe",
> +                                  sym=start_trigger.probe_name(),
> +                                  fn_name="start_trigger_probe",
> +                                  pid=options.pid)
> +
> +            if start_trigger.probe_type == "uretprobe":
> +                bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe",
> +                                     sym=start_trigger.probe_name(),
> +                                     fn_name="start_trigger_probe",
> +                                     pid=options.pid)
> +        except Exception as e:
> +            print("ERROR: Failed attaching uprobe start trigger "
> +                  f"'{start_trigger.probe_name()}';\n         {str(e)}")
> +            sys.exit(-1)
> +
> +    if stop_trigger:
> +        try:
> +            if stop_trigger.probe_type == "uprobe":
> +                bpf.attach_uprobe(name=f"/proc/{options.pid}/exe",
> +                                  sym=stop_trigger.probe_name(),
> +                                  fn_name="stop_trigger_probe",
> +                                  pid=options.pid)
> +
> +            if stop_trigger.probe_type == "uretprobe":
> +                bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe",
> +                                     sym=stop_trigger.probe_name(),
> +                                     fn_name="stop_trigger_probe",
> +                                     pid=options.pid)
> +        except Exception as e:
> +            print("ERROR: Failed attaching uprobe stop trigger"
> +                  f"'{stop_trigger.probe_name()}';\n         {str(e)}")
> +            sys.exit(-1)
> +
> +    #
> +    # If no triggers are configured use the delay configuration
> +    #
> +    bpf['events'].open_ring_buffer(process_event)
> +
> +    sample_count = 0
> +    while sample_count < options.sample_count:
> +        sample_count += 1
> +        syscall_events = []
> +
> +        if options.start_trigger is None:
> +            print_timestamp("# Start sampling")
> +            start_capture()
> +            stop_time = -1 if options.stop_trigger else \
> +                time_ns() + options.sample_time * 1000000000
> +        else:
> +            # For start triggers the stop time depends on the start trigger
> +            # time, or depends on the stop trigger if configured.
> +            stop_time = -1 if options.stop_trigger else 0
> +
> +        while True:
> +            try:
> +                last_start_ts = start_trigger_ts
> +                last_stop_ts = stop_trigger_ts
> +
> +                if stop_time > 0:
> +                    delay = int((stop_time - time_ns()) / 1000000)
> +                    if delay <= 0:
> +                        break
> +                else:
> +                    delay = -1
> +
> +                bpf.ring_buffer_poll(timeout=delay)
> +
> +                if stop_time <= 0 and last_start_ts != start_trigger_ts:
> +                    print_timestamp(
> +                        "# Start sampling (trigger@{})".format(
> +                            start_trigger_ts))
> +
> +                    if not options.stop_trigger:
> +                        stop_time = time_ns() + \
> +                            options.sample_time * 1000000000
> +
> +                if last_stop_ts != stop_trigger_ts:
> +                    break
> +
> +            except KeyboardInterrupt:
> +                break
> +
> +        if options.stop_trigger and not capture_running():
> +            print_timestamp("# Stop sampling (trigger@{})".format(
> +                stop_trigger_ts))
> +        else:
> +            print_timestamp("# Stop sampling")
> +
> +        if stop_trigger_ts != 0 and start_trigger_ts != 0:
> +            trigger_delta = stop_trigger_ts - start_trigger_ts
> +        else:
> +            trigger_delta = None
> +
> +        if not trigger_delta or trigger_delta >= options.trigger_delta:
> +            stop_capture(force=True)  # Prevent a new trigger to start.
> +            process_results(syscal_events=syscall_events,
> +                            trigger_delta=trigger_delta)
> +        elif trigger_delta:
> +            sample_count -= 1
> +            print_timestamp("# Sample dump skipped, delta {:,} ns".format(
> +                trigger_delta))
> +
> +        reset_capture()
> +        stop_capture()
> +        if options.sample_interval > 0:
> +            time.sleep(options.sample_interval)
> +
> +    #
> +    # Report lost events.
> +    #
> +    dropcnt = bpf.get_table("dropcnt")
> +    for k in dropcnt.keys():
> +        count = dropcnt.sum(k).value
> +        if k.value == 0 and count > 0:
> +            print("\n# WARNING: Not all events were captured, {} were "
> +                  "dropped!\n#          Increase the BPF ring buffer size "
> +                  "with the --buffer-page-count option.".format(count))
> +
> +    if (options.sample_count > 1):
> +        trigger_miss = bpf.get_table("trigger_miss")
> +        for k in trigger_miss.keys():
> +            count = trigger_miss.sum(k).value
> +            if k.value == 0 and count > 0:
> +                print("\n# WARNING: Not all start triggers were successful. "
> +                      "{} were missed due to\n#          slow userspace "
> +                      "processing!".format(count))
> +
> +
> +#
> +# Start main() as the default entry point...
> +#
> +if __name__ == '__main__':
> +    main()
> diff --git a/utilities/usdt-scripts/kernel_delay.rst 
> b/utilities/usdt-scripts/kernel_delay.rst
> new file mode 100644
> index 000000000..95e98db34
> --- /dev/null
> +++ b/utilities/usdt-scripts/kernel_delay.rst
> @@ -0,0 +1,594 @@
> +Troubleshooting Open vSwitch: Is the kernel to blame?
> +=====================================================
> +Often, when troubleshooting Open vSwitch (OVS) in the field, you might be 
> left
> +wondering if the issue is really OVS-related, or if it's a problem with the
> +kernel being overloaded. The kernel_delay.py tool can help you quickly
> +identify if the focus of your investigation should be OVS or the Linux 
> kernel.

We might want to give additional context here.  For example, hints that
the system could be overloaded due to the WARN level log "Unreasonably
long poll..." or uname/uptime output.  I know you do reference it later
in the document, but perhaps a section on when the reader can infer that
there could be a problem.

> +
> +
> +Introduction
> +------------
> +``kernel_delay.py`` consists of a Python script that uses the BCC [#BCC]_
> +framework to install eBPF probes. The data the eBPF probes collect will be
> +analyzed and presented to the user by the Python script. Some of the 
> presented
> +data can also be captured by the individual scripts included in the BBC 
> [#BCC]_
> +framework.
> +
> +kernel_delay.py has two modes of operation:
> +
> +- In **time mode**, the tool runs for a specific time and collects the
> +  information.
> +- In **trigger mode**, event collection can be started and/or stopped based 
> on
> +  a specific eBPF probe. Currently, we support the following probes:
> +  - USDT probes
> +  - Kernel tracepoints
> +  - kprobe
> +  - kretprobe
> +  - uprobe
> +  - uretprobe
> +
> +
> +In addition, the option, ``--sample-count``, exists to specify how many
> +iterations you would like to do. When using triggers, you can also ignore
> +samples if they are less than a number of nanoseconds with the
> +``--trigger-delta`` option. The latter might be useful when debugging Linux
> +syscalls which take a long time to complete. More on this later. Finally, you
> +can configure the delay between two sample runs with the 
> ``--sample-interval``
> +option.
> +
> +Before getting into more details, let's just run the tool without any options
> +to see what the output looks like. Notice that it will try to automatically
> +get the process ID of the running ``ovs-vsdwitchd``. You can overwrite this
> +with the ``--pid`` option.
> +
> +.. code-block:: console
> +
> +  $ sudo ./kernel_delay.py
> +  # Start sampling @2023-06-08T12:17:22.725127 (10:17:22 UTC)
> +  # Stop sampling @2023-06-08T12:17:23.224781 (10:17:23 UTC)
> +  # Sample dump @2023-06-08T12:17:23.224855 (10:17:23 UTC)
> +  TID        THREAD           <RESOURCE SPECIFIC>
> +  ---------- ---------------- 
> ----------------------------------------------------------------------------
> +       27090 ovs-vswitchd     [SYSCALL STATISTICS]
> +                  <EDIT: REMOVED DATA FOR ovs-vswitchd THREAD>
> +
> +       31741 revalidator122   [SYSCALL STATISTICS]
> +                  NAME                 NUMBER       COUNT          TOTAL ns  
>           MAX ns
> +                  poll                      7           5       184,193,176  
>      184,191,520
> +                  recvmsg                  47         494       125,208,756  
>          310,331
> +                  futex                   202           8        18,768,758  
>        4,023,039
> +                  sendto                   44          10           375,861  
>          266,867
> +                  sendmsg                  46           4            43,294  
>           11,213
> +                  write                     1           1             5,949  
>            5,949
> +                  getrusage                98           1             1,424  
>            1,424
> +                  read                      0           1             1,292  
>            1,292
> +                  TOTAL( - poll):                     519       144,405,334
> +
> +                  [THREAD RUN STATISTICS]
> +                  SCHED_CNT           TOTAL ns            MIN ns            
> MAX ns
> +                       6       136,764,071             1,480       
> 115,146,424
> +
> +                  [THREAD READY STATISTICS]
> +                  SCHED_CNT           TOTAL ns            MAX ns
> +                       7            11,334             6,636
> +
> +                  [HARD IRQ STATISTICS]
> +                  NAME                       COUNT          TOTAL ns         
>    MAX ns
> +                  eno8303-rx-1                   1             3,586         
>     3,586
> +                  TOTAL:                         1             3,586
> +
> +                  [SOFT IRQ STATISTICS]
> +                  NAME                 VECT_NR       COUNT          TOTAL ns 
>            MAX ns
> +                  net_rx                     3           1            17,699 
>            17,699
> +                  sched                      7           6            13,820 
>             3,226
> +                  rcu                        9          16            13,586 
>             1,554
> +                  timer                      1           3            10,259 
>             3,815
> +                  TOTAL:                                26            55,364
> +
> +
> +By default, the tool will run for half a second in `time mode`. To extend 
> this
> +you can use the ``--sample-time`` option.
> +
> +
> +What will it report
> +-------------------
> +The above sample output separates the captured data on a per-thread basis.
> +For this, it displays the thread's id (``TID``) and name (``THREAD``),
> +followed by resource-specific data. Which are:
> +
> +- ``SYSCALL STATISTICS``
> +- ``THREAD RUN STATISTICS``
> +- ``THREAD READY STATISTICS``
> +- ``HARD IRQ STATISTICS``
> +- ``SOFT IRQ STATISTICS``
> +
> +The following sections will describe in detail what statistics they report.
> +
> +
> +``SYSCALL STATISTICS``
> +~~~~~~~~~~~~~~~~~~~~~~
> +``SYSCALL STATISTICS`` tell you which Linux system calls got executed during
> +the measurement interval. This includes the number of times the syscall was
> +called (``COUNT``), the total time spent in the system calls (``TOTAL ns``),
> +and the worst-case duration of a single call (``MAX ns``).
> +
> +It also shows the total of all system calls, but it excludes the poll system
> +call, as the purpose of this call is to wait for activity on a set of 
> sockets,
> +and usually, the thread gets swapped out.
> +
> +Note that it only counts calls that started and stopped during the
> +measurement interval!
> +
> +
> +``THREAD RUN STATISTICS``
> +~~~~~~~~~~~~~~~~~~~~~~~~~
> +``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU
> +during the measurement interval.
> +
> +Note that these statistics only count events where the thread started and
> +stopped running on a CPU during the measurement interval. For example, if
> +this was a PMD thread, you should see zero ``SCHED_CNT`` and ``TOTAL_ns``.
> +If not, there might be a misconfiguration.
> +
> +
> +``THREAD READY STATISTICS``
> +~~~~~~~~~~~~~~~~~~~~~~~~~~~
> +``THREAD READY STATISTICS`` tell you the time between the thread being ready
> +to run and it actually running on the CPU.
> +
> +Note that these statistics only count events where the thread was getting
> +ready to run and started running during the measurement interval.
> +
> +
> +``HARD IRQ STATISTICS``
> +~~~~~~~~~~~~~~~~~~~~~~~
> +``HARD IRQ STATISTICS`` tell you how much time was spent servicing hard
> +interrupts during the threads run time.
> +
> +It shows the interrupt name (``NAME``), the number of interrupts (``COUNT``),
> +the total time spent in the interrupt handler (``TOTAL ns``), and the
> +worst-case duration (``MAX ns``).
> +
> +
> +``SOFT IRQ STATISTICS``
> +~~~~~~~~~~~~~~~~~~~~~~~
> +``SOFT IRQ STATISTICS`` tell you how much time was spent servicing soft
> +interrupts during the threads run time.
> +
> +It shows the interrupt name (``NAME``), vector number (``VECT_NR``), the
> +number of interrupts (``COUNT``), the total time spent in the interrupt
> +handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).
> +
> +
> +The ``--syscall-events`` option
> +-------------------------------
> +In addition to reporting global syscall statistics in ``SYSCALL_STATISTICS``,
> +the tool can also report each individual syscall. This can be a usefull
> +second step if the ``SYSCALL_STATISTICS`` show high latency numbers.
> +
> +All you need to do is add the ``--syscall-events`` option, with or without
> +the additional ``DURATION_NS`` parameter. The ``DUTATION_NS`` parameter
> +allows you to exclude events that take less than the supplied time.
> +
> +The ``--skip-syscall-poll-events`` option allows you to exclude poll
> +syscalls from the report.
> +
> +Below is an example run, note that I have removed the resource-specific data
> +to highlight the syscall events:
> +
> +.. code-block:: console
> +
> +  $ sudo ./kernel_delay.py  --syscall-events 50000 --skip-syscall-poll-events
> +  # Start sampling @2023-06-13T17:10:46.460874 (15:10:46 UTC)
> +  # Stop sampling @2023-06-13T17:10:46.960727 (15:10:46 UTC)
> +  # Sample dump @2023-06-13T17:10:46.961033 (15:10:46 UTC)
> +  TID        THREAD           <RESOURCE SPECIFIC>
> +  ---------- ---------------- 
> ----------------------------------------------------------------------------
> +     3359686 ipf_clean2       [SYSCALL STATISTICS]
> +     ...
> +     3359635 ovs-vswitchd     [SYSCALL STATISTICS]
> +     ...
> +     3359697 revalidator12    [SYSCALL STATISTICS]
> +     ...
> +     3359698 revalidator13    [SYSCALL STATISTICS]
> +     ...
> +     3359699 revalidator14    [SYSCALL STATISTICS]
> +     ...
> +     3359700 revalidator15    [SYSCALL STATISTICS]
> +     ...
> +
> +  # SYSCALL EVENTS:
> +         ENTRY (ns)           EXIT (ns)        TID COMM             DELTA 
> (us)  SYSCALL
> +    ------------------- ------------------- ---------- ---------------- 
> ----------  ----------------
> +       2161821694935486    2161821695031201    3359699 revalidator14         
>    95  futex
> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
> +        syscall_exit_to_user_mode+0x9 [kernel]
> +        do_syscall_64+0x68 [kernel]
> +        entry_SYSCALL_64_after_hwframe+0x72 [kernel]
> +        __GI___lll_lock_wait+0x30 [libc.so.6]
> +        ovs_mutex_lock_at+0x18 [ovs-vswitchd]
> +        [unknown] 0x696c003936313a63
> +       2161821695276882    2161821695333687    3359698 revalidator13         
>    56  futex
> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
> +        syscall_exit_to_user_mode+0x9 [kernel]
> +        do_syscall_64+0x68 [kernel]
> +        entry_SYSCALL_64_after_hwframe+0x72 [kernel]
> +        __GI___lll_lock_wait+0x30 [libc.so.6]
> +        ovs_mutex_lock_at+0x18 [ovs-vswitchd]
> +        [unknown] 0x696c003134313a63
> +       2161821695275820    2161821695405733    3359700 revalidator15         
>   129  futex
> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
> +        syscall_exit_to_user_mode+0x9 [kernel]
> +        do_syscall_64+0x68 [kernel]
> +        entry_SYSCALL_64_after_hwframe+0x72 [kernel]
> +        __GI___lll_lock_wait+0x30 [libc.so.6]
> +        ovs_mutex_lock_at+0x18 [ovs-vswitchd]
> +        [unknown] 0x696c003936313a63
> +       2161821695964969    2161821696052021    3359635 ovs-vswitchd          
>    87  accept
> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
> +        syscall_exit_to_user_mode+0x9 [kernel]
> +        do_syscall_64+0x68 [kernel]
> +        entry_SYSCALL_64_after_hwframe+0x72 [kernel]
> +        __GI_accept+0x4d [libc.so.6]
> +        pfd_accept+0x3a [ovs-vswitchd]
> +        [unknown] 0x7fff19f2bd00
> +        [unknown] 0xe4b8001f0f
> +
> +As you can see above, the output also shows the stackback trace. You can
> +disable this using the ``--stack-trace-size 0`` option.
> +
> +As you can see above, the backtrace does not show a lot of useful information
> +due to the BCC [#BCC]_ toolkit not supporting dwarf decoding. So to further
> +analyze system call backtraces, you could use perf. The following perf
> +script can do this for you (refer to the embedded instructions):
> +
> +https://github.com/chaudron/perf_scripts/blob/master/analyze_perf_pmd_syscall.py

Maybe we should consider putting something like that in-tree for use.

> +
> +
> +Using triggers
> +--------------
> +The tool supports start and, or stop triggers. This will allow you to capture
> +statistics triggered by a specific event. First, let's look at what
> +combinations of stop-and-start triggers we can use.
> +
> +If you only use ``--start-trigger``, the inspection start when the trigger
> +happens and runs until the ``--sample-time`` number of seconds has passed.
> +The example below shows all the supported options in this scenario.
> +
> +.. code-block:: console
> +
> +  $ sudo ./kernel_delay.py --start-trigger up:bridge_run --sample-time 4 \
> +                           --sample-count 4 --sample-interval 1
> +
> +
> +If you only use ``--stop-trigger``, the inspection starts immediately and
> +stops when the trigger happens.  The example below shows all the supported
> +options in this scenario.
> +
> +.. code-block:: console
> +
> +  $ sudo ./kernel_delay.py --stop-trigger upr:bridge_run \
> +                           --sample-count 4 --sample-interval 1
> +
> +
> +If you use both ``--start-trigger`` and ``--stop-trigger`` triggers, the
> +statistics are captured between the two first occurrences of these events.
> +The example below shows all the supported options in this scenario.
> +
> +.. code-block:: console
> +
> +  $ sudo ./kernel_delay.py --start-trigger up:bridge_run \
> +                           --stop-trigger upr:bridge_run \
> +                           --sample-count 4 --sample-interval 1 \
> +                           --trigger-delta 50000
> +
> +
> +Now that we know how these triggers can be used, let's investigate what
> +triggers are supported. What we call triggers, BCC [#BCC]_, calls events;
> +these are eBPF tracepoints you can attach to. For more details on the
> +supported tracepoints, check out the BCC documentation [#BCC_EVENT]_.
> +
> +The list below shows the supported triggers and their argument format:
> +
> +**USDT probes:**
> +  [u|usdt]:{provider}:{probe}
> +**Kernel tracepoint:**
> +  [t:trace]:{system}:{event}
> +**kprobe:**
> +  [k:kprobe]:{kernel_function}
> +**kretprobe:**
> +  [kr:kretprobe]:{kernel_function}
> +**uprobe:**
> +  [up:uprobe]:{function}
> +**uretprobe:**
> +  [upr:uretprobe]:{function}
> +
> +Here are a couple of trigger examples, more use-case-specific examples can be
> +found in the *Examples* section.
> +
> +.. code-block:: console
> +
> +  --start|stop-trigger u:udpif_revalidator:start_dump
> +  --start|stop-trigger t:openvswitch:ovs_dp_upcall
> +  --start|stop-trigger k:ovs_dp_process_packet
> +  --start|stop-trigger kr:ovs_dp_process_packet
> +  --start|stop-trigger up:bridge_run
> +  --start|stop-trigger upr:bridge_run
> +
> +
> +Examples
> +--------
> +This section will give some examples of how to use this tool in real-world
> +scenarios. Let's start with the issue where Open vSwitch reports
> +``Unreasonably long XXXXms poll interval`` on your revalidator threads. Note
> +that there is a blog available explaining how the revalidator process works
> +in OVS [#REVAL_BLOG]_.
> +
> +First, let me explain this log message. It gets logged if the time delta
> +between two ``poll_block()`` calls is more than 1 second. In other words,
> +the process was spending a lot of time processing stuff that was made
> +available by the return of the ``poll_block()`` function.
> +
> +Do a run with the tool using the existing USDT revalidator probes as a start
> +and stop trigger (Note that I removed the resource-specific data from the 
> none
> +revalidator threads):
> +
> +.. code-block:: console
> +
> +  $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump 
> --stop-trigger u:udpif_revalidator:sweep_done
> +  # Start sampling (trigger@791777093512008) @2023-06-14T14:52:00.110303 
> (12:52:00 UTC)
> +  # Stop sampling (trigger@791778281498462) @2023-06-14T14:52:01.297975 
> (12:52:01 UTC)
> +  # Triggered sample dump, stop-start delta 1,187,986,454 ns 
> @2023-06-14T14:52:01.298021 (12:52:01 UTC)
> +  TID        THREAD           <RESOURCE SPECIFIC>
> +  ---------- ---------------- 
> ----------------------------------------------------------------------------
> +     1457761 handler24        [SYSCALL STATISTICS]
> +                              NAME                 NUMBER       COUNT        
>   TOTAL ns            MAX ns
> +                              sendmsg                  46        6110       
> 123,274,761            41,776
> +                              recvmsg                  47      136299        
> 99,397,508            49,896
> +                              futex                   202          51        
>  7,655,832         7,536,776
> +                              poll                      7        4068        
>  1,202,883             2,907
> +                              getrusage                98        2034        
>    586,602             1,398
> +                              sendto                   44           9        
>    213,682            27,417
> +                              TOTAL( - poll):                  144503       
> 231,128,385
> +
> +                              [THREAD RUN STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MIN ns 
>            MAX ns
> +
> +                              [THREAD READY STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MAX ns
> +                                       1             1,438             1,438
> +
> +                              [SOFT IRQ STATISTICS]
> +                              NAME                 VECT_NR       COUNT       
>    TOTAL ns            MAX ns
> +                              sched                      7          21       
>      59,145             3,769
> +                              rcu                        9          50       
>      42,917             2,234
> +                              TOTAL:                                71       
>     102,062
> +     1457733 ovs-vswitchd     [SYSCALL STATISTICS]
> +     ...
> +     1457792 revalidator55    [SYSCALL STATISTICS]
> +                              NAME                 NUMBER       COUNT        
>   TOTAL ns            MAX ns
> +                              futex                   202          73       
> 572,576,329        19,621,600
> +                              recvmsg                  47         815       
> 296,697,618           405,338
> +                              sendto                   44           3        
>     78,302            26,837
> +                              sendmsg                  46           3        
>     38,712            13,250
> +                              write                     1           1        
>      5,073             5,073
> +                              TOTAL( - poll):                     895       
> 869,396,034
> +
> +                              [THREAD RUN STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MIN ns 
>            MAX ns
> +                                      48       394,350,393             1,729 
>       140,455,796
> +
> +                              [THREAD READY STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MAX ns
> +                                      49            23,650             1,559
> +
> +                              [SOFT IRQ STATISTICS]
> +                              NAME                 VECT_NR       COUNT       
>    TOTAL ns            MAX ns
> +                              sched                      7          14       
>      26,889             3,041
> +                              rcu                        9          28       
>      23,024             1,600
> +                              TOTAL:                                42       
>      49,913
> +
> +
> +You can see from the start of the output that the trigger took more than a
> +second (1,187,986,454 ns), which we would already know,  by looking at the
> +output of the ``ovs-vsctl upcall/show`` command.
> +
> +From the *revalidator55*'s ``SYSCALL STATISTICS`` statistics we can see it
> +spent almost 870ms handling syscalls, and there were no poll() calls being
> +executed. The ``THREAD RUN STATISTICS`` statistics here are a bit misleading,
> +as it looks like we only spent 394ms on the CPU. But earlier, we learned that
> +this time does not include the time being on the CPU at the start or stop of
> +an event. What is exactly the case here, because we are using USDT probes.
> +
> +From the above data and maybe some ``top`` output, we can determine that the
> +*revalidator55* thread is taking a lot of CPU time, probably because it has
> +to do a lot of revalidator work by itself. The solution here is to increase
> +the number of revalidator threads, so more work could be done in parallel.
> +
> +Let's do another run of the same command in another scenario:
> +
> +.. code-block:: console
> +
> +  $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump 
> --stop-trigger u:udpif_revalidator:sweep_done
> +  # Start sampling (trigger@795160501758971) @2023-06-14T15:48:23.518512 
> (13:48:23 UTC)
> +  # Stop sampling (trigger@795160764940201) @2023-06-14T15:48:23.781381 
> (13:48:23 UTC)
> +  # Triggered sample dump, stop-start delta 263,181,230 ns 
> @2023-06-14T15:48:23.781414 (13:48:23 UTC)
> +  TID        THREAD           <RESOURCE SPECIFIC>
> +  ---------- ---------------- 
> ----------------------------------------------------------------------------
> +     1457733 ovs-vswitchd     [SYSCALL STATISTICS]
> +                              ...
> +     1457792 revalidator55    [SYSCALL STATISTICS]
> +                              NAME                 NUMBER       COUNT        
>   TOTAL ns            MAX ns
> +                              recvmsg                  47         284       
> 193,422,110        46,248,418
> +                              sendto                   44           2        
>     46,685            23,665
> +                              sendmsg                  46           2        
>     24,916            12,703
> +                              write                     1           1        
>      6,534             6,534
> +                              TOTAL( - poll):                     289       
> 193,500,245
> +
> +                              [THREAD RUN STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MIN ns 
>            MAX ns
> +                                       2        47,333,558           331,516 
>        47,002,042
> +
> +                              [THREAD READY STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MAX ns
> +                                       3        87,000,403        45,999,712
> +
> +                              [SOFT IRQ STATISTICS]
> +                              NAME                 VECT_NR       COUNT       
>    TOTAL ns            MAX ns
> +                              sched                      7           2       
>       9,504             5,109
> +                              TOTAL:                                 2       
>       9,504
> +
> +
> +Here you can see the revalidator run took about 263ms, which does not look
> +odd, however, the ``THREAD READY STATISTICS`` information shows us we were
> +waiting 87ms for a CPU to be run on. This means the revalidator process could
> +have finished 87ms faster. Looking at the ``MAX ns`` value, we see a
> +worst-case delay of almost 46ms, which hints at an overloaded system.
> +
> +One final example where we use a ``uprobe`` to get some statistics on
> +a ``bridge_run()`` execution that takes more than 1ms.
> +
> +.. code-block:: console
> +
> +  $ sudo ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger 
> ur:bridge_run --trigger-delta 1000000
> +  # Start sampling (trigger@2245245432101270) @2023-06-14T16:21:10.467919 
> (14:21:10 UTC)
> +  # Stop sampling (trigger@2245245432414656) @2023-06-14T16:21:10.468296 
> (14:21:10 UTC)
> +  # Sample dump skipped, delta 313,386 ns @2023-06-14T16:21:10.468419 
> (14:21:10 UTC)
> +  # Start sampling (trigger@2245245505301745) @2023-06-14T16:21:10.540970 
> (14:21:10 UTC)
> +  # Stop sampling (trigger@2245245506911119) @2023-06-14T16:21:10.542499 
> (14:21:10 UTC)
> +  # Triggered sample dump, stop-start delta 1,609,374 ns 
> @2023-06-14T16:21:10.542565 (14:21:10 UTC)
> +  TID        THREAD           <RESOURCE SPECIFIC>
> +  ---------- ---------------- 
> ----------------------------------------------------------------------------
> +     3371035 <unknown:3366258/3371035> [SYSCALL STATISTICS]
> +     ... <REMOVED 7 MORE unknown THREADS>
> +     3371102 handler66        [SYSCALL STATISTICS]
> +     ... <REMOVED 7 MORE HANDLER THREADS>
> +     3366258 ovs-vswitchd     [SYSCALL STATISTICS]
> +                              NAME                 NUMBER       COUNT        
>   TOTAL ns            MAX ns
> +                              futex                   202          43        
>    403,469           199,312
> +                              clone3                  435          13        
>    174,394            30,731
> +                              munmap                   11           8        
>    115,774            21,861
> +                              poll                      7           5        
>     92,969            38,307
> +                              unlink                   87           2        
>     49,918            35,741
> +                              mprotect                 10           8        
>     47,618            13,201
> +                              accept                   43          10        
>     31,360             6,976
> +                              mmap                      9           8        
>     30,279             5,776
> +                              write                     1           6        
>     27,720            11,774
> +                              rt_sigprocmask           14          28        
>     12,281               970
> +                              read                      0           6        
>      9,478             2,318
> +                              recvfrom                 45           3        
>      7,024             4,024
> +                              sendto                   44           1        
>      4,684             4,684
> +                              getrusage                98           5        
>      4,594             1,342
> +                              close                     3           2        
>      2,918             1,627
> +                              recvmsg                  47           1        
>      2,722             2,722
> +                              TOTAL( - poll):                     144        
>    924,233
> +
> +                              [THREAD RUN STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MIN ns 
>            MAX ns
> +                                      13           817,605             5,433 
>           524,376
> +
> +                              [THREAD READY STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MAX ns
> +                                      14            28,646            11,566
> +
> +                              [SOFT IRQ STATISTICS]
> +                              NAME                 VECT_NR       COUNT       
>    TOTAL ns            MAX ns
> +                              rcu                        9           1       
>       2,838             2,838
> +                              TOTAL:                                 1       
>       2,838
> +
> +     3371110 revalidator74    [SYSCALL STATISTICS]
> +     ... <REMOVED 7 MORE NEW revalidator THREADS>
> +     3366311 urcu3            [SYSCALL STATISTICS]
> +     ...
> +
> +
> +We removed some of the threads and their resource-specific data, but based
> +on the ``<unknown:3366258/3371035>`` thread name, you can see that some
> +threads no longer exist. In the ``ovs-vswitchd`` thread, you can see some
> +``clone3`` syscalls, indicating threads were created. In this example, it was
> +due to the deletion of a bridge, which resulted in the recreation of the
> +revalidator and handler threads.
> +
> +
> +Use with Openshift
> +------------------
> +This section describes how you would use the tool on a node in an OpenShift
> +cluster. It assumes you have console access to the node, either directly or
> +through a debug container.
> +
> +We will use a base fedora38 container through podman, as this will allow us 
> to
> +install some additional tools and packages we need.
> +
> +The first thing we need to do is to start the container:
> +
> +.. code-block:: console
> +
> +  [core@sno-master ~]$ sudo podman run -it --rm \
> +     -e PS1='[(DEBUG)\u@\h \W]\$ ' \
> +     --privileged --network=host --pid=host \
> +     -v /lib/modules:/lib/modules:ro \
> +     -v /sys/kernel/debug:/sys/kernel/debug \
> +     -v /proc:/proc \
> +     -v /:/mnt/rootdir \
> +     quay.io/fedora/fedora:38-x86_64
> +
> +  [(DEBUG)root@sno-master /]#
> +
> +
> +Next add the ``linux_delay.py`` dependencies:
> +
> +.. code-block:: console
> +
> +  [(DEBUG)root@sno-master /]# dnf install -y bcc-tools perl-interpreter \
> +       python3-pytz  python3-psutil
> +
> +
> +You need to install the devel, debug and source RPMs for your OVS and kernel
> +version:
> +
> +.. code-block:: console
> +
> +  [(DEBUG)root@sno-master home]# rpm -i \
> +      openvswitch2.17-debuginfo-2.17.0-67.el8fdp.x86_64.rpm \
> +      openvswitch2.17-debugsource-2.17.0-67.el8fdp.x86_64.rpm \
> +      kernel-devel-4.18.0-372.41.1.el8_6.x86_64.rpm
> +
> +
> +Now we can run the tool. Here we use the above ```bridge_run()`` example:
> +
> +.. code-block:: console
> +
> +  [(DEBUG)root@sno-master home]# ./kernel_delay.py --start-trigger 
> up:bridge_run --stop-trigger ur:bridge_run
> +  # Start sampling (trigger@75279117343513) @2023-06-15T11:44:07.628372 
> (11:44:07 UTC)
> +  # Stop sampling (trigger@75279117443980) @2023-06-15T11:44:07.628529 
> (11:44:07 UTC)
> +  # Triggered sample dump, stop-start delta 100,467 ns 
> @2023-06-15T11:44:07.628569 (11:44:07 UTC)
> +  TID        THREAD           <RESOURCE SPECIFIC>
> +  ---------- ---------------- 
> ----------------------------------------------------------------------------
> +        1246 ovs-vswitchd     [SYSCALL STATISTICS]
> +                              NAME                 NUMBER       COUNT        
>   TOTAL ns            MAX ns
> +                              getdents64              217           2        
>      8,560             8,162
> +                              openat                  257           1        
>      6,951             6,951
> +                              accept                   43           4        
>      6,942             3,763
> +                              recvfrom                 45           1        
>      3,726             3,726
> +                              recvmsg                  47           2        
>      2,880             2,188
> +                              stat                      4           2        
>      1,946             1,384
> +                              close                     3           1        
>      1,393             1,393
> +                              fstat                     5           1        
>      1,324             1,324
> +                              TOTAL( - poll):                      14        
>     33,722
> +
> +                              [THREAD RUN STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MIN ns 
>            MAX ns
> +
> +                              [THREAD READY STATISTICS]
> +                              SCHED_CNT           TOTAL ns            MAX ns
> +
> +
> +.. rubric:: Footnotes
> +
> +.. [#BCC] https://github.com/iovisor/bcc
> +.. [#BCC_EVENT] 
> https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments
> +.. [#REVAL_BLOG] 
> https://developers.redhat.com/articles/2022/10/19/open-vswitch-revalidator-process-explained
>
> _______________________________________________
> dev mailing list
> [email protected]
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH] utilities: Add kernel_delay.py script to debug a busy Linux kernel.

Reply via email to