Re: [ovs-dev] [PATCH] utilities: Add kernel_delay.py script to debug a busy Linux kernel.

Eelco Chaudron Tue, 05 Sep 2023 01:38:14 -0700


On 28 Aug 2023, at 16:15, Aaron Conole wrote:


> Eelco Chaudron <[email protected]> writes:
>
>> This patch adds an utility that can be used to determine if
>> an issue is related to a lack of Linux kernel resources.
>>
>> This tool is also featured in a Red Hat developers blog article:
>>
>>  
>> https://developers.redhat.com/articles/2023/07/24/troubleshooting-open-vswitch-kernel-blame
>>
>> Signed-off-by: Eelco Chaudron <[email protected]>
>> ---
>
> Hi Eelco,
>
> Thanks for the writeup and utilities.

Thanks see, inlines below, and will sent an updated version.

Cheers,

Eelco

>>  utilities/automake.mk                   |    4
>>  utilities/usdt-scripts/kernel_delay.py  | 1397 
>> +++++++++++++++++++++++++++++++
>>  utilities/usdt-scripts/kernel_delay.rst |  594 +++++++++++++
>>  3 files changed, 1995 insertions(+)
>>  create mode 100755 utilities/usdt-scripts/kernel_delay.py
>>  create mode 100644 utilities/usdt-scripts/kernel_delay.rst
>>
>> diff --git a/utilities/automake.mk b/utilities/automake.mk
>> index 37d679f82..9a2114df4 100644
>> --- a/utilities/automake.mk
>> +++ b/utilities/automake.mk
>> @@ -23,6 +23,8 @@ scripts_DATA += utilities/ovs-lib
>>  usdt_SCRIPTS += \
>>      utilities/usdt-scripts/bridge_loop.bt \
>>      utilities/usdt-scripts/dpif_nl_exec_monitor.py \
>> +    utilities/usdt-scripts/kernel_delay.py \
>> +    utilities/usdt-scripts/kernel_delay.rst \
>>      utilities/usdt-scripts/reval_monitor.py \
>>      utilities/usdt-scripts/upcall_cost.py \
>>      utilities/usdt-scripts/upcall_monitor.py
>> @@ -70,6 +72,8 @@ EXTRA_DIST += \
>>      utilities/docker/debian/build-kernel-modules.sh \
>>      utilities/usdt-scripts/bridge_loop.bt \
>>      utilities/usdt-scripts/dpif_nl_exec_monitor.py \
>> +    utilities/usdt-scripts/kernel_delay.py \
>> +    utilities/usdt-scripts/kernel_delay.rst \
>>      utilities/usdt-scripts/reval_monitor.py \
>>      utilities/usdt-scripts/upcall_cost.py \
>>      utilities/usdt-scripts/upcall_monitor.py
>> diff --git a/utilities/usdt-scripts/kernel_delay.py 
>> b/utilities/usdt-scripts/kernel_delay.py
>> new file mode 100755
>> index 000000000..bba7bc044
>> --- /dev/null
>> +++ b/utilities/usdt-scripts/kernel_delay.py
>> @@ -0,0 +1,1397 @@
>> +#!/usr/bin/env python3
>> +#
>> +# Copyright (c) 2022 Red Hat, Inc.
>
> Maybe this should be 2022,2023 ?

This is always puzzling to me, Should it only be the first year, or all years?
Well to be sure I’ll add 2023 ;)

>> +#
>> +# Licensed under the Apache License, Version 2.0 (the "License");
>> +# you may not use this file except in compliance with the License.
>> +# You may obtain a copy of the License at:
>> +#
>> +#     http://www.apache.org/licenses/LICENSE-2.0
>> +#
>> +# Unless required by applicable law or agreed to in writing, software
>> +# distributed under the License is distributed on an "AS IS" BASIS,
>> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> +# See the License for the specific language governing permissions and
>> +# limitations under the License.
>> +#
>> +# Script information:
>> +# -------------------
>> +# This script allows a developer to quickly identify if the issue at hand
>> +# might be related to the kernel running out of resources or if it really is
>> +# an Open vSwitch issue.
>> +#
>> +# For documentation see the kernel_delay.rst file.
>> +#
>> +try:
>> +    from bcc import BPF, USDT, USDTException
>> +    from bcc.syscall import syscalls, syscall_name
>> +except ModuleNotFoundError:
>> +    print("ERROR: Can't find the BPF Compiler Collection (BCC) tools!")
>
> I think this needs to exit in this exception block

Thanks, missed it.

>> +
>> +from enum import IntEnum
>> +
>> +import argparse
>> +import datetime
>> +import os
>> +import pytz
>> +import psutil
>> +import re
>> +import sys
>> +import time
>> +
>> +import ctypes as ct
>> +
>> +try:
>> +    from time import time_ns
>> +except ImportError:
>> +    # For compatibility with Python <= v3.6.
>> +    def time_ns():
>> +        now = datetime.datetime.now()
>> +        return int(now.timestamp() * 1e9)
>> +
>> +
>> +#
>> +# Actual eBPF source code
>> +#
>> +EBPF_SOURCE = """
>> +#include <linux/irq.h>
>> +#include <linux/sched.h>
>> +
>> +#define MONITOR_PID <MONITOR_PID>
>> +
>> +enum {
>> +<EVENT_ENUM>
>> +};
>> +
>> +struct event_t {
>> +    u64 ts;
>> +    u32 tid;
>> +    u32 id;
>> +
>> +    int user_stack_id;
>> +    int kernel_stack_id;
>> +
>> +    u32 syscall;
>> +    u64 entry_ts;
>> +
>> +};
>> +
>> +BPF_RINGBUF_OUTPUT(events, <BUFFER_PAGE_CNT>);
>> +BPF_STACK_TRACE(stack_traces, <STACK_TRACE_SIZE>);
>> +BPF_TABLE("percpu_array", uint32_t, uint64_t, dropcnt, 1);
>> +BPF_TABLE("percpu_array", uint32_t, uint64_t, trigger_miss, 1);
>> +
>> +BPF_ARRAY(capture_on, u64, 1);
>> +static bool capture_enabled(u64 pid_tgid) {
>> +    int key = 0;
>> +    u64 *ret;
>> +
>> +    if ((pid_tgid >> 32) != MONITOR_PID)
>> +        return false;
>> +
>> +    ret = capture_on.lookup(&key);
>> +    return ret && *ret == 1;
>> +}
>> +
>> +static inline bool capture_enabled__() {
>> +    int key = 0;
>> +    u64 *ret;
>> +
>> +    ret = capture_on.lookup(&key);
>> +    return ret && *ret == 1;
>> +}
>> +
>> +
>> +static struct event_t *get_event(uint32_t id) {
>> +    struct event_t *event = events.ringbuf_reserve(sizeof(struct event_t));
>> +
>> +    if (!event) {
>> +        dropcnt.increment(0);
>> +        return NULL;
>> +    }
>> +
>> +    event->id = id;
>> +    event->ts = bpf_ktime_get_ns();
>> +    event->tid = bpf_get_current_pid_tgid();
>> +
>> +    return event;
>> +}
>> +
>> +static int start_trigger() {
>> +    int key = 0;
>> +    u64 *val = capture_on.lookup(&key);
>> +
>> +    /* If the value is -1 we can't start as we are still processing the
>> +     * results in userspace. */
>> +    if (!val || *val != 0) {
>> +        trigger_miss.increment(0);
>> +        return 0;
>> +    }
>> +
>> +    struct event_t *event = get_event(EVENT_START_TRIGGER);
>> +    if (event) {
>> +       events.ringbuf_submit(event, 0);
>> +       *val = 1;
>> +    } else {
>> +        trigger_miss.increment(0);
>> +    }
>> +    return 0;
>> +}
>> +
>> +static int stop_trigger() {
>> +    int key = 0;
>> +    u64 *val = capture_on.lookup(&key);
>> +
>> +    if (!val || *val != 1)
>> +        return 0;
>> +
>> +    struct event_t *event = get_event(EVENT_STOP_TRIGGER);
>> +
>> +    if (event)
>> +       events.ringbuf_submit(event, 0);
>> +
>> +    if (val)
>> +        *val = -1;
>> +
>> +    return 0;
>> +}
>> +
>> +<START_TRIGGER>
>> +<STOP_TRIGGER>
>> +
>> +
>> +/*
>> + * For the syscall monitor the following probes get installed.
>> + */
>> +struct syscall_data_t {
>> +    u64 count;
>> +    u64 total_ns;
>> +    u64 worst_ns;
>> +};
>> +
>> +struct syscall_data_key_t {
>> +    u32 pid;
>> +    u32 tid;
>> +    u32 syscall;
>> +};
>> +
>> +BPF_HASH(syscall_start, u64, u64);
>> +BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t);
>> +
>> +TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
>> +    u64 pid_tgid = bpf_get_current_pid_tgid();
>> +
>> +    if (!capture_enabled(pid_tgid))
>> +       return 0;
>> +
>> +    u64 t = bpf_ktime_get_ns();
>> +    syscall_start.update(&pid_tgid, &t);
>> +
>> +    return 0;
>> +}
>> +
>> +TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
>> +    struct syscall_data_t *val, zero = {};
>> +    struct syscall_data_key_t key;
>> +
>> +    u64 pid_tgid = bpf_get_current_pid_tgid();
>> +
>> +    if (!capture_enabled(pid_tgid))
>> +       return 0;
>> +
>> +    key.pid = pid_tgid >> 32;
>> +    key.tid = (u32)pid_tgid;
>> +    key.syscall = args->id;
>> +
>> +    u64 *start_ns = syscall_start.lookup(&pid_tgid);
>> +
>> +    if (!start_ns)
>> +        return 0;
>> +
>> +    val = syscall_data.lookup_or_try_init(&key, &zero);
>> +    if (val) {
>> +        u64 delta = bpf_ktime_get_ns() - *start_ns;
>> +        val->count++;
>> +        val->total_ns += delta;
>> +        if (val->worst_ns == 0 || delta > val->worst_ns)
>> +            val->worst_ns = delta;
>> +
>> +        if (<SYSCAL_TRACE_EVENTS>) {
>> +            struct event_t *event = get_event(EVENT_SYSCALL);
>> +            if (event) {
>> +                event->syscall = args->id;
>> +                event->entry_ts = *start_ns;
>> +                if (<STACK_TRACE_ENABLED>) {
>> +                    event->user_stack_id = stack_traces.get_stackid(
>> +                        args, BPF_F_USER_STACK);
>> +                    event->kernel_stack_id = stack_traces.get_stackid(
>> +                        args, 0);
>> +                }
>> +                events.ringbuf_submit(event, 0);
>> +            }
>> +        }
>> +    }
>> +    return 0;
>> +}
>> +
>> +
>> +
>> +/*
>> + * For measuring the thread run time, we need the following.
>> + */
>> +struct run_time_data_t {
>> +    u64 count;
>> +    u64 total_ns;
>> +    u64 max_ns;
>> +    u64 min_ns;
>> +};
>> +
>> +struct pid_tid_key_t {
>> +    u32  pid;
>> +    u32  tid;
>> +};
>> +
>> +BPF_HASH(run_start, u64, u64);
>> +BPF_HASH(run_data, struct pid_tid_key_t, struct run_time_data_t);
>> +
>> +static inline void thread_start_run(u64 pid_tgid, u64 ktime)
>> +{
>> +    run_start.update(&pid_tgid, &ktime);
>> +}
>> +
>> +static inline void thread_stop_run(u32 pid, u32 tgid, u64 ktime)
>> +{
>> +    u64 pid_tgid = (u64) tgid << 32 | pid;
>> +    u64 *start_ns = run_start.lookup(&pid_tgid);
>> +
>> +    if (!start_ns || *start_ns == 0)
>> +        return;
>> +
>> +    struct run_time_data_t *val, zero = {};
>> +    struct pid_tid_key_t key = { .pid = tgid,
>> +                                 .tid = pid };
>> +
>> +    val = run_data.lookup_or_try_init(&key, &zero);
>> +    if (val) {
>> +        u64 delta = ktime - *start_ns;
>> +        val->count++;
>> +        val->total_ns += delta;
>> +        if (val->max_ns == 0 || delta > val->max_ns)
>> +            val->max_ns = delta;
>> +        if (val->min_ns == 0 || delta < val->min_ns)
>> +            val->min_ns = delta;
>> +    }
>> +    *start_ns = 0;
>> +}
>> +
>> +
>> +/*
>> + * For measuring the thread-ready delay, we need the following.
>> + */
>> +struct ready_data_t {
>> +    u64 count;
>> +    u64 total_ns;
>> +    u64 worst_ns;
>> +};
>> +
>> +BPF_HASH(ready_start, u64, u64);
>> +BPF_HASH(ready_data, struct pid_tid_key_t, struct ready_data_t);
>> +
>> +static inline int sched_wakeup__(u32 pid, u32 tgid)
>> +{
>> +    u64 pid_tgid = (u64) tgid << 32 | pid;
>> +
>> +    if (!capture_enabled(pid_tgid))
>> +        return 0;
>> +
>> +    u64 t = bpf_ktime_get_ns();
>> +    ready_start.update(&pid_tgid, &t);
>> +    return 0;
>> +}
>> +
>> +RAW_TRACEPOINT_PROBE(sched_wakeup)
>> +{
>> +    struct task_struct *t = (struct task_struct *)ctx->args[0];
>> +    return sched_wakeup__(t->pid, t->tgid);
>> +}
>> +
>> +RAW_TRACEPOINT_PROBE(sched_wakeup_new)
>> +{
>> +    struct task_struct *t = (struct task_struct *)ctx->args[0];
>> +    return sched_wakeup__(t->pid, t->tgid);
>> +}
>> +
>> +RAW_TRACEPOINT_PROBE(sched_switch)
>> +{
>> +    struct task_struct *prev = (struct task_struct *)ctx->args[1];
>> +    struct task_struct *next= (struct task_struct *)ctx->args[2];
>> +    u64 ktime = 0;
>> +
>> +    if (!capture_enabled__())
>> +        return 0;
>> +
>> +    if (prev-><STATE_FIELD> == TASK_RUNNING && prev->tgid == MONITOR_PID)
>> +        sched_wakeup__(prev->pid, prev->tgid);
>> +
>> +    if (prev->tgid == MONITOR_PID) {
>> +        ktime = bpf_ktime_get_ns();
>> +        thread_stop_run(prev->pid, prev->tgid, ktime);
>> +    }
>> +
>> +    u64 pid_tgid = (u64)next->tgid << 32 | next->pid;
>> +
>> +    if (next->tgid != MONITOR_PID)
>> +        return 0;
>> +
>> +    if (ktime == 0)
>> +        ktime = bpf_ktime_get_ns();
>> +
>> +    u64 *start_ns = ready_start.lookup(&pid_tgid);
>> +
>> +    if (start_ns && *start_ns != 0) {
>> +
>> +        struct ready_data_t *val, zero = {};
>> +        struct pid_tid_key_t key = { .pid = next->tgid,
>> +                                     .tid = next->pid };
>> +
>> +        val = ready_data.lookup_or_try_init(&key, &zero);
>> +        if (val) {
>> +            u64 delta = ktime - *start_ns;
>> +            val->count++;
>> +            val->total_ns += delta;
>> +            if (val->worst_ns == 0 || delta > val->worst_ns)
>> +                val->worst_ns = delta;
>> +        }
>> +        *start_ns = 0;
>> +    }
>> +
>> +    thread_start_run(pid_tgid, ktime);
>> +    return 0;
>> +}
>> +
>> +/*
>> + * For measuring the hard irq time, we need the following.
>> + */
>> +struct hardirq_start_data_t {
>> +    u64  start_ns;
>> +    char irq_name[32];
>> +};
>> +
>> +struct hardirq_data_t {
>> +    u64 count;
>> +    u64 total_ns;
>> +    u64 worst_ns;
>> +};
>> +
>> +struct hardirq_data_key_t {
>> +    u32 pid;
>> +    u32 tid;
>> +    char irq_name[32];
>> +};
>> +
>> +BPF_HASH(hardirq_start, u64, struct hardirq_start_data_t);
>> +BPF_HASH(hardirq_data, struct hardirq_data_key_t, struct hardirq_data_t);
>> +
>> +TRACEPOINT_PROBE(irq, irq_handler_entry)
>> +{
>> +    u64 pid_tgid = bpf_get_current_pid_tgid();
>> +
>> +    if (!capture_enabled(pid_tgid))
>> +        return 0;
>> +
>> +    struct hardirq_start_data_t data = {};
>> +
>> +    data.start_ns = bpf_ktime_get_ns();
>> +    TP_DATA_LOC_READ_STR(&data.irq_name, name, sizeof(data.irq_name));
>> +    hardirq_start.update(&pid_tgid, &data);
>> +    return 0;
>> +}
>> +
>> +TRACEPOINT_PROBE(irq, irq_handler_exit)
>> +{
>> +    u64 pid_tgid = bpf_get_current_pid_tgid();
>> +
>> +    if (!capture_enabled(pid_tgid))
>> +        return 0;
>> +
>> +    struct hardirq_start_data_t *data;
>> +    data = hardirq_start.lookup(&pid_tgid);
>> +    if (!data || data->start_ns == 0)
>> +        return 0;
>> +
>> +    if (args->ret != IRQ_NONE) {
>> +        struct hardirq_data_t *val, zero = {};
>> +        struct hardirq_data_key_t key = { .pid = pid_tgid >> 32,
>> +                                          .tid = (u32)pid_tgid };
>> +
>> +        bpf_probe_read_kernel(&key.irq_name, sizeof(key.irq_name),
>> +                              data->irq_name);
>> +        val = hardirq_data.lookup_or_try_init(&key, &zero);
>> +        if (val) {
>> +            u64 delta = bpf_ktime_get_ns() - data->start_ns;
>> +            val->count++;
>> +            val->total_ns += delta;
>> +            if (val->worst_ns == 0 || delta > val->worst_ns)
>> +                val->worst_ns = delta;
>> +        }
>> +    }
>> +
>> +    data->start_ns = 0;
>> +    return 0;
>> +}
>> +
>> +/*
>> + * For measuring the soft irq time, we need the following.
>> + */
>> +struct softirq_start_data_t {
>> +    u64 start_ns;
>> +    u32 vec_nr;
>> +};
>> +
>> +struct softirq_data_t {
>> +    u64 count;
>> +    u64 total_ns;
>> +    u64 worst_ns;
>> +};
>> +
>> +struct softirq_data_key_t {
>> +    u32 pid;
>> +    u32 tid;
>> +    u32 vec_nr;
>> +};
>> +
>> +BPF_HASH(softirq_start, u64, struct softirq_start_data_t);
>> +BPF_HASH(softirq_data, struct softirq_data_key_t, struct softirq_data_t);
>> +
>> +TRACEPOINT_PROBE(irq, softirq_entry)
>> +{
>> +    u64 pid_tgid = bpf_get_current_pid_tgid();
>> +
>> +    if (!capture_enabled(pid_tgid))
>> +        return 0;
>> +
>> +    struct softirq_start_data_t data = {};
>> +
>> +    data.start_ns = bpf_ktime_get_ns();
>> +    data.vec_nr = args->vec;
>> +    softirq_start.update(&pid_tgid, &data);
>> +    return 0;
>> +}
>> +
>> +TRACEPOINT_PROBE(irq, softirq_exit)
>> +{
>> +    u64 pid_tgid = bpf_get_current_pid_tgid();
>> +
>> +    if (!capture_enabled(pid_tgid))
>> +        return 0;
>> +
>> +    struct softirq_start_data_t *data;
>> +    data = softirq_start.lookup(&pid_tgid);
>> +    if (!data || data->start_ns == 0)
>> +        return 0;
>> +
>> +    struct softirq_data_t *val, zero = {};
>> +    struct softirq_data_key_t key = { .pid = pid_tgid >> 32,
>> +                                      .tid = (u32)pid_tgid,
>> +                                      .vec_nr = data->vec_nr};
>> +
>> +    val = softirq_data.lookup_or_try_init(&key, &zero);
>> +    if (val) {
>> +        u64 delta = bpf_ktime_get_ns() - data->start_ns;
>> +        val->count++;
>> +        val->total_ns += delta;
>> +        if (val->worst_ns == 0 || delta > val->worst_ns)
>> +            val->worst_ns = delta;
>> +    }
>> +
>> +    data->start_ns = 0;
>> +    return 0;
>> +}
>> +"""
>> +
>> +
>> +#
>> +# Probe class to use for the start/stop triggers
>> +#
>> +class Probe(object):
>> +    '''
>> +    The goal for this object is to support as many as possible
>> +    probe/events as supported by BCC. See
>> +       
>> https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments
>> +    '''
>> +    def __init__(self, probe, pid=None):
>> +        self.pid = pid
>> +        self.text_probe = probe
>> +        self._parse_text_probe()
>> +
>> +    def __str__(self):
>> +        if self.probe_type == "usdt":
>> +            return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type,
>> +                                           self.usdt_provider, 
>> self.usdt_probe)
>> +        elif self.probe_type == "trace":
>> +            return "[{}]; {}:{}:{}".format(self.text_probe, self.probe_type,
>> +                                           self.trace_system, 
>> self.trace_event)
>> +        elif self.probe_type == "kprobe" or self.probe_type == "kretprobe":
>> +            return "[{}]; {}:{}".format(self.text_probe, self.probe_type,
>> +                                        self.kprobe_function)
>> +        elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
>> +            return "[{}]; {}:{}".format(self.text_probe, self.probe_type,
>> +                                        self.uprobe_function)
>> +        else:
>> +            return "[{}] <{}:unknown probe>".format(self.text_probe,
>> +                                                    self.probe_type)
>> +
>> +    def _raise(self, error):
>> +        raise ValueError("[{}]; {}".format(self.text_probe, error))
>> +
>> +    def _verify_kprobe_probe(self):
>> +        # Nothing to verify for now, just return.
>> +        return
>> +
>> +    def _verify_trace_probe(self):
>> +        # Nothing to verify for now, just return.
>> +        return
>> +
>> +    def _verify_uprobe_probe(self):
>> +        # Nothing to verify for now, just return.
>> +        return
>> +
>> +    def _verify_usdt_probe(self):
>> +        if not self.pid:
>> +            self._raise("USDT probes need a valid PID.")
>> +
>> +        usdt = USDT(pid=self.pid)
>> +
>> +        for probe in usdt.enumerate_probes():
>> +            if probe.provider.decode('utf-8') == self.usdt_provider and \
>> +               probe.name.decode('utf-8') == self.usdt_probe:
>> +                return
>> +
>> +        self._raise("Can't find UDST probe 
>> '{}:{}'".format(self.usdt_provider,
>> +                                                           self.usdt_probe))
>> +
>> +    def _parse_text_probe(self):
>> +        '''
>> +        The text probe format is defined as follows:
>> +          <probe_type>:<probe_specific>
>> +
>> +        Types:
>> +          USDT:      u|usdt:<provider>:<probe>
>> +          TRACE:     t|trace:<system>:<event>
>> +          KPROBE:    k|kprobe:<kernel_function>
>> +          KRETPROBE: kr|kretprobe:<kernel_function>
>> +          UPROBE:    up|uprobe:<function>
>> +          URETPROBE: ur|uretprobe:<function>
>> +        '''
>> +        args = self.text_probe.split(":")
>> +        if len(args) <= 1:
>> +            self._raise("Can't extract probe type.")
>> +
>> +        if args[0] not in ["k", "kprobe", "kr", "kretprobe", "t", "trace",
>> +                           "u", "usdt", "up", "uprobe", "ur", "uretprobe"]:
>> +            self._raise("Invalid probe type '{}'".format(args[0]))
>> +
>> +        self.probe_type = "kprobe" if args[0] == "k" else args[0]
>> +        self.probe_type = "kretprobe" if args[0] == "kr" else 
>> self.probe_type
>> +        self.probe_type = "trace" if args[0] == "t" else self.probe_type
>> +        self.probe_type = "usdt" if args[0] == "u" else self.probe_type
>> +        self.probe_type = "uprobe" if args[0] == "up" else self.probe_type
>> +        self.probe_type = "uretprobe" if args[0] == "ur" else 
>> self.probe_type
>> +
>> +        if self.probe_type == "usdt":
>> +            if len(args) != 3:
>> +                self._raise("Invalid number of arguments for USDT")
>> +
>> +            self.usdt_provider = args[1]
>> +            self.usdt_probe = args[2]
>> +            self._verify_usdt_probe()
>> +
>> +        elif self.probe_type == "trace":
>> +            if len(args) != 3:
>> +                self._raise("Invalid number of arguments for TRACE")
>> +
>> +            self.trace_system = args[1]
>> +            self.trace_event = args[2]
>> +            self._verify_trace_probe()
>> +
>> +        elif self.probe_type == "kprobe" or self.probe_type == "kretprobe":
>> +            if len(args) != 2:
>> +                self._raise("Invalid number of arguments for K(RET)PROBE")
>> +            self.kprobe_function = args[1]
>> +            self._verify_kprobe_probe()
>> +
>> +        elif self.probe_type == "uprobe" or self.probe_type == "uretprobe":
>> +            if len(args) != 2:
>> +                self._raise("Invalid number of arguments for U(RET)PROBE")
>> +            self.uprobe_function = args[1]
>> +            self._verify_uprobe_probe()
>> +
>> +    def _get_kprobe_c_code(self, function_name, function_content):
>> +        #
>> +        # The kprobe__* do not require a function name, so it's
>> +        # ignored in the code generation.
>> +        #
>> +        return """
>> +int {}__{}(struct pt_regs *ctx) {{
>> +    {}
>> +}}
>> +""".format(self.probe_type, self.kprobe_function, function_content)
>> +
>> +    def _get_trace_c_code(self, function_name, function_content):
>> +        #
>> +        # The TRACEPOINT_PROBE() do not require a function name, so it's
>> +        # ignored in the code generation.
>> +        #
>> +        return """
>> +TRACEPOINT_PROBE({},{}) {{
>> +    {}
>> +}}
>> +""".format(self.trace_system, self.trace_event, function_content)
>> +
>> +    def _get_uprobe_c_code(self, function_name, function_content):
>> +        return """
>> +int {}(struct pt_regs *ctx) {{
>> +    {}
>> +}}
>> +""".format(function_name, function_content)
>> +
>> +    def _get_usdt_c_code(self, function_name, function_content):
>> +        return """
>> +int {}(struct pt_regs *ctx) {{
>> +    {}
>> +}}
>> +""".format(function_name, function_content)
>> +
>> +    def get_c_code(self, function_name, function_content):
>> +        if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe':
>> +            return self._get_kprobe_c_code(function_name, function_content)
>> +        elif self.probe_type == 'trace':
>> +            return self._get_trace_c_code(function_name, function_content)
>> +        elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe':
>> +            return self._get_uprobe_c_code(function_name, function_content)
>> +        elif self.probe_type == 'usdt':
>> +            return self._get_usdt_c_code(function_name, function_content)
>> +
>> +        return ""
>> +
>> +    def probe_name(self):
>> +        if self.probe_type == 'kprobe' or self.probe_type == 'kretprobe':
>> +            return "{}".format(self.kprobe_function)
>> +        elif self.probe_type == 'trace':
>> +            return "{}:{}".format(self.trace_system,
>> +                                  self.trace_event)
>> +        elif self.probe_type == 'uprobe' or self.probe_type == 'uretprobe':
>> +            return "{}".format(self.uprobe_function)
>> +        elif self.probe_type == 'usdt':
>> +            return "{}:{}".format(self.usdt_provider,
>> +                                  self.usdt_probe)
>> +
>> +        return ""
>> +
>> +
>> +#
>> +# event_to_dict()
>> +#
>> +def event_to_dict(event):
>> +    return dict([(field, getattr(event, field))
>> +                 for (field, _) in event._fields_
>> +                 if isinstance(getattr(event, field), (int, bytes))])
>> +
>> +
>> +#
>> +# Event enum
>> +#
>> +Event = IntEnum("Event", ["SYSCALL", "START_TRIGGER", "STOP_TRIGGER",
>> +                          ], start=0)
>> +
>> +
>> +#
>> +# process_event()
>> +#
>> +def process_event(ctx, data, size):
>> +    global start_trigger_ts
>> +    global stop_trigger_ts
>> +
>> +    event = bpf['events'].event(data)
>> +    if event.id == Event.SYSCALL:
>> +        syscall_events.append({"tid": event.tid,
>> +                               "ts_entry": event.entry_ts,
>> +                               "ts_exit": event.ts,
>> +                               "syscall": event.syscall,
>> +                               "user_stack_id": event.user_stack_id,
>> +                               "kernel_stack_id": event.kernel_stack_id})
>> +    elif event.id == Event.START_TRIGGER:
>> +        #
>> +        # This event would have start the trigger already, so all we need to
>> +        # do is record the start timestamp.
>> +        #
>> +        start_trigger_ts = event.ts
>> +
>> +    elif event.id == Event.STOP_TRIGGER:
>> +        #
>> +        # This event would have stopped the trigger already, so all we need 
>> to
>> +        # do is record the start timestamp.
>> +        stop_trigger_ts = event.ts
>> +
>> +
>> +#
>> +# next_power_of_two()
>> +#
>> +def next_power_of_two(val):
>> +    np = 1
>> +    while np < val:
>> +        np *= 2
>> +    return np
>> +
>> +
>> +#
>> +# unsigned_int()
>> +#
>> +def unsigned_int(value):
>> +    try:
>> +        value = int(value)
>> +    except ValueError:
>> +        raise argparse.ArgumentTypeError("must be an integer")
>> +
>> +    if value < 0:
>> +        raise argparse.ArgumentTypeError("must be positive")
>> +    return value
>> +
>> +
>> +#
>> +# unsigned_nonzero_int()
>> +#
>> +def unsigned_nonzero_int(value):
>> +    value = unsigned_int(value)
>> +    if value == 0:
>> +        raise argparse.ArgumentTypeError("must be nonzero")
>> +    return value
>> +
>> +
>> +#
>> +# get_thread_name()
>> +#
>> +def get_thread_name(pid, tid):
>> +    try:
>> +        with open(f"/proc/{pid}/task/{tid}/comm", encoding="utf8") as f:
>> +            return f.readline().strip("\n")
>> +    except FileNotFoundError:
>> +        pass
>> +
>> +    return f"<unknown:{pid}/{tid}>"
>> +
>> +
>> +#
>> +# get_vec_nr_name()
>> +#
>> +def get_vec_nr_name(vec_nr):
>> +    known_vec_nr = ["hi", "timer", "net_tx", "net_rx", "block", "irq_poll",
>> +                    "tasklet", "sched", "hrtimer", "rcu"]
>> +
>> +    if vec_nr < 0 or vec_nr > len(known_vec_nr):
>> +        return f"<unknown:{vec_nr}>"
>> +
>> +    return known_vec_nr[vec_nr]
>> +
>> +
>> +#
>> +# start/stop/reset capture
>> +#
>> +def start_capture():
>> +    bpf["capture_on"][ct.c_int(0)] = ct.c_int(1)
>> +
>> +
>> +def stop_capture(force=False):
>> +    if force:
>> +        bpf["capture_on"][ct.c_int(0)] = ct.c_int(0xffff)
>> +    else:
>> +        bpf["capture_on"][ct.c_int(0)] = ct.c_int(0)
>> +
>> +
>> +def capture_running():
>> +    return True if bpf["capture_on"][ct.c_int(0)].value == 1 else False
>> +
>> +
>> +def reset_capture():
>> +    bpf["syscall_start"].clear()
>> +    bpf["syscall_data"].clear()
>> +    bpf["run_start"].clear()
>> +    bpf["run_data"].clear()
>> +    bpf["ready_start"].clear()
>> +    bpf["ready_data"].clear()
>> +    bpf["hardirq_start"].clear()
>> +    bpf["hardirq_data"].clear()
>> +    bpf["softirq_start"].clear()
>> +    bpf["softirq_data"].clear()
>> +    bpf["stack_traces"].clear()
>> +
>> +
>> +#
>> +# Display timestamp
>> +#
>> +def print_timestamp(msg):
>> +    ltz = datetime.datetime.now()
>> +    utc = ltz.astimezone(pytz.utc)
>> +    time_string = "{} @{} ({} UTC)".format(
>> +        msg, ltz.isoformat(), utc.strftime("%H:%M:%S"))
>> +    print(time_string)
>> +
>> +
>> +#
>> +# process_results()
>> +#
>> +def process_results(syscal_events=None, trigger_delta=None):
>> +    if trigger_delta:
>> +        print_timestamp("# Triggered sample dump, stop-start delta {:,} ns".
>> +                        format(trigger_delta))
>> +    else:
>> +        print_timestamp("# Sample dump")
>> +
>> +    #
>> +    # First get a list of all threads we need to report on.
>> +    #
>> +    threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items()
>> +                       if k.syscall != 0xffffffff}
>> +
>> +    threads_run = {k.tid for k, _ in bpf["run_data"].items()
>> +                   if k.pid != 0xffffffff}
>> +
>> +    threads_ready = {k.tid for k, _ in bpf["ready_data"].items()
>> +                     if k.pid != 0xffffffff}
>> +
>> +    threads_hardirq = {k.tid for k, _ in bpf["hardirq_data"].items()
>> +                       if k.pid != 0xffffffff}
>> +
>> +    threads_softirq = {k.tid for k, _ in bpf["softirq_data"].items()
>> +                       if k.pid != 0xffffffff}
>> +
>> +    threads = sorted(threads_syscall | threads_run | threads_ready |
>> +                     threads_hardirq | threads_softirq,
>> +                     key=lambda x: get_thread_name(options.pid, x))
>> +
>> +    #
>> +    # Print header...
>> +    #
>> +    print("{:10} {:16} {}".format("TID", "THREAD", "<RESOURCE SPECIFIC>"))
>> +    print("{:10} {:16} {}".format("-" * 10, "-" * 16, "-" * 76))
>> +    indent = 28 * " "
>> +
>> +    #
>> +    # Print all events/statistics per threads.
>> +    #
>> +    poll_id = [k for k, v in syscalls.items() if v == b'poll'][0]
>> +    for thread in threads:
>> +
>> +        if thread != threads[0]:
>> +            print("")
>> +
>> +        #
>> +        # SYSCALL_STATISTICS
>> +        #
>> +        print("{:10} {:16} {}\n{}{:20} {:>6}  {:>10}  {:>16}  
>> {:>16}".format(
>> +            thread, get_thread_name(options.pid, thread),
>> +            "[SYSCALL STATISTICS]", indent,
>> +            "NAME", "NUMBER", "COUNT", "TOTAL ns", "MAX ns"))
>> +
>> +        total_count = 0
>> +        total_ns = 0
>> +        for k, v in sorted(bpf["syscall_data"].items(),
>> +                           key=lambda kv: -kv[1].total_ns):
>> +            if k.tid != thread:
>> +                continue
>> +
>> +            print("{}{:20.20} {:6}  {:10}  {:16,}  {:16,}".format(
>> +                indent, syscall_name(k.syscall).decode('utf-8'), k.syscall,
>> +                v.count, v.total_ns, v.worst_ns))
>> +            if k.syscall != poll_id:
>> +                total_count += v.count
>> +                total_ns += v.total_ns
>> +
>> +        if total_count > 0:
>> +            print("{}{:20.20} {:6}  {:10}  {:16,}".format(
>> +                indent, "TOTAL( - poll):", "", total_count, total_ns))
>> +
>> +        #
>> +        # THREAD RUN STATISTICS
>> +        #
>> +        print("\n{:10} {:16} {}\n{}{:10}  {:>16}  {:>16}  {:>16}".format(
>> +            "", "", "[THREAD RUN STATISTICS]", indent,
>> +            "SCHED_CNT", "TOTAL ns", "MIN ns", "MAX ns"))
>> +
>> +        for k, v in bpf["run_data"].items():
>> +            if k.tid != thread:
>> +                continue
>> +
>> +            print("{}{:10}  {:16,}  {:16,}  {:16,}".format(
>> +                indent, v.count, v.total_ns, v.min_ns, v.max_ns))
>> +
>> +        #
>> +        # THREAD READY STATISTICS
>> +        #
>> +        print("\n{:10} {:16} {}\n{}{:10}  {:>16}  {:>16}".format(
>> +            "", "", "[THREAD READY STATISTICS]", indent,
>> +            "SCHED_CNT", "TOTAL ns", "MAX ns"))
>> +
>> +        for k, v in bpf["ready_data"].items():
>> +            if k.tid != thread:
>> +                continue
>> +
>> +            print("{}{:10}  {:16,}  {:16,}".format(
>> +                indent, v.count, v.total_ns, v.worst_ns))
>> +
>> +        #
>> +        # HARD IRQ STATISTICS
>> +        #
>> +        total_ns = 0
>> +        total_count = 0
>> +        header_printed = False
>> +        for k, v in sorted(bpf["hardirq_data"].items(),
>> +                           key=lambda kv: -kv[1].total_ns):
>> +            if k.tid != thread:
>> +                continue
>> +
>> +            if not header_printed:
>> +                print("\n{:10} {:16} {}\n{}{:20}  {:>10}  {:>16}  {:>16}".
>> +                      format("", "", "[HARD IRQ STATISTICS]", indent,
>> +                             "NAME", "COUNT", "TOTAL ns", "MAX ns"))
>> +                header_printed = True
>> +
>> +            print("{}{:20.20}  {:10}  {:16,}  {:16,}".format(
>> +                indent, k.irq_name.decode('utf-8'),
>> +                v.count, v.total_ns, v.worst_ns))
>> +
>> +            total_count += v.count
>> +            total_ns += v.total_ns
>> +
>> +        if total_count > 0:
>> +            print("{}{:20.20}  {:10}  {:16,}".format(
>> +                indent, "TOTAL:", total_count, total_ns))
>> +
>> +        #
>> +        # SOFT IRQ STATISTICS
>> +        #
>> +        total_ns = 0
>> +        total_count = 0
>> +        header_printed = False
>> +        for k, v in sorted(bpf["softirq_data"].items(),
>> +                           key=lambda kv: -kv[1].total_ns):
>> +            if k.tid != thread:
>> +                continue
>> +
>> +            if not header_printed:
>> +                print("\n{:10} {:16} {}\n"
>> +                      "{}{:20} {:>7}  {:>10}  {:>16}  {:>16}".
>> +                      format("", "", "[SOFT IRQ STATISTICS]", indent,
>> +                             "NAME", "VECT_NR", "COUNT", "TOTAL ns", "MAX 
>> ns"))
>> +                header_printed = True
>> +
>> +            print("{}{:20.20} {:>7}  {:10}  {:16,}  {:16,}".format(
>> +                indent, get_vec_nr_name(k.vec_nr), k.vec_nr,
>> +                v.count, v.total_ns, v.worst_ns))
>> +
>> +            total_count += v.count
>> +            total_ns += v.total_ns
>> +
>> +        if total_count > 0:
>> +            print("{}{:20.20} {:7}  {:10}  {:16,}".format(
>> +                indent, "TOTAL:", "", total_count, total_ns))
>> +
>> +    #
>> +    # Print events
>> +    #
>> +    lost_stack_traces = 0
>> +    if syscall_events is not None and len(syscall_events) > 0:
>> +        stack_traces = bpf.get_table("stack_traces")
>> +
>> +        print("\n\n# SYSCALL EVENTS:"
>> +              "\n{}{:>19} {:>19} {:>10} {:16} {:>10}  {}".format(
>> +                  2 * " ", "ENTRY (ns)", "EXIT (ns)", "TID", "COMM",
>> +                  "DELTA (us)", "SYSCALL"))
>> +        print("{}{:19} {:19} {:10} {:16} {:10}  {}".format(
>> +            2 * " ", "-" * 19, "-" * 19, "-" * 10, "-" * 16,
>> +            "-" * 10, "-" * 16))
>> +        for event in syscall_events:
>> +            print("{}{:19} {:19} {:10} {:16} {:10,}  {}".format(
>> +                " " * 2,
>> +                event["ts_entry"], event["ts_exit"], event["tid"],
>> +                get_thread_name(options.pid, event["tid"]),
>> +                int((event["ts_exit"] - event["ts_entry"]) / 1000),
>> +                syscall_name(event["syscall"]).decode('utf-8')))
>> +            #
>> +            # Not sure where to put this, but I'll add some info on stack
>> +            # traces here... Userspace stack traces are very limited due to
>> +            # the fact that bcc does not support dwarf backtraces. As OVS
>> +            # gets compiled without frame pointers we will not see much.
>> +            # If however, OVS does get built with frame pointers, we should 
>> not
>> +            # use the BPF_STACK_TRACE_BUILDID as it does not seem to handle
>> +            # the debug symbols correctly. Also, note that for kernel
>> +            # traces you should not use BPF_STACK_TRACE_BUILDID, so two
>> +            # buffers are needed.
>> +            #
>> +            # Some info on manual dwarf walk support:
>> +            #   https://github.com/iovisor/bcc/issues/3515
>> +            #   https://github.com/iovisor/bcc/pull/4463
>> +            #
>> +            if options.stack_trace_size == 0:
>> +                continue
>> +
>> +            if event['kernel_stack_id'] < 0 or event['user_stack_id'] < 0:
>> +                lost_stack_traces += 1
>> +
>> +            kernel_stack = stack_traces.walk(event['kernel_stack_id']) \
>> +                if event['kernel_stack_id'] >= 0 else []
>> +            user_stack = stack_traces.walk(event['user_stack_id']) \
>> +                if event['user_stack_id'] >= 0 else []
>> +
>> +            for addr in kernel_stack:
>> +                print("{}{}".format(
>> +                    " " * 10,
>> +                    bpf.ksym(addr, show_module=True,
>> +                             show_offset=True).decode('utf-8', 'replace')))
>> +
>> +            for addr in user_stack:
>> +                addr_str = bpf.sym(addr, options.pid, show_module=True,
>> +                                   show_offset=True).decode('utf-8', 
>> 'replace')
>> +
>> +                if addr_str == "[unknown]":
>> +                    addr_str += " 0x{:x}".format(addr)
>> +
>> +                print("{}{}".format(" " * 10, addr_str))
>> +
>> +    #
>> +    # Print any footer messages.
>> +    #
>> +    if lost_stack_traces > 0:
>> +        print("\n#WARNING: We where not able to display {} stack traces!\n"
>> +              "#         Consider increasing the stack trace size using\n"
>> +              "#         the '--stack-trace-size' option.\n"
>> +              "#         Note that this can also happen due to a stack id\n"
>> +              "#         collision.".format(lost_stack_traces))
>> +
>> +
>> +#
>> +# main()
>> +#
>> +def main():
>> +    #
>> +    # Don't like these globals, but ctx passing does not seem to work with 
>> the
>> +    # existing open_ring_buffer() API :(
>> +    #
>> +    global bpf
>> +    global options
>> +    global syscall_events
>> +    global start_trigger_ts
>> +    global stop_trigger_ts
>> +
>> +    start_trigger_ts = 0
>> +    stop_trigger_ts = 0
>> +
>> +    #
>> +    # Argument parsing
>> +    #
>> +    parser = argparse.ArgumentParser()
>> +
>> +    parser.add_argument("-D", "--debug",
>> +                        help="Enable eBPF debugging",
>> +                        type=int, const=0x3f, default=0, nargs='?')
>> +    parser.add_argument("-p", "--pid", metavar="VSWITCHD_PID",
>> +                        help="ovs-vswitch's PID",
>> +                        type=unsigned_int, default=None)
>> +    parser.add_argument("-s", "--syscall-events", metavar="DURATION_NS",
>> +                        help="Record syscall events, default disabled",
>> +                        type=unsigned_int, const=0, default=None, nargs='?')
>> +    parser.add_argument("--buffer-page-count",
>> +                        help="Number of BPF ring buffer pages, default 
>> 1024",
>> +                        type=unsigned_int, default=1024, metavar="NUMBER")
>> +    parser.add_argument("--sample-count",
>> +                        help="Number of sample runs, default 1",
>> +                        type=unsigned_nonzero_int, default=1, 
>> metavar="RUNS")
>> +    parser.add_argument("--sample-interval",
>> +                        help="Delay between sample runs, default 0",
>> +                        type=float, default=0, metavar="SECONDS")
>> +    parser.add_argument("--sample-time",
>> +                        help="Sample time, default 0.5 seconds",
>> +                        type=float, default=0.5, metavar="SECONDS")
>> +    parser.add_argument("--skip-syscall-poll-events",
>> +                        help="Skip poll() syscalls with --syscall-events",
>> +                        action="store_true")
>> +    parser.add_argument("--stack-trace-size",
>> +                        help="Number of unique stack traces that can be "
>> +                        "recorded, default 4096. 0 to disable",
>> +                        type=unsigned_int, default=4096)
>> +    parser.add_argument("--start-trigger", metavar="TRIGGER",
>> +                        help="Start trigger, see documentation for details",
>> +                        type=str, default=None)
>> +    parser.add_argument("--stop-trigger", metavar="TRIGGER",
>> +                        help="Stop trigger, see documentation for details",
>> +                        type=str, default=None)
>> +    parser.add_argument("--trigger-delta", metavar="DURATION_NS",
>> +                        help="Only report event when the trigger duration > 
>> "
>> +                             "DURATION_NS, default 0 (all events)",
>> +                        type=unsigned_int, const=0, default=0, nargs='?')
>> +
>> +    options = parser.parse_args()
>> +
>> +    #
>> +    # Find the PID of the ovs-vswitchd daemon if not specified.
>> +    #
>> +    if options.pid is None:
>> +        for proc in psutil.process_iter():
>> +            if 'ovs-vswitchd' in proc.name():
>> +                if options.pid is not None:
>> +                    print("ERROR: Multiple ovs-vswitchd daemons running, "
>> +                          "use the -p option!")
>> +                    sys.exit(-1)
>
> We have some inconsistencies here because some utilities we send
> "sys.exit(1)" and some we send "sys.exit(-1)"
>
> I think we switched to using '1' because it is a valid return value on
> other OSes.  I guess for now there isn't any need to normalize it since
> these are eBPF programs intended to run on linux.  That said, maybe we
> could consider a normalized value anyway just for review purposes.

ACK, good idea! I changed all the exit codes using os.EX_???.

>> +
>> +                options.pid = proc.pid
>> +
>> +    #
>> +    # Error checking on input parameters.
>> +    #
>> +    if options.pid is None:
>> +        print("ERROR: Failed to find ovs-vswitchd's PID!")
>> +        sys.exit(-1)
>> +
>> +    options.buffer_page_count = next_power_of_two(options.buffer_page_count)
>> +
>> +    #
>> +    # Make sure we are running as root, or else we can not attach the 
>> probes.
>> +    #
>> +    if os.geteuid() != 0:
>> +        print("ERROR: We need to run as root to attached probes!")
>> +        sys.exit(-1)
>> +
>> +    #
>> +    # Setup any of the start stop triggers
>> +    #
>> +    if options.start_trigger is not None:
>> +        try:
>> +            start_trigger = Probe(options.start_trigger, pid=options.pid)
>> +        except ValueError as e:
>> +            print(f"ERROR: Invalid start trigger {str(e)}")
>> +            sys.exit(-1)
>> +    else:
>> +        start_trigger = None
>> +
>> +    if options.stop_trigger is not None:
>> +        try:
>> +            stop_trigger = Probe(options.stop_trigger, pid=options.pid)
>> +        except ValueError as e:
>> +            print(f"ERROR: Invalid stop trigger {str(e)}")
>> +            sys.exit(-1)
>> +    else:
>> +        stop_trigger = None
>> +
>> +    #
>> +    # Attach probe to running process.
>> +    #
>> +    source = EBPF_SOURCE.replace("<EVENT_ENUM>", "\n".join(
>> +        ["    EVENT_{} = {},".format(
>> +            event.name, event.value) for event in Event]))
>> +    source = source.replace("<BUFFER_PAGE_CNT>",
>> +                            str(options.buffer_page_count))
>> +    source = source.replace("<MONITOR_PID>", str(options.pid))
>> +
>> +    if BPF.kernel_struct_has_field(b'task_struct', b'state') == 1:
>> +        source = source.replace('<STATE_FIELD>', 'state')
>> +    else:
>> +        source = source.replace('<STATE_FIELD>', '__state')
>> +
>> +    poll_id = [k for k, v in syscalls.items() if v == b'poll'][0]
>> +    if options.syscall_events is None:
>> +        syscall_trace_events = "false"
>> +    elif options.syscall_events == 0:
>> +        if not options.skip_syscall_poll_events:
>> +            syscall_trace_events = "true"
>> +        else:
>> +            syscall_trace_events = f"args->id != {poll_id}"
>> +    else:
>> +        syscall_trace_events = "delta > {}".format(options.syscall_events)
>> +        if options.skip_syscall_poll_events:
>> +            syscall_trace_events += f" && args->id != {poll_id}"
>> +
>> +    source = source.replace("<SYSCAL_TRACE_EVENTS>",
>> +                            syscall_trace_events)
>> +
>> +    source = source.replace("<STACK_TRACE_SIZE>",
>> +                            str(options.stack_trace_size))
>> +
>> +    source = source.replace("<STACK_TRACE_ENABLED>", "true"
>> +                            if options.stack_trace_size > 0 else "false")
>> +
>> +    #
>> +    # Handle start/stop probes
>> +    #
>> +    if start_trigger:
>> +        source = source.replace("<START_TRIGGER>",
>> +                                start_trigger.get_c_code(
>> +                                    "start_trigger_probe",
>> +                                    "return start_trigger();"))
>> +    else:
>> +        source = source.replace("<START_TRIGGER>", "")
>> +
>> +    if stop_trigger:
>> +        source = source.replace("<STOP_TRIGGER>",
>> +                                stop_trigger.get_c_code(
>> +                                    "stop_trigger_probe",
>> +                                    "return stop_trigger();"))
>> +    else:
>> +        source = source.replace("<STOP_TRIGGER>", "")
>> +
>> +    #
>> +    # Setup usdt or other probes that need handling trough the BFP class.
>> +    #
>> +    usdt = USDT(pid=int(options.pid))
>> +    try:
>> +        if start_trigger and start_trigger.probe_type == 'usdt':
>> +            usdt.enable_probe(probe=start_trigger.probe_name(),
>> +                              fn_name="start_trigger_probe")
>> +        if stop_trigger and stop_trigger.probe_type == 'usdt':
>> +            usdt.enable_probe(probe=stop_trigger.probe_name(),
>> +                              fn_name="stop_trigger_probe")
>> +
>> +    except USDTException as e:
>> +        print("ERROR: {}".format(
>> +            (re.sub('^', ' ' * 7, str(e), flags=re.MULTILINE)).strip().
>> +            replace("--with-dtrace or --enable-dtrace",
>> +                    "--enable-usdt-probes")))
>> +        sys.exit(-1)
>> +
>> +    bpf = BPF(text=source, usdt_contexts=[usdt], debug=options.debug)
>> +
>> +    if start_trigger:
>> +        try:
>> +            if start_trigger.probe_type == "uprobe":
>> +                bpf.attach_uprobe(name=f"/proc/{options.pid}/exe",
>> +                                  sym=start_trigger.probe_name(),
>> +                                  fn_name="start_trigger_probe",
>> +                                  pid=options.pid)
>> +
>> +            if start_trigger.probe_type == "uretprobe":
>> +                bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe",
>> +                                     sym=start_trigger.probe_name(),
>> +                                     fn_name="start_trigger_probe",
>> +                                     pid=options.pid)
>> +        except Exception as e:
>> +            print("ERROR: Failed attaching uprobe start trigger "
>> +                  f"'{start_trigger.probe_name()}';\n         {str(e)}")
>> +            sys.exit(-1)
>> +
>> +    if stop_trigger:
>> +        try:
>> +            if stop_trigger.probe_type == "uprobe":
>> +                bpf.attach_uprobe(name=f"/proc/{options.pid}/exe",
>> +                                  sym=stop_trigger.probe_name(),
>> +                                  fn_name="stop_trigger_probe",
>> +                                  pid=options.pid)
>> +
>> +            if stop_trigger.probe_type == "uretprobe":
>> +                bpf.attach_uretprobe(name=f"/proc/{options.pid}/exe",
>> +                                     sym=stop_trigger.probe_name(),
>> +                                     fn_name="stop_trigger_probe",
>> +                                     pid=options.pid)
>> +        except Exception as e:
>> +            print("ERROR: Failed attaching uprobe stop trigger"
>> +                  f"'{stop_trigger.probe_name()}';\n         {str(e)}")
>> +            sys.exit(-1)
>> +
>> +    #
>> +    # If no triggers are configured use the delay configuration
>> +    #
>> +    bpf['events'].open_ring_buffer(process_event)
>> +
>> +    sample_count = 0
>> +    while sample_count < options.sample_count:
>> +        sample_count += 1
>> +        syscall_events = []
>> +
>> +        if options.start_trigger is None:
>> +            print_timestamp("# Start sampling")
>> +            start_capture()
>> +            stop_time = -1 if options.stop_trigger else \
>> +                time_ns() + options.sample_time * 1000000000
>> +        else:
>> +            # For start triggers the stop time depends on the start trigger
>> +            # time, or depends on the stop trigger if configured.
>> +            stop_time = -1 if options.stop_trigger else 0
>> +
>> +        while True:
>> +            try:
>> +                last_start_ts = start_trigger_ts
>> +                last_stop_ts = stop_trigger_ts
>> +
>> +                if stop_time > 0:
>> +                    delay = int((stop_time - time_ns()) / 1000000)
>> +                    if delay <= 0:
>> +                        break
>> +                else:
>> +                    delay = -1
>> +
>> +                bpf.ring_buffer_poll(timeout=delay)
>> +
>> +                if stop_time <= 0 and last_start_ts != start_trigger_ts:
>> +                    print_timestamp(
>> +                        "# Start sampling (trigger@{})".format(
>> +                            start_trigger_ts))
>> +
>> +                    if not options.stop_trigger:
>> +                        stop_time = time_ns() + \
>> +                            options.sample_time * 1000000000
>> +
>> +                if last_stop_ts != stop_trigger_ts:
>> +                    break
>> +
>> +            except KeyboardInterrupt:
>> +                break
>> +
>> +        if options.stop_trigger and not capture_running():
>> +            print_timestamp("# Stop sampling (trigger@{})".format(
>> +                stop_trigger_ts))
>> +        else:
>> +            print_timestamp("# Stop sampling")
>> +
>> +        if stop_trigger_ts != 0 and start_trigger_ts != 0:
>> +            trigger_delta = stop_trigger_ts - start_trigger_ts
>> +        else:
>> +            trigger_delta = None
>> +
>> +        if not trigger_delta or trigger_delta >= options.trigger_delta:
>> +            stop_capture(force=True)  # Prevent a new trigger to start.
>> +            process_results(syscal_events=syscall_events,
>> +                            trigger_delta=trigger_delta)
>> +        elif trigger_delta:
>> +            sample_count -= 1
>> +            print_timestamp("# Sample dump skipped, delta {:,} ns".format(
>> +                trigger_delta))
>> +
>> +        reset_capture()
>> +        stop_capture()
>> +        if options.sample_interval > 0:
>> +            time.sleep(options.sample_interval)
>> +
>> +    #
>> +    # Report lost events.
>> +    #
>> +    dropcnt = bpf.get_table("dropcnt")
>> +    for k in dropcnt.keys():
>> +        count = dropcnt.sum(k).value
>> +        if k.value == 0 and count > 0:
>> +            print("\n# WARNING: Not all events were captured, {} were "
>> +                  "dropped!\n#          Increase the BPF ring buffer size "
>> +                  "with the --buffer-page-count option.".format(count))
>> +
>> +    if (options.sample_count > 1):
>> +        trigger_miss = bpf.get_table("trigger_miss")
>> +        for k in trigger_miss.keys():
>> +            count = trigger_miss.sum(k).value
>> +            if k.value == 0 and count > 0:
>> +                print("\n# WARNING: Not all start triggers were successful. 
>> "
>> +                      "{} were missed due to\n#          slow userspace "
>> +                      "processing!".format(count))
>> +
>> +
>> +#
>> +# Start main() as the default entry point...
>> +#
>> +if __name__ == '__main__':
>> +    main()
>> diff --git a/utilities/usdt-scripts/kernel_delay.rst 
>> b/utilities/usdt-scripts/kernel_delay.rst
>> new file mode 100644
>> index 000000000..95e98db34
>> --- /dev/null
>> +++ b/utilities/usdt-scripts/kernel_delay.rst
>> @@ -0,0 +1,594 @@
>> +Troubleshooting Open vSwitch: Is the kernel to blame?
>> +=====================================================
>> +Often, when troubleshooting Open vSwitch (OVS) in the field, you might be 
>> left
>> +wondering if the issue is really OVS-related, or if it's a problem with the
>> +kernel being overloaded. The kernel_delay.py tool can help you quickly
>> +identify if the focus of your investigation should be OVS or the Linux 
>> kernel.
>
> We might want to give additional context here.  For example, hints that
> the system could be overloaded due to the WARN level log "Unreasonably
> long poll..." or uname/uptime output.  I know you do reference it later
> in the document, but perhaps a section on when the reader can infer that
> there could be a problem.

Ack, added a sentence to explain what these issues might look like.

>> +
>> +
>> +Introduction
>> +------------
>> +``kernel_delay.py`` consists of a Python script that uses the BCC [#BCC]_
>> +framework to install eBPF probes. The data the eBPF probes collect will be
>> +analyzed and presented to the user by the Python script. Some of the 
>> presented
>> +data can also be captured by the individual scripts included in the BBC 
>> [#BCC]_
>> +framework.
>> +
>> +kernel_delay.py has two modes of operation:
>> +
>> +- In **time mode**, the tool runs for a specific time and collects the
>> +  information.
>> +- In **trigger mode**, event collection can be started and/or stopped based 
>> on
>> +  a specific eBPF probe. Currently, we support the following probes:
>> +  - USDT probes
>> +  - Kernel tracepoints
>> +  - kprobe
>> +  - kretprobe
>> +  - uprobe
>> +  - uretprobe
>> +
>> +
>> +In addition, the option, ``--sample-count``, exists to specify how many
>> +iterations you would like to do. When using triggers, you can also ignore
>> +samples if they are less than a number of nanoseconds with the
>> +``--trigger-delta`` option. The latter might be useful when debugging Linux
>> +syscalls which take a long time to complete. More on this later. Finally, 
>> you
>> +can configure the delay between two sample runs with the 
>> ``--sample-interval``
>> +option.
>> +
>> +Before getting into more details, let's just run the tool without any 
>> options
>> +to see what the output looks like. Notice that it will try to automatically
>> +get the process ID of the running ``ovs-vsdwitchd``. You can overwrite this
>> +with the ``--pid`` option.
>> +
>> +.. code-block:: console
>> +
>> +  $ sudo ./kernel_delay.py
>> +  # Start sampling @2023-06-08T12:17:22.725127 (10:17:22 UTC)
>> +  # Stop sampling @2023-06-08T12:17:23.224781 (10:17:23 UTC)
>> +  # Sample dump @2023-06-08T12:17:23.224855 (10:17:23 UTC)
>> +  TID        THREAD           <RESOURCE SPECIFIC>
>> +  ---------- ---------------- 
>> ----------------------------------------------------------------------------
>> +       27090 ovs-vswitchd     [SYSCALL STATISTICS]
>> +                  <EDIT: REMOVED DATA FOR ovs-vswitchd THREAD>
>> +
>> +       31741 revalidator122   [SYSCALL STATISTICS]
>> +                  NAME                 NUMBER       COUNT          TOTAL ns 
>>            MAX ns
>> +                  poll                      7           5       184,193,176 
>>       184,191,520
>> +                  recvmsg                  47         494       125,208,756 
>>           310,331
>> +                  futex                   202           8        18,768,758 
>>         4,023,039
>> +                  sendto                   44          10           375,861 
>>           266,867
>> +                  sendmsg                  46           4            43,294 
>>            11,213
>> +                  write                     1           1             5,949 
>>             5,949
>> +                  getrusage                98           1             1,424 
>>             1,424
>> +                  read                      0           1             1,292 
>>             1,292
>> +                  TOTAL( - poll):                     519       144,405,334
>> +
>> +                  [THREAD RUN STATISTICS]
>> +                  SCHED_CNT           TOTAL ns            MIN ns            
>> MAX ns
>> +                       6       136,764,071             1,480       
>> 115,146,424
>> +
>> +                  [THREAD READY STATISTICS]
>> +                  SCHED_CNT           TOTAL ns            MAX ns
>> +                       7            11,334             6,636
>> +
>> +                  [HARD IRQ STATISTICS]
>> +                  NAME                       COUNT          TOTAL ns        
>>     MAX ns
>> +                  eno8303-rx-1                   1             3,586        
>>      3,586
>> +                  TOTAL:                         1             3,586
>> +
>> +                  [SOFT IRQ STATISTICS]
>> +                  NAME                 VECT_NR       COUNT          TOTAL 
>> ns            MAX ns
>> +                  net_rx                     3           1            
>> 17,699            17,699
>> +                  sched                      7           6            
>> 13,820             3,226
>> +                  rcu                        9          16            
>> 13,586             1,554
>> +                  timer                      1           3            
>> 10,259             3,815
>> +                  TOTAL:                                26            55,364
>> +
>> +
>> +By default, the tool will run for half a second in `time mode`. To extend 
>> this
>> +you can use the ``--sample-time`` option.
>> +
>> +
>> +What will it report
>> +-------------------
>> +The above sample output separates the captured data on a per-thread basis.
>> +For this, it displays the thread's id (``TID``) and name (``THREAD``),
>> +followed by resource-specific data. Which are:
>> +
>> +- ``SYSCALL STATISTICS``
>> +- ``THREAD RUN STATISTICS``
>> +- ``THREAD READY STATISTICS``
>> +- ``HARD IRQ STATISTICS``
>> +- ``SOFT IRQ STATISTICS``
>> +
>> +The following sections will describe in detail what statistics they report.
>> +
>> +
>> +``SYSCALL STATISTICS``
>> +~~~~~~~~~~~~~~~~~~~~~~
>> +``SYSCALL STATISTICS`` tell you which Linux system calls got executed during
>> +the measurement interval. This includes the number of times the syscall was
>> +called (``COUNT``), the total time spent in the system calls (``TOTAL ns``),
>> +and the worst-case duration of a single call (``MAX ns``).
>> +
>> +It also shows the total of all system calls, but it excludes the poll system
>> +call, as the purpose of this call is to wait for activity on a set of 
>> sockets,
>> +and usually, the thread gets swapped out.
>> +
>> +Note that it only counts calls that started and stopped during the
>> +measurement interval!
>> +
>> +
>> +``THREAD RUN STATISTICS``
>> +~~~~~~~~~~~~~~~~~~~~~~~~~
>> +``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU
>> +during the measurement interval.
>> +
>> +Note that these statistics only count events where the thread started and
>> +stopped running on a CPU during the measurement interval. For example, if
>> +this was a PMD thread, you should see zero ``SCHED_CNT`` and ``TOTAL_ns``.
>> +If not, there might be a misconfiguration.
>> +
>> +
>> +``THREAD READY STATISTICS``
>> +~~~~~~~~~~~~~~~~~~~~~~~~~~~
>> +``THREAD READY STATISTICS`` tell you the time between the thread being ready
>> +to run and it actually running on the CPU.
>> +
>> +Note that these statistics only count events where the thread was getting
>> +ready to run and started running during the measurement interval.
>> +
>> +
>> +``HARD IRQ STATISTICS``
>> +~~~~~~~~~~~~~~~~~~~~~~~
>> +``HARD IRQ STATISTICS`` tell you how much time was spent servicing hard
>> +interrupts during the threads run time.
>> +
>> +It shows the interrupt name (``NAME``), the number of interrupts 
>> (``COUNT``),
>> +the total time spent in the interrupt handler (``TOTAL ns``), and the
>> +worst-case duration (``MAX ns``).
>> +
>> +
>> +``SOFT IRQ STATISTICS``
>> +~~~~~~~~~~~~~~~~~~~~~~~
>> +``SOFT IRQ STATISTICS`` tell you how much time was spent servicing soft
>> +interrupts during the threads run time.
>> +
>> +It shows the interrupt name (``NAME``), vector number (``VECT_NR``), the
>> +number of interrupts (``COUNT``), the total time spent in the interrupt
>> +handler (``TOTAL ns``), and the worst-case duration (``MAX ns``).
>> +
>> +
>> +The ``--syscall-events`` option
>> +-------------------------------
>> +In addition to reporting global syscall statistics in 
>> ``SYSCALL_STATISTICS``,
>> +the tool can also report each individual syscall. This can be a usefull
>> +second step if the ``SYSCALL_STATISTICS`` show high latency numbers.
>> +
>> +All you need to do is add the ``--syscall-events`` option, with or without
>> +the additional ``DURATION_NS`` parameter. The ``DUTATION_NS`` parameter
>> +allows you to exclude events that take less than the supplied time.
>> +
>> +The ``--skip-syscall-poll-events`` option allows you to exclude poll
>> +syscalls from the report.
>> +
>> +Below is an example run, note that I have removed the resource-specific data
>> +to highlight the syscall events:
>> +
>> +.. code-block:: console
>> +
>> +  $ sudo ./kernel_delay.py  --syscall-events 50000 
>> --skip-syscall-poll-events
>> +  # Start sampling @2023-06-13T17:10:46.460874 (15:10:46 UTC)
>> +  # Stop sampling @2023-06-13T17:10:46.960727 (15:10:46 UTC)
>> +  # Sample dump @2023-06-13T17:10:46.961033 (15:10:46 UTC)
>> +  TID        THREAD           <RESOURCE SPECIFIC>
>> +  ---------- ---------------- 
>> ----------------------------------------------------------------------------
>> +     3359686 ipf_clean2       [SYSCALL STATISTICS]
>> +     ...
>> +     3359635 ovs-vswitchd     [SYSCALL STATISTICS]
>> +     ...
>> +     3359697 revalidator12    [SYSCALL STATISTICS]
>> +     ...
>> +     3359698 revalidator13    [SYSCALL STATISTICS]
>> +     ...
>> +     3359699 revalidator14    [SYSCALL STATISTICS]
>> +     ...
>> +     3359700 revalidator15    [SYSCALL STATISTICS]
>> +     ...
>> +
>> +  # SYSCALL EVENTS:
>> +         ENTRY (ns)           EXIT (ns)        TID COMM             DELTA 
>> (us)  SYSCALL
>> +    ------------------- ------------------- ---------- ---------------- 
>> ----------  ----------------
>> +       2161821694935486    2161821695031201    3359699 revalidator14        
>>     95  futex
>> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
>> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
>> +        syscall_exit_to_user_mode+0x9 [kernel]
>> +        do_syscall_64+0x68 [kernel]
>> +        entry_SYSCALL_64_after_hwframe+0x72 [kernel]
>> +        __GI___lll_lock_wait+0x30 [libc.so.6]
>> +        ovs_mutex_lock_at+0x18 [ovs-vswitchd]
>> +        [unknown] 0x696c003936313a63
>> +       2161821695276882    2161821695333687    3359698 revalidator13        
>>     56  futex
>> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
>> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
>> +        syscall_exit_to_user_mode+0x9 [kernel]
>> +        do_syscall_64+0x68 [kernel]
>> +        entry_SYSCALL_64_after_hwframe+0x72 [kernel]
>> +        __GI___lll_lock_wait+0x30 [libc.so.6]
>> +        ovs_mutex_lock_at+0x18 [ovs-vswitchd]
>> +        [unknown] 0x696c003134313a63
>> +       2161821695275820    2161821695405733    3359700 revalidator15        
>>    129  futex
>> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
>> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
>> +        syscall_exit_to_user_mode+0x9 [kernel]
>> +        do_syscall_64+0x68 [kernel]
>> +        entry_SYSCALL_64_after_hwframe+0x72 [kernel]
>> +        __GI___lll_lock_wait+0x30 [libc.so.6]
>> +        ovs_mutex_lock_at+0x18 [ovs-vswitchd]
>> +        [unknown] 0x696c003936313a63
>> +       2161821695964969    2161821696052021    3359635 ovs-vswitchd         
>>     87  accept
>> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
>> +        syscall_exit_to_user_mode_prepare+0x161 [kernel]
>> +        syscall_exit_to_user_mode+0x9 [kernel]
>> +        do_syscall_64+0x68 [kernel]
>> +        entry_SYSCALL_64_after_hwframe+0x72 [kernel]
>> +        __GI_accept+0x4d [libc.so.6]
>> +        pfd_accept+0x3a [ovs-vswitchd]
>> +        [unknown] 0x7fff19f2bd00
>> +        [unknown] 0xe4b8001f0f
>> +
>> +As you can see above, the output also shows the stackback trace. You can
>> +disable this using the ``--stack-trace-size 0`` option.
>> +
>> +As you can see above, the backtrace does not show a lot of useful 
>> information
>> +due to the BCC [#BCC]_ toolkit not supporting dwarf decoding. So to further
>> +analyze system call backtraces, you could use perf. The following perf
>> +script can do this for you (refer to the embedded instructions):
>> +
>> +https://github.com/chaudron/perf_scripts/blob/master/analyze_perf_pmd_syscall.py
>
> Maybe we should consider putting something like that in-tree for use.

Good idea, added a todo to my queue :)

>> +
>> +
>> +Using triggers
>> +--------------
>> +The tool supports start and, or stop triggers. This will allow you to 
>> capture
>> +statistics triggered by a specific event. First, let's look at what
>> +combinations of stop-and-start triggers we can use.
>> +
>> +If you only use ``--start-trigger``, the inspection start when the trigger
>> +happens and runs until the ``--sample-time`` number of seconds has passed.
>> +The example below shows all the supported options in this scenario.
>> +
>> +.. code-block:: console
>> +
>> +  $ sudo ./kernel_delay.py --start-trigger up:bridge_run --sample-time 4 \
>> +                           --sample-count 4 --sample-interval 1
>> +
>> +
>> +If you only use ``--stop-trigger``, the inspection starts immediately and
>> +stops when the trigger happens.  The example below shows all the supported
>> +options in this scenario.
>> +
>> +.. code-block:: console
>> +
>> +  $ sudo ./kernel_delay.py --stop-trigger upr:bridge_run \
>> +                           --sample-count 4 --sample-interval 1
>> +
>> +
>> +If you use both ``--start-trigger`` and ``--stop-trigger`` triggers, the
>> +statistics are captured between the two first occurrences of these events.
>> +The example below shows all the supported options in this scenario.
>> +
>> +.. code-block:: console
>> +
>> +  $ sudo ./kernel_delay.py --start-trigger up:bridge_run \
>> +                           --stop-trigger upr:bridge_run \
>> +                           --sample-count 4 --sample-interval 1 \
>> +                           --trigger-delta 50000
>> +
>> +
>> +Now that we know how these triggers can be used, let's investigate what
>> +triggers are supported. What we call triggers, BCC [#BCC]_, calls events;
>> +these are eBPF tracepoints you can attach to. For more details on the
>> +supported tracepoints, check out the BCC documentation [#BCC_EVENT]_.
>> +
>> +The list below shows the supported triggers and their argument format:
>> +
>> +**USDT probes:**
>> +  [u|usdt]:{provider}:{probe}
>> +**Kernel tracepoint:**
>> +  [t:trace]:{system}:{event}
>> +**kprobe:**
>> +  [k:kprobe]:{kernel_function}
>> +**kretprobe:**
>> +  [kr:kretprobe]:{kernel_function}
>> +**uprobe:**
>> +  [up:uprobe]:{function}
>> +**uretprobe:**
>> +  [upr:uretprobe]:{function}
>> +
>> +Here are a couple of trigger examples, more use-case-specific examples can 
>> be
>> +found in the *Examples* section.
>> +
>> +.. code-block:: console
>> +
>> +  --start|stop-trigger u:udpif_revalidator:start_dump
>> +  --start|stop-trigger t:openvswitch:ovs_dp_upcall
>> +  --start|stop-trigger k:ovs_dp_process_packet
>> +  --start|stop-trigger kr:ovs_dp_process_packet
>> +  --start|stop-trigger up:bridge_run
>> +  --start|stop-trigger upr:bridge_run
>> +
>> +
>> +Examples
>> +--------
>> +This section will give some examples of how to use this tool in real-world
>> +scenarios. Let's start with the issue where Open vSwitch reports
>> +``Unreasonably long XXXXms poll interval`` on your revalidator threads. Note
>> +that there is a blog available explaining how the revalidator process works
>> +in OVS [#REVAL_BLOG]_.
>> +
>> +First, let me explain this log message. It gets logged if the time delta
>> +between two ``poll_block()`` calls is more than 1 second. In other words,
>> +the process was spending a lot of time processing stuff that was made
>> +available by the return of the ``poll_block()`` function.
>> +
>> +Do a run with the tool using the existing USDT revalidator probes as a start
>> +and stop trigger (Note that I removed the resource-specific data from the 
>> none
>> +revalidator threads):
>> +
>> +.. code-block:: console
>> +
>> +  $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump 
>> --stop-trigger u:udpif_revalidator:sweep_done
>> +  # Start sampling (trigger@791777093512008) @2023-06-14T14:52:00.110303 
>> (12:52:00 UTC)
>> +  # Stop sampling (trigger@791778281498462) @2023-06-14T14:52:01.297975 
>> (12:52:01 UTC)
>> +  # Triggered sample dump, stop-start delta 1,187,986,454 ns 
>> @2023-06-14T14:52:01.298021 (12:52:01 UTC)
>> +  TID        THREAD           <RESOURCE SPECIFIC>
>> +  ---------- ---------------- 
>> ----------------------------------------------------------------------------
>> +     1457761 handler24        [SYSCALL STATISTICS]
>> +                              NAME                 NUMBER       COUNT       
>>    TOTAL ns            MAX ns
>> +                              sendmsg                  46        6110       
>> 123,274,761            41,776
>> +                              recvmsg                  47      136299       
>>  99,397,508            49,896
>> +                              futex                   202          51       
>>   7,655,832         7,536,776
>> +                              poll                      7        4068       
>>   1,202,883             2,907
>> +                              getrusage                98        2034       
>>     586,602             1,398
>> +                              sendto                   44           9       
>>     213,682            27,417
>> +                              TOTAL( - poll):                  144503       
>> 231,128,385
>> +
>> +                              [THREAD RUN STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MIN 
>> ns            MAX ns
>> +
>> +                              [THREAD READY STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MAX ns
>> +                                       1             1,438             1,438
>> +
>> +                              [SOFT IRQ STATISTICS]
>> +                              NAME                 VECT_NR       COUNT      
>>     TOTAL ns            MAX ns
>> +                              sched                      7          21      
>>       59,145             3,769
>> +                              rcu                        9          50      
>>       42,917             2,234
>> +                              TOTAL:                                71      
>>      102,062
>> +     1457733 ovs-vswitchd     [SYSCALL STATISTICS]
>> +     ...
>> +     1457792 revalidator55    [SYSCALL STATISTICS]
>> +                              NAME                 NUMBER       COUNT       
>>    TOTAL ns            MAX ns
>> +                              futex                   202          73       
>> 572,576,329        19,621,600
>> +                              recvmsg                  47         815       
>> 296,697,618           405,338
>> +                              sendto                   44           3       
>>      78,302            26,837
>> +                              sendmsg                  46           3       
>>      38,712            13,250
>> +                              write                     1           1       
>>       5,073             5,073
>> +                              TOTAL( - poll):                     895       
>> 869,396,034
>> +
>> +                              [THREAD RUN STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MIN 
>> ns            MAX ns
>> +                                      48       394,350,393             
>> 1,729       140,455,796
>> +
>> +                              [THREAD READY STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MAX ns
>> +                                      49            23,650             1,559
>> +
>> +                              [SOFT IRQ STATISTICS]
>> +                              NAME                 VECT_NR       COUNT      
>>     TOTAL ns            MAX ns
>> +                              sched                      7          14      
>>       26,889             3,041
>> +                              rcu                        9          28      
>>       23,024             1,600
>> +                              TOTAL:                                42      
>>       49,913
>> +
>> +
>> +You can see from the start of the output that the trigger took more than a
>> +second (1,187,986,454 ns), which we would already know,  by looking at the
>> +output of the ``ovs-vsctl upcall/show`` command.
>> +
>> +From the *revalidator55*'s ``SYSCALL STATISTICS`` statistics we can see it
>> +spent almost 870ms handling syscalls, and there were no poll() calls being
>> +executed. The ``THREAD RUN STATISTICS`` statistics here are a bit 
>> misleading,
>> +as it looks like we only spent 394ms on the CPU. But earlier, we learned 
>> that
>> +this time does not include the time being on the CPU at the start or stop of
>> +an event. What is exactly the case here, because we are using USDT probes.
>> +
>> +From the above data and maybe some ``top`` output, we can determine that the
>> +*revalidator55* thread is taking a lot of CPU time, probably because it has
>> +to do a lot of revalidator work by itself. The solution here is to increase
>> +the number of revalidator threads, so more work could be done in parallel.
>> +
>> +Let's do another run of the same command in another scenario:
>> +
>> +.. code-block:: console
>> +
>> +  $ sudo ./kernel_delay.py --start-trigger u:udpif_revalidator:start_dump 
>> --stop-trigger u:udpif_revalidator:sweep_done
>> +  # Start sampling (trigger@795160501758971) @2023-06-14T15:48:23.518512 
>> (13:48:23 UTC)
>> +  # Stop sampling (trigger@795160764940201) @2023-06-14T15:48:23.781381 
>> (13:48:23 UTC)
>> +  # Triggered sample dump, stop-start delta 263,181,230 ns 
>> @2023-06-14T15:48:23.781414 (13:48:23 UTC)
>> +  TID        THREAD           <RESOURCE SPECIFIC>
>> +  ---------- ---------------- 
>> ----------------------------------------------------------------------------
>> +     1457733 ovs-vswitchd     [SYSCALL STATISTICS]
>> +                              ...
>> +     1457792 revalidator55    [SYSCALL STATISTICS]
>> +                              NAME                 NUMBER       COUNT       
>>    TOTAL ns            MAX ns
>> +                              recvmsg                  47         284       
>> 193,422,110        46,248,418
>> +                              sendto                   44           2       
>>      46,685            23,665
>> +                              sendmsg                  46           2       
>>      24,916            12,703
>> +                              write                     1           1       
>>       6,534             6,534
>> +                              TOTAL( - poll):                     289       
>> 193,500,245
>> +
>> +                              [THREAD RUN STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MIN 
>> ns            MAX ns
>> +                                       2        47,333,558           
>> 331,516        47,002,042
>> +
>> +                              [THREAD READY STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MAX ns
>> +                                       3        87,000,403        45,999,712
>> +
>> +                              [SOFT IRQ STATISTICS]
>> +                              NAME                 VECT_NR       COUNT      
>>     TOTAL ns            MAX ns
>> +                              sched                      7           2      
>>        9,504             5,109
>> +                              TOTAL:                                 2      
>>        9,504
>> +
>> +
>> +Here you can see the revalidator run took about 263ms, which does not look
>> +odd, however, the ``THREAD READY STATISTICS`` information shows us we were
>> +waiting 87ms for a CPU to be run on. This means the revalidator process 
>> could
>> +have finished 87ms faster. Looking at the ``MAX ns`` value, we see a
>> +worst-case delay of almost 46ms, which hints at an overloaded system.
>> +
>> +One final example where we use a ``uprobe`` to get some statistics on
>> +a ``bridge_run()`` execution that takes more than 1ms.
>> +
>> +.. code-block:: console
>> +
>> +  $ sudo ./kernel_delay.py --start-trigger up:bridge_run --stop-trigger 
>> ur:bridge_run --trigger-delta 1000000
>> +  # Start sampling (trigger@2245245432101270) @2023-06-14T16:21:10.467919 
>> (14:21:10 UTC)
>> +  # Stop sampling (trigger@2245245432414656) @2023-06-14T16:21:10.468296 
>> (14:21:10 UTC)
>> +  # Sample dump skipped, delta 313,386 ns @2023-06-14T16:21:10.468419 
>> (14:21:10 UTC)
>> +  # Start sampling (trigger@2245245505301745) @2023-06-14T16:21:10.540970 
>> (14:21:10 UTC)
>> +  # Stop sampling (trigger@2245245506911119) @2023-06-14T16:21:10.542499 
>> (14:21:10 UTC)
>> +  # Triggered sample dump, stop-start delta 1,609,374 ns 
>> @2023-06-14T16:21:10.542565 (14:21:10 UTC)
>> +  TID        THREAD           <RESOURCE SPECIFIC>
>> +  ---------- ---------------- 
>> ----------------------------------------------------------------------------
>> +     3371035 <unknown:3366258/3371035> [SYSCALL STATISTICS]
>> +     ... <REMOVED 7 MORE unknown THREADS>
>> +     3371102 handler66        [SYSCALL STATISTICS]
>> +     ... <REMOVED 7 MORE HANDLER THREADS>
>> +     3366258 ovs-vswitchd     [SYSCALL STATISTICS]
>> +                              NAME                 NUMBER       COUNT       
>>    TOTAL ns            MAX ns
>> +                              futex                   202          43       
>>     403,469           199,312
>> +                              clone3                  435          13       
>>     174,394            30,731
>> +                              munmap                   11           8       
>>     115,774            21,861
>> +                              poll                      7           5       
>>      92,969            38,307
>> +                              unlink                   87           2       
>>      49,918            35,741
>> +                              mprotect                 10           8       
>>      47,618            13,201
>> +                              accept                   43          10       
>>      31,360             6,976
>> +                              mmap                      9           8       
>>      30,279             5,776
>> +                              write                     1           6       
>>      27,720            11,774
>> +                              rt_sigprocmask           14          28       
>>      12,281               970
>> +                              read                      0           6       
>>       9,478             2,318
>> +                              recvfrom                 45           3       
>>       7,024             4,024
>> +                              sendto                   44           1       
>>       4,684             4,684
>> +                              getrusage                98           5       
>>       4,594             1,342
>> +                              close                     3           2       
>>       2,918             1,627
>> +                              recvmsg                  47           1       
>>       2,722             2,722
>> +                              TOTAL( - poll):                     144       
>>     924,233
>> +
>> +                              [THREAD RUN STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MIN 
>> ns            MAX ns
>> +                                      13           817,605             
>> 5,433           524,376
>> +
>> +                              [THREAD READY STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MAX ns
>> +                                      14            28,646            11,566
>> +
>> +                              [SOFT IRQ STATISTICS]
>> +                              NAME                 VECT_NR       COUNT      
>>     TOTAL ns            MAX ns
>> +                              rcu                        9           1      
>>        2,838             2,838
>> +                              TOTAL:                                 1      
>>        2,838
>> +
>> +     3371110 revalidator74    [SYSCALL STATISTICS]
>> +     ... <REMOVED 7 MORE NEW revalidator THREADS>
>> +     3366311 urcu3            [SYSCALL STATISTICS]
>> +     ...
>> +
>> +
>> +We removed some of the threads and their resource-specific data, but based
>> +on the ``<unknown:3366258/3371035>`` thread name, you can see that some
>> +threads no longer exist. In the ``ovs-vswitchd`` thread, you can see some
>> +``clone3`` syscalls, indicating threads were created. In this example, it 
>> was
>> +due to the deletion of a bridge, which resulted in the recreation of the
>> +revalidator and handler threads.
>> +
>> +
>> +Use with Openshift
>> +------------------
>> +This section describes how you would use the tool on a node in an OpenShift
>> +cluster. It assumes you have console access to the node, either directly or
>> +through a debug container.
>> +
>> +We will use a base fedora38 container through podman, as this will allow us 
>> to
>> +install some additional tools and packages we need.
>> +
>> +The first thing we need to do is to start the container:
>> +
>> +.. code-block:: console
>> +
>> +  [core@sno-master ~]$ sudo podman run -it --rm \
>> +     -e PS1='[(DEBUG)\u@\h \W]\$ ' \
>> +     --privileged --network=host --pid=host \
>> +     -v /lib/modules:/lib/modules:ro \
>> +     -v /sys/kernel/debug:/sys/kernel/debug \
>> +     -v /proc:/proc \
>> +     -v /:/mnt/rootdir \
>> +     quay.io/fedora/fedora:38-x86_64
>> +
>> +  [(DEBUG)root@sno-master /]#
>> +
>> +
>> +Next add the ``linux_delay.py`` dependencies:
>> +
>> +.. code-block:: console
>> +
>> +  [(DEBUG)root@sno-master /]# dnf install -y bcc-tools perl-interpreter \
>> +       python3-pytz  python3-psutil
>> +
>> +
>> +You need to install the devel, debug and source RPMs for your OVS and kernel
>> +version:
>> +
>> +.. code-block:: console
>> +
>> +  [(DEBUG)root@sno-master home]# rpm -i \
>> +      openvswitch2.17-debuginfo-2.17.0-67.el8fdp.x86_64.rpm \
>> +      openvswitch2.17-debugsource-2.17.0-67.el8fdp.x86_64.rpm \
>> +      kernel-devel-4.18.0-372.41.1.el8_6.x86_64.rpm
>> +
>> +
>> +Now we can run the tool. Here we use the above ```bridge_run()`` example:
>> +
>> +.. code-block:: console
>> +
>> +  [(DEBUG)root@sno-master home]# ./kernel_delay.py --start-trigger 
>> up:bridge_run --stop-trigger ur:bridge_run
>> +  # Start sampling (trigger@75279117343513) @2023-06-15T11:44:07.628372 
>> (11:44:07 UTC)
>> +  # Stop sampling (trigger@75279117443980) @2023-06-15T11:44:07.628529 
>> (11:44:07 UTC)
>> +  # Triggered sample dump, stop-start delta 100,467 ns 
>> @2023-06-15T11:44:07.628569 (11:44:07 UTC)
>> +  TID        THREAD           <RESOURCE SPECIFIC>
>> +  ---------- ---------------- 
>> ----------------------------------------------------------------------------
>> +        1246 ovs-vswitchd     [SYSCALL STATISTICS]
>> +                              NAME                 NUMBER       COUNT       
>>    TOTAL ns            MAX ns
>> +                              getdents64              217           2       
>>       8,560             8,162
>> +                              openat                  257           1       
>>       6,951             6,951
>> +                              accept                   43           4       
>>       6,942             3,763
>> +                              recvfrom                 45           1       
>>       3,726             3,726
>> +                              recvmsg                  47           2       
>>       2,880             2,188
>> +                              stat                      4           2       
>>       1,946             1,384
>> +                              close                     3           1       
>>       1,393             1,393
>> +                              fstat                     5           1       
>>       1,324             1,324
>> +                              TOTAL( - poll):                      14       
>>      33,722
>> +
>> +                              [THREAD RUN STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MIN 
>> ns            MAX ns
>> +
>> +                              [THREAD READY STATISTICS]
>> +                              SCHED_CNT           TOTAL ns            MAX ns
>> +
>> +
>> +.. rubric:: Footnotes
>> +
>> +.. [#BCC] https://github.com/iovisor/bcc
>> +.. [#BCC_EVENT] 
>> https://github.com/iovisor/bcc/blob/master/docs/reference_guide.md#events--arguments
>> +.. [#REVAL_BLOG] 
>> https://developers.redhat.com/articles/2022/10/19/open-vswitch-revalidator-process-explained
>>
>> _______________________________________________
>> dev mailing list
>> [email protected]
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev

_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Re: [ovs-dev] [PATCH] utilities: Add kernel_delay.py script to debug a busy Linux kernel.

Reply via email to