Eelco Chaudron <echau...@redhat.com> writes: > This addition uses the existing syscall probes to record statistics > related to the OVS 'Unreasonably long ... ms poll interval' message. > Basically, it records the min/max/average time between system poll > calls. This can be used to determine if a long poll event has > occurred during the capture. > > Signed-off-by: Eelco Chaudron <echau...@redhat.com> > --- > The long line warning can be ignored. I'm keeping the original > output in the documentation. > > v2: - Calculate the average on display, not per sample. > - Enhanced the documentation with additional log details. > ---
Just some nits below, and otherwise looks good. Thanks! > utilities/usdt-scripts/kernel_delay.py | 70 ++++++++++++++++++++++++- > utilities/usdt-scripts/kernel_delay.rst | 17 +++++- > 2 files changed, 83 insertions(+), 4 deletions(-) > > diff --git a/utilities/usdt-scripts/kernel_delay.py > b/utilities/usdt-scripts/kernel_delay.py > index 367d27e34..cbca7ef8b 100755 > --- a/utilities/usdt-scripts/kernel_delay.py > +++ b/utilities/usdt-scripts/kernel_delay.py > @@ -182,11 +182,27 @@ struct syscall_data_key_t { > u32 syscall; > }; > > +struct long_poll_data_key_t { > + u32 pid; > + u32 tid; > +}; > + > +struct long_poll_data_t { > + u64 count; > + u64 total_ns; > + u64 min_ns; > + u64 max_ns; > +}; > + > BPF_HASH(syscall_start, u64, u64); > BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t); > +BPF_HASH(long_poll_start, u64, u64); > +BPF_HASH(long_poll_data, struct long_poll_data_key_t, struct > long_poll_data_t); > > TRACEPOINT_PROBE(raw_syscalls, sys_enter) { > u64 pid_tgid = bpf_get_current_pid_tgid(); > + struct long_poll_data_t *val, init_val = {.min_ns = U64_MAX}; > + struct long_poll_data_key_t key; > > if (!capture_enabled(pid_tgid)) > return 0; > @@ -194,6 +210,29 @@ TRACEPOINT_PROBE(raw_syscalls, sys_enter) { > u64 t = bpf_ktime_get_ns(); > syscall_start.update(&pid_tgid, &t); > > + /* Do long poll handling from here on. */ > + if (args->id != <SYSCALL_POLL_ID>) > + return 0; > + > + u64 *start_ns = long_poll_start.lookup(&pid_tgid); > + > + if (!start_ns || *start_ns == 0) > + return 0; > + > + key.pid = pid_tgid >> 32; > + key.tid = (u32)pid_tgid; > + > + val = long_poll_data.lookup_or_try_init(&key, &init_val); > + if (val) { > + u64 delta = t - *start_ns; > + val->count++; > + val->total_ns += delta; > + if (delta > val->max_ns) > + val->max_ns = delta; > + else if (delta < val->min_ns) Should this really be an else? It's possible that min will == U64_MAX here so we might want to drop the 'else' and make sure it gets populated on the first time through. That said, I can also get behind the idea that after the first couple of samples it won't likely matter because we'll have dropped into the 'min' case anyway. > + val->min_ns = delta; > + } > + > return 0; > } > > @@ -206,6 +245,12 @@ TRACEPOINT_PROBE(raw_syscalls, sys_exit) { > if (!capture_enabled(pid_tgid)) > return 0; > > + u64 t = bpf_ktime_get_ns(); > + > + if (args->id == <SYSCALL_POLL_ID>) { > + long_poll_start.update(&pid_tgid, &t); > + } > + > key.pid = pid_tgid >> 32; > key.tid = (u32)pid_tgid; > key.syscall = args->id; > @@ -217,7 +262,7 @@ TRACEPOINT_PROBE(raw_syscalls, sys_exit) { > > val = syscall_data.lookup_or_try_init(&key, &zero); > if (val) { > - u64 delta = bpf_ktime_get_ns() - *start_ns; > + u64 delta = t - *start_ns; > val->count++; > val->total_ns += delta; > if (delta > val->worst_ns) > @@ -1039,6 +1084,9 @@ def process_results(syscall_events=None, > trigger_delta=None): > threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items() > if k.syscall != 0xffffffff} > > + threads_long_poll = {k.tid for k, _ in bpf["long_poll_data"].items() > + if k.pid != 0xffffffff} > + > threads_run = {k.tid for k, _ in bpf["run_data"].items() > if k.pid != 0xffffffff} > > @@ -1055,7 +1103,8 @@ def process_results(syscall_events=None, > trigger_delta=None): > if k.pid != 0xffffffff} > > threads = sorted(threads_syscall | threads_run | threads_ready | > - threads_stopped | threads_hardirq | threads_softirq, > + threads_stopped | threads_hardirq | threads_softirq | > + threads_long_poll, > key=lambda x: get_thread_name(options.pid, x)) > > # > @@ -1099,6 +1148,21 @@ def process_results(syscall_events=None, > trigger_delta=None): > print("{}{:20.20} {:6} {:10} {:16,}".format( > indent, "TOTAL( - poll):", "", total_count, total_ns)) > > + # > + # LONG POLL STATISTICS > + # > + for k, v in filter(lambda t: t[0].tid == thread, > + bpf["long_poll_data"].items()): > + > + print("\n{:10} {:16} {}\n{}{:10} {:>16} {:>16} {:>16}".format( > + "", "", "[LONG POLL STATISTICS]", indent, > + "COUNT", "AVERAGE ns", "MIN ns", "MAX ns")) > + > + print("{}{:10} {:16,} {:16,} {:16,}".format( > + indent, v.count, int(v.total_ns / v.count), > + v.min_ns, v.max_ns)) On the off chance that count == 0 (which should *NEVER* happen) should we put some kind of try{}? It's theoretical only I think, because count should be populated as long as items() returns true... so probably not super high priority. > + break > + > # > # THREAD RUN STATISTICS > # > @@ -1429,6 +1493,8 @@ def main(): > source = source.replace("<STACK_TRACE_ENABLED>", "true" > if options.stack_trace_size > 0 else "false") > > + source = source.replace("<SYSCALL_POLL_ID>", str(poll_id)) > + > source = source.replace("<INSTALL_OVS_DP_UPCALL_PROBE>", "0" > if options.skip_upcall_stats else "1") > > diff --git a/utilities/usdt-scripts/kernel_delay.rst > b/utilities/usdt-scripts/kernel_delay.rst > index 41620d760..0a2a430a5 100644 > --- a/utilities/usdt-scripts/kernel_delay.rst > +++ b/utilities/usdt-scripts/kernel_delay.rst > @@ -67,13 +67,17 @@ with the ``--pid`` option. > read 0 1 1,292 > 1,292 > TOTAL( - poll): 519 144,405,334 > > + [LONG POLL STATISTICS] > + COUNT AVERAGE ns MIN ns > MAX ns > + 58 76,773 7,388 > 234,129 > + > [THREAD RUN STATISTICS] > SCHED_CNT TOTAL ns MIN ns > MAX ns > - 6 136,764,071 1,480 > 115,146,424 > + 6 136,764,071 1,480 > 115,146,424 > > [THREAD READY STATISTICS] > SCHED_CNT TOTAL ns MAX ns > - 7 11,334 6,636 > + 7 11,334 6,636 > > [THREAD STOPPED STATISTICS] > STOP_CNT TOTAL ns MAX ns > @@ -104,6 +108,7 @@ For this, it displays the thread's id (``TID``) and name > (``THREAD``), > followed by resource-specific data. Which are: > > - ``SYSCALL STATISTICS`` > +- ``LONG POLL STATISTICS`` > - ``THREAD RUN STATISTICS`` > - ``THREAD READY STATISTICS`` > - ``THREAD STOPPED STATISTICS`` > @@ -129,6 +134,14 @@ Note that it only counts calls that started and stopped > during the > measurement interval! > > > +``LONG POLL STATISTICS`` > +~~~~~~~~~~~~~~~~~~~~~~ > +``LONG POLL STATISTICS`` tell you how long the thread was running between two > +poll system calls. This relates to the 'Unreasonably long ... ms poll > interval' > +message reported by ovs-vswitchd. More details about this message can be > found > +in the example section. > + > + > ``THREAD RUN STATISTICS`` > ~~~~~~~~~~~~~~~~~~~~~~~~~ > ``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev