Re: [ovs-dev] [PATCH v2] utilities: Add long poll statistics to the kernel_delay.py script.

Eelco Chaudron via dev Mon, 24 Mar 2025 01:31:23 -0700

On 21 Mar 2025, at 15:08, Aaron Conole wrote:

> Eelco Chaudron <echau...@redhat.com> writes:
>
>> This addition uses the existing syscall probes to record statistics
>> related to the OVS 'Unreasonably long ... ms poll interval' message.
>> Basically, it records the min/max/average time between system poll
>> calls. This can be used to determine if a long poll event has
>> occurred during the capture.
>>
>> Signed-off-by: Eelco Chaudron <echau...@redhat.com>
>> ---
>> The long line warning can be ignored. I'm keeping the original
>> output in the documentation.
>>
>> v2: - Calculate the average on display, not per sample.
>>     - Enhanced the documentation with additional log details.
>> ---
>
> Just some nits below, and otherwise looks good.  Thanks!

Thanks! See the replies below. I also sent out a v3.

>
>>  utilities/usdt-scripts/kernel_delay.py  | 70 ++++++++++++++++++++++++-
>>  utilities/usdt-scripts/kernel_delay.rst | 17 +++++-
>>  2 files changed, 83 insertions(+), 4 deletions(-)
>>
>> diff --git a/utilities/usdt-scripts/kernel_delay.py 
>> b/utilities/usdt-scripts/kernel_delay.py
>> index 367d27e34..cbca7ef8b 100755
>> --- a/utilities/usdt-scripts/kernel_delay.py
>> +++ b/utilities/usdt-scripts/kernel_delay.py
>> @@ -182,11 +182,27 @@ struct syscall_data_key_t {
>>      u32 syscall;
>>  };
>>
>> +struct long_poll_data_key_t {
>> +    u32 pid;
>> +    u32 tid;
>> +};
>> +
>> +struct long_poll_data_t {
>> +    u64 count;
>> +    u64 total_ns;
>> +    u64 min_ns;
>> +    u64 max_ns;
>> +};
>> +
>>  BPF_HASH(syscall_start, u64, u64);
>>  BPF_HASH(syscall_data, struct syscall_data_key_t, struct syscall_data_t);
>> +BPF_HASH(long_poll_start, u64, u64);
>> +BPF_HASH(long_poll_data, struct long_poll_data_key_t, struct 
>> long_poll_data_t);
>>
>>  TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
>>      u64 pid_tgid = bpf_get_current_pid_tgid();
>> +    struct long_poll_data_t *val, init_val = {.min_ns = U64_MAX};
>> +    struct long_poll_data_key_t key;
>>
>>      if (!capture_enabled(pid_tgid))
>>         return 0;
>> @@ -194,6 +210,29 @@ TRACEPOINT_PROBE(raw_syscalls, sys_enter) {
>>      u64 t = bpf_ktime_get_ns();
>>      syscall_start.update(&pid_tgid, &t);
>>
>> +    /* Do long poll handling from here on. */
>> +    if (args->id != <SYSCALL_POLL_ID>)
>> +        return 0;
>> +
>> +    u64 *start_ns = long_poll_start.lookup(&pid_tgid);
>> +
>> +    if (!start_ns || *start_ns == 0)
>> +        return 0;
>> +
>> +    key.pid = pid_tgid >> 32;
>> +    key.tid = (u32)pid_tgid;
>> +
>> +    val = long_poll_data.lookup_or_try_init(&key, &init_val);
>> +    if (val) {
>> +        u64 delta = t - *start_ns;
>> +        val->count++;
>> +        val->total_ns += delta;
>> +        if (delta > val->max_ns)
>> +            val->max_ns = delta;
>> +        else if (delta < val->min_ns)
>
> Should this really be an else?  It's possible that min will == U64_MAX
> here so we might want to drop the 'else' and make sure it gets populated
> on the first time through.
>
> That said, I can also get behind the idea that after the first couple of
> samples it won't likely matter because we'll have dropped into the 'min'
> case anyway.

No this was a bug :) Fixed in v3.

>> +            val->min_ns = delta;
>> +    }
>> +
>>      return 0;
>>  }
>>
>> @@ -206,6 +245,12 @@ TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
>>      if (!capture_enabled(pid_tgid))
>>         return 0;
>>
>> +    u64 t = bpf_ktime_get_ns();
>> +
>> +    if (args->id == <SYSCALL_POLL_ID>) {
>> +        long_poll_start.update(&pid_tgid, &t);
>> +    }
>> +
>>      key.pid = pid_tgid >> 32;
>>      key.tid = (u32)pid_tgid;
>>      key.syscall = args->id;
>> @@ -217,7 +262,7 @@ TRACEPOINT_PROBE(raw_syscalls, sys_exit) {
>>
>>      val = syscall_data.lookup_or_try_init(&key, &zero);
>>      if (val) {
>> -        u64 delta = bpf_ktime_get_ns() - *start_ns;
>> +        u64 delta = t - *start_ns;
>>          val->count++;
>>          val->total_ns += delta;
>>          if (delta > val->worst_ns)
>> @@ -1039,6 +1084,9 @@ def process_results(syscall_events=None, 
>> trigger_delta=None):
>>      threads_syscall = {k.tid for k, _ in bpf["syscall_data"].items()
>>                         if k.syscall != 0xffffffff}
>>
>> +    threads_long_poll = {k.tid for k, _ in bpf["long_poll_data"].items()
>> +                         if k.pid != 0xffffffff}
>> +
>>      threads_run = {k.tid for k, _ in bpf["run_data"].items()
>>                     if k.pid != 0xffffffff}
>>
>> @@ -1055,7 +1103,8 @@ def process_results(syscall_events=None, 
>> trigger_delta=None):
>>                         if k.pid != 0xffffffff}
>>
>>      threads = sorted(threads_syscall | threads_run | threads_ready |
>> -                     threads_stopped | threads_hardirq | threads_softirq,
>> +                     threads_stopped | threads_hardirq | threads_softirq |
>> +                     threads_long_poll,
>>                       key=lambda x: get_thread_name(options.pid, x))
>>
>>      #
>> @@ -1099,6 +1148,21 @@ def process_results(syscall_events=None, 
>> trigger_delta=None):
>>              print("{}{:20.20} {:6}  {:10}  {:16,}".format(
>>                  indent, "TOTAL( - poll):", "", total_count, total_ns))
>>
>> +        #
>> +        # LONG POLL STATISTICS
>> +        #
>> +        for k, v in filter(lambda t: t[0].tid == thread,
>> +                           bpf["long_poll_data"].items()):
>> +
>> +            print("\n{:10} {:16} {}\n{}{:10}  {:>16}  {:>16}  
>> {:>16}".format(
>> +                "", "", "[LONG POLL STATISTICS]", indent,
>> +                "COUNT", "AVERAGE ns", "MIN ns", "MAX ns"))
>> +
>> +            print("{}{:10}  {:16,}  {:16,}  {:16,}".format(
>> +                indent, v.count, int(v.total_ns / v.count),
>> +                v.min_ns, v.max_ns))
>
> On the off chance that count == 0 (which should *NEVER* happen) should
> we put some kind of try{}?  It's theoretical only I think, because count
> should be populated as long as items() returns true... so probably not
> super high priority.

Yes, this is purely theoretical, it would not happen as there should be at 
least one entry before the data structure gets created.
But I fixes this with a max() statement to avoid the try{} indentation, etc.

             print("{}{:10}  {:16,}  {:16,}  {:16,}".format(
-                indent, v.count, int(v.total_ns / v.count),
+                indent, v.count, int(v.total_ns / max(v.count, 1)),
                 v.min_ns, v.max_ns))


>> +            break
>> +
>>          #
>>          # THREAD RUN STATISTICS
>>          #
>> @@ -1429,6 +1493,8 @@ def main():
>>      source = source.replace("<STACK_TRACE_ENABLED>", "true"
>>                              if options.stack_trace_size > 0 else "false")
>>
>> +    source = source.replace("<SYSCALL_POLL_ID>", str(poll_id))
>> +
>>      source = source.replace("<INSTALL_OVS_DP_UPCALL_PROBE>", "0"
>>                              if options.skip_upcall_stats else "1")
>>
>> diff --git a/utilities/usdt-scripts/kernel_delay.rst 
>> b/utilities/usdt-scripts/kernel_delay.rst
>> index 41620d760..0a2a430a5 100644
>> --- a/utilities/usdt-scripts/kernel_delay.rst
>> +++ b/utilities/usdt-scripts/kernel_delay.rst
>> @@ -67,13 +67,17 @@ with the ``--pid`` option.
>>                    read                      0           1             1,292 
>>             1,292
>>                    TOTAL( - poll):                     519       144,405,334
>>
>> +                  [LONG POLL STATISTICS]
>> +                  COUNT             AVERAGE ns            MIN ns            
>> MAX ns
>> +                          58            76,773             7,388           
>> 234,129
>> +
>>                    [THREAD RUN STATISTICS]
>>                    SCHED_CNT           TOTAL ns            MIN ns            
>> MAX ns
>> -                       6       136,764,071             1,480       
>> 115,146,424
>> +                           6       136,764,071             1,480       
>> 115,146,424
>>
>>                    [THREAD READY STATISTICS]
>>                    SCHED_CNT           TOTAL ns            MAX ns
>> -                       7            11,334             6,636
>> +                           7            11,334             6,636
>>
>>                    [THREAD STOPPED STATISTICS]
>>                    STOP_CNT            TOTAL ns            MAX ns
>> @@ -104,6 +108,7 @@ For this, it displays the thread's id (``TID``) and name 
>> (``THREAD``),
>>  followed by resource-specific data. Which are:
>>
>>  - ``SYSCALL STATISTICS``
>> +- ``LONG POLL STATISTICS``
>>  - ``THREAD RUN STATISTICS``
>>  - ``THREAD READY STATISTICS``
>>  - ``THREAD STOPPED STATISTICS``
>> @@ -129,6 +134,14 @@ Note that it only counts calls that started and stopped 
>> during the
>>  measurement interval!
>>
>>
>> +``LONG POLL STATISTICS``
>> +~~~~~~~~~~~~~~~~~~~~~~
>> +``LONG POLL STATISTICS`` tell you how long the thread was running between 
>> two
>> +poll system calls. This relates to the 'Unreasonably long ... ms poll 
>> interval'
>> +message reported by ovs-vswitchd. More details about this message can be 
>> found
>> +in the example section.
>> +
>> +
>>  ``THREAD RUN STATISTICS``
>>  ~~~~~~~~~~~~~~~~~~~~~~~~~
>>  ``THREAD RUN STATISTICS`` tell you how long the thread was running on a CPU

_______________________________________________
dev mailing list
d...@openvswitch.org
https://mail.openvswitch.org/mailman/listinfo/ovs-dev
Re: [ovs-dev] [PATCH v2] utilities: Add long poll statistics to the kernel_delay.py script.

Reply via email to