Package: munin-plugins-core
Version: 2.0.42-5
Severity: normal
Tags: patch upstream

For some reason, NVME drives report near 100% utilization when using
/proc/diskstats to calculate utilization using the ticks data.

$ cat /proc/diskstats | awk '{print $3, $7, $11, $7+$11, $13, $14}' | grep nvme
nvme0n1 22988 457568 480556 2440737408 2441207708
nvme0n1p1 22988 339624 362612 528884 978496

Note that the $7+$11 column for n1p1 isn't that far off of $13

I have looked through the kernel sources and cannot figure out what is
going on.

See also

https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1790181

Patch that illustrates the issue:

--- diskstats-dist      2018-11-11 04:48:17.000000000 -0800
+++ diskstats   2018-11-12 10:06:05.990785111 -0800
@@ -234,6 +234,9 @@
     # a given second, the device is nearly 100% saturated.
     my $utilization = $tot_ticks / $interval;

+    # NVME drives tend to show nearly 100% util, so lets use r+w as an 
alternate
+    my $rw_util = ($rd_ticks + $wr_ticks) / $interval;
+
     # Average time an I/O takes on the block device
     my $servicetime_in_sec =
       $total_ios_per_sec ? $utilization / $total_ios_per_sec / 1000 : 0;
@@ -253,9 +256,11 @@
       : 0;

     my $util_print = $utilization / 10;
+    my $rw_util_print = $rw_util / 10;

     return {
         utilization              => $util_print,
+        rw_util                  => $rw_util_print,
         servicetime              => $servicetime_in_sec,
         average_wait             => $average_wait_in_sec,
         average_rd_wait          => $average_rd_wait_in_sec,
@@ -298,6 +303,8 @@

         print "${graph_id}_util.value "
           . $result->{$device}->{'utilization'} . "\n";
+        print "${graph_id}_rw_util.value "
+          . $result->{$device}->{'rw_util'} . "\n";
     }

     print "\nmultigraph ${plugin_name}_throughput\n";
@@ -342,6 +349,7 @@

 multigraph ${plugin_name}_utilization.$graph_id
 util.value $result->{'utilization'}
+rw_util.value $result->{'rw_util'}
 EOF

     }
@@ -784,6 +792,11 @@
 ${graph_id}_util.info Utilization of the device
 ${graph_id}_util.min 0
 ${graph_id}_util.draw LINE1
+${graph_id}_rw_util.label $cur_diskstats{$device}->{'pretty_device_name'} (r+w)
+${graph_id}_rw_util.type GAUGE
+${graph_id}_rw_util.info Read/write utilization of the device
+${graph_id}_rw_util.min 0
+${graph_id}_rw_util.draw LINE1
 EOF
     }

-- System Information:
Debian Release: buster/sid
  APT prefers unstable-debug
  APT policy: (500, 'unstable-debug'), (500, 'testing-debug'), (500, 
'unstable'), (500, 'oldstable'), (1, 'experimental-debug')
Architecture: amd64 (x86_64)
Foreign Architectures: i386

Kernel: Linux 4.18.0-2-amd64 (SMP w/4 CPU cores)
Locale: LANG=en_US.UTF-8, LC_CTYPE=en_US.UTF-8 (charmap=UTF-8), 
LANGUAGE=en_US.UTF-8 (charmap=UTF-8)
Shell: /bin/sh linked to /bin/dash
Init: systemd (via /run/systemd/system)
LSM: AppArmor: enabled

Versions of packages munin-plugins-core depends on:
ii  munin-common  2.0.42-5
ii  perl          5.26.2-7+b1

Versions of packages munin-plugins-core recommends:
ii  libnet-snmp-perl  6.0.1-4

Versions of packages munin-plugins-core suggests:
pn  conntrack            <none>
pn  libcache-cache-perl  <none>
pn  libdbd-mysql-perl    <none>
ii  libhttp-date-perl    6.02-1
ii  libnet-dns-perl      1.17-1
ii  libnet-ip-perl       1.26-2
pn  libnet-ldap-perl     <none>
ii  libnet-netmask-perl  1.9022-1
pn  libnet-telnet-perl   <none>
ii  libxml-parser-perl   2.44-2+b3
ii  python3              3.6.6-1
ii  ruby                 1:2.5.1

-- no debconf information

Reply via email to