From: Ira Weiny <[email protected]>
Date: Wed, 27 Apr 2011 10:43:01 -0700
Subject: [PATCH 01/5] infiniband-diags: ibqueryerrors; add error thresholds

Signed-off-by: Ira Weiny <[email protected]>
---
 Makefile.am            |    3 +-
 configure.in           |   10 ++++-
 etc/error_thresholds   |   16 ++++++
 man/ibqueryerrors.8    |   99 ---------------------------------------
 man/ibqueryerrors.8.in |  120 ++++++++++++++++++++++++++++++++++++++++++++++++
 src/ibqueryerrors.c    |  106 +++++++++++++++++++++++++++++++++++-------
 6 files changed, 235 insertions(+), 119 deletions(-)
 create mode 100644 etc/error_thresholds
 delete mode 100644 man/ibqueryerrors.8
 create mode 100644 man/ibqueryerrors.8.in

diff --git a/Makefile.am b/Makefile.am
index f9286a9..4b08635 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -94,7 +94,7 @@ ibdiag_version:
        fi
 
 EXTRA_DIST = scripts include infiniband-diags.spec.in infiniband-diags.spec \
-       $(man_MANS) autogen.sh
+       $(man_MANS) autogen.sh etc/error_thresholds
 
 dist-hook:
        if [ -x $(top_srcdir)/gen_chlog.sh ] ; then \
@@ -104,3 +104,4 @@ dist-hook:
 # install this to a default location.
 install-data-hook:
        $(top_srcdir)/config/install-sh -c -m 444 
$(top_srcdir)/scripts/IBswcountlimits.pm 
$(DESTDIR)/$(PERL_INSTALLDIR)/IBswcountlimits.pm
+       $(top_srcdir)/config/install-sh -c -m 444 
$(top_srcdir)/etc/error_thresholds $(DESTDIR)/$(sysconfdir)/infiniband-diags
diff --git a/configure.in b/configure.in
index 8461a37..350a11e 100644
--- a/configure.in
+++ b/configure.in
@@ -71,6 +71,13 @@ AC_CHECK_FUNCS([strchr strrchr strtol strtoul memset 
strtoull])
 dnl Checks for typedefs, structures, and compiler characteristics.
 AC_C_CONST
 
+dnl Define a configure directory
+IBDIAG_CONFIG_PATH_TMP1="`eval echo ${sysconfdir}`"
+IBDIAG_CONFIG_PATH_TMP2="`echo $IBDIAG_CONFIG_PATH_TMP1 | sed 
's/^NONE/$ac_default_prefix/'`"
+IBDIAG_CONFIG_PATH="`eval echo $IBDIAG_CONFIG_PATH_TMP2`/infiniband-diags"
+AC_SUBST(IBDIAG_CONFIG_PATH)
+AC_DEFINE_UNQUOTED([IBDIAG_CONFIG_PATH], "$IBDIAG_CONFIG_PATH", [Define the 
path to configuratios])
+
 dnl Check if we should include test utilities
 AC_MSG_CHECKING(for --enable-test-utils)
 AC_ARG_ENABLE(test-utils,
@@ -176,6 +183,7 @@ AC_CONFIG_FILES([\
        scripts/ibrouters \
        scripts/iblinkinfo.pl \
        scripts/ibqueryerrors.pl \
-   libibnetdisc/Makefile
+       man/ibqueryerrors.8 \
+       libibnetdisc/Makefile \
 ])
 AC_OUTPUT
diff --git a/etc/error_thresholds b/etc/error_thresholds
new file mode 100644
index 0000000..28cd295
--- /dev/null
+++ b/etc/error_thresholds
@@ -0,0 +1,16 @@
+# Define error thresholds here
+
+#SymbolErrorCounter=10
+#LinkErrorRecoveryCounter=10
+#LinkDownedCounter=10
+#PortRcvErrors=10
+#PortRcvRemotePhysicalErrors=100
+#PortRcvSwitchRelayErrors=100
+#PortXmitDiscards=100
+#PortXmitConstraintErrors=100
+#PortRcvConstraintErrors=100
+#LocalLinkIntegrityErrors=10
+#ExcessiveBufferOverrunErrors=10
+#VL15Dropped=100
+#PortXmitWait=1000
+
diff --git a/man/ibqueryerrors.8 b/man/ibqueryerrors.8
deleted file mode 100644
index a7af20a..0000000
--- a/man/ibqueryerrors.8
+++ /dev/null
@@ -1,99 +0,0 @@
-.TH IBQUERYERRORS 8 "Dec 31, 2009" "OpenIB" "OpenIB Diagnostics"
-
-.SH NAME
-ibqueryerrors \- query and report non-zero IB port counters
-
-.SH SYNOPSIS
-.B ibqueryerrors
-[-s <err1,err2,...> -c -r -C <ca_name> -P <ca_port> -G <node_guid>
--D <direct_route> -d -k -K \-\-load\-cache <filename>]
-
-.SH DESCRIPTION
-.PP
-ibqueryerrors reports port counters.  This is similar to ibcheckerrors with
-the additional ability to filter out selected errors, include the optional
-transmit and receive data counters, and report full link information for the
-link reported.
-
-.SH OPTIONS
-
-.PP
-.TP
-\fB\-s <err1,err2,...>\fR
-Suppress the errors listed in the comma separated list provided.
-.TP
-\fB\-c\fR
-Suppress some of the common "side effect" counters.  These counters usually do
-not indicate an error condition and can be usually be safely ignored.
-.TP
-\fB\-G <node_guid>\fR
-Report results only for the node guid specified.
-.TP
-\fB\-S <node_guid>\fR
-\-S is provided only for backward compatibility and works the same as "-G"
-.TP
-\fB\-D <direct_route>\fR
-Report results only for the switch specified by the direct route path.
-.TP
-\fB\-r\fR
-Report the port information.  This includes LID, port, external port (if
-applicable), link speed setting, remote GUID, remote port, remote external port
-(if applicable), and remote node description information.
-.TP
-\fB\-\-data\fR
-Include the optional transmit and receive data counters.
-.TP
-\fB\-\-switch\fR  print data for switches only
-.TP
-\fB\-\-ca\fR  print data for CA's only
-.TP
-\fB\-\-router\fR  print data for routers only
-.TP
-\fB\-\-clear\-errors\fR \fB\-k\fR Clear error counters after read.
-\-k and \-K can be used together to clear both errors and counters.
-.TP
-\fB\-\-clear\-counts\fR \fB\-K\fR Clear data counters after read.
-\fBCAUTION\fR clearing data counters will occur regardless of if they are
-printed or not.  This is because data counters are only \fBprinted\fR on ports
-which have errors.  This means if a port has 0 errors and the \-K option is
-specified the data counters will be cleared without any printed output.
-.TP
-\fB\-\-details\fR include receive error and transmit discard details
-.TP
-\fB\-\-load\-cache\fR <filename>
-Load and use the cached ibnetdiscover data stored in the specified
-filename.  May be useful for outputting and learning about other
-fabrics or a previous state of a fabric.  Cannot be used if user
-specifies a directo route path.  See
-.B ibnetdiscover
-for information on caching ibnetdiscover output.
-.TP
-\fB\-R\fR  (This option is obsolete and does nothing)
-
-.SH COMMON OPTIONS
-.PP
-\-d      raise the IB debugging level.
-       May be used several times (-ddd or -d -d -d).
-.PP
-\-e      show send and receive errors (timeouts and others)
-.PP
-\-h      show the usage message
-.PP
-\-v      increase the application verbosity level.
-       May be used several times (-vv or -v -v -v)
-.PP
-\-V      show the version info.
-
-# Other common flags:
-.PP
-\-C <ca_name>    use the specified ca_name.
-.PP
-\-P <ca_port>    use the specified ca_port.
-.PP
-\-t <timeout_ms> override the default timeout for the solicited mads.
-
-
-.SH AUTHOR
-.TP
-Ira Weiny
-.RI < [email protected] >
diff --git a/man/ibqueryerrors.8.in b/man/ibqueryerrors.8.in
new file mode 100644
index 0000000..4e9c7a1
--- /dev/null
+++ b/man/ibqueryerrors.8.in
@@ -0,0 +1,120 @@
+.TH IBQUERYERRORS 8 "Dec 31, 2009" "OpenIB" "OpenIB Diagnostics"
+
+.SH NAME
+ibqueryerrors \- query and report non-zero IB port counters
+
+.SH SYNOPSIS
+.B ibqueryerrors [options]
+
+.SH DESCRIPTION
+.PP
+The default behavior is to report the port error counters which exceed a
+threshold for each port in the fabric.  The default threshold is zero (0).
+Error fields can also be suppressed entirely.
+
+In addition to reporting errors on every port.  ibqueryerrors can report the
+port transmit and receive data as well as report full link information to the
+remote port if available.
+
+.SH OPTIONS
+
+.PP
+.TP
+\fB\-s <err1,err2,...>\fR
+Suppress the errors listed in the comma separated list provided.
+.TP
+\fB\-c\fR
+Suppress some of the common "side effect" counters.  These counters usually do
+not indicate an error condition and can be usually be safely ignored.
+.TP
+\fB\-G <node_guid>\fR
+Report results only for the node guid specified.
+.TP
+\fB\-S <node_guid>\fR
+\-S is provided only for backward compatibility and works the same as "-G"
+.TP
+\fB\-D <direct_route>\fR
+Report results only for the switch specified by the direct route path.
+.TP
+\fB\-r\fR
+Report the port information.  This includes LID, port, external port (if
+applicable), link speed setting, remote GUID, remote port, remote external port
+(if applicable), and remote node description information.
+.TP
+\fB\-\-data\fR
+Include the optional transmit and receive data counters.
+.TP
+\fB\-\-threshold-file\fR
+Specify an alternate threshold file.  The default is 
@IBDIAG_CONFIG_PATH@/error_thresholds
+.TP
+\fB\-\-switch\fR  print data for switches only
+.TP
+\fB\-\-ca\fR  print data for CA's only
+.TP
+\fB\-\-router\fR  print data for routers only
+.TP
+\fB\-\-clear\-errors\fR \fB\-k\fR Clear error counters after read.
+\-k and \-K can be used together to clear both errors and counters.
+.TP
+\fB\-\-clear\-counts\fR \fB\-K\fR Clear data counters after read.
+\fBCAUTION\fR clearing data counters will occur regardless of if they are
+printed or not.  This is because data counters are only \fBprinted\fR on ports
+which have errors.  This means if a port has 0 errors and the \-K option is
+specified the data counters will be cleared without any printed output.
+.TP
+\fB\-\-details\fR include receive error and transmit discard details
+.TP
+\fB\-\-load\-cache\fR <filename>
+Load and use the cached ibnetdiscover data stored in the specified
+filename.  May be useful for outputting and learning about other
+fabrics or a previous state of a fabric.  Cannot be used if user
+specifies a directo route path.  See
+.B ibnetdiscover
+for information on caching ibnetdiscover output.
+.TP
+\fB\-R\fR  (This option is obsolete and does nothing)
+
+.SH COMMON OPTIONS
+.PP
+\-d      raise the IB debugging level.
+       May be used several times (-ddd or -d -d -d).
+.PP
+\-e      show send and receive errors (timeouts and others)
+.PP
+\-h      show the usage message
+.PP
+\-v      increase the application verbosity level.
+       May be used several times (-vv or -v -v -v)
+.PP
+\-V      show the version info.
+
+# Other common flags:
+.PP
+\-C <ca_name>    use the specified ca_name.
+.PP
+\-P <ca_port>    use the specified ca_port.
+.PP
+\-t <timeout_ms> override the default timeout for the solicited mads.
+
+.SH FILES
+
+@IBDIAG_CONFIG_PATH@/error_thresholds
+
+Define threshold values for errors.  File format is simple "name=val".
+Comments begin with '#'
+
+Example:
+
+       # Define thresholds for error counters
+.br
+       SymbolErrorCounter=10
+.br
+       LinkErrorRecoveryCounter=10
+.br
+       VL15Dropped=100
+
+
+.SH AUTHOR
+.TP
+Ira Weiny
+.RI < [email protected] >
diff --git a/src/ibqueryerrors.c b/src/ibqueryerrors.c
index 295b398..68957fe 100644
--- a/src/ibqueryerrors.c
+++ b/src/ibqueryerrors.c
@@ -77,6 +77,69 @@ unsigned clear_errors = 0, clear_counts = 0, details = 0;
 #define PRINT_ROUTER 0x4
 #define PRINT_ALL 0xFF         /* all nodes default flag */
 
+struct {
+       int nodes_checked;
+       int bad_nodes;
+       int ports_checked;
+       int bad_ports;
+} summary = { 0 };
+
+#define DEF_THRES_FILE IBDIAG_CONFIG_PATH"/error_thresholds"
+static char *threshold_file = DEF_THRES_FILE;
+
+/* define a "packet" with threshold values in it */
+uint8_t thresholds[1204] = { 0 };
+
+static void set_thres(char *name, uint32_t val)
+{
+       int f;
+       for (f = IB_PC_FIRST_F; f <= IB_PC_LAST_F; f++) {
+               if (strcmp(name, mad_field_name(f)) == 0) {
+                       mad_encode_field(thresholds, f, &val);
+                       printf("[%s = %u]", name, val);
+               }
+       }
+}
+
+static void set_thresholds(char *threshold_file)
+{
+       char buf[1024];
+       int val = 0;
+       FILE *thresf = fopen(threshold_file, "r");
+       char *p_prefix, *p_last;
+       char *name;
+       char *val_str;
+
+       if (!thresf)
+               return;
+
+       printf("Thresholds: ");
+       while (fgets(buf, sizeof buf, thresf) != NULL) {
+               p_prefix = strtok_r(buf, "\n", &p_last);
+               if (!p_prefix)
+                       continue; /* ignore blank lines */
+
+               if (*p_prefix == '#')
+                       continue; /* ignore comment lines */
+
+               name = strtok_r(p_prefix, "=", &p_last);
+               val_str = strtok_r(NULL, "\n", &p_last);
+
+               val = strtoul(val_str, NULL, 0);
+               set_thres(name, val);
+       }
+       printf("\n");
+
+       fclose(thresf);
+}
+
+static int exceeds_threshold(int field, unsigned val)
+{
+       uint32_t thres = 0;
+       mad_decode_field(thresholds, field, &thres);
+       return (val > thres);
+}
+
 static unsigned int get_max(unsigned int num)
 {
        unsigned r = 0;         // r will be lg(num)
@@ -264,32 +327,33 @@ static int print_results(ib_portid_t * portid, char 
*node_name,
                        continue;
 
                mad_decode_field(pc, i, (void *)&val);
-               if (val)
+               if (exceeds_threshold(i, val)) {
                        n += snprintf(str + n, 1024 - n, " [%s == %u]",
                                      mad_field_name(i), val);
 
-               /* If there are PortXmitDiscards, get details (if supported) */
-               if (i == IB_PC_XMT_DISCARDS_F && details && val) {
-                       n += query_and_dump(str + n, sizeof(buf) - n, portid,
-                                           node, node_name, portnum,
-                                           "PortXmitDiscardDetails",
-                                           IB_GSI_PORT_XMIT_DISCARD_DETAILS,
-                                           IB_PC_RCV_LOCAL_PHY_ERR_F,
-                                           IB_PC_RCV_ERR_LAST_F);
-                       /* If there are PortRcvErrors, get details (if 
supported) */
-               } else if (i == IB_PC_ERR_RCV_F && details && val) {
-                       n += query_and_dump(str + n, sizeof(buf) - n, portid,
-                                           node, node_name, portnum,
-                                           "PortRcvErrorDetails",
-                                           IB_GSI_PORT_RCV_ERROR_DETAILS,
-                                           IB_PC_XMT_INACT_DISC_F,
-                                           IB_PC_XMT_DISC_LAST_F);
+                       /* If there are PortXmitDiscards, get details (if 
supported) */
+                       if (i == IB_PC_XMT_DISCARDS_F && details) {
+                               n += query_and_dump(str + n, sizeof(buf) - n, 
portid,
+                                                   node, node_name, portnum,
+                                                   "PortXmitDiscardDetails",
+                                                   
IB_GSI_PORT_XMIT_DISCARD_DETAILS,
+                                                   IB_PC_RCV_LOCAL_PHY_ERR_F,
+                                                   IB_PC_RCV_ERR_LAST_F);
+                               /* If there are PortRcvErrors, get details (if 
supported) */
+                       } else if (i == IB_PC_ERR_RCV_F && details) {
+                               n += query_and_dump(str + n, sizeof(buf) - n, 
portid,
+                                                   node, node_name, portnum,
+                                                   "PortRcvErrorDetails",
+                                                   
IB_GSI_PORT_RCV_ERROR_DETAILS,
+                                                   IB_PC_XMT_INACT_DISC_F,
+                                                   IB_PC_XMT_DISC_LAST_F);
+                       }
                }
        }
 
        if (!suppress(IB_PC_XMT_WAIT_F)) {
                mad_decode_field(pc, IB_PC_XMT_WAIT_F, (void *)&val);
-               if (val)
+               if (exceeds_threshold(IB_PC_XMT_WAIT_F, val))
                        n += snprintf(str + n, 1024 - n, " [%s == %u]",
                                      mad_field_name(IB_PC_XMT_WAIT_F), val);
        }
@@ -536,6 +600,9 @@ static int process_opt(void *context, int ch, char *optarg)
        case 7:
                load_cache_file = strdup(optarg);
                break;
+       case 8:
+               threshold_file = strdup(optarg);
+               break;
        case 'G':
        case 'S':
                node_guid_str = optarg;
@@ -590,6 +657,8 @@ int main(int argc, char **argv)
                 "query only switch specified by <dr_path>"},
                {"report-port", 'r', 0, NULL,
                 "report port configuration information"},
+               {"threshold-file", 8, 1, NULL,
+                "specify an alternate thresold file, default: " 
DEF_THRES_FILE},
                {"GNDN", 'R', 0, NULL,
                 "(This option is obsolete and does nothing)"},
                {"data", 2, 0, NULL, "include the data counters in the output"},
@@ -678,6 +747,7 @@ int main(int argc, char **argv)
        }
 
        report_suppressed();
+       set_thresholds(threshold_file);
 
        if (node_guid_str) {
                ibnd_node_t *node = ibnd_find_node_guid(fabric, node_guid);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to