User space tool for querying GPU health monitoring RAS events via
Generic Netlink Socket interface from Kernel's DRM Netlink Subsystem.
Available Commands are
        - List Nodes
        - Get Error Counters
        - Query Error Counter

Signed-off-by: Ravi Kishore Koppuravuri <[email protected]>
Co-authored-by: Iddamsetty Aravind <[email protected]>
Cc: Tauro Riana <[email protected]>
Cc: Gupta Anshuman <[email protected]>
Cc: Vivi Rodrigo <[email protected]>

---
V2 -> V3:
        - Created handle_err() function to remove redundant code
        - Handled more error scenarios while passing command line arguments
        - Resolved formatting issues (Rodrigo)

V1 -> V2:
        - Removed device_id from the input parameters
        - Updated help() function
        - Incorporated error handling logic
---
---
 include/drm-uapi/drm_ras.h |  79 +++++++
 meson.build                |   5 +-
 tools/drm_ras.c            | 425 +++++++++++++++++++++++++++++++++++++
 tools/meson.build          |   5 +
 4 files changed, 513 insertions(+), 1 deletion(-)
 create mode 100644 include/drm-uapi/drm_ras.h
 create mode 100644 tools/drm_ras.c

diff --git a/include/drm-uapi/drm_ras.h b/include/drm-uapi/drm_ras.h
new file mode 100644
index 000000000..af893aa36
--- /dev/null
+++ b/include/drm-uapi/drm_ras.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR 
BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/*     Documentation/netlink/specs/drm_ras.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _LINUX_DRM_RAS_H
+#define _LINUX_DRM_RAS_H
+
+#define DRM_RAS_GENL_NAME "drm-ras"
+#define DRM_RAS_FAMILY_VERSION 1
+
+/*
+ * Type of the node. Currently, only error-counter nodes are supported, which
+ * expose reliability counters for a hardware/software component.
+ */
+enum drm_ras_node_type {
+       DRM_RAS_NODE_TYPE_ERROR_COUNTER = 1,
+};
+
+enum {
+       /* Unique identifier for the node*/
+       DRM_RAS_NODE_ATTR_NODE_ID = 1,
+
+       /* Device name chosen by the driver at the time of registration */
+       DRM_RAS_NODE_ATTR_DEVICE_NAME,
+
+       /* Node name chosen by the driver at registration to identify RAS node 
inside the device */
+       DRM_RAS_NODE_ATTR_NODE_NAME,
+
+       /* Type of the node, identifying its function */
+       DRM_RAS_NODE_ATTR_NODE_TYPE,
+
+       __DRM_RAS_NODE_ATTR_MAX,
+       DRM_RAS_NODE_ATTR_MAX = (__DRM_RAS_NODE_ATTR_MAX - 1)
+};
+
+enum {
+       /* Node ID targeted by this error counter operation */
+       DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID = 1,
+
+       /* Unique identifier for a specific error counter within an node */
+       DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID,
+
+       /* Name of the requested error counter */
+       DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME,
+
+       /* Current value of the requested error counter */
+       DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE,
+
+       __DRM_RAS_ERROR_COUNTER_ATTR_MAX,
+       DRM_RAS_ERROR_COUNTER_ATTR_MAX = (__DRM_RAS_ERROR_COUNTER_ATTR_MAX - 1)
+};
+
+enum drm_genl_error_cmds {
+       /**
+        * @DRM_RAS_CMD_LIST_NODES: Command to Retrieve the full list of 
currently registered
+        * DRM RAS nodes.Each node includes its dynamically assigned ID, name, 
and type.
+        * Obtain the Node IDs by calling this command and use it in the 
subsequent operations
+        * on the nodes.
+        */
+       DRM_RAS_CMD_LIST_NODES = 1,
+
+       /**
+        * @DRM_RAS_CMD_GET_ERROR_COUNTERS: Retrieve the full list of error 
counters for a given
+        * node. The response include id, name, and current value of each 
counter.
+        */
+       DRM_RAS_CMD_GET_ERROR_COUNTERS,
+
+       /**
+        * @DRM_RAS_CMD_QUERY_ERROR_COUNTER: Query the information of a 
specific error counter
+        * for a given node. Response contains id, name, and current value of 
the counter.
+        */
+       DRM_RAS_CMD_QUERY_ERROR_COUNTER,
+
+       __DRM_RAS_CMD_MAX,
+       DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1)
+};
+
+#endif /* _LINUX_DRM_RAS_H */
diff --git a/meson.build b/meson.build
index db6e09a94..f7807660e 100644
--- a/meson.build
+++ b/meson.build
@@ -165,10 +165,13 @@ cairo = dependency('cairo', version : '>1.12.0', required 
: true)
 libudev = dependency('libudev', required : true)
 glib = dependency('glib-2.0', required : true)
 
+libnl = dependency('libnl-3.0', required: false)
+libnl_genl = dependency('libnl-genl-3.0', required: false)
+libnl_cli = dependency('libnl-cli-3.0', required:false)
+
 xmlrpc = dependency('xmlrpc', required : false)
 xmlrpc_util = dependency('xmlrpc_util', required : false)
 xmlrpc_client = dependency('xmlrpc_client', required : false)
-
 xmlrpc_cmd = find_program('xmlrpc-c-config', required : false)
 if not xmlrpc.found() and xmlrpc_cmd.found()
        libs_cmd = run_command(xmlrpc_cmd, 'client', '--libs', check: false)
diff --git a/tools/drm_ras.c b/tools/drm_ras.c
new file mode 100644
index 000000000..9bb58bc5e
--- /dev/null
+++ b/tools/drm_ras.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <getopt.h>
+#include <linux/genetlink.h>
+#include <netlink/netlink.h>
+#include <netlink/cache.h>
+#include <netlink/genl/genl.h>
+#include <netlink/genl/ctrl.h>
+#include <netlink/cli/utils.h>
+#include <netlink/cli/link.h>
+#include "../include/drm-uapi/drm_ras.h"
+#include "igt_device_scan.h"
+
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
+
+struct nl_sock *mcsock;
+
+enum opt_val {
+       OPT_UNKNOWN = '?',
+       OPT_END = -1,
+       OPT_NODEID,
+       OPT_ERRORID,
+       OPT_HELP,
+};
+
+enum cmd_ids {
+       INVALID_CMD = -1,
+       LIST_NODES = 0,
+       GET_ERROR_COUNTERS,
+       QUERY_ERROR_COUNTER,
+
+       __MAX_CMDS,
+};
+
+static const char * const cmd_names[] = {
+       "list_nodes",
+       "get_error_counters",
+       "query_error_counter",
+};
+
+struct app_context {
+       enum drm_genl_error_cmds command;
+       struct nl_sock *sock;
+       struct nl_cb *cb;
+       uint32_t node_id;
+       uint32_t error_id;
+       int error_id_set;
+       int node_id_set;
+       int error;
+       int family_id;
+};
+
+static void help(char **argv)
+{
+       int i;
+
+       printf("Usage: %s command [<command options>]\n", argv[0]);
+       printf("commands:\n");
+
+       for (i = 0; i < __MAX_CMDS; i++) {
+               switch (i) {
+               case LIST_NODES:
+                       printf("%s %s\n",
+                              argv[0],
+                              cmd_names[i]);
+                       break;
+               case GET_ERROR_COUNTERS:
+                       printf("%s %s "
+                               "--node-id=<node-id>\n",
+                               argv[0],
+                               cmd_names[i]);
+                       break;
+               case QUERY_ERROR_COUNTER:
+                       printf("%s %s "
+                               "--node-id=<node-id> "
+                               "--error-id=<error-id>\n",
+                               argv[0],
+                               cmd_names[i]);
+                       break;
+               default:
+                       printf("%s is Unknown Command\n",
+                              (i < __MAX_CMDS && cmd_names[i]) ? cmd_names[i] 
: "Unknown");
+               }
+       }
+}
+
+static int list_nodes_handler(struct nl_msg *msg, void *arg)
+{
+       struct nlmsghdr *nlh = nlmsg_hdr(msg);
+       struct nlattr *nla;
+       int len, remain;
+
+       len = GENL_HDRLEN;
+       nlmsg_for_each_attr(nla, nlh, len, remain) {
+               if (nla_type(nla) > DRM_RAS_NODE_ATTR_MAX) {
+                       printf("Unknown Node attribute type: %d\n", 
nla_type(nla));
+                       return NL_SKIP;
+               }
+
+               switch (nla_type(nla)) {
+               case DRM_RAS_NODE_ATTR_NODE_ID:
+                       printf("%-18u\t", nla_get_u32(nla));
+                       break;
+               case DRM_RAS_NODE_ATTR_DEVICE_NAME:
+                       printf("%-30s\t", nla_get_string(nla));
+                       break;
+               case DRM_RAS_NODE_ATTR_NODE_NAME:
+                       printf("%-30s\t", nla_get_string(nla));
+                       break;
+               case DRM_RAS_NODE_ATTR_NODE_TYPE:
+                       printf("%-18u\n", nla_get_u32(nla));
+                       break;
+               default:
+                       printf("Unknown attribute type: %d\n", nla_type(nla));
+                       break;
+               }
+       }
+       return NL_OK;
+}
+
+static int query_error_counter(struct nl_msg *msg, void *arg)
+{
+       struct nlmsghdr *nlh = nlmsg_hdr(msg);
+       struct nlattr *attrs[256];
+       int ret;
+
+       ret = genlmsg_parse(nlh, 0, attrs, 256, NULL);
+       if (ret < 0) {
+               fprintf(stderr, "Failed to parse attributes: %s\n", 
nl_geterror(ret));
+               return NL_SKIP;
+       }
+
+       if (!attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]) {
+               nl_cli_fatal(NLE_FAILURE, 
"DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE attribute is missing");
+               return NL_SKIP;
+       }
+
+       printf("counter value %u\n", 
nla_get_u32(attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]));
+
+       return NL_OK;
+}
+
+static int get_error_counters(struct nl_msg *msg, void *arg)
+{
+       struct nlmsghdr *nlh = nlmsg_hdr(msg);
+       struct nlattr *nla;
+       int len, remain;
+
+       len = GENL_HDRLEN;
+
+       nlmsg_for_each_attr(nla, nlh, len, remain) {
+               if (nla_type(nla) > DRM_RAS_ERROR_COUNTER_ATTR_MAX) {
+                       printf("Unknown error counter attribute type: %d\n", 
nla_type(nla));
+                       return NL_SKIP;
+               }
+
+               switch (nla_type(nla)) {
+               case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID:
+                       printf("%-18u\t", nla_get_u32(nla));
+                       break;
+               case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME:
+                       printf("%-30s\t", nla_get_string(nla));
+                       break;
+               case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE:
+                       printf("%-18u\n", nla_get_u32(nla));
+                       break;
+               default:
+                       printf("Unknown attribute type: %d\n", nla_type(nla));
+                       break;
+               }
+       }
+       return NL_OK;
+}
+
+static int drm_genl_handle_msg(struct nl_msg *msg, void *arg)
+{
+       struct app_context *ctx = (struct app_context *)arg;
+       struct nlmsghdr *nlh = nlmsg_hdr(msg);
+       struct genlmsghdr *gnlh = genlmsg_hdr(nlh);
+
+       if (gnlh->cmd != ctx->command) {
+               fprintf(stderr,
+                       "Unexpected command response: got %d, expected %d\n",
+                       gnlh->cmd,
+                       ctx->command);
+               return NL_SKIP;
+       }
+
+       switch (ctx->command) {
+       case DRM_RAS_CMD_LIST_NODES:
+               return list_nodes_handler(msg, arg);
+       case DRM_RAS_CMD_GET_ERROR_COUNTERS:
+               return get_error_counters(msg, arg);
+       case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
+               return query_error_counter(msg, arg);
+       default:
+               fprintf(stderr, "Unknown command: %d\n", ctx->command);
+               ctx->error = -EOPNOTSUPP;
+               return NL_SKIP;
+       }
+}
+
+static void handle_err(struct nl_sock *sock, int ret, const char *err_msg)
+{
+       nl_close(sock);
+       nl_socket_free(sock);
+       nl_cli_fatal(ret, err_msg);
+}
+
+static void send_cmd(int cmd, void *arg)
+{
+       struct app_context *ctx = (struct app_context *)arg;
+       struct nl_msg *msg;
+       void *msg_head;
+       int ret;
+
+       msg = nlmsg_alloc();
+       if (!msg)
+               handle_err(ctx->sock, NLE_INVAL, "nlmsg_alloc failed\n");
+
+       switch (cmd) {
+       case DRM_RAS_CMD_LIST_NODES:
+               msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+                                      ctx->family_id, 0,
+                                      NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | 
NLM_F_MATCH,
+                                      cmd, 1);
+               if (!msg_head)
+                       nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+               printf("%-18s\t%-30s\t%-30s\t%-18s\n",
+                      "node-id", "device-name", "node-name", "node-type");
+               break;
+       case DRM_RAS_CMD_GET_ERROR_COUNTERS:
+               if (ctx->node_id == -1) {
+                       fprintf(stderr, "Error: --node-id is required for %s 
command\n",
+                               cmd_names[ctx->command - 1]);
+                       exit(EXIT_FAILURE);
+               }
+               msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+                                      ctx->family_id, 0,
+                                      NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT | 
NLM_F_MATCH,
+                                      cmd, 1);
+
+               if (!msg_head)
+                       nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+               nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, 
ctx->node_id);
+               printf("%-18s\t%-30s\t%-18s\n",
+                      "error-id", "error-name", "error-value");
+               break;
+       case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
+               if (ctx->node_id == -1 || ctx->error_id == -1) {
+                       fprintf(stderr,
+                               "Error: --node-id and --error-id are required "
+                               "for %s command\n",
+                               cmd_names[ctx->command - 1]);
+                       exit(EXIT_FAILURE);
+               }
+               msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+                                      ctx->family_id, 0,
+                                      NLM_F_REQUEST | NLM_F_ACK,
+                                      cmd, 1);
+
+               if (!msg_head)
+                       nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+               nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID, 
ctx->node_id);
+               nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID, 
ctx->error_id);
+               break;
+       default:
+               break;
+       }
+
+       ret = nl_send_auto(ctx->sock, msg);
+       if (ret < 0)
+               nl_cli_fatal(ret, "Unable to send message: %s", 
nl_geterror(ret));
+
+       ret = nl_recvmsgs_default(ctx->sock);
+       if (ret < 0)
+               nl_cli_fatal(ret, "Unable to receive message: %s", 
nl_geterror(ret));
+
+       nlmsg_free(msg);
+}
+
+static int get_cmd(char *cmd_name)
+{
+       int i;
+
+       if (!cmd_name)
+               return -1;
+
+       for (i = 0; i < __DRM_RAS_CMD_MAX; i++) {
+               if (strcasecmp(cmd_name, cmd_names[i]) == 0)
+                       return i + 1;
+       }
+       return -1;
+}
+
+static int check_for_help(int argc, char **argv)
+{
+       for (int i = 1; i < argc; i++) {
+               if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") == 
0)
+                       return 1;
+       }
+       return 0;
+}
+
+int main(int argc, char **argv)
+{
+       char *endptr;
+       int ret, opt, option_index = 0;
+       struct app_context ctx = {0};
+
+       ctx.error_id = -1;
+       ctx.node_id = -1;
+
+       if (argc < 2) {
+               fprintf(stderr, "\nNo Arguments were passed.\n\n"
+                       "Use --help to see the correct usage.\n\n");
+               exit(EXIT_FAILURE);
+       }
+       if (check_for_help(argc, argv)) {
+               help(argv);
+               exit(EXIT_SUCCESS);
+       }
+
+       ctx.command = get_cmd(argv[1]);
+       if (ctx.command < 0) {
+               fprintf(stderr, "invalid command\n");
+               help(argv);
+               exit(EXIT_FAILURE);
+       }
+
+       static struct option options[] = {
+               {"error-id", optional_argument, NULL, OPT_ERRORID},
+               {"node-id",  optional_argument, NULL, OPT_NODEID},
+               {"help",     no_argument,       NULL, OPT_HELP},
+               {0, 0, 0, 0}
+       };
+
+       optind = 2;
+       while ((opt = getopt_long(argc, argv, "h", options, &option_index)) != 
-1) {
+               switch (opt) {
+               case OPT_ERRORID:
+                       if (optarg) {
+                               ctx.error_id = strtoul(optarg, &endptr, 10);
+                               if (*endptr != '\0' || !ctx.error_id) {
+                                       fprintf(stderr,
+                                               "\ninvalid error-id %s\n\n"
+                                               "Enter a valid error-id 
received "
+                                               "from get_error_counters 
command\n\n",
+                                               optarg);
+                                       exit(EXIT_FAILURE);
+                               }
+                       } else {
+                               printf("error-id not specified. check --help  
for correct usage\n");
+                               exit(EXIT_FAILURE);
+                       }
+                       break;
+               case OPT_NODEID:
+                       if (optarg) {
+                               ctx.node_id = strtoul(optarg, &endptr, 10);
+                               if (*endptr != '\0' || !ctx.node_id) {
+                                       fprintf(stderr,
+                                               "\ninvalid node id %s\n\n"
+                                               "Enter a valid node-id received 
"
+                                               "from list_nodes command\n\n",
+                                               optarg);
+                                       exit(EXIT_FAILURE);
+                               }
+                       } else {
+                               printf("node-id not specified. Check --help for 
correct usage\n");
+                               exit(EXIT_FAILURE);
+                       }
+                       break;
+               case OPT_HELP:
+               case 'h':
+                       help(argv);
+                       exit(EXIT_SUCCESS);
+                       break;
+               case '?':
+                       fprintf(stderr,
+                               "Unknown argument passed\n"
+                               "Check --help for the correct usage\n\n");
+                       exit(EXIT_FAILURE);
+                       break;
+               default:
+                       fprintf(stderr, "Unexpected option: %c\n", opt);
+                       exit(EXIT_FAILURE);
+                       break;
+               }
+       }
+
+       ctx.sock = nl_cli_alloc_socket();
+       if (!ctx.sock)
+               nl_cli_fatal(NLE_NOMEM, "Cannot allocate nl_sock");
+
+       ret = nl_cli_connect(ctx.sock, NETLINK_GENERIC);
+       if (ret < 0)
+               handle_err(ctx.sock, ret, "Cannot connect handle\n");
+
+       ctx.family_id = genl_ctrl_resolve(ctx.sock, DRM_RAS_GENL_NAME);
+       if (ctx.family_id < 0)
+               handle_err(ctx.sock, NLE_INVAL, "Resolving of family name 
failed\n");
+
+       ret = nl_socket_modify_cb(ctx.sock, NL_CB_VALID, NL_CB_CUSTOM, 
drm_genl_handle_msg, &ctx);
+       if (ret < 0)
+               handle_err(ctx.sock, ret, "Unable to modify valid message 
callback\n");
+
+       send_cmd(ctx.command, &ctx);
+
+       nl_close(ctx.sock);
+       nl_socket_free(ctx.sock);
+
+       return 0;
+}
diff --git a/tools/meson.build b/tools/meson.build
index 8185ba160..74ff97713 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -70,6 +70,11 @@ if libudev.found()
                   install : true)
 endif
 
+executable('drm_ras', 'drm_ras.c',
+                       dependencies : [tool_deps, libnl, libnl_cli, 
libnl_genl],
+                       install_rpath : bindir_rpathdir,
+                       install : true)
+
 executable('gputop', 'gputop.c',
            install : true,
            install_rpath : bindir_rpathdir,
-- 
2.34.1

Reply via email to