User space tool for querying GPU health monitoring RAS events via
Generic Netlink Socket interface from Kernel's DRM Netlink Subsystem.
Available Commands are
- List Nodes
- Get Error Counters
- Query Error Counter
Signed-off-by: Ravi Kishore Koppuravuri <[email protected]>
Co-authored-by: Iddamsetty Aravind <[email protected]>
Cc: Tauro Riana <[email protected]>
Cc: Gupta Anshuman <[email protected]>
Cc: Vivi Rodrigo <[email protected]>
---
V2 -> V3:
- Created handle_err() function to remove redundant code
- Handled more error scenarios while passing command line arguments
- Resolved formatting issues (Rodrigo)
V1 -> V2:
- Removed device_id from the input parameters
- Updated help() function
- Incorporated error handling logic
---
---
include/drm-uapi/drm_ras.h | 79 +++++++
meson.build | 5 +-
tools/drm_ras.c | 425 +++++++++++++++++++++++++++++++++++++
tools/meson.build | 5 +
4 files changed, 513 insertions(+), 1 deletion(-)
create mode 100644 include/drm-uapi/drm_ras.h
create mode 100644 tools/drm_ras.c
diff --git a/include/drm-uapi/drm_ras.h b/include/drm-uapi/drm_ras.h
new file mode 100644
index 000000000..af893aa36
--- /dev/null
+++ b/include/drm-uapi/drm_ras.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR
BSD-3-Clause) */
+/* Do not edit directly, auto-generated from: */
+/* Documentation/netlink/specs/drm_ras.yaml */
+/* YNL-GEN uapi header */
+
+#ifndef _LINUX_DRM_RAS_H
+#define _LINUX_DRM_RAS_H
+
+#define DRM_RAS_GENL_NAME "drm-ras"
+#define DRM_RAS_FAMILY_VERSION 1
+
+/*
+ * Type of the node. Currently, only error-counter nodes are supported, which
+ * expose reliability counters for a hardware/software component.
+ */
+enum drm_ras_node_type {
+ DRM_RAS_NODE_TYPE_ERROR_COUNTER = 1,
+};
+
+enum {
+ /* Unique identifier for the node*/
+ DRM_RAS_NODE_ATTR_NODE_ID = 1,
+
+ /* Device name chosen by the driver at the time of registration */
+ DRM_RAS_NODE_ATTR_DEVICE_NAME,
+
+ /* Node name chosen by the driver at registration to identify RAS node
inside the device */
+ DRM_RAS_NODE_ATTR_NODE_NAME,
+
+ /* Type of the node, identifying its function */
+ DRM_RAS_NODE_ATTR_NODE_TYPE,
+
+ __DRM_RAS_NODE_ATTR_MAX,
+ DRM_RAS_NODE_ATTR_MAX = (__DRM_RAS_NODE_ATTR_MAX - 1)
+};
+
+enum {
+ /* Node ID targeted by this error counter operation */
+ DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID = 1,
+
+ /* Unique identifier for a specific error counter within an node */
+ DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID,
+
+ /* Name of the requested error counter */
+ DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME,
+
+ /* Current value of the requested error counter */
+ DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE,
+
+ __DRM_RAS_ERROR_COUNTER_ATTR_MAX,
+ DRM_RAS_ERROR_COUNTER_ATTR_MAX = (__DRM_RAS_ERROR_COUNTER_ATTR_MAX - 1)
+};
+
+enum drm_genl_error_cmds {
+ /**
+ * @DRM_RAS_CMD_LIST_NODES: Command to Retrieve the full list of
currently registered
+ * DRM RAS nodes.Each node includes its dynamically assigned ID, name,
and type.
+ * Obtain the Node IDs by calling this command and use it in the
subsequent operations
+ * on the nodes.
+ */
+ DRM_RAS_CMD_LIST_NODES = 1,
+
+ /**
+ * @DRM_RAS_CMD_GET_ERROR_COUNTERS: Retrieve the full list of error
counters for a given
+ * node. The response include id, name, and current value of each
counter.
+ */
+ DRM_RAS_CMD_GET_ERROR_COUNTERS,
+
+ /**
+ * @DRM_RAS_CMD_QUERY_ERROR_COUNTER: Query the information of a
specific error counter
+ * for a given node. Response contains id, name, and current value of
the counter.
+ */
+ DRM_RAS_CMD_QUERY_ERROR_COUNTER,
+
+ __DRM_RAS_CMD_MAX,
+ DRM_RAS_CMD_MAX = (__DRM_RAS_CMD_MAX - 1)
+};
+
+#endif /* _LINUX_DRM_RAS_H */
diff --git a/meson.build b/meson.build
index db6e09a94..f7807660e 100644
--- a/meson.build
+++ b/meson.build
@@ -165,10 +165,13 @@ cairo = dependency('cairo', version : '>1.12.0', required
: true)
libudev = dependency('libudev', required : true)
glib = dependency('glib-2.0', required : true)
+libnl = dependency('libnl-3.0', required: false)
+libnl_genl = dependency('libnl-genl-3.0', required: false)
+libnl_cli = dependency('libnl-cli-3.0', required:false)
+
xmlrpc = dependency('xmlrpc', required : false)
xmlrpc_util = dependency('xmlrpc_util', required : false)
xmlrpc_client = dependency('xmlrpc_client', required : false)
-
xmlrpc_cmd = find_program('xmlrpc-c-config', required : false)
if not xmlrpc.found() and xmlrpc_cmd.found()
libs_cmd = run_command(xmlrpc_cmd, 'client', '--libs', check: false)
diff --git a/tools/drm_ras.c b/tools/drm_ras.c
new file mode 100644
index 000000000..9bb58bc5e
--- /dev/null
+++ b/tools/drm_ras.c
@@ -0,0 +1,425 @@
+// SPDX-License-Identifier: MIT
+/*
+ * Copyright © 2025 Intel Corporation
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <getopt.h>
+#include <linux/genetlink.h>
+#include <netlink/netlink.h>
+#include <netlink/cache.h>
+#include <netlink/genl/genl.h>
+#include <netlink/genl/ctrl.h>
+#include <netlink/cli/utils.h>
+#include <netlink/cli/link.h>
+#include "../include/drm-uapi/drm_ras.h"
+#include "igt_device_scan.h"
+
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0]))
+
+struct nl_sock *mcsock;
+
+enum opt_val {
+ OPT_UNKNOWN = '?',
+ OPT_END = -1,
+ OPT_NODEID,
+ OPT_ERRORID,
+ OPT_HELP,
+};
+
+enum cmd_ids {
+ INVALID_CMD = -1,
+ LIST_NODES = 0,
+ GET_ERROR_COUNTERS,
+ QUERY_ERROR_COUNTER,
+
+ __MAX_CMDS,
+};
+
+static const char * const cmd_names[] = {
+ "list_nodes",
+ "get_error_counters",
+ "query_error_counter",
+};
+
+struct app_context {
+ enum drm_genl_error_cmds command;
+ struct nl_sock *sock;
+ struct nl_cb *cb;
+ uint32_t node_id;
+ uint32_t error_id;
+ int error_id_set;
+ int node_id_set;
+ int error;
+ int family_id;
+};
+
+static void help(char **argv)
+{
+ int i;
+
+ printf("Usage: %s command [<command options>]\n", argv[0]);
+ printf("commands:\n");
+
+ for (i = 0; i < __MAX_CMDS; i++) {
+ switch (i) {
+ case LIST_NODES:
+ printf("%s %s\n",
+ argv[0],
+ cmd_names[i]);
+ break;
+ case GET_ERROR_COUNTERS:
+ printf("%s %s "
+ "--node-id=<node-id>\n",
+ argv[0],
+ cmd_names[i]);
+ break;
+ case QUERY_ERROR_COUNTER:
+ printf("%s %s "
+ "--node-id=<node-id> "
+ "--error-id=<error-id>\n",
+ argv[0],
+ cmd_names[i]);
+ break;
+ default:
+ printf("%s is Unknown Command\n",
+ (i < __MAX_CMDS && cmd_names[i]) ? cmd_names[i]
: "Unknown");
+ }
+ }
+}
+
+static int list_nodes_handler(struct nl_msg *msg, void *arg)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ struct nlattr *nla;
+ int len, remain;
+
+ len = GENL_HDRLEN;
+ nlmsg_for_each_attr(nla, nlh, len, remain) {
+ if (nla_type(nla) > DRM_RAS_NODE_ATTR_MAX) {
+ printf("Unknown Node attribute type: %d\n",
nla_type(nla));
+ return NL_SKIP;
+ }
+
+ switch (nla_type(nla)) {
+ case DRM_RAS_NODE_ATTR_NODE_ID:
+ printf("%-18u\t", nla_get_u32(nla));
+ break;
+ case DRM_RAS_NODE_ATTR_DEVICE_NAME:
+ printf("%-30s\t", nla_get_string(nla));
+ break;
+ case DRM_RAS_NODE_ATTR_NODE_NAME:
+ printf("%-30s\t", nla_get_string(nla));
+ break;
+ case DRM_RAS_NODE_ATTR_NODE_TYPE:
+ printf("%-18u\n", nla_get_u32(nla));
+ break;
+ default:
+ printf("Unknown attribute type: %d\n", nla_type(nla));
+ break;
+ }
+ }
+ return NL_OK;
+}
+
+static int query_error_counter(struct nl_msg *msg, void *arg)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ struct nlattr *attrs[256];
+ int ret;
+
+ ret = genlmsg_parse(nlh, 0, attrs, 256, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "Failed to parse attributes: %s\n",
nl_geterror(ret));
+ return NL_SKIP;
+ }
+
+ if (!attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]) {
+ nl_cli_fatal(NLE_FAILURE,
"DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE attribute is missing");
+ return NL_SKIP;
+ }
+
+ printf("counter value %u\n",
nla_get_u32(attrs[DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE]));
+
+ return NL_OK;
+}
+
+static int get_error_counters(struct nl_msg *msg, void *arg)
+{
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ struct nlattr *nla;
+ int len, remain;
+
+ len = GENL_HDRLEN;
+
+ nlmsg_for_each_attr(nla, nlh, len, remain) {
+ if (nla_type(nla) > DRM_RAS_ERROR_COUNTER_ATTR_MAX) {
+ printf("Unknown error counter attribute type: %d\n",
nla_type(nla));
+ return NL_SKIP;
+ }
+
+ switch (nla_type(nla)) {
+ case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID:
+ printf("%-18u\t", nla_get_u32(nla));
+ break;
+ case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_NAME:
+ printf("%-30s\t", nla_get_string(nla));
+ break;
+ case DRM_RAS_ERROR_COUNTER_ATTR_ERROR_VALUE:
+ printf("%-18u\n", nla_get_u32(nla));
+ break;
+ default:
+ printf("Unknown attribute type: %d\n", nla_type(nla));
+ break;
+ }
+ }
+ return NL_OK;
+}
+
+static int drm_genl_handle_msg(struct nl_msg *msg, void *arg)
+{
+ struct app_context *ctx = (struct app_context *)arg;
+ struct nlmsghdr *nlh = nlmsg_hdr(msg);
+ struct genlmsghdr *gnlh = genlmsg_hdr(nlh);
+
+ if (gnlh->cmd != ctx->command) {
+ fprintf(stderr,
+ "Unexpected command response: got %d, expected %d\n",
+ gnlh->cmd,
+ ctx->command);
+ return NL_SKIP;
+ }
+
+ switch (ctx->command) {
+ case DRM_RAS_CMD_LIST_NODES:
+ return list_nodes_handler(msg, arg);
+ case DRM_RAS_CMD_GET_ERROR_COUNTERS:
+ return get_error_counters(msg, arg);
+ case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
+ return query_error_counter(msg, arg);
+ default:
+ fprintf(stderr, "Unknown command: %d\n", ctx->command);
+ ctx->error = -EOPNOTSUPP;
+ return NL_SKIP;
+ }
+}
+
+static void handle_err(struct nl_sock *sock, int ret, const char *err_msg)
+{
+ nl_close(sock);
+ nl_socket_free(sock);
+ nl_cli_fatal(ret, err_msg);
+}
+
+static void send_cmd(int cmd, void *arg)
+{
+ struct app_context *ctx = (struct app_context *)arg;
+ struct nl_msg *msg;
+ void *msg_head;
+ int ret;
+
+ msg = nlmsg_alloc();
+ if (!msg)
+ handle_err(ctx->sock, NLE_INVAL, "nlmsg_alloc failed\n");
+
+ switch (cmd) {
+ case DRM_RAS_CMD_LIST_NODES:
+ msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+ ctx->family_id, 0,
+ NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT |
NLM_F_MATCH,
+ cmd, 1);
+ if (!msg_head)
+ nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+ printf("%-18s\t%-30s\t%-30s\t%-18s\n",
+ "node-id", "device-name", "node-name", "node-type");
+ break;
+ case DRM_RAS_CMD_GET_ERROR_COUNTERS:
+ if (ctx->node_id == -1) {
+ fprintf(stderr, "Error: --node-id is required for %s
command\n",
+ cmd_names[ctx->command - 1]);
+ exit(EXIT_FAILURE);
+ }
+ msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+ ctx->family_id, 0,
+ NLM_F_REQUEST | NLM_F_ACK | NLM_F_ROOT |
NLM_F_MATCH,
+ cmd, 1);
+
+ if (!msg_head)
+ nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+ nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID,
ctx->node_id);
+ printf("%-18s\t%-30s\t%-18s\n",
+ "error-id", "error-name", "error-value");
+ break;
+ case DRM_RAS_CMD_QUERY_ERROR_COUNTER:
+ if (ctx->node_id == -1 || ctx->error_id == -1) {
+ fprintf(stderr,
+ "Error: --node-id and --error-id are required "
+ "for %s command\n",
+ cmd_names[ctx->command - 1]);
+ exit(EXIT_FAILURE);
+ }
+ msg_head = genlmsg_put(msg, NL_AUTO_PORT, NL_AUTO_SEQ,
+ ctx->family_id, 0,
+ NLM_F_REQUEST | NLM_F_ACK,
+ cmd, 1);
+
+ if (!msg_head)
+ nl_cli_fatal(ENOMEM, "genlmsg_put failed\n");
+
+ nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_NODE_ID,
ctx->node_id);
+ nla_put_u32(msg, DRM_RAS_ERROR_COUNTER_ATTR_ERROR_ID,
ctx->error_id);
+ break;
+ default:
+ break;
+ }
+
+ ret = nl_send_auto(ctx->sock, msg);
+ if (ret < 0)
+ nl_cli_fatal(ret, "Unable to send message: %s",
nl_geterror(ret));
+
+ ret = nl_recvmsgs_default(ctx->sock);
+ if (ret < 0)
+ nl_cli_fatal(ret, "Unable to receive message: %s",
nl_geterror(ret));
+
+ nlmsg_free(msg);
+}
+
+static int get_cmd(char *cmd_name)
+{
+ int i;
+
+ if (!cmd_name)
+ return -1;
+
+ for (i = 0; i < __DRM_RAS_CMD_MAX; i++) {
+ if (strcasecmp(cmd_name, cmd_names[i]) == 0)
+ return i + 1;
+ }
+ return -1;
+}
+
+static int check_for_help(int argc, char **argv)
+{
+ for (int i = 1; i < argc; i++) {
+ if (strcmp(argv[i], "--help") == 0 || strcmp(argv[i], "-h") ==
0)
+ return 1;
+ }
+ return 0;
+}
+
+int main(int argc, char **argv)
+{
+ char *endptr;
+ int ret, opt, option_index = 0;
+ struct app_context ctx = {0};
+
+ ctx.error_id = -1;
+ ctx.node_id = -1;
+
+ if (argc < 2) {
+ fprintf(stderr, "\nNo Arguments were passed.\n\n"
+ "Use --help to see the correct usage.\n\n");
+ exit(EXIT_FAILURE);
+ }
+ if (check_for_help(argc, argv)) {
+ help(argv);
+ exit(EXIT_SUCCESS);
+ }
+
+ ctx.command = get_cmd(argv[1]);
+ if (ctx.command < 0) {
+ fprintf(stderr, "invalid command\n");
+ help(argv);
+ exit(EXIT_FAILURE);
+ }
+
+ static struct option options[] = {
+ {"error-id", optional_argument, NULL, OPT_ERRORID},
+ {"node-id", optional_argument, NULL, OPT_NODEID},
+ {"help", no_argument, NULL, OPT_HELP},
+ {0, 0, 0, 0}
+ };
+
+ optind = 2;
+ while ((opt = getopt_long(argc, argv, "h", options, &option_index)) !=
-1) {
+ switch (opt) {
+ case OPT_ERRORID:
+ if (optarg) {
+ ctx.error_id = strtoul(optarg, &endptr, 10);
+ if (*endptr != '\0' || !ctx.error_id) {
+ fprintf(stderr,
+ "\ninvalid error-id %s\n\n"
+ "Enter a valid error-id
received "
+ "from get_error_counters
command\n\n",
+ optarg);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ printf("error-id not specified. check --help
for correct usage\n");
+ exit(EXIT_FAILURE);
+ }
+ break;
+ case OPT_NODEID:
+ if (optarg) {
+ ctx.node_id = strtoul(optarg, &endptr, 10);
+ if (*endptr != '\0' || !ctx.node_id) {
+ fprintf(stderr,
+ "\ninvalid node id %s\n\n"
+ "Enter a valid node-id received
"
+ "from list_nodes command\n\n",
+ optarg);
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ printf("node-id not specified. Check --help for
correct usage\n");
+ exit(EXIT_FAILURE);
+ }
+ break;
+ case OPT_HELP:
+ case 'h':
+ help(argv);
+ exit(EXIT_SUCCESS);
+ break;
+ case '?':
+ fprintf(stderr,
+ "Unknown argument passed\n"
+ "Check --help for the correct usage\n\n");
+ exit(EXIT_FAILURE);
+ break;
+ default:
+ fprintf(stderr, "Unexpected option: %c\n", opt);
+ exit(EXIT_FAILURE);
+ break;
+ }
+ }
+
+ ctx.sock = nl_cli_alloc_socket();
+ if (!ctx.sock)
+ nl_cli_fatal(NLE_NOMEM, "Cannot allocate nl_sock");
+
+ ret = nl_cli_connect(ctx.sock, NETLINK_GENERIC);
+ if (ret < 0)
+ handle_err(ctx.sock, ret, "Cannot connect handle\n");
+
+ ctx.family_id = genl_ctrl_resolve(ctx.sock, DRM_RAS_GENL_NAME);
+ if (ctx.family_id < 0)
+ handle_err(ctx.sock, NLE_INVAL, "Resolving of family name
failed\n");
+
+ ret = nl_socket_modify_cb(ctx.sock, NL_CB_VALID, NL_CB_CUSTOM,
drm_genl_handle_msg, &ctx);
+ if (ret < 0)
+ handle_err(ctx.sock, ret, "Unable to modify valid message
callback\n");
+
+ send_cmd(ctx.command, &ctx);
+
+ nl_close(ctx.sock);
+ nl_socket_free(ctx.sock);
+
+ return 0;
+}
diff --git a/tools/meson.build b/tools/meson.build
index 8185ba160..74ff97713 100644
--- a/tools/meson.build
+++ b/tools/meson.build
@@ -70,6 +70,11 @@ if libudev.found()
install : true)
endif
+executable('drm_ras', 'drm_ras.c',
+ dependencies : [tool_deps, libnl, libnl_cli,
libnl_genl],
+ install_rpath : bindir_rpathdir,
+ install : true)
+
executable('gputop', 'gputop.c',
install : true,
install_rpath : bindir_rpathdir,
--
2.34.1