Add a new command: 'set-alert-config', which configures device's warning alert.
Signed-off-by: Jehoon Park <[email protected]> --- Documentation/cxl/cxl-set-alert-config.txt | 96 +++++++++ Documentation/cxl/meson.build | 1 + cxl/builtin.h | 1 + cxl/cxl.c | 1 + cxl/memdev.c | 220 ++++++++++++++++++++- 5 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 Documentation/cxl/cxl-set-alert-config.txt diff --git a/Documentation/cxl/cxl-set-alert-config.txt b/Documentation/cxl/cxl-set-alert-config.txt new file mode 100644 index 0000000..c905f7c --- /dev/null +++ b/Documentation/cxl/cxl-set-alert-config.txt @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 + +cxl-set-alert-config(1) +======================= + +NAME +---- +cxl-set-alert-config - set the warning alert threshold on a CXL memdev + +SYNOPSIS +-------- +[verse] +'cxl set-alert-config <mem0> [<mem1>..<memN>] [<options>]' + +DESCRIPTION +----------- +CXL device raises an alert when its health status is changed. Critical alert +shall automatically be configured by the device after a device reset. +If supported, programmable warning thresholds also be initialized to vendor +recommended defaults, then could be configured by the user. + +Use this command to configure warning alert thresholds of a device. +Having issued this command, the newly requested warning thresholds would +override the previously programmed warning thresholds. + +To enable warning alert, set both 'threshold=value' and 'alert=on'. To disable +warning alert, set only 'alert=off'. Other cases would cause errors. + +Use "cxl list -m <memdev> -A" to examine the programming warning threshold +capabilities of a device. + +EXAMPLES +-------- +Set warning threshold to 30 and enable alert for life used. +[verse] +cxl set-alert-config mem0 -L 30 --life-used-alert=on + +Disable warning alert for device over temperature. +[verse] +cxl set-alert-config mem0 --over-temperature-alert=off + +OPTIONS +------- +<memory device(s)>:: +include::memdev-option.txt[] + +-v:: +--verbose=:: + Turn on verbose debug messages in the library (if libcxl was built with + logging and debug enabled). + +-L:: +--life-used-threshold=:: + Set <value> for the life used warning alert threshold. + +--life-used-alert=:: + Enable or disable the life used warning alert. + Options are 'on' or 'off'. + +-O:: +--over-temperature-threshold=:: + Set <value> for the device over temperature warning alert threshold. + +--over-temperature-alert=:: + Enable or disable the device over temperature warning alert. + Options are 'on' or 'off'. + +-U:: +--under-temperature-threshold=:: + Set <value> for the device under temperature warning alert threshold. + +--under-temperature-alert=:: + Enable or disable the device under temperature warning alert. + Options are 'on' or 'off'. + +-V:: +--volatile-mem-err-threshold=:: + Set <value> for the corrected volatile memory error warning alert + threshold. + +--volatile-mem-err-alert=:: + Enable or disable the corrected volatile memory error warning alert. + Options are 'on' or 'off'. + +-P:: +--pmem-err-threshold=:: + Set <value> for the corrected persistent memory error warning alert + threshold. + +--pmem-err-alert=:: + Enable or disable the corrected persistent memory error warning alert. + Options are 'on' or 'off'. + +SEE ALSO +-------- +CXL-3.0 8.2.9.8.3.3 diff --git a/Documentation/cxl/meson.build b/Documentation/cxl/meson.build index c553357..865aad5 100644 --- a/Documentation/cxl/meson.build +++ b/Documentation/cxl/meson.build @@ -47,6 +47,7 @@ cxl_manpages = [ 'cxl-destroy-region.txt', 'cxl-monitor.txt', 'cxl-update-firmware.txt', + 'cxl-set-alert-config.txt', ] foreach man : cxl_manpages diff --git a/cxl/builtin.h b/cxl/builtin.h index 3ec6c6c..2c46a82 100644 --- a/cxl/builtin.h +++ b/cxl/builtin.h @@ -15,6 +15,7 @@ int cmd_enable_memdev(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_reserve_dpa(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_free_dpa(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_update_fw(int argc, const char **argv, struct cxl_ctx *ctx); +int cmd_set_alert_config(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_disable_port(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_enable_port(int argc, const char **argv, struct cxl_ctx *ctx); int cmd_set_partition(int argc, const char **argv, struct cxl_ctx *ctx); diff --git a/cxl/cxl.c b/cxl/cxl.c index e1524b8..bf4822f 100644 --- a/cxl/cxl.c +++ b/cxl/cxl.c @@ -69,6 +69,7 @@ static struct cmd_struct commands[] = { { "reserve-dpa", .c_fn = cmd_reserve_dpa }, { "free-dpa", .c_fn = cmd_free_dpa }, { "update-firmware", .c_fn = cmd_update_fw }, + { "set-alert-config", .c_fn = cmd_set_alert_config }, { "disable-port", .c_fn = cmd_disable_port }, { "enable-port", .c_fn = cmd_enable_port }, { "set-partition", .c_fn = cmd_set_partition }, diff --git a/cxl/memdev.c b/cxl/memdev.c index f6a2d3f..2dd2e7f 100644 --- a/cxl/memdev.c +++ b/cxl/memdev.c @@ -38,10 +38,38 @@ static struct parameters { const char *type; const char *size; const char *decoder_filter; + const char *life_used_threshold; + const char *dev_over_temperature_threshold; + const char *dev_under_temperature_threshold; + const char *corrected_volatile_mem_err_threshold; + const char *corrected_pmem_err_threshold; + const char *life_used_alert; + const char *dev_over_temperature_alert; + const char *dev_under_temperature_alert; + const char *corrected_volatile_mem_err_alert; + const char *corrected_pmem_err_alert; } param; static struct log_ctx ml; +struct alert_context { + int valid_alert_actions; + int enable_alert_actions; + int life_used_threshold; + int dev_over_temperature_threshold; + int dev_under_temperature_threshold; + int corrected_volatile_mem_err_threshold; + int corrected_pmem_err_threshold; +}; + +enum cxl_setalert_event { + CXL_SETALERT_LIFE_USED, + CXL_SETALERT_OVER_TEMP, + CXL_SETALERT_UNDER_TEMP, + CXL_SETALERT_VOLATILE_MEM_ERROR, + CXL_SETALERT_PMEM_ERROR, +}; + enum cxl_setpart_type { CXL_SETPART_PMEM, CXL_SETPART_VOLATILE, @@ -99,6 +127,36 @@ OPT_BOOLEAN('c', "cancel", ¶m.cancel, \ OPT_BOOLEAN('w', "wait", ¶m.wait, \ "wait for firmware update to complete before returning") +#define SET_ALERT_OPTIONS() \ +OPT_STRING('L', "life-used-threshold", ¶m.life_used_threshold, \ + "threshold", "threshold value for life used warning alert"), \ +OPT_STRING('\0', "life-used-alert", ¶m.life_used_alert, \ + "'on' or 'off'", "enable or disable life used warning alert"), \ +OPT_STRING('O', "over-temperature-threshold", \ + ¶m.dev_over_temperature_threshold, "threshold", \ + "threshold value for device over temperature warning alert"), \ +OPT_STRING('\0', "over-temperature-alert", \ + ¶m.dev_over_temperature_alert, "'on' or 'off'", \ + "enable or disable device over temperature warning alert"), \ +OPT_STRING('U', "under-temperature-threshold", \ + ¶m.dev_under_temperature_threshold, "threshold", \ + "threshold value for device under temperature warning alert"), \ +OPT_STRING('\0', "under-temperature-alert", \ + ¶m.dev_under_temperature_alert, "'on' or 'off'", \ + "enable or disable device under temperature warning alert"), \ +OPT_STRING('V', "volatile-mem-err-threshold", \ + ¶m.corrected_volatile_mem_err_threshold, "threshold", \ + "threshold value for corrected volatile mem error warning alert"), \ +OPT_STRING('\0', "volatile-mem-err-alert", \ + ¶m.corrected_volatile_mem_err_alert, "'on' or 'off'", \ + "enable or disable corrected volatile mem error warning alert"), \ +OPT_STRING('P', "pmem-err-threshold", \ + ¶m.corrected_pmem_err_threshold, "threshold", \ + "threshold value for corrected pmem error warning alert"), \ +OPT_STRING('\0', "pmem-err-alert", \ + ¶m.corrected_pmem_err_alert, "'on' or 'off'", \ + "enable or disable corrected pmem error warning alert") + static const struct option read_options[] = { BASE_OPTIONS(), LABEL_OPTIONS(), @@ -155,6 +213,12 @@ static const struct option update_fw_options[] = { OPT_END(), }; +static const struct option set_alert_options[] = { + BASE_OPTIONS(), + SET_ALERT_OPTIONS(), + OPT_END(), +}; + enum reserve_dpa_mode { DPA_ALLOC, DPA_FREE, @@ -706,6 +770,148 @@ static int action_update_fw(struct cxl_memdev *memdev, return rc; } +static int validate_alert_threshold(enum cxl_setalert_event event, + int threshold) +{ + if (event == CXL_SETALERT_LIFE_USED) { + if (threshold < 0 || threshold > 100) { + log_err(&ml, "Invalid life used threshold: %d\n", + threshold); + return -EINVAL; + } + } else if (event == CXL_SETALERT_OVER_TEMP || + event == CXL_SETALERT_UNDER_TEMP) { + if (threshold < SHRT_MIN || threshold > SHRT_MAX) { + log_err(&ml, + "Invalid device temperature threshold: %d\n", + threshold); + return -EINVAL; + } + } else { + if (threshold < 0 || threshold > USHRT_MAX) { + log_err(&ml, + "Invalid corrected mem error threshold: %d\n", + threshold); + return -EINVAL; + } + } + return 0; +} + +#define alert_param_set_threshold(arg, alert_event) \ +{ \ + if (!param.arg##_alert) { \ + if (param.arg##_threshold) { \ + log_err(&ml, "Action not specified\n"); \ + return -EINVAL; \ + } \ + } else if (strcmp(param.arg##_alert, "on") == 0) { \ + if (param.arg##_threshold) { \ + char *endptr; \ + alertctx.arg##_threshold = \ + strtol(param.arg##_threshold, &endptr, 10); \ + if (endptr[0] != '\0') { \ + log_err(&ml, "Invalid threshold: %s\n", \ + param.arg##_threshold); \ + return -EINVAL; \ + } \ + rc = validate_alert_threshold( \ + alert_event, alertctx.arg##_threshold); \ + if (rc != 0) \ + return rc; \ + alertctx.valid_alert_actions |= 1 << alert_event; \ + alertctx.enable_alert_actions |= 1 << alert_event; \ + } else { \ + log_err(&ml, "Threshold not specified\n"); \ + return -EINVAL; \ + } \ + } else if (strcmp(param.arg##_alert, "off") == 0) { \ + if (!param.arg##_threshold) { \ + alertctx.valid_alert_actions |= 1 << alert_event; \ + alertctx.enable_alert_actions &= ~(1 << alert_event); \ + } else { \ + log_err(&ml, "Disable not require threshold\n"); \ + return -EINVAL; \ + } \ + } else { \ + log_err(&ml, "Invalid action: %s\n", param.arg##_alert); \ + return -EINVAL; \ + } \ +} + +#define setup_threshold_field(arg) \ +{ \ + if (param.arg##_threshold) \ + cxl_cmd_alert_config_set_##arg##_prog_warn_threshold( \ + cmd, alertctx.arg##_threshold); \ +} + +static int action_set_alert_config(struct cxl_memdev *memdev, + struct action_context *actx) +{ + const char *devname = cxl_memdev_get_devname(memdev); + struct cxl_cmd *cmd; + struct alert_context alertctx = { 0 }; + struct json_object *jmemdev; + unsigned long flags; + int rc = 0; + + alert_param_set_threshold(life_used, CXL_SETALERT_LIFE_USED) + alert_param_set_threshold(dev_over_temperature, CXL_SETALERT_OVER_TEMP) + alert_param_set_threshold(dev_under_temperature, + CXL_SETALERT_UNDER_TEMP) + alert_param_set_threshold(corrected_volatile_mem_err, + CXL_SETALERT_VOLATILE_MEM_ERROR) + alert_param_set_threshold(corrected_pmem_err, CXL_SETALERT_PMEM_ERROR) + if (alertctx.valid_alert_actions == 0) { + log_err(&ml, "No action specified\n"); + return -EINVAL; + } + + cmd = cxl_cmd_new_set_alert_config(memdev); + if (!cmd) { + rc = -ENXIO; + goto out_err; + } + + setup_threshold_field(life_used) + setup_threshold_field(dev_over_temperature) + setup_threshold_field(dev_under_temperature) + setup_threshold_field(corrected_volatile_mem_err) + setup_threshold_field(corrected_pmem_err) + cxl_cmd_alert_config_set_valid_alert_actions( + cmd, alertctx.valid_alert_actions); + cxl_cmd_alert_config_set_enable_alert_actions( + cmd, alertctx.enable_alert_actions); + + rc = cxl_cmd_submit(cmd); + if (rc < 0) { + log_err(&ml, "cmd submission failed: %s\n", strerror(-rc)); + goto out_cmd; + } + + rc = cxl_cmd_get_mbox_status(cmd); + if (rc != 0) { + log_err(&ml, "%s: mbox status: %d\n", __func__, rc); + rc = -ENXIO; + } + +out_cmd: + cxl_cmd_unref(cmd); +out_err: + if (rc) + log_err(&ml, "%s error: %s\n", devname, strerror(-rc)); + + flags = UTIL_JSON_ALERT_CONFIG; + if (actx->f_out == stdout && isatty(1)) + flags |= UTIL_JSON_HUMAN; + jmemdev = util_cxl_memdev_to_json(memdev, flags); + if (actx->jdevs && jmemdev) + json_object_array_add(actx->jdevs, jmemdev); + + return rc; +} + static int memdev_action(int argc, const char **argv, struct cxl_ctx *ctx, int (*action)(struct cxl_memdev *memdev, struct action_context *actx), @@ -749,7 +955,8 @@ static int memdev_action(int argc, const char **argv, struct cxl_ctx *ctx, } if (action == action_setpartition || action == action_reserve_dpa || - action == action_free_dpa || action == action_update_fw) + action == action_free_dpa || action == action_update_fw || + action == action_set_alert_config) actx.jdevs = json_object_new_array(); if (err == argc) { @@ -968,3 +1175,14 @@ int cmd_update_fw(int argc, const char **argv, struct cxl_ctx *ctx) return count >= 0 ? 0 : EXIT_FAILURE; } + +int cmd_set_alert_config(int argc, const char **argv, struct cxl_ctx *ctx) +{ + int count = memdev_action( + argc, argv, ctx, action_set_alert_config, set_alert_options, + "cxl set-alert-config <mem0> [<mem1>..<memN>] [<options>]"); + log_info(&ml, "set alert configuration for %d mem%s\n", + count >= 0 ? count : 0, count > 1 ? "s" : ""); + + return count >= 0 ? 0 : EXIT_FAILURE; +} -- 2.17.1
