On Wed, 2019-02-20 at 05:11 +0000, Dexuan Cui wrote:
> Currently "ndctl monitor" fails for NVDIMM_FAMILY_HYPERV due to
> "no smart support".
> 
> NVDIMM_FAMILY_HYPERV doesn't use ND_CMD_SMART to get the health info.
> Instead, it uses ND_CMD_CALL, so the checking here can't apply,and it
> doesn't support threshold alarms -- actually it only supports one event:
> ND_EVENT_HEALTH_STATE.
> 
> See http://www.uefi.org/RFIC_LIST ("Virtual NVDIMM 0x1901").
> 
> Let's skip the unnecessary checking for NVDIMM_FAMILY_HYPERV, and make
> sure we only monitor the "dimm-health-state" event and ignore the others.
> 
> With the patch, when an error happens, we log it with such a message:
> 
> {"timestamp":"1550547497.431731497","pid":1571,"event":
> {"dimm-health-state":true},"dimm":{"dev":"nmem1",
> "id":"04d5-01-1701-01000000","handle":1,"phys_id":0,
> "health":{"health_state":"fatal","shutdown_count":8}}}
> 
> Here the meaningful info is:
> "health":{"health_state":"fatal","shutdown_count":8}
> 
> Signed-off-by: Dexuan Cui <de...@microsoft.com>
> ---
>  ndctl/monitor.c | 42 +++++++++++++++++++++++++++++++++++-------
>  1 file changed, 35 insertions(+), 7 deletions(-)
> 
> diff --git a/ndctl/monitor.c b/ndctl/monitor.c
> index 43b2abe..43beb06 100644
> --- a/ndctl/monitor.c
> +++ b/ndctl/monitor.c
> @@ -265,31 +265,59 @@ static bool filter_region(struct ndctl_region *region,
>       return true;
>  }
>  
> -static void filter_dimm(struct ndctl_dimm *dimm, struct util_filter_ctx 
> *fctx)
> +static bool ndctl_dimm_test_and_enable_notification(struct ndctl_dimm *dimm)
>  {
> -     struct monitor_dimm *mdimm;
> -     struct monitor_filter_arg *mfa = fctx->monitor;
>       const char *name = ndctl_dimm_get_devname(dimm);
>  
> +     /*
> +      * Hyper-V Virtual NVDIMM doesn't use ND_CMD_SMART to get the health
> +      * info. Instead, it uses ND_CMD_CALL, so the checking here can't
> +      * apply, and it doesn't support threshold alarms -- actually it only
> +      * supports one event: ND_EVENT_HEALTH_STATE.
> +      */
> +     if (ndctl_dimm_get_cmd_family(dimm) == NVDIMM_FAMILY_HYPERV) {
> +             if (monitor.event_flags != ND_EVENT_HEALTH_STATE) {
> +                     monitor.event_flags = ND_EVENT_HEALTH_STATE;
> +
> +                     notice(&monitor,
> +                             "%s: only dimm-health-state can be monitored\n",
> +                             name);
> +             }
> +             return true;
> +     }
> +

I'm not sure about the family-specific special casing -- it leaks family
specific details into the common implementation, something that's been
avoidable thus far.

Is there any reason why the ndctl_dimm_is_cmd_supported() can't be made
to fail appropriately for anything that is not supported, by adding the
necessary functions to dimm-ops?

That way if the user enters any of the unsupported options, they will
just fail normally, and the user will be expected to provide the right
options for the environment they know they're running in.

Indeed, I think that patch 3 of this series should never be required :)

>       if (!ndctl_dimm_is_cmd_supported(dimm, ND_CMD_SMART)) {
>               err(&monitor, "%s: no smart support\n", name);
> -             return;
> +             return false;
>       }
>       if (!ndctl_dimm_is_cmd_supported(dimm, ND_CMD_SMART_THRESHOLD)) {
>               err(&monitor, "%s: no smart threshold support\n", name);
> -             return;
> +             return false;
>       }
>  
>       if (!ndctl_dimm_is_flag_supported(dimm, ND_SMART_ALARM_VALID)) {
>               err(&monitor, "%s: smart alarm invalid\n", name);
> -             return;
> +             return false;
>       }
>  
>       if (enable_dimm_supported_threshold_alarms(dimm)) {
>               err(&monitor, "%s: enable supported threshold alarms failed\n", 
> name);
> -             return;
> +             return false;
>       }
>  
> +     return true;
> +}
> +
> +static void filter_dimm(struct ndctl_dimm *dimm, struct util_filter_ctx 
> *fctx)
> +{
> +     struct monitor_dimm *mdimm;
> +     struct monitor_filter_arg *mfa = fctx->monitor;
> +     const char *name = ndctl_dimm_get_devname(dimm);
> +
> +
> +     if (!ndctl_dimm_test_and_enable_notification(dimm))
> +             return;
> +
>       mdimm = calloc(1, sizeof(struct monitor_dimm));
>       if (!mdimm) {
>               err(&monitor, "%s: calloc for monitor dimm failed\n", name);

_______________________________________________
Linux-nvdimm mailing list
Linux-nvdimm@lists.01.org
https://lists.01.org/mailman/listinfo/linux-nvdimm

Reply via email to