RE: [PATCH RFC 1/4] ACPI: APEI: Add support to notify the vendor specific HW errors

2019-08-22 Thread Shiju Jose
Hi James,

>-Original Message-
>From: James Morse [mailto:james.mo...@arm.com]
>Sent: 21 August 2019 18:24
>To: Shiju Jose 
>Cc: linux-a...@vger.kernel.org; linux-e...@vger.kernel.org; linux-
>ker...@vger.kernel.org; r...@rjwysocki.net; l...@kernel.org;
>tony.l...@intel.com; b...@alien8.de; bai...@os.amperecomputing.com;
>Linuxarm ; Jonathan Cameron
>; tanxiaofei 
>Subject: Re: [PATCH RFC 1/4] ACPI: APEI: Add support to notify the vendor
>specific HW errors
>
>Hi,
>
>On 12/08/2019 11:11, Shiju Jose wrote:
>> Presently the vendor specific HW errors, in the non-standard format,
>> are not reported to the vendor drivers for the recovery.
>>
>> This patch adds support to notify the vendor specific HW errors to the
>> registered kernel drivers.
>
>> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index
>> a66e00f..374d197 100644
>> --- a/drivers/acpi/apei/ghes.c
>> +++ b/drivers/acpi/apei/ghes.c
>> @@ -477,6 +477,77 @@ static void ghes_handle_aer(struct
>> acpi_hest_generic_data *gdata)  #endif  }
>>
>> +struct ghes_error_notify {
>> +struct list_head list;> +   struct rcu_head rcu_head;
>> +guid_t sec_type; /* guid of the error record */
>
>> +error_handle handle; /* error handler function */
>
>ghes_error_handler_t error_handler; ?
Sure.

>
>
>> +void *data; /* handler driver's private data if any */ };
>> +
>> +/* List to store the registered error handling functions */ static
>> +DEFINE_MUTEX(ghes_error_notify_mutex);
>> +static LIST_HEAD(ghes_error_notify_list);
>
>> +static refcount_t ghes_ref_count;
>
>I don't think this refcount is needed.
refcount was added to register standard error handlers with this notification 
method one time when
multiple ghes platform devices are probed.
 
>
>
>> +/**
>> + * ghes_error_notify_register - register an error handling function
>> + * for the hw errors.
>> + * @sec_type: sec_type of the corresponding CPER to be notified.
>> + * @handle: pointer to the error handling function.
>> + * @data: handler driver's private data.
>> + *
>> + * return 0 : SUCCESS, non-zero : FAIL  */ int
>> +ghes_error_notify_register(guid_t sec_type, error_handle handle, void
>> +*data) {
>> +struct ghes_error_notify *err_notify;
>> +
>> +mutex_lock(_error_notify_mutex);
>> +err_notify = kzalloc(sizeof(*err_notify), GFP_KERNEL);
>> +if (!err_notify)
>> +return -ENOMEM;
>
>Leaving the mutex locked.
>You may as well allocate the memory before taking the lock.
Good spot. I will fix.

>
>
>> +
>> +err_notify->handle = handle;
>> +guid_copy(_notify->sec_type, _type);
>> +err_notify->data = data;
>> +list_add_rcu(_notify->list, _error_notify_list);
>> +mutex_unlock(_error_notify_mutex);
>> +
>> +return 0;
>> +}
>> +EXPORT_SYMBOL_GPL(ghes_error_notify_register);
>
>Could we leave exporting this to modules until there is a user?
>
>
>> +/**
>> + * ghes_error_notify_unregister - unregister an error handling function.
>> + * @sec_type: sec_type of the corresponding CPER.
>> + * @handle: pointer to the error handling function.
>> + *
>> + * return none.
>> + */
>> +void ghes_error_notify_unregister(guid_t sec_type, error_handle
>> +handle)
>
>Why do we need the handle(r) a second time? Surely there can only be one
>callback for a given guid.
There is a possibility of sharing the guid between drivers if the non-standard 
error section format is common
for more than one devices if the error data to be reported is in the same 
format.
 
>
>
>> +{
>> +struct ghes_error_notify *err_notify;
>> +bool found = 0;
>> +
>> +mutex_lock(_error_notify_mutex);
>> +rcu_read_lock();
>> +list_for_each_entry_rcu(err_notify, _error_notify_list, list) {
>> +if (guid_equal(_notify->sec_type, _type) &&
>> +err_notify->handle == handle) {
>> +list_del_rcu(_notify->list);
>> +found = 1;
>> +break;
>> +}
>> +}
>> +rcu_read_unlock();
>
>> +synchronize_rcu();
>
>Is this for the kfree()? Please keep them together so its obvious what its for.
>Putting it outside the mutex will also save any contended waiter some time.
Yes. I will move synchronize_rcu () just before kfree. 
 
>
>
>> +mutex_unlock(_error_notify_mutex);
>> +if (found)
>> +kfree(e

Re: [PATCH RFC 1/4] ACPI: APEI: Add support to notify the vendor specific HW errors

2019-08-21 Thread James Morse
Hi,

On 12/08/2019 11:11, Shiju Jose wrote:
> Presently the vendor specific HW errors, in the non-standard format,
> are not reported to the vendor drivers for the recovery.
> 
> This patch adds support to notify the vendor specific HW errors to the
> registered kernel drivers.

> diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
> index a66e00f..374d197 100644
> --- a/drivers/acpi/apei/ghes.c
> +++ b/drivers/acpi/apei/ghes.c
> @@ -477,6 +477,77 @@ static void ghes_handle_aer(struct 
> acpi_hest_generic_data *gdata)
>  #endif
>  }
>  
> +struct ghes_error_notify {
> + struct list_head list;> +   struct rcu_head rcu_head;
> + guid_t sec_type; /* guid of the error record */

> + error_handle handle; /* error handler function */

ghes_error_handler_t error_handler; ?


> + void *data; /* handler driver's private data if any */
> +};
> +
> +/* List to store the registered error handling functions */
> +static DEFINE_MUTEX(ghes_error_notify_mutex);
> +static LIST_HEAD(ghes_error_notify_list);

> +static refcount_t ghes_ref_count;

I don't think this refcount is needed.


> +/**
> + * ghes_error_notify_register - register an error handling function
> + * for the hw errors.
> + * @sec_type: sec_type of the corresponding CPER to be notified.
> + * @handle: pointer to the error handling function.
> + * @data: handler driver's private data.
> + *
> + * return 0 : SUCCESS, non-zero : FAIL
> + */
> +int ghes_error_notify_register(guid_t sec_type, error_handle handle, void 
> *data)
> +{
> + struct ghes_error_notify *err_notify;
> +
> + mutex_lock(_error_notify_mutex);
> + err_notify = kzalloc(sizeof(*err_notify), GFP_KERNEL);
> + if (!err_notify)
> + return -ENOMEM;

Leaving the mutex locked.
You may as well allocate the memory before taking the lock.


> +
> + err_notify->handle = handle;
> + guid_copy(_notify->sec_type, _type);
> + err_notify->data = data;
> + list_add_rcu(_notify->list, _error_notify_list);
> + mutex_unlock(_error_notify_mutex);
> +
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(ghes_error_notify_register);

Could we leave exporting this to modules until there is a user?


> +/**
> + * ghes_error_notify_unregister - unregister an error handling function.
> + * @sec_type: sec_type of the corresponding CPER.
> + * @handle: pointer to the error handling function.
> + *
> + * return none.
> + */
> +void ghes_error_notify_unregister(guid_t sec_type, error_handle handle)

Why do we need the handle(r) a second time? Surely there can only be one 
callback for a
given guid.


> +{
> + struct ghes_error_notify *err_notify;
> + bool found = 0;
> +
> + mutex_lock(_error_notify_mutex);
> + rcu_read_lock();
> + list_for_each_entry_rcu(err_notify, _error_notify_list, list) {
> + if (guid_equal(_notify->sec_type, _type) &&
> + err_notify->handle == handle) {
> + list_del_rcu(_notify->list);
> + found = 1;
> + break;
> + }
> + }
> + rcu_read_unlock();

> + synchronize_rcu();

Is this for the kfree()? Please keep them together so its obvious what its for.
Putting it outside the mutex will also save any contended waiter some time.


> + mutex_unlock(_error_notify_mutex);
> + if (found)
> + kfree(err_notify);
> +}
> +EXPORT_SYMBOL_GPL(ghes_error_notify_unregister);
> +

>  static void ghes_do_proc(struct ghes *ghes,
>const struct acpi_hest_generic_status *estatus)
>  {> @@ -512,11 +585,29 @@ static void ghes_do_proc(struct ghes *ghes,
>  
>   log_arm_hw_error(err);
>   } else {
> - void *err = acpi_hest_get_payload(gdata);
> -
> - log_non_standard_event(sec_type, fru_id, fru_text,
> -sec_sev, err,
> -gdata->error_data_length);

> + rcu_read_lock();
> + list_for_each_entry_rcu(err_notify,
> + _error_notify_list, list) {
> + if (guid_equal(_notify->sec_type,
> +sec_type)) {

> + /* The notification is called in the
> +  * interrupt context, thus the handler
> +  * functions should be take care of it.
> +  */

I read this as "the handler will be called", which doesn't seem to be a useful 
comment.


> + err_notify->handle(gdata, sev,
> +err_notify->data);
> + is_notify = 1;

break;

> + }
> + }
> +