RE: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras

2020-07-22 Thread Chen, Guchun
[AMD Public Use]

Thanks for review, Stanley.

Re: [Yang, Stanley] : It's better to compare con->bad_page_cnt_threshold with 
max_length, the value of bad_page_cnt_threshold should not exceed max_length.

Correct, one guard is necessary. It will be patch v2.

Regards,
Guchun

-Original Message-
From: Yang, Stanley  
Sent: Wednesday, July 22, 2020 3:52 PM
To: Chen, Guchun ; amd-gfx@lists.freedesktop.org; Deucher, 
Alexander ; Zhang, Hawking ; 
Li, Dennis ; Zhou1, Tao ; Clements, John 

Subject: RE: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras

[AMD Official Use Only - Internal Distribution Only]

Hi Guchun,

Please see my comment inline.

> -Original Message-
> From: Chen, Guchun 
> Sent: Wednesday, July 22, 2020 11:14 AM
> To: amd-gfx@lists.freedesktop.org; Deucher, Alexander 
> ; Zhang, Hawking ; 
> Li, Dennis ; Yang, Stanley ; 
> Zhou1, Tao ; Clements, John 
> Cc: Chen, Guchun 
> Subject: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras
> 
> Bad page threshold value should be valid in the range between
> -1 and max records length of eeprom. It could determine when the GPU 
> should be retired.
> 
> Signed-off-by: Guchun Chen 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 43
> +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  3 ++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|  5 +++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|  2 +
>  4 files changed, 53 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6f06e1214622..e3d67d85c55f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -69,6 +69,9 @@ const char *ras_block_string[] = {
>  /* inject address is 52 bits */
>  #define  RAS_UMC_INJECT_ADDR_LIMIT   (0x1ULL << 52)
> 
> +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
> +#define RAS_BAD_PAGE_RATE(100 * 1024 * 1024ULL)
> +
>  enum amdgpu_ras_retire_page_reservation {
>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>   AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -1700,6 +1703,42 @@ static bool amdgpu_ras_check_bad_page(struct 
> amdgpu_device *adev,
>   return ret;
>  }
> 
> +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
> + uint32_t max_length)
> +{
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + int tmp_threshold = amdgpu_bad_page_threshold;
> + u64 val;
> +
> + /*
> +  * Justification of value bad_page_cnt_threshold in ras structure
> +  *
> +  * 1. -1 <= amdgpu_bad_page_threshold <= max record length in
> eeprom
> +  * 2. if amdgpu_bad_page_threshold = -1,
> +  *bad_page_cnt_threshold = typical value by formula.
> +  * 3. if amdgpu_bad_page_threshold = 0,
> +  *bad_page_cnt_threshold = 0x,
> +  *and disable RMA feature accordingly.
> +  * 4. use the value specified from user when
> (amdgpu_bad_page_threshold
> +  *> 0 && < max record length in eeprom).
> +  */
> +
> + if (tmp_threshold < -1)
> + tmp_threshold = -1;
> + else if (tmp_threshold > max_length)
> + tmp_threshold = max_length;
> +
> + if (tmp_threshold == -1) {
> + val = adev->gmc.mc_vram_size;
> + do_div(val, RAS_BAD_PAGE_RATE);
> + con->bad_page_cnt_threshold = lower_32_bits(val);
[Yang, Stanley] : It's better to compare con->bad_page_cnt_threshold with 
max_length, the value of bad_page_cnt_threshold should not exceed max_length.

> + } else if (tmp_threshold == 0) {
> + con->bad_page_cnt_threshold = 0x;
> + } else {
> + con->bad_page_cnt_threshold = tmp_threshold;
> + }
> +}
> +
>  /* called in gpu recovery/init */
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -
> 1777,6 +1816,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)  {
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   struct ras_err_handler_data **data;
> + uint32_t max_eeprom_records_len = 0;
>   int ret;
> 
>   if (con)
> @@ -1795,6 +1835,9 @@ int amdgpu_ras_recovery_init(struct 
> amdgpu_device *adev)
>   atomic_set(>in_recovery, 0);
>   con->adev = adev;
> 
> + max_eeprom_records_len =
> amdgpu_ras_eeprom_get_record_max_length();
> + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
> +
>   ret = amdgpu_ras_eeprom_init(>eeprom_control);
>   if (ret)
>   goto free;
> diff --git a/drivers/

RE: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras

2020-07-22 Thread Yang, Stanley
[AMD Official Use Only - Internal Distribution Only]

Hi Guchun,

Please see my comment inline.

> -Original Message-
> From: Chen, Guchun 
> Sent: Wednesday, July 22, 2020 11:14 AM
> To: amd-gfx@lists.freedesktop.org; Deucher, Alexander
> ; Zhang, Hawking
> ; Li, Dennis ; Yang,
> Stanley ; Zhou1, Tao ;
> Clements, John 
> Cc: Chen, Guchun 
> Subject: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras
> 
> Bad page threshold value should be valid in the range between
> -1 and max records length of eeprom. It could determine when the GPU
> should be retired.
> 
> Signed-off-by: Guchun Chen 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 43
> +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  3 ++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|  5 +++
>  .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|  2 +
>  4 files changed, 53 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6f06e1214622..e3d67d85c55f 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -69,6 +69,9 @@ const char *ras_block_string[] = {
>  /* inject address is 52 bits */
>  #define  RAS_UMC_INJECT_ADDR_LIMIT   (0x1ULL << 52)
> 
> +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
> +#define RAS_BAD_PAGE_RATE(100 * 1024 * 1024ULL)
> +
>  enum amdgpu_ras_retire_page_reservation {
>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>   AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -1700,6 +1703,42 @@ static bool amdgpu_ras_check_bad_page(struct
> amdgpu_device *adev,
>   return ret;
>  }
> 
> +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
> + uint32_t max_length)
> +{
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + int tmp_threshold = amdgpu_bad_page_threshold;
> + u64 val;
> +
> + /*
> +  * Justification of value bad_page_cnt_threshold in ras structure
> +  *
> +  * 1. -1 <= amdgpu_bad_page_threshold <= max record length in
> eeprom
> +  * 2. if amdgpu_bad_page_threshold = -1,
> +  *bad_page_cnt_threshold = typical value by formula.
> +  * 3. if amdgpu_bad_page_threshold = 0,
> +  *bad_page_cnt_threshold = 0x,
> +  *and disable RMA feature accordingly.
> +  * 4. use the value specified from user when
> (amdgpu_bad_page_threshold
> +  *> 0 && < max record length in eeprom).
> +  */
> +
> + if (tmp_threshold < -1)
> + tmp_threshold = -1;
> + else if (tmp_threshold > max_length)
> + tmp_threshold = max_length;
> +
> + if (tmp_threshold == -1) {
> + val = adev->gmc.mc_vram_size;
> + do_div(val, RAS_BAD_PAGE_RATE);
> + con->bad_page_cnt_threshold = lower_32_bits(val);
[Yang, Stanley] : It's better to compare con->bad_page_cnt_threshold with 
max_length,
the value of bad_page_cnt_threshold should not exceed max_length.

> + } else if (tmp_threshold == 0) {
> + con->bad_page_cnt_threshold = 0x;
> + } else {
> + con->bad_page_cnt_threshold = tmp_threshold;
> + }
> +}
> +
>  /* called in gpu recovery/init */
>  int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)  { @@ -
> 1777,6 +1816,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device
> *adev)  {
>   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
>   struct ras_err_handler_data **data;
> + uint32_t max_eeprom_records_len = 0;
>   int ret;
> 
>   if (con)
> @@ -1795,6 +1835,9 @@ int amdgpu_ras_recovery_init(struct
> amdgpu_device *adev)
>   atomic_set(>in_recovery, 0);
>   con->adev = adev;
> 
> + max_eeprom_records_len =
> amdgpu_ras_eeprom_get_record_max_length();
> + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
> +
>   ret = amdgpu_ras_eeprom_init(>eeprom_control);
>   if (ret)
>   goto free;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index b2667342cf67..4672649a9293 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -336,6 +336,9 @@ struct amdgpu_ras {
>   struct amdgpu_ras_eeprom_control eeprom_control;
> 
>   bool error_query_ready;
> +
> + /* bad page count threshold */
> + uint32_t bad_page_cnt_threshold;
>  };
> 
>  struct ras_fs_data {
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
&

[PATCH 2/5] drm/amdgpu: validate bad page threshold in ras

2020-07-21 Thread Guchun Chen
Bad page threshold value should be valid in the range between
-1 and max records length of eeprom. It could determine when
the GPU should be retired.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c   | 43 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h   |  3 ++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c|  5 +++
 .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h|  2 +
 4 files changed, 53 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6f06e1214622..e3d67d85c55f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -69,6 +69,9 @@ const char *ras_block_string[] = {
 /* inject address is 52 bits */
 #defineRAS_UMC_INJECT_ADDR_LIMIT   (0x1ULL << 52)
 
+/* typical ECC bad page rate(1 bad page per 100MB VRAM) */
+#define RAS_BAD_PAGE_RATE  (100 * 1024 * 1024ULL)
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -1700,6 +1703,42 @@ static bool amdgpu_ras_check_bad_page(struct 
amdgpu_device *adev,
return ret;
 }
 
+static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev,
+   uint32_t max_length)
+{
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   int tmp_threshold = amdgpu_bad_page_threshold;
+   u64 val;
+
+   /*
+* Justification of value bad_page_cnt_threshold in ras structure
+*
+* 1. -1 <= amdgpu_bad_page_threshold <= max record length in eeprom
+* 2. if amdgpu_bad_page_threshold = -1,
+*bad_page_cnt_threshold = typical value by formula.
+* 3. if amdgpu_bad_page_threshold = 0,
+*bad_page_cnt_threshold = 0x,
+*and disable RMA feature accordingly.
+* 4. use the value specified from user when (amdgpu_bad_page_threshold
+*> 0 && < max record length in eeprom).
+*/
+
+   if (tmp_threshold < -1)
+   tmp_threshold = -1;
+   else if (tmp_threshold > max_length)
+   tmp_threshold = max_length;
+
+   if (tmp_threshold == -1) {
+   val = adev->gmc.mc_vram_size;
+   do_div(val, RAS_BAD_PAGE_RATE);
+   con->bad_page_cnt_threshold = lower_32_bits(val);
+   } else if (tmp_threshold == 0) {
+   con->bad_page_cnt_threshold = 0x;
+   } else {
+   con->bad_page_cnt_threshold = tmp_threshold;
+   }
+}
+
 /* called in gpu recovery/init */
 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
 {
@@ -1777,6 +1816,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
 {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
+   uint32_t max_eeprom_records_len = 0;
int ret;
 
if (con)
@@ -1795,6 +1835,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
atomic_set(>in_recovery, 0);
con->adev = adev;
 
+   max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length();
+   amdgpu_ras_validate_threshold(adev, max_eeprom_records_len);
+
ret = amdgpu_ras_eeprom_init(>eeprom_control);
if (ret)
goto free;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index b2667342cf67..4672649a9293 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -336,6 +336,9 @@ struct amdgpu_ras {
struct amdgpu_ras_eeprom_control eeprom_control;
 
bool error_query_ready;
+
+   /* bad page count threshold */
+   uint32_t bad_page_cnt_threshold;
 };
 
 struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
index c0096097bbcf..a2c982b1eac6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c
@@ -499,6 +499,11 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
return ret == num ? 0 : -EIO;
 }
 
+inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void)
+{
+   return EEPROM_MAX_RECORD_NUM;
+}
+
 /* Used for testing if bugs encountered */
 #if 0
 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
index 7e8647a05df7..b272840cb069 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h
@@ -85,6 +85,8 @@ int amdgpu_ras_eeprom_process_recods(struct 
amdgpu_ras_eeprom_control *control,
bool write,
int num);
 
+inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void);
+