RE: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras
[AMD Public Use] Thanks for review, Stanley. Re: [Yang, Stanley] : It's better to compare con->bad_page_cnt_threshold with max_length, the value of bad_page_cnt_threshold should not exceed max_length. Correct, one guard is necessary. It will be patch v2. Regards, Guchun -Original Message- From: Yang, Stanley Sent: Wednesday, July 22, 2020 3:52 PM To: Chen, Guchun ; amd-gfx@lists.freedesktop.org; Deucher, Alexander ; Zhang, Hawking ; Li, Dennis ; Zhou1, Tao ; Clements, John Subject: RE: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras [AMD Official Use Only - Internal Distribution Only] Hi Guchun, Please see my comment inline. > -Original Message- > From: Chen, Guchun > Sent: Wednesday, July 22, 2020 11:14 AM > To: amd-gfx@lists.freedesktop.org; Deucher, Alexander > ; Zhang, Hawking ; > Li, Dennis ; Yang, Stanley ; > Zhou1, Tao ; Clements, John > Cc: Chen, Guchun > Subject: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras > > Bad page threshold value should be valid in the range between > -1 and max records length of eeprom. It could determine when the GPU > should be retired. > > Signed-off-by: Guchun Chen > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 > +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 5 +++ > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 2 + > 4 files changed, 53 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 6f06e1214622..e3d67d85c55f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -69,6 +69,9 @@ const char *ras_block_string[] = { > /* inject address is 52 bits */ > #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) > > +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ > +#define RAS_BAD_PAGE_RATE(100 * 1024 * 1024ULL) > + > enum amdgpu_ras_retire_page_reservation { > AMDGPU_RAS_RETIRE_PAGE_RESERVED, > AMDGPU_RAS_RETIRE_PAGE_PENDING, > @@ -1700,6 +1703,42 @@ static bool amdgpu_ras_check_bad_page(struct > amdgpu_device *adev, > return ret; > } > > +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, > + uint32_t max_length) > +{ > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + int tmp_threshold = amdgpu_bad_page_threshold; > + u64 val; > + > + /* > + * Justification of value bad_page_cnt_threshold in ras structure > + * > + * 1. -1 <= amdgpu_bad_page_threshold <= max record length in > eeprom > + * 2. if amdgpu_bad_page_threshold = -1, > + *bad_page_cnt_threshold = typical value by formula. > + * 3. if amdgpu_bad_page_threshold = 0, > + *bad_page_cnt_threshold = 0x, > + *and disable RMA feature accordingly. > + * 4. use the value specified from user when > (amdgpu_bad_page_threshold > + *> 0 && < max record length in eeprom). > + */ > + > + if (tmp_threshold < -1) > + tmp_threshold = -1; > + else if (tmp_threshold > max_length) > + tmp_threshold = max_length; > + > + if (tmp_threshold == -1) { > + val = adev->gmc.mc_vram_size; > + do_div(val, RAS_BAD_PAGE_RATE); > + con->bad_page_cnt_threshold = lower_32_bits(val); [Yang, Stanley] : It's better to compare con->bad_page_cnt_threshold with max_length, the value of bad_page_cnt_threshold should not exceed max_length. > + } else if (tmp_threshold == 0) { > + con->bad_page_cnt_threshold = 0x; > + } else { > + con->bad_page_cnt_threshold = tmp_threshold; > + } > +} > + > /* called in gpu recovery/init */ > int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ - > 1777,6 +1816,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device > *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data **data; > + uint32_t max_eeprom_records_len = 0; > int ret; > > if (con) > @@ -1795,6 +1835,9 @@ int amdgpu_ras_recovery_init(struct > amdgpu_device *adev) > atomic_set(>in_recovery, 0); > con->adev = adev; > > + max_eeprom_records_len = > amdgpu_ras_eeprom_get_record_max_length(); > + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); > + > ret = amdgpu_ras_eeprom_init(>eeprom_control); > if (ret) > goto free; > diff --git a/drivers/
RE: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras
[AMD Official Use Only - Internal Distribution Only] Hi Guchun, Please see my comment inline. > -Original Message- > From: Chen, Guchun > Sent: Wednesday, July 22, 2020 11:14 AM > To: amd-gfx@lists.freedesktop.org; Deucher, Alexander > ; Zhang, Hawking > ; Li, Dennis ; Yang, > Stanley ; Zhou1, Tao ; > Clements, John > Cc: Chen, Guchun > Subject: [PATCH 2/5] drm/amdgpu: validate bad page threshold in ras > > Bad page threshold value should be valid in the range between > -1 and max records length of eeprom. It could determine when the GPU > should be retired. > > Signed-off-by: Guchun Chen > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 > +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 5 +++ > .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 2 + > 4 files changed, 53 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > index 6f06e1214622..e3d67d85c55f 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c > @@ -69,6 +69,9 @@ const char *ras_block_string[] = { > /* inject address is 52 bits */ > #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) > > +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ > +#define RAS_BAD_PAGE_RATE(100 * 1024 * 1024ULL) > + > enum amdgpu_ras_retire_page_reservation { > AMDGPU_RAS_RETIRE_PAGE_RESERVED, > AMDGPU_RAS_RETIRE_PAGE_PENDING, > @@ -1700,6 +1703,42 @@ static bool amdgpu_ras_check_bad_page(struct > amdgpu_device *adev, > return ret; > } > > +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, > + uint32_t max_length) > +{ > + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > + int tmp_threshold = amdgpu_bad_page_threshold; > + u64 val; > + > + /* > + * Justification of value bad_page_cnt_threshold in ras structure > + * > + * 1. -1 <= amdgpu_bad_page_threshold <= max record length in > eeprom > + * 2. if amdgpu_bad_page_threshold = -1, > + *bad_page_cnt_threshold = typical value by formula. > + * 3. if amdgpu_bad_page_threshold = 0, > + *bad_page_cnt_threshold = 0x, > + *and disable RMA feature accordingly. > + * 4. use the value specified from user when > (amdgpu_bad_page_threshold > + *> 0 && < max record length in eeprom). > + */ > + > + if (tmp_threshold < -1) > + tmp_threshold = -1; > + else if (tmp_threshold > max_length) > + tmp_threshold = max_length; > + > + if (tmp_threshold == -1) { > + val = adev->gmc.mc_vram_size; > + do_div(val, RAS_BAD_PAGE_RATE); > + con->bad_page_cnt_threshold = lower_32_bits(val); [Yang, Stanley] : It's better to compare con->bad_page_cnt_threshold with max_length, the value of bad_page_cnt_threshold should not exceed max_length. > + } else if (tmp_threshold == 0) { > + con->bad_page_cnt_threshold = 0x; > + } else { > + con->bad_page_cnt_threshold = tmp_threshold; > + } > +} > + > /* called in gpu recovery/init */ > int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ - > 1777,6 +1816,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device > *adev) { > struct amdgpu_ras *con = amdgpu_ras_get_context(adev); > struct ras_err_handler_data **data; > + uint32_t max_eeprom_records_len = 0; > int ret; > > if (con) > @@ -1795,6 +1835,9 @@ int amdgpu_ras_recovery_init(struct > amdgpu_device *adev) > atomic_set(>in_recovery, 0); > con->adev = adev; > > + max_eeprom_records_len = > amdgpu_ras_eeprom_get_record_max_length(); > + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); > + > ret = amdgpu_ras_eeprom_init(>eeprom_control); > if (ret) > goto free; > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > index b2667342cf67..4672649a9293 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h > @@ -336,6 +336,9 @@ struct amdgpu_ras { > struct amdgpu_ras_eeprom_control eeprom_control; > > bool error_query_ready; > + > + /* bad page count threshold */ > + uint32_t bad_page_cnt_threshold; > }; > > struct ras_fs_data { > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c &
[PATCH 2/5] drm/amdgpu: validate bad page threshold in ras
Bad page threshold value should be valid in the range between -1 and max records length of eeprom. It could determine when the GPU should be retired. Signed-off-by: Guchun Chen --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 ++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c| 5 +++ .../gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h| 2 + 4 files changed, 53 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 6f06e1214622..e3d67d85c55f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -69,6 +69,9 @@ const char *ras_block_string[] = { /* inject address is 52 bits */ #defineRAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ +#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -1700,6 +1703,42 @@ static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, return ret; } +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, + uint32_t max_length) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + int tmp_threshold = amdgpu_bad_page_threshold; + u64 val; + + /* +* Justification of value bad_page_cnt_threshold in ras structure +* +* 1. -1 <= amdgpu_bad_page_threshold <= max record length in eeprom +* 2. if amdgpu_bad_page_threshold = -1, +*bad_page_cnt_threshold = typical value by formula. +* 3. if amdgpu_bad_page_threshold = 0, +*bad_page_cnt_threshold = 0x, +*and disable RMA feature accordingly. +* 4. use the value specified from user when (amdgpu_bad_page_threshold +*> 0 && < max record length in eeprom). +*/ + + if (tmp_threshold < -1) + tmp_threshold = -1; + else if (tmp_threshold > max_length) + tmp_threshold = max_length; + + if (tmp_threshold == -1) { + val = adev->gmc.mc_vram_size; + do_div(val, RAS_BAD_PAGE_RATE); + con->bad_page_cnt_threshold = lower_32_bits(val); + } else if (tmp_threshold == 0) { + con->bad_page_cnt_threshold = 0x; + } else { + con->bad_page_cnt_threshold = tmp_threshold; + } +} + /* called in gpu recovery/init */ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) { @@ -1777,6 +1816,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; + uint32_t max_eeprom_records_len = 0; int ret; if (con) @@ -1795,6 +1835,9 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(>in_recovery, 0); con->adev = adev; + max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + ret = amdgpu_ras_eeprom_init(>eeprom_control); if (ret) goto free; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index b2667342cf67..4672649a9293 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -336,6 +336,9 @@ struct amdgpu_ras { struct amdgpu_ras_eeprom_control eeprom_control; bool error_query_ready; + + /* bad page count threshold */ + uint32_t bad_page_cnt_threshold; }; struct ras_fs_data { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c index c0096097bbcf..a2c982b1eac6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.c @@ -499,6 +499,11 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, return ret == num ? 0 : -EIO; } +inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void) +{ + return EEPROM_MAX_RECORD_NUM; +} + /* Used for testing if bugs encountered */ #if 0 void amdgpu_ras_eeprom_test(struct amdgpu_ras_eeprom_control *control) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h index 7e8647a05df7..b272840cb069 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h @@ -85,6 +85,8 @@ int amdgpu_ras_eeprom_process_recods(struct amdgpu_ras_eeprom_control *control, bool write, int num); +inline uint32_t amdgpu_ras_eeprom_get_record_max_length(void); +