RE: [PATCH V2 1/4] drm/amdgpu: add variable to record the deferred error number read by driver

2024-06-21 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

prevd_queried_count and de_queried_count are used to accurately count the 
number of DE lost after driver receives a large number of poison creation 
interrupts.

Since amdgpu_ras_query_error_status can be called by page_retirment_thread, 
xxx_err_count sysfs and gpu recovery,
using local variable to save the old de_queried_count before calling 
amdgpu_ras_query_error_status in page_retirment_thread will be inaccurate.


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Friday, June 21, 2024 2:37 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH V2 1/4] drm/amdgpu: add variable to record the deferred 
error number read by driver

[AMD Official Use Only - AMD Internal Distribution Only]

Shall we make pre_de_queried_count to be local variable? Others look good to me

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Thursday, June 20, 2024 13:40
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: [PATCH V2 1/4] drm/amdgpu: add variable to record the deferred error 
number read by driver

Add variable to record the deferred error number read by driver.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 62 ++---  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 +-  
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  |  4 +-
 3 files changed, 48 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 86cb97d2155b..f674e34037b6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -120,7 +120,7 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

-#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  300  //ms

 #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms

@@ -2804,7 +2804,8 @@ static void amdgpu_ras_ecc_log_init(struct 
ras_ecc_log_info *ecc_log)
memset(_log->ecc_key, 0xad, sizeof(ecc_log->ecc_key));

INIT_RADIX_TREE(_log->de_page_tree, GFP_KERNEL);
-   ecc_log->de_updated = false;
+   ecc_log->de_queried_count = 0;
+   ecc_log->prev_de_queried_count = 0;
 }

 static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log) @@ 
-2823,7 +2824,8 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info 
*ecc_log)
mutex_unlock(_log->lock);

mutex_destroy(_log->lock);
-   ecc_log->de_updated = false;
+   ecc_log->de_queried_count = 0;
+   ecc_log->prev_de_queried_count = 0;
 }
 #endif

@@ -2856,40 +2858,64 @@ static void amdgpu_ras_do_page_retirement(struct 
work_struct *work)
mutex_unlock(>umc_ecc_log.lock);
 }

-static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
-   uint32_t timeout_ms)
+static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+   uint32_t poison_creation_count)
 {
int ret = 0;
struct ras_ecc_log_info *ecc_log;
struct ras_query_if info;
-   uint32_t timeout = timeout_ms;
+   uint32_t timeout = 0;
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+   uint64_t de_queried_count;
+   uint32_t new_detect_count, total_detect_count;
+   uint32_t need_query_count = poison_creation_count;
+   bool query_data_timeout = false;

memset(, 0, sizeof(info));
info.head.block = AMDGPU_RAS_BLOCK__UMC;

ecc_log = >umc_ecc_log;
-   ecc_log->de_updated = false;
+   total_detect_count = 0;
do {
ret = amdgpu_ras_query_error_status(adev, );
-   if (ret) {
-   dev_err(adev->dev, "Failed to query ras error! 
ret:%d\n", ret);
-   return;
+   if (ret)
+   return ret;
+
+   de_queried_count = ecc_log->de_queried_count;
+   if (de_queried_count > ecc_log->prev_de_queried_count) {
+   new_detect_count = de_queried_count - 
ecc_log->prev_de_queried_count;
+   ecc_log->prev_de_queried_count = de_queried_count;
+   timeout = 0;
+   } else {
+   new_detect_count = 0;
}

-   if (timeout && !ecc_log->de_updated) {
-   msleep(1);
-   timeout--;
+   if (new_detect_count) {
+   total_detect_count += new_detect_count;

RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to complete

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 18, 2024 8:00 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley 
Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
complete



On 6/18/2024 4:51 PM, Chai, Thomas wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -
> Best Regards,
> Thomas
>
> -Original Message-
> From: Chai, Thomas
> Sent: Tuesday, June 18, 2024 7:09 PM
> To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Zhou1, Tao
> ; Li, Candice ; Wang,
> Yang(Kevin) ; Yang, Stanley
> 
> Subject: RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras
> reset to complete
>
>
>
>
> -
> Best Regards,
> Thomas
>
> -Original Message-----
> From: Lazar, Lijo 
> Sent: Tuesday, June 18, 2024 6:09 PM
> To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Zhou1, Tao
> ; Li, Candice ; Wang,
> Yang(Kevin) ; Yang, Stanley
> 
> Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras
> reset to complete
>
>
>
> On 6/18/2024 12:03 PM, YiPeng Chai wrote:
>> Add completion to wait for ras reset to complete.
>>
>> Signed-off-by: YiPeng Chai 
>> ---
>>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
>> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>>  2 files changed, 12 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index 898889600771..7f8e6ca07957 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct
>> ras_common_if
>> *ras_block)
>>
>>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>>
>> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  12 //ms
>> +
>>  enum amdgpu_ras_retire_page_reservation {
>>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>>   AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -2518,6 +2520,8 @@ static
>> void amdgpu_ras_do_recovery(struct work_struct *work)
>>   atomic_set(>ras_recovery, 0);
>>   amdgpu_put_xgmi_hive(hive);
>>   }
>> +
>> + complete_all(>ras_recovery_completion);
>>  }
>>
>>  /* alloc/realloc bps array */
>> @@ -2911,10 +2915,16 @@ static int
>> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>>
>>   flush_delayed_work(>page_retirement_dwork);
>>
>> + reinit_completion(>ras_recovery_completion);
>> +
>>   con->gpu_reset_flags |= reset;
>>   amdgpu_ras_reset_gpu(adev);
>>
>>   *gpu_reset = reset;
>> + if (!wait_for_completion_timeout(>ras_recovery_completion,
>> + 
>> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
>> + dev_err(adev->dev, "Waiting for GPU to complete ras 
>> reset timeout! reset:0x%x\n",
>> + reset);
>
>> If a mode-1 reset gets to execute first due to job timeout/hws detect cases 
>> in poison timeout, then the ras handler will never get executed.
>> Why this wait is required?
>
>> Thanks,
>> Lijo
>
> [Thomas]  "[PATCH 5/5] drm/amdgpu: add gpu reset check and exception 
> handling" add the check before ras gpu reset.
> Poison ras reset is different from reset triggered by other 
> fatal errors, and all poison RAS resets are triggered from here,
>  in order to distinguish other gpu resets and facilitate 
> subsequent  code processing, so add wait for gpu ras reset here.
>

> Reset mechanism resets the GPU state - whether it's triggered due to poison 
> or fatal errors. As soon as the device is reset successfully, GPU operations 
> can continue.

>So why there needs to be a special wait for poison triggred reset alone?
[Thomas] Different applications may randomly trigger poison errors before gpu 
reset.
 Since poison gpu reset is triggered asynchronously, new poison 
consumption interrupts may occur in the period after gpu reset request is sent 
and before the GPU reset is actually performed..
  In order to avoid performing a poison gpu reset again after 
completing the current poison gpu reset,  It need to stay here to wait for gpu 
to complete reset and then clear the

RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to complete

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Chai, Thomas
Sent: Tuesday, June 18, 2024 7:09 PM
To: Lazar, Lijo ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley 
Subject: RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
complete




-
Best Regards,
Thomas

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 18, 2024 6:09 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley 
Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
complete



On 6/18/2024 12:03 PM, YiPeng Chai wrote:
> Add completion to wait for ras reset to complete.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>  2 files changed, 12 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 898889600771..7f8e6ca07957 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if
> *ras_block)
>
>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>
> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  12 //ms
> +
>  enum amdgpu_ras_retire_page_reservation {
>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>   AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -2518,6 +2520,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
> *work)
>   atomic_set(>ras_recovery, 0);
>   amdgpu_put_xgmi_hive(hive);
>   }
> +
> + complete_all(>ras_recovery_completion);
>  }
>
>  /* alloc/realloc bps array */
> @@ -2911,10 +2915,16 @@ static int
> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>
>   flush_delayed_work(>page_retirement_dwork);
>
> + reinit_completion(>ras_recovery_completion);
> +
>   con->gpu_reset_flags |= reset;
>   amdgpu_ras_reset_gpu(adev);
>
>   *gpu_reset = reset;
> + if (!wait_for_completion_timeout(>ras_recovery_completion,
> + 
> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
> + dev_err(adev->dev, "Waiting for GPU to complete ras 
> reset timeout! reset:0x%x\n",
> + reset);

> If a mode-1 reset gets to execute first due to job timeout/hws detect cases 
> in poison timeout, then the ras handler will never get executed.
> Why this wait is required?

>Thanks,
>Lijo

[Thomas]  "[PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling" 
add the check before ras gpu reset.
Poison ras reset is different from reset triggered by other 
fatal errors, and all poison RAS resets are triggered from here,
 in order to distinguish other gpu resets and facilitate subsequent 
 code processing, so add wait for gpu ras reset here.

>   }
>
>   return 0;
> @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   }
>   }
>
> + init_completion(>ras_recovery_completion);
>   mutex_init(>page_rsv_lock);
>   INIT_KFIFO(con->poison_fifo);
>   mutex_init(>page_retirement_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 91daf48be03a..b47f03edac87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -537,6 +537,7 @@ struct amdgpu_ras {
>   DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
>   struct ras_ecc_log_info  umc_ecc_log;
>   struct delayed_work page_retirement_dwork;
> + struct completion ras_recovery_completion;
>
>   /* Fatal error detected flag */
>   atomic_t fed;


RE: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to complete

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Lazar, Lijo 
Sent: Tuesday, June 18, 2024 6:09 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley 
Subject: Re: [PATCH 4/5] drm/amdgpu: add completion to wait for ras reset to 
complete



On 6/18/2024 12:03 PM, YiPeng Chai wrote:
> Add completion to wait for ras reset to complete.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 11 +++
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  1 +
>  2 files changed, 12 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 898889600771..7f8e6ca07957 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if
> *ras_block)
>
>  #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100  //ms
>
> +#define MAX_RAS_RECOVERY_COMPLETION_TIME  12 //ms
> +
>  enum amdgpu_ras_retire_page_reservation {
>   AMDGPU_RAS_RETIRE_PAGE_RESERVED,
>   AMDGPU_RAS_RETIRE_PAGE_PENDING,
> @@ -2518,6 +2520,8 @@ static void amdgpu_ras_do_recovery(struct work_struct 
> *work)
>   atomic_set(>ras_recovery, 0);
>   amdgpu_put_xgmi_hive(hive);
>   }
> +
> + complete_all(>ras_recovery_completion);
>  }
>
>  /* alloc/realloc bps array */
> @@ -2911,10 +2915,16 @@ static int
> amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
>
>   flush_delayed_work(>page_retirement_dwork);
>
> + reinit_completion(>ras_recovery_completion);
> +
>   con->gpu_reset_flags |= reset;
>   amdgpu_ras_reset_gpu(adev);
>
>   *gpu_reset = reset;
> + if (!wait_for_completion_timeout(>ras_recovery_completion,
> + 
> msecs_to_jiffies(MAX_RAS_RECOVERY_COMPLETION_TIME)))
> + dev_err(adev->dev, "Waiting for GPU to complete ras 
> reset timeout! reset:0x%x\n",
> + reset);

> If a mode-1 reset gets to execute first due to job timeout/hws detect cases 
> in poison timeout, then the ras handler will never get executed.
> Why this wait is required?

[Thomas]  "[PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling" 
add the check before ras gpu reset.


Thanks,
Lijo

>   }
>
>   return 0;
> @@ -3041,6 +3051,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
>   }
>   }
>
> + init_completion(>ras_recovery_completion);
>   mutex_init(>page_rsv_lock);
>   INIT_KFIFO(con->poison_fifo);
>   mutex_init(>page_retirement_lock);
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index 91daf48be03a..b47f03edac87 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -537,6 +537,7 @@ struct amdgpu_ras {
>   DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
>   struct ras_ecc_log_info  umc_ecc_log;
>   struct delayed_work page_retirement_dwork;
> + struct completion ras_recovery_completion;
>
>   /* Fatal error detected flag */
>   atomic_t fed;


RE: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, June 18, 2024 3:19 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Yang, Stanley 
Subject: RE: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling



-Original Message-
From: Chai, Thomas 
Sent: 2024年6月18日 14:34
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: [PATCH 5/5] drm/amdgpu: add gpu reset check and exception handling

Add gpu reset check and exception handling for page retirement.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 43 +
 1 file changed, 43 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 7f8e6ca07957..635dc86dbfd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1386,10 +1386,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device 
*adev, struct ras_query_i
memset(, 0, sizeof(qctx));
qctx.event_id = amdgpu_ras_acquire_event_id(adev, 
amdgpu_ras_intr_triggered() ?
   RAS_EVENT_TYPE_ISR : 
RAS_EVENT_TYPE_INVALID);
+
+   if (!down_read_trylock(>reset_domain->sem))
+   return -EIO;
+
ret = amdgpu_ras_query_error_status_helper(adev, info,
   _data,
   ,
   error_query_mode);
+   up_read(>reset_domain->sem);
if (ret)
goto out_fini_err_data;

@@ -2884,6 +2889,14 @@ static int amdgpu_ras_poison_creation_handler(struct 
amdgpu_device *adev,
return 0;
 }

+static void amdgpu_ras_clear_poison_fifo(struct amdgpu_device *adev) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_poison_msg msg;
+
+   while (kfifo_get(>poison_fifo, )); }
+
 static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
uint32_t msg_count, uint32_t *gpu_reset)  { @@ -2913,6 
+2926,11 @@ static int amdgpu_ras_poison_consumption_handler(struct 
amdgpu_device *adev,
else
reset = reset_flags;

+   /* Check if gpu is in reset state */
+   if (!down_read_trylock(>reset_domain->sem))
+   return -EIO;
+   up_read(>reset_domain->sem);

> [Kevin]:
> I'm confused that why not using 'amdgpu_in_reset()' helper function to check 
> reset state?

>Best Regards,
> Kevin

[Thomas] This function is called in page retirement thread.
 According to Christian König's previous email suggestions  
that "It's illegal to call amdgpu_in_reset() from outside of the hw specific 
backends."

+
flush_delayed_work(>page_retirement_dwork);

reinit_completion(>ras_recovery_completion);
@@ -2977,6 +2995,31 @@ static int amdgpu_ras_page_retirement_thread(void *param)
}
}

+   if ((ret == -EIO) || (gpu_reset == 
AMDGPU_RAS_GPU_RESET_MODE1_RESET)) {
+   /* gpu is in mode-1 reset state */
+   /* Clear poison creation request */
+   while (atomic_read(>poison_creation_count))
+   atomic_dec(>poison_creation_count);
[Kevin]:

Aha! It is better to use atomic_set() to instead of it.

Best Regards,
Kevin
+
+   /* Clear poison consumption fifo */
+   amdgpu_ras_clear_poison_fifo(adev);
+
+   while (atomic_read(>page_retirement_req_cnt))
+   atomic_dec(>page_retirement_req_cnt);
+
+   if (ret == -EIO) {
+   /* Wait for mode-1 reset to complete */
+   down_read(>reset_domain->sem);
+   up_read(>reset_domain->sem);
+   }
+
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(>page_retirement_dwork, 0);
+   } else if (gpu_reset) {
+   /* gpu is in mode-2 reset or other reset state */
+   /* Wake up work queue to save bad pages to eeprom */
+   schedule_delayed_work(>page_retirement_dwork, 0);
+   }
 #else
 dev_info(adev->dev, "Start processing page retirement. request:%d\n",
 atomic_read(>page_retirement_req_cnt));
--
2.34.1



RE: [PATCH 2/2] Revert "drm/amdgpu: change aca bank error lock type to spinlock"

2024-06-18 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

Series is
Reviewed-by: YiPeng Chai 


-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Tuesday, June 18, 2024 3:49 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; 
Chai, Thomas 
Subject: [PATCH 2/2] Revert "drm/amdgpu: change aca bank error lock type to 
spinlock"

This reverts commit 354436e7905d166011f2aa26dccd9fa04b20940e.

Revert this patch to modify lock type back to 'mutex' to avoid kernel calltrace 
issue.

[  602.668806] Workqueue: amdgpu-reset-dev amdgpu_ras_do_recovery [amdgpu] [  
602.668939] Call Trace:
[  602.668940]  
[  602.668941]  dump_stack_lvl+0x4c/0x70 [  602.668945]  dump_stack+0x14/0x20 [ 
 602.668946]  __schedule_bug+0x5a/0x70 [  602.668950]  __schedule+0x940/0xb30 [ 
 602.668952]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.668955]  ? hrtimer_reprogram+0x77/0xb0 [  602.668957]  ? 
srso_alias_return_thunk+0x5/0xfbef5
[  602.668959]  ? hrtimer_start_range_ns+0x126/0x370
[  602.668961]  schedule+0x39/0xe0
[  602.668962]  schedule_hrtimeout_range_clock+0xb1/0x140
[  602.668964]  ? __pfx_hrtimer_wakeup+0x10/0x10 [  602.668966]  
schedule_hrtimeout_range+0x17/0x20
[  602.668967]  usleep_range_state+0x69/0x90 [  602.668970]  
psp_cmd_submit_buf+0x132/0x570 [amdgpu] [  602.669066]  
psp_ras_invoke+0x75/0x1a0 [amdgpu] [  602.669156]  
psp_ras_query_address+0x9c/0x120 [amdgpu] [  602.669245]  
umc_v12_0_update_ecc_status+0x16d/0x520 [amdgpu] [  602.669337]  ? 
srso_alias_return_thunk+0x5/0xfbef5
[  602.669339]  ? stack_depot_save+0x12/0x20 [  602.669342]  ? 
srso_alias_return_thunk+0x5/0xfbef5
[  602.669343]  ? set_track_prepare+0x52/0x70 [  602.669346]  ? 
kmemleak_alloc+0x4f/0x90 [  602.669348]  ? __kmalloc_node+0x34b/0x450 [  
602.669352]  amdgpu_umc_update_ecc_status+0x23/0x40 [amdgpu] [  602.669438]  
mca_umc_mca_get_err_count+0x85/0xc0 [amdgpu] [  602.669554]  
mca_smu_parse_mca_error_count+0x120/0x1d0 [amdgpu] [  602.669655]  
amdgpu_mca_dispatch_mca_set.part.0+0x141/0x250 [amdgpu] [  602.669743]  ? 
kmemleak_free+0x36/0x60 [  602.669745]  ? kvfree+0x32/0x40 [  602.669747]  ? 
srso_alias_return_thunk+0x5/0xfbef5
[  602.669749]  ? kfree+0x15d/0x2a0
[  602.669752]  amdgpu_mca_smu_log_ras_error+0x1f6/0x210 [amdgpu] [  
602.669839]  amdgpu_ras_query_error_status_helper+0x2ad/0x390 [amdgpu] [  
602.669924]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.669925]  ? __call_rcu_common.constprop.0+0xa6/0x2b0
[  602.669929]  amdgpu_ras_query_error_status+0xf3/0x620 [amdgpu] [  
602.670014]  ? srso_alias_return_thunk+0x5/0xfbef5
[  602.670017]  amdgpu_ras_log_on_err_counter+0xe1/0x170 [amdgpu] [  
602.670103]  amdgpu_ras_do_recovery+0xd2/0x2c0 [amdgpu] [  602.670187]  ? 
srso_alias_return_thunk+0x5/0

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c | 19 ++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h |  3 +--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
index 04515c1c7241..7945173321a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -222,9 +222,9 @@ static struct aca_bank_error *new_bank_error(struct 
aca_error *aerr, struct aca_
INIT_LIST_HEAD(_error->node);
memcpy(_error->info, info, sizeof(*info));

-   spin_lock(>lock);
+   mutex_lock(>lock);
list_add_tail(_error->node, >list);
-   spin_unlock(>lock);
+   mutex_unlock(>lock);

return bank_error;
 }
@@ -235,7 +235,7 @@ static struct aca_bank_error *find_bank_error(struct 
aca_error *aerr, struct aca
struct aca_bank_info *tmp_info;
bool found = false;

-   spin_lock(>lock);
+   mutex_lock(>lock);
list_for_each_entry(bank_error, >list, node) {
tmp_info = _error->info;
if (tmp_info->socket_id == info->socket_id && @@ -246,7 +246,7 
@@ static struct aca_bank_error *find_bank_error(struct aca_error *aerr, struct 
aca
}

 out_unlock:
-   spin_unlock(>lock);
+   mutex_unlock(>lock);

return found ? bank_error : NULL;
 }
@@ -474,7 +474,7 @@ static int aca_log_aca_error(struct aca_handle *handle, 
enum aca_error_type type
struct aca_error *aerr = _cache->errors[type];
struct aca_bank_error *bank_error, *tmp;

-   spin_lock(>lock);
+   mutex_lock(>lock);

if (list_empty(>list))
goto out_unlock;
@@ -485,7 +485,7 @@ static int aca_log_aca_error(struct aca_handle *handle, 
enum aca_error_type type
}

 out_unlock:
-   spin_unlock(>lock);
+   mutex_unlock(>lock);

return 0;
 }
@@ -542,7 +542,7 @@ int amdgpu_aca_get_error_data(struct amdgpu_device *adev, 
struct aca_handle *han

 static void aca_error_init(struct aca_error *aerr, 

RE: [PATCH 5/5] drm/amdgpu: Remove dead code in amdgpu_ras_add_mca_err_addr

2024-05-13 Thread Chai, Thomas
[AMD Official Use Only - AMD Internal Distribution Only]

Reviewed-by: YiPeng Chai 

-
Best Regards,
Thomas

-Original Message-
From: amd-gfx  On Behalf Of Ma Jun
Sent: Monday, May 13, 2024 4:56 PM
To: amd-gfx@lists.freedesktop.org
Cc: Feng, Kenneth ; Deucher, Alexander 
; Wang, Yang(Kevin) ; 
Koenig, Christian ; Ma, Jun 
Subject: [PATCH 5/5] drm/amdgpu: Remove dead code in amdgpu_ras_add_mca_err_addr

Remove dead code in amdgpu_ras_add_mca_err_addr

Signed-off-by: Ma Jun 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6da02a209890..0cf67923c0fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -4292,21 +4292,8 @@ static struct ras_err_info 
*amdgpu_ras_error_get_info(struct ras_err_data *err_d

 void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct 
ras_err_addr *err_addr)  {
-   struct ras_err_addr *mca_err_addr;
-
/* This function will be retired. */
return;
-   mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
-   if (!mca_err_addr)
-   return;
-
-   INIT_LIST_HEAD(_err_addr->node);
-
-   mca_err_addr->err_status = err_addr->err_status;
-   mca_err_addr->err_ipid = err_addr->err_ipid;
-   mca_err_addr->err_addr = err_addr->err_addr;
-
-   list_add_tail(_err_addr->node, _info->err_addr_list);
 }

 void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct 
ras_err_addr *mca_err_addr)
--
2.34.1



RE: [PATCH] drm/amdgpu: add mutex to protect ras shared memory

2024-04-28 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Sunday, April 28, 2024 3:48 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Yang, Stanley 
Subject: RE: [PATCH] drm/amdgpu: add mutex to protect ras shared memory

[AMD Official Use Only - General]

-Original Message-
From: Chai, Thomas 
Sent: Sunday, April 28, 2024 3:08 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: [PATCH] drm/amdgpu: add mutex to protect ras shared memory

Add mutex to protect ras shared memory.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 121 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp_ta.c |   2 +
 3 files changed, 84 insertions(+), 40 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 5583e2d1b12f..fa4fea00f6b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1564,6 +1564,66 @@ static void psp_ras_ta_check_status(struct psp_context 
*psp)
}
 }

+static int psp_ras_send_cmd(struct psp_context *psp,
+   enum ras_command cmd_id, void *in, void *out) {
+   struct ta_ras_shared_memory *ras_cmd;
+   uint32_t cmd = cmd_id;
+   int ret = 0;
+
+   mutex_lock(>ras_context.mutex);
+   ras_cmd = (struct ta_ras_shared_memory 
*)psp->ras_context.context.mem_context.shared_buf;
+   memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
+
+   switch (cmd) {
+   case TA_RAS_COMMAND__ENABLE_FEATURES:
+   case TA_RAS_COMMAND__DISABLE_FEATURES:
+   memcpy(_cmd->ras_in_message,
+   in, sizeof(ras_cmd->ras_in_message));
+   break;
+   case TA_RAS_COMMAND__TRIGGER_ERROR:
+   memcpy(_cmd->ras_in_message.trigger_error,
+   in, sizeof(ras_cmd->ras_in_message.trigger_error));
+   break;
+   case TA_RAS_COMMAND__QUERY_ADDRESS:
+   memcpy(_cmd->ras_in_message.address,
+   in, sizeof(ras_cmd->ras_in_message.address));
+   break;
+   default:
+   dev_err(psp->adev->dev, "Invalid ras cmd id: %u\n", cmd);
+   ret = -EINVAL;
+   goto err_out;
+   }
+
+   ras_cmd->cmd_id = cmd;
+   ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+
+   switch (cmd) {
+   case TA_RAS_COMMAND__TRIGGER_ERROR:
+   if (out) {
+   uint32_t *ras_status = (uint32_t *)out;
[Kevin]:
It's better to check 'ret' value first before use this 'out' data.

Best Regards,
Kevin
+
+   *ras_status = ras_cmd->ras_status;
+   }
+   break;
+   case TA_RAS_COMMAND__QUERY_ADDRESS:
+   if (ret || ras_cmd->ras_status || psp->cmd_buf_mem->resp.status)
+   ret = -EINVAL;
+   else if (out)
+   memcpy(out,
+   _cmd->ras_out_message.address,
+   sizeof(ras_cmd->ras_out_message.address));
+   break;
+   default:
+   break;
+   }
+
+err_out:
+   mutex_unlock(>ras_context.mutex);
+
+   return ret;
+}
+
 int psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)  {
struct ta_ras_shared_memory *ras_cmd; @@ -1605,23 +1665,15 @@ int 
psp_ras_invoke(struct psp_context *psp, uint32_t ta_cmd_id)  int 
psp_ras_enable_features(struct psp_context *psp,
union ta_ras_cmd_input *info, bool enable)  {
-   struct ta_ras_shared_memory *ras_cmd;
+   enum ras_command cmd_id;
int ret;

-   if (!psp->ras_context.context.initialized)
+   if (!psp->ras_context.context.initialized || !info)
return -EINVAL;

-   ras_cmd = (struct ta_ras_shared_memory 
*)psp->ras_context.context.mem_context.shared_buf;
-   memset(ras_cmd, 0, sizeof(struct ta_ras_shared_memory));
-
-   if (enable)
-   ras_cmd->cmd_id = TA_RAS_COMMAND__ENABLE_FEATURES;
-   else
-   ras_cmd->cmd_id = TA_RAS_COMMAND__DISABLE_FEATURES;
-
-   ras_cmd->ras_in_message = *info;
-
-   ret = psp_ras_invoke(psp, ras_cmd->cmd_id);
+   cmd_id = enable ?
+   TA_RAS_COMMAND__ENABLE_FEATURES : 
TA_RAS_COMMAND__DISABLE_FEATURES;
+   ret = psp_ras_send_cmd(psp, cmd_id, info, NULL);
if (ret)
return -EINVAL;

@@ -1645,6 +1697,8 @@ int psp_ras_terminate(struct psp_context *psp)

psp->ras_context.context.initialized = false;

+   mutex_destroy(>ras_context.mutex);
+
return ret;
 }

@@ 

RE: [PATCH 04/15] drm/amdgpu: add poison creation handler

2024-04-24 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, I will do this.


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Thursday, April 25, 2024 10:33 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhou1, Tao ; Li, 
Candice ; Wang, Yang(Kevin) ; Yang, 
Stanley ; Chai, Thomas 
Subject: RE: [PATCH 04/15] drm/amdgpu: add poison creation handler

[AMD Official Use Only - General]

Is it okay to drop below static function and just implement the logic in poison 
creation handler leveraging the ras query api: amdgpu_ras_query_error_status.

It seems to me the static function may not be able to be used for other IP 
blocks.

Regards,
Hawking

+ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+   enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+   int ret = 0;
+   struct ras_ecc_log_info *ecc_log;
+   struct ras_query_if info;
+   uint32_t timeout = timeout_ms;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   memset(, 0, sizeof(info));
+   info.head.block = ras_block;
+
+   ecc_log = >umc_ecc_log;
+   ecc_log->de_updated = false;
+   do {
+   ret = amdgpu_ras_query_error_status(adev, );
+   if (ret) {
+   dev_err(adev->dev, "Failed to query ras error! 
ret:%d\n", ret);
+   return ret;
+   }
+
+   if (timeout && !ecc_log->de_updated) {
+   msleep(1);
+   timeout--;
+   }
+   } while (timeout && !ecc_log->de_updated);
+
+   if (timeout_ms && !timeout) {
+   dev_warn(adev->dev, "Can't find deferred error\n");
+   return -ETIMEDOUT;
+   }
+
+   return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+   uint32_t timeout) {
+   amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC,
+timeout); }
+

-Original Message-
From: amd-gfx  On Behalf Of YiPeng Chai
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 04/15] drm/amdgpu: add poison creation handler

Add poison creation handler.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 74 +++--
 1 file changed, 69 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 64e6e20c6de7..126616eaeec1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2080,6 +2080,17 @@ static void 
amdgpu_ras_interrupt_poison_creation_handler(struct ras_manager *obj  {
dev_info(obj->adev->dev,
"Poison is created\n");
+
+   if (amdgpu_ip_version(obj->adev, UMC_HWIP, 0) >= IP_VERSION(12, 0, 0)) {
+   struct amdgpu_ras *con =
+ amdgpu_ras_get_context(obj->adev);
+
+   amdgpu_ras_put_poison_req(obj->adev,
+   AMDGPU_RAS_BLOCK__UMC, 0, NULL, NULL, false);
+
+   atomic_inc(>page_retirement_req_cnt);
+
+   wake_up(>page_retirement_wq);
+   }
 }

 static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj, @@ 
-2754,10 +2765,54 @@ static void amdgpu_ras_ecc_log_fini(struct 
ras_ecc_log_info *ecc_log)
mutex_destroy(_log->lock);
ecc_log->de_updated = false;
 }
+
+static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
+   enum amdgpu_ras_block ras_block, uint32_t timeout_ms) {
+   int ret = 0;
+   struct ras_ecc_log_info *ecc_log;
+   struct ras_query_if info;
+   uint32_t timeout = timeout_ms;
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   memset(, 0, sizeof(info));
+   info.head.block = ras_block;
+
+   ecc_log = >umc_ecc_log;
+   ecc_log->de_updated = false;
+   do {
+   ret = amdgpu_ras_query_error_status(adev, );
+   if (ret) {
+   dev_err(adev->dev, "Failed to query ras error! 
ret:%d\n", ret);
+   return ret;
+   }
+
+   if (timeout && !ecc_log->de_updated) {
+   msleep(1);
+   timeout--;
+   }
+   } while (timeout && !ecc_log->de_updated);
+
+   if (timeout_ms && !timeout) {
+   dev_warn(adev->dev, "Can't find deferred error\n");
+   return -ETIMEDOUT;
+   }
+
+   return 0;
+}
+
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+   uint32_t tim

RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

2024-04-24 Thread Chai, Thomas
[AMD Official Use Only - General]

amdgpu_umc_fill_error_record  is called in umc_v12_0_convert_error_address 
directly to prepare for page retirement,
The new path need to check if these converted pages already exist before 
filling the error page,  umc_v12_0_convert_error_address is not suitable for 
new requirements, so I created a new interface.

-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Thursday, April 25, 2024 11:03 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

[AMD Official Use Only - General]

I might lose some context here. Can you please elaborate why we don't leverage 
the existing umc_v12_0_convert_error_address implementation?

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Thursday, April 18, 2024 10:58
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 06/15] drm/amdgpu: umc v12_0 converts error address

Umc v12_0 converts error address.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 94 +-  
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 12 
 2 files changed, 105 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 81435533c4a7..085dcfe16b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -222,6 +222,66 @@ static void umc_v12_0_convert_error_address(struct 
amdgpu_device *adev,
}
 }

+static int umc_v12_0_convert_err_addr(struct amdgpu_device *adev,
+   struct ta_ras_query_address_input *addr_in,
+   uint64_t *pfns, int len) {
+   uint32_t col, row, row_xor, bank, channel_index;
+   uint64_t soc_pa, retired_page, column, err_addr;
+   struct ta_ras_query_address_output addr_out;
+   uint32_t pos = 0;
+
+   err_addr = addr_in->ma.err_addr;
+   addr_in->addr_type = TA_RAS_MCA_TO_PA;
+   if (psp_ras_query_address(>psp, addr_in, _out)) {
+   dev_warn(adev->dev, "Failed to query RAS physical address for 
0x%llx",
+   err_addr);
+   return 0;
+   }
+
+   soc_pa = addr_out.pa.pa;
+   bank = addr_out.pa.bank;
+   channel_index = addr_out.pa.channel_idx;
+
+   col = (err_addr >> 1) & 0x1fULL;
+   row = (err_addr >> 10) & 0x3fffULL;
+   row_xor = row ^ (0x1ULL << 13);
+   /* clear [C3 C2] in soc physical address */
+   soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
+   /* clear [C4] in soc physical address */
+   soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+
+   /* loop for all possibilities of [C4 C3 C2] */
+   for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+   retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
+   retired_page |= (((column & 0x4) >> 2) <<
+ UMC_V12_0_PA_C4_BIT);
+
+   if (pos >= len)
+   return 0;
+   pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+   /* include column bit 0 and 1 */
+   col &= 0x3;
+   col |= (column << 2);
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row, col, bank, channel_index);
+
+   /* shift R13 bit */
+   retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
+
+   if (pos >= len)
+   return 0;
+   pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
+
+   dev_info(adev->dev,
+   "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x 
Bank:0x%x Channel:0x%x\n",
+   retired_page, row_xor, col, bank, channel_index);
+   }
+
+   return pos;
+}
+
 static int umc_v12_0_query_error_address(struct amdgpu_device *adev,
uint32_t node_inst, uint32_t umc_inst,
uint32_t ch_inst, void *data) @@ -482,8 
+542,12 @@ static int umc_v12_0_ras_late_init(struct amdgpu_device *adev, 
struct ras_common  static int umc_v12_0_update_ecc_status(struct amdgpu_device 
*adev,
uint64_t status, uint64_t ipid, uint64_t addr)  {
-   uint16_t hwid, mcatype;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   uint16_t hwid, mcatype;
+   struct ta_ras_query_address_input addr_in;
+   uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PE

RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

2024-04-24 Thread Chai, Thomas
[AMD Official Use Only - General]

-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Thursday, April 25, 2024 11:01 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison 
consumption

[AMD Official Use Only - General]

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t
+reset);

> So we ultimately switch to above poison consumption handler for all the 
> existing v9 adapters, right? If so, we shall be able to make this function 
> backwards compatible. I'm wondering if we can just change the existing 
> amdgpu_amdkfd_ras_poison_consumption_handler.

> Pasid_poison_consumption_handler is a little bit confusing.

[Thomas] No,  Only  UMC_HWIP  greater or equal to IP_VERSION(12, 0, 0)),  it 
works on  the new path.  The IP check is in amdgpu_umc_pasid_poison_handler 
function.



Regards,
Hawking

-Original Message-----
From: Chai, Thomas 
Sent: Thursday, April 18, 2024 10:59
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 11/15] drm/amdgpu: prepare to handle pasid poison consumption

Prepare to handle pasid poison consumption.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c|  9 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c   | 20 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h   |  3 +++
 .../gpu/drm/amd/amdkfd/kfd_int_process_v9.c   |  3 ++-
 5 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 66753940bb4d..287ce431901c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -759,10 +759,17 @@ bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev)
return amdgpu_ras_get_fed_status(adev);  }

+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t 
reset) {
+   amdgpu_umc_pasid_poison_handler(adev, block, pasid, pasid_fn,
+data, reset); }
+
 void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset)  {
-   amdgpu_umc_poison_handler(adev, block, reset);
+   amdgpu_umc_pasid_poison_handler(adev, block, 0, NULL, NULL,
+ reset);
 }

 int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev, diff 
--git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index ad50c7bbc326..54e15994d02b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -401,6 +401,11 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device 
*adev,
struct tile_config *config);  void 
amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t reset);
+
+void amdgpu_amdkfd_ras_pasid_poison_consumption_handler(struct amdgpu_device 
*adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t
+reset);
+
 bool amdgpu_amdkfd_is_fed(struct amdgpu_device *adev);  bool 
amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem 
*mem);  void amdgpu_amdkfd_block_mmu_notifications(void *p); diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index dcda3d24bee3..8ebbca9e2e22 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -252,8 +252,9 @@ int amdgpu_umc_bad_page_polling_timeout(struct 
amdgpu_device *adev,
return 0;
 }

-int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
-   enum amdgpu_ras_block block, uint32_t reset)
+int amdgpu_umc_pasid_poison_handler(struct amdgpu_device *adev,
+   enum amdgpu_ras_block block, uint16_t pasid,
+   pasid_notify pasid_fn, void *data, uint32_t
+reset)
 {
int ret = AMDGPU_RAS_SUCCESS;

@@ -291,16 +292,14 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,

amdgpu_ras_error_data_fini(_data);
} else {
-   if (reset) {
-   amdgpu_umc_bad_page_polling_timeout(adev,
- 

RE: [PATCH] drm/amdgpu: Fix ras mode2 reset failure in ras aca mode

2024-04-23 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Tuesday, April 23, 2024 11:27 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH] drm/amdgpu: Fix ras mode2 reset failure in ras aca mode

[AMD Official Use Only - General]

Shall we move the check to the aca helper function?

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, April 23, 2024 11:14
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH] drm/amdgpu: Fix ras mode2 reset failure in ras aca mode

Fix ras mode2 reset failure in ras aca mode for sdma v4_4_2 and gfx v9_4_3.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c  | 4   
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 4 
 2 files changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 0e429b7ed036..c8bc34aafdd7 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -4324,6 +4324,10 @@ static int gfx_v9_4_3_ras_late_init(struct amdgpu_device 
*adev, struct ras_commo
if (r)
return r;

+   /* in resume phase, no need to create aca fs node */
+   if (adev->in_suspend || amdgpu_in_reset(adev))
+   return 0;
+
r = amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__GFX,
_v9_4_3_aca_info,
NULL);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 7ea209b68154..77ae943745fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -2249,6 +2249,10 @@ static int sdma_v4_4_2_ras_late_init(struct 
amdgpu_device *adev, struct ras_comm
if (r)
return r;

+   /* in resume phase, no need to create aca fs node */
+   if (adev->in_suspend || amdgpu_in_reset(adev))
+   return 0;
+
return amdgpu_ras_bind_aca(adev, AMDGPU_RAS_BLOCK__SDMA,
   _v4_4_2_aca_info, NULL);  }
--
2.34.1




RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

2024-04-22 Thread Chai, Thomas
[AMD Official Use Only - General]

update

-
Best Regards,
Thomas

-Original Message-
From: amd-gfx  On Behalf Of Chai, Thomas
Sent: Monday, April 22, 2024 5:21 PM
To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Li, Candice ; 
Wang, Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

[AMD Official Use Only - General]

-
Best Regards,
Thomas

-Original Message-
From: Zhou1, Tao 
Sent: Monday, April 22, 2024 4:14 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Li, Candice ; 
Wang, Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

> -Original Message-
> From: Chai, Thomas 
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking
> ; Zhou1, Tao ; Li, Candice
> ; Wang, Yang(Kevin) ;
> Yang, Stanley ; Chai, Thomas
> 
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +-
>  1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
>  #include "umc/umc_12_0_0_sh_mask.h"
>  #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT  16

> [Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the
> purposes

[Thomas]  This is from eeprom view to define the maximum number of eeprom table 
entries written each time.
The actual data items that need to be written to eeprom may be less than this 
value(e.g.  later, write the data items before address conversion), or may be 
more than this value.
If the value is less than this value, it will be written according to the 
actual value, and if it is more than this value, it will be written in multiple 
times.


> +
>  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
>   uint32_t node_inst,
>   uint32_t umc_inst, @@ -633,6
> +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
>   return 0;
>  }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> + struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> + uint32_t i = 0;
> + int ret = 0;
> +
> + if (!err_data || !ecc_err)
> + return -EINVAL;
> +
> + for (i = 0; i < ecc_err->err_pages.count; i++) {
> + ret = amdgpu_umc_fill_error_record(err_data,
> + ecc_err->addr,
> + ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> + MCA_IPID_2_UMC_CH(ecc_err->ipid),
> + MCA_IPID_2_UMC_INST(ecc_err->ipid));
> + if (ret)
> + break;
> + }
> +
> + err_data->de_count++;
> +
> + return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> + void *ras_error_status) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> + struct radix_tree_root *ecc_tree;
> + int new_detected, ret, i;
> +
> + ecc_tree = >umc_ecc_log.de_page_tree;
> +
> + mutex_lock(>umc_ecc_log.lock);
> + new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> + 0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> + for (i = 0; i < new_detected; i++) {
> + if (!entries[i])
> + continue;
> +
> + ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> + if (ret) {
> + dev_err(adev->dev, "Fail to fill umc error
> + record,
> ret:%d\n", ret);
> + break;
> + }
> + radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> + }
> + mutex_unlock(>umc_ecc_log.lock);
> +}
> +
>  struct amdgpu_umc_ras umc_v12_0_ras = {
>   .r

RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

2024-04-22 Thread Chai, Thomas
[AMD Official Use Only - General]

-
Best Regards,
Thomas

-Original Message-
From: Zhou1, Tao 
Sent: Monday, April 22, 2024 4:14 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Li, Candice ; 
Wang, Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0

[AMD Official Use Only - General]

> -Original Message-
> From: Chai, Thomas 
> Sent: Thursday, April 18, 2024 10:59 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking
> ; Zhou1, Tao ; Li, Candice
> ; Wang, Yang(Kevin) ;
> Yang, Stanley ; Chai, Thomas
> 
> Subject: [PATCH 10/15] drm/amdgpu: retire bad pages for umc v12_0
>
> Retire bad pages for umc v12_0.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57
> +-
>  1 file changed, 55 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> index 6c2b61ef5b57..bd917eb6ea24 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
> @@ -28,6 +28,8 @@
>  #include "umc/umc_12_0_0_sh_mask.h"
>  #include "mp/mp_13_0_6_sh_mask.h"
>
> +#define MAX_ECC_NUM_PER_RETIREMENT  16

> [Tao] we already have UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL for the purposes

[Thomas]  This is from eeprom view to define the maximum number of eeprom table 
entries written each time.
The actual data items that need to be written to eeprom may be less than this 
value, or may be more than this value(e.g.  later, write the data items before 
address conversion).
If the value is less than this value, it will be written according to the 
actual value, and if it is more than this value, it will be written in multiple 
times.


> +
>  static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
>   uint32_t node_inst,
>   uint32_t umc_inst, @@ -633,6
> +635,58 @@ static int umc_v12_0_update_ecc_status(struct
> amdgpu_device *adev,
>   return 0;
>  }
>
> +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
> + struct ras_ecc_err *ecc_err, void
> *ras_error_status) {
> + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
> + uint32_t i = 0;
> + int ret = 0;
> +
> + if (!err_data || !ecc_err)
> + return -EINVAL;
> +
> + for (i = 0; i < ecc_err->err_pages.count; i++) {
> + ret = amdgpu_umc_fill_error_record(err_data,
> + ecc_err->addr,
> + ecc_err->err_pages.pfn[i] <<
> AMDGPU_GPU_PAGE_SHIFT,
> + MCA_IPID_2_UMC_CH(ecc_err->ipid),
> + MCA_IPID_2_UMC_INST(ecc_err->ipid));
> + if (ret)
> + break;
> + }
> +
> + err_data->de_count++;
> +
> + return ret;
> +}
> +
> +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
> + void *ras_error_status) {
> + struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
> + struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
> + struct radix_tree_root *ecc_tree;
> + int new_detected, ret, i;
> +
> + ecc_tree = >umc_ecc_log.de_page_tree;
> +
> + mutex_lock(>umc_ecc_log.lock);
> + new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
> + 0, ARRAY_SIZE(entries),
> UMC_ECC_NEW_DETECTED_TAG);
> + for (i = 0; i < new_detected; i++) {
> + if (!entries[i])
> + continue;
> +
> + ret = umc_v12_0_fill_error_record(adev, entries[i],
> ras_error_status);
> + if (ret) {
> + dev_err(adev->dev, "Fail to fill umc error
> + record,
> ret:%d\n", ret);
> + break;
> + }
> + radix_tree_tag_clear(ecc_tree, entries[i]->hash_index,
> UMC_ECC_NEW_DETECTED_TAG);
> + }
> + mutex_unlock(>umc_ecc_log.lock);
> +}
> +
>  struct amdgpu_umc_ras umc_v12_0_ras = {
>   .ras_block = {
>   .hw_ops = _v12_0_ras_hw_ops, @@ -640,8 +694,7 @@
> struct amdgpu_umc_ras umc_v12_0_ras = {
>   },
>   .err_cnt_init = umc_v12_0_err_cnt_init,
>   .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
> - .ecc_info_query_ras_error_count =
> umc_v12_0_ecc_info_query_ras_error_count,
> - .ecc_info_query_ras_error_address =
> umc_v12_0_ecc_info_query_ras_error_address,
> + .ecc_info_query_ras_error_address =
> umc_v12_0_query_ras_ecc_err_addr,
>   .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
>   .update_ecc_status = umc_v12_0_update_ecc_status,  };
> --
> 2.34.1




RE: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page

2024-04-21 Thread Chai, Thomas
[AMD Official Use Only - General]

Ping 


-
Best Regards,
Thomas

-Original Message-
From: Chai, Thomas 
Sent: Thursday, April 18, 2024 10:59 AM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page

Use new interface to reserve bad page.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d1a2ab944b7d..dee66db10fa2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2548,9 +2548,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
goto out;
}

-   amdgpu_vram_mgr_reserve_range(>mman.vram_mgr,
-   bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
-   AMDGPU_GPU_PAGE_SIZE);
+   amdgpu_ras_reserve_page(adev, bps[i].retired_page);

memcpy(>bps[data->count], [i], sizeof(*data->bps));
data->count++;
--
2.34.1



RE: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page

2024-04-18 Thread Chai, Thomas
[AMD Official Use Only - General]

-
Best Regards,
Thomas

-Original Message-
From: Christian König 
Sent: Thursday, April 18, 2024 5:01 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: Re: [PATCH 15/15] drm/amdgpu: Use new interface to reserve bad page

Am 18.04.24 um 04:58 schrieb YiPeng Chai:
> Use new interface to reserve bad page.
>
> Signed-off-by: YiPeng Chai 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 +---
>   1 file changed, 1 insertion(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d1a2ab944b7d..dee66db10fa2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2548,9 +2548,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
>   goto out;
>   }
>
> - amdgpu_vram_mgr_reserve_range(>mman.vram_mgr,
> - bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT,
> - AMDGPU_GPU_PAGE_SIZE);

> Were is the call to reserve the VRAM range now moved?

[Thomas] Called in amdgpu_ras_reserve_page,  amdgpu_ras_reserve_page  refer to 
" [PATCH 01/15] drm/amdgpu: Add interface to reserve bad page "

Regards,
Christian.

> + amdgpu_ras_reserve_page(adev, bps[i].retired_page);
>
>   memcpy(>bps[data->count], [i], sizeof(*data->bps));
>   data->count++;



RE: [PATCH V2] drm/amdgpu: Fix incorrect return value

2024-04-14 Thread Chai, Thomas
[AMD Official Use Only - General]

Hi Christian:
   If an ecc error occurs at an address, HW will generate an interrupt to SW to 
retire all pages located in the same physical row as the error address based on 
the physical characteristics of the memory device.
   Therefore, if other pages located on the same physical row as the error 
address also occur ecc errors later, HW will also generate multiple interrupts 
to SW to retire these same pages again, so that amdgpu_vram_mgr_reserve_range 
will be called multiple times to reserve the same pages.

I think it's more appropriate to do the status check inside the function. 
If the function entry is not checked, people who are not familiar with this 
part of the code can easily make mistakes when calling the function.


-
Best Regards,
Thomas

-Original Message-
From: Christian König 
Sent: Friday, April 12, 2024 5:24 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: Re: [PATCH V2] drm/amdgpu: Fix incorrect return value

Am 12.04.24 um 10:55 schrieb YiPeng Chai:
> [Why]
>After calling amdgpu_vram_mgr_reserve_range multiple times with the
> same address, calling amdgpu_vram_mgr_query_page_status will always
> return -EBUSY.
>From the second call to amdgpu_vram_mgr_reserve_range, the same
> address will be added to the reservations_pending list again and is
> never moved to the reserved_pages list because the address had been
> reserved.

Well just to make it clear that approach is a NAK until my concerns are solved.

Regards,
Christian.

>
> [How]
>First add the address status check before calling
> amdgpu_vram_mgr_do_reserve, if the address is already reserved, do
> nothing; If the address is already in the reservations_pending list,
> directly reserve memory; only add new nodes for the addresses that are
> not in the reserved_pages list and reservations_pending list.
>
> V2:
>   Avoid repeated locking/unlocking.
>
> Signed-off-by: YiPeng Chai 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 25 +---
>   1 file changed, 16 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 1e36c428d254..a636d3f650b1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> ttm_resource_manager *man)
>
>   dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
>   rsv->start, rsv->size);
> -
>   vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
>   atomic64_add(vis_usage, >vis_usage);
>   spin_lock(>bdev->lru_lock);
> @@ -340,19 +339,27 @@ int amdgpu_vram_mgr_reserve_range(struct 
> amdgpu_vram_mgr *mgr,
> uint64_t start, uint64_t size)
>   {
>   struct amdgpu_vram_reservation *rsv;
> + int ret = 0;
>
> - rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> - if (!rsv)
> - return -ENOMEM;
> + ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> + if (!ret)
> + return 0;
>
> - INIT_LIST_HEAD(>allocated);
> - INIT_LIST_HEAD(>blocks);
> + if (ret == -ENOENT) {
> + rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> + if (!rsv)
> + return -ENOMEM;
>
> - rsv->start = start;
> - rsv->size = size;
> + INIT_LIST_HEAD(>allocated);
> + INIT_LIST_HEAD(>blocks);
> +
> + rsv->start = start;
> + rsv->size = size;
> + }
>
>   mutex_lock(>lock);
> - list_add_tail(>blocks, >reservations_pending);
> + if (ret == -ENOENT)
> + list_add_tail(>blocks, >reservations_pending);
>   amdgpu_vram_mgr_do_reserve(>manager);
>   mutex_unlock(>lock);
>



RE: [PATCH] drm/amdgpu: Fix incorrect return value

2024-04-09 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Zhou1, Tao 
Sent: Tuesday, April 9, 2024 10:52 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Li, Candice ; 
Wang, Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value

[AMD Official Use Only - General]

> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, April 3, 2024 3:07 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking
> ; Zhou1, Tao ; Li, Candice
> ; Wang, Yang(Kevin) ;
> Yang, Stanley ; Chai, Thomas
> 
> Subject: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [Why]
>   After calling amdgpu_vram_mgr_reserve_range multiple times with the
> same address, calling amdgpu_vram_mgr_query_page_status will always
> return - EBUSY.
>   From the second call to amdgpu_vram_mgr_reserve_range, the same
> address will be added to the reservations_pending list again and is
> never moved to the reserved_pages list because the address had been reserved.
>
> [How]
>   First add the address status check before calling
> amdgpu_vram_mgr_do_reserve, if the address is already reserved, do
> nothing; If the address is already in the reservations_pending list,
> directly reserve memory; only add new nodes for the addresses that are
> not in the reserved_pages list and reservations_pending list.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28
> +---
>  1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 1e36c428d254..0bf3f4092900 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> ttm_resource_manager *man)
>
>   dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
>   rsv->start, rsv->size);
> -
>   vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
>   atomic64_add(vis_usage, >vis_usage);
>   spin_lock(>bdev->lru_lock); @@ -340,19 +339,30 @@
> int amdgpu_vram_mgr_reserve_range(struct
> amdgpu_vram_mgr *mgr,
> uint64_t start, uint64_t size)  {
>   struct amdgpu_vram_reservation *rsv;
> + int ret = 0;
>
> - rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> - if (!rsv)
> - return -ENOMEM;
> + ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> + if (!ret)
> + return 0;
> +
> + if (ret == -ENOENT) {
> + rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> + if (!rsv)
> + return -ENOMEM;
>
> - INIT_LIST_HEAD(>allocated);
> - INIT_LIST_HEAD(>blocks);
> + INIT_LIST_HEAD(>allocated);
> + INIT_LIST_HEAD(>blocks);
>
> - rsv->start = start;
> - rsv->size = size;
> + rsv->start = start;
> + rsv->size = size;
> +
> + mutex_lock(>lock);
> + list_add_tail(>blocks, >reservations_pending);
> + mutex_unlock(>lock);

[Tao] we can drop the mutex_unlock and add if (ret != -ENOENT) for the second 
mutex_lock to avoid unlocking/locking repeatedly.

> +
> + }
>
>   mutex_lock(>lock);
> - list_add_tail(>blocks, >reservations_pending);
>   amdgpu_vram_mgr_do_reserve(>manager);
>   mutex_unlock(>lock);
>
> --
> 2.34.1




RE: [PATCH] drm/amdgpu: Fix incorrect return value

2024-04-08 Thread Chai, Thomas
[AMD Official Use Only - General]

-
Best Regards,
Thomas

-Original Message-
From: Zhou1, Tao 
Sent: Monday, April 8, 2024 4:41 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Li, Candice ; 
Wang, Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value

[AMD Official Use Only - General]

> -Original Message-
> From: Chai, Thomas 
> Sent: Sunday, April 7, 2024 10:21 AM
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Li, Candice
> ; Wang, Yang(Kevin) ;
> Yang, Stanley 
> Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [AMD Official Use Only - General]
>
> -
> Best Regards,
> Thomas
>
> -Original Message-
> From: Zhou1, Tao 
> Sent: Wednesday, April 3, 2024 6:36 PM
> To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Li, Candice
> ; Wang, Yang(Kevin) ;
> Yang, Stanley 
> Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [AMD Official Use Only - General]
>
> > -Original Message-
> > From: Chai, Thomas 
> > Sent: Wednesday, April 3, 2024 3:07 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Chai, Thomas ; Zhang, Hawking
> > ; Zhou1, Tao ; Li, Candice
> > ; Wang, Yang(Kevin) ;
> > Yang, Stanley ; Chai, Thomas
> > 
> > Subject: [PATCH] drm/amdgpu: Fix incorrect return value
> >
> > [Why]
> >   After calling amdgpu_vram_mgr_reserve_range multiple times with
> > the same address, calling amdgpu_vram_mgr_query_page_status will
> > always return - EBUSY.
>
> >[Tao] could you explain why we call amdgpu_vram_mgr_reserve_range
> >multiple
> times with the same  address? IIRC, we skip duplicate address before
> reserve memory.
>
> [Thomas]
>When poison creation interrupt is received, since some poisoning
> addresses may have been allocated by some processes, reserving these memories 
> will fail.
> These memory will be tried to reserve again after killing the poisoned
> process in the subsequent poisoning consumption interrupt handler.
> so amdgpu_vram_mgr_reserve_range needs to be called multiple times
> with the same address.
>
> >   From the second call to amdgpu_vram_mgr_reserve_range, the same
> > address will be added to the reservations_pending list again and is
> > never moved to the reserved_pages list because the address had been
> reserved.

>[Tao] but if a page is added to reservations_pending list, it should also be 
>put in data->bps array, and when we call amdgpu_ras_add_bad_pages again, 
>amdgpu_ras_check_bad_page_unlock could ignore this page.
So except for amdgpu_ras_add_bad_pages, would you like to call 
amdgpu_vram_mgr_reserve_range in other place?

[Thomas] Yes,  Since after amdgpu_ras_add_bad_pages is called, the bad pages 
will be saved to eeprom. When a large number of bad pages need to be reserved, 
this will delay subsequent memory reservation.
I want to call amdgpu_vram_mgr_reserve_range to reserve memory immediately when 
driver receives poison creation interrupt, this can reduce the probability of 
bad memory pages being allocated and storing the bad pages to eeprom can be 
done slowly.

> >
> > [How]
> >   First add the address status check before calling
> > amdgpu_vram_mgr_do_reserve, if the address is already reserved, do
> > nothing; If the address is already in the reservations_pending list,
> > directly reserve memory; only add new nodes for the addresses that
> > are not in the reserved_pages list and reservations_pending list.
> >
> > Signed-off-by: YiPeng Chai 
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28
> > +---
> >  1 file changed, 19 insertions(+), 9 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > index 1e36c428d254..0bf3f4092900 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> > @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> > ttm_resource_manager *man)
> >
> >   dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
> >   rsv->start, rsv->size);
> > -
> >   vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
> >   atomic64_add(vis_usage, >vis_usage);
> >   spin_lock(>bdev->lru_lock); @@ -340,19 +339,30 @@
> > int amdgpu_vram_mgr_reserve_range(struct
> > amdgpu_vram_mgr *mgr,
> > uint64_t start, uint64_

RE: [PATCH] drm/amdgpu: Fix incorrect return value

2024-04-06 Thread Chai, Thomas
[AMD Official Use Only - General]

-
Best Regards,
Thomas

-Original Message-
From: Zhou1, Tao 
Sent: Wednesday, April 3, 2024 6:36 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Li, Candice ; 
Wang, Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH] drm/amdgpu: Fix incorrect return value

[AMD Official Use Only - General]

> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, April 3, 2024 3:07 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking
> ; Zhou1, Tao ; Li, Candice
> ; Wang, Yang(Kevin) ;
> Yang, Stanley ; Chai, Thomas
> 
> Subject: [PATCH] drm/amdgpu: Fix incorrect return value
>
> [Why]
>   After calling amdgpu_vram_mgr_reserve_range multiple times with the
> same address, calling amdgpu_vram_mgr_query_page_status will always
> return - EBUSY.

>[Tao] could you explain why we call amdgpu_vram_mgr_reserve_range multiple 
>times with the same  address? IIRC, we skip duplicate address before reserve 
>memory.

[Thomas]
   When poison creation interrupt is received, since some poisoning addresses 
may have been allocated by some processes, reserving these memories will fail.
These memory will be tried to reserve again after killing the poisoned process 
in the subsequent poisoning consumption interrupt handler.
so amdgpu_vram_mgr_reserve_range needs to be called multiple times with the 
same address.

>   From the second call to amdgpu_vram_mgr_reserve_range, the same
> address will be added to the reservations_pending list again and is
> never moved to the reserved_pages list because the address had been reserved.
>
> [How]
>   First add the address status check before calling
> amdgpu_vram_mgr_do_reserve, if the address is already reserved, do
> nothing; If the address is already in the reservations_pending list,
> directly reserve memory; only add new nodes for the addresses that are
> not in the reserved_pages list and reservations_pending list.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 28
> +---
>  1 file changed, 19 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 1e36c428d254..0bf3f4092900 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -317,7 +317,6 @@ static void amdgpu_vram_mgr_do_reserve(struct
> ttm_resource_manager *man)
>
>   dev_dbg(adev->dev, "Reservation 0x%llx - %lld, Succeeded\n",
>   rsv->start, rsv->size);
> -
>   vis_usage = amdgpu_vram_mgr_vis_size(adev, block);
>   atomic64_add(vis_usage, >vis_usage);
>   spin_lock(>bdev->lru_lock); @@ -340,19 +339,30 @@
> int amdgpu_vram_mgr_reserve_range(struct
> amdgpu_vram_mgr *mgr,
> uint64_t start, uint64_t size)  {
>   struct amdgpu_vram_reservation *rsv;
> + int ret = 0;
>
> - rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> - if (!rsv)
> - return -ENOMEM;
> + ret = amdgpu_vram_mgr_query_page_status(mgr, start);
> + if (!ret)
> + return 0;
> +
> + if (ret == -ENOENT) {
> + rsv = kzalloc(sizeof(*rsv), GFP_KERNEL);
> + if (!rsv)
> + return -ENOMEM;
>
> - INIT_LIST_HEAD(>allocated);
> - INIT_LIST_HEAD(>blocks);
> + INIT_LIST_HEAD(>allocated);
> + INIT_LIST_HEAD(>blocks);
>
> - rsv->start = start;
> - rsv->size = size;
> + rsv->start = start;
> + rsv->size = size;
> +
> + mutex_lock(>lock);
> + list_add_tail(>blocks, >reservations_pending);
> + mutex_unlock(>lock);
> +
> + }
>
>   mutex_lock(>lock);
> - list_add_tail(>blocks, >reservations_pending);
>   amdgpu_vram_mgr_do_reserve(>manager);
>   mutex_unlock(>lock);
>
> --
> 2.34.1




RE: [PATCH V2 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-18 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, I will remove it.


-
Best Regards,
Thomas

From: Wang, Yang(Kevin) 
Sent: Thursday, January 18, 2024 9:15 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Yang, Stanley 

Subject: Re: [PATCH V2 1/5] drm/amdgpu: Add log info for umc_v12_0 and 
smu_v13_0_6


[AMD Official Use Only - General]

The title and description don't seem right.

Remove smu?

Best Regards,
Kevin

From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Sent: Thursday, January 18, 2024 14:43
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhang, 
Hawking mailto:hawking.zh...@amd.com>>; Zhou1, Tao 
mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
Subject: [PATCH V2 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

v2:
 Delete redundant logs.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)
 {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
 return (amdgpu_ras_is_poison_mode_supported(adev) &&
 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Val) == 1) &&
 (REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
 uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
 int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
 return; /* Presumably process exited. */
+   }

 user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
 if (unlikely(user_gpu_id == -EINVAL)) {
@@ -1334,6 +1336,8 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
 }
 }

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
 rcu_read_unlock();

 /* user application will handle SIGBUS signal */
--
2.34.1


RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Wang, Yang(Kevin) 
Sent: Thursday, January 18, 2024 11:00 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; Li, 
Candice ; Yang, Stanley 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 4:21 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
@@ -2406,10 +2406,23 @@ static int smu_v13_0_6_get_valid_mca_count(struct 
smu_context *smu, enum amdgpu_

ret = smu_cmn_send_smc_msg(smu, msg, count);
if (ret) {
+   dev_err(smu->adev->dev, "%s(%d) failed to query %s MCA count, 
ret:%d\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   ret);
*count = 0;
return ret;
}

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);


[Kevin]:
Please make following function public then use this helper function to get msg 
name string.
- smu_get_message_name()

Best Regards,
Kevin
+
return 0;
 }

--
2.34.1




RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]


-
Best Regards,
Thomas


_
From: Zhou1, Tao 
Sent: Thursday, January 18, 2024 11:24 AM
To: Chai, Thomas ; Zhang, Hawking ; 
amd-gfx@lists.freedesktop.org
Cc: Li, Candice ; Wang, Yang(Kevin) 
; Yang, Stanley 
Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


[AMD Official Use Only - General]





  _
  From: Chai, Thomas mailto:yipeng.c...@amd.com>>
  Sent: Thursday, January 18, 2024 11:06 AM
  To: Zhang, Hawking mailto:hawking.zh...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
  Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>
  Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]






  -
  Best Regards,
  Thomas


  _
  From: Zhang, Hawking mailto:hawking.zh...@amd.com>>
  Sent: Wednesday, January 17, 2024 7:54 PM
  To: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
  Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>
  Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


  [AMD Official Use Only - General]



  Please check my comments inline

      Regards,
  Hawking

  -Original Message-
  From: Chai, Thomas mailto:yipeng.c...@amd.com>>
  Sent: Tuesday, January 16, 2024 16:21
      To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
  Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
Zhang, Hawking mailto:hawking.zh...@amd.com>>; Zhou1, 
Tao mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
  Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning

  Use asynchronous polling to handle umc_v12_0 poisoning.

  Signed-off-by: YiPeng Chai 
mailto:yipeng.c...@amd.com>>
  ---
   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
   drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
   3 files changed, 120 insertions(+), 31 deletions(-)

  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  index 856206e95842..44929281840e 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
  @@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
   /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
   #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

  +#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
  +
   enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
  @@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void 
*param)
atomic_read(>page_retirement_req_cnt));

atomic_dec(>page_retirement_req_cnt);
  +
  + amdgpu_umc_poison_retire_page_polling_timeout(adev,
  + false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  index 9d1cf41cf483..2dde29cb807d 100644
  --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
  @@ -23,6 +23,7 @@

   #include "amdgpu.h"
   #include "umc_v6_7.h"
  +#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

   static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
   }

  -static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
  - void *ras_error_status,
  - struct amdgpu_iv_entry *entry,
  - bool reset)
  +static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev

RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 poisoning

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]


-
Best Regards,
Thomas


_
From: Zhang, Hawking 
Sent: Wednesday, January 17, 2024 7:54 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley 
Subject: RE: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle 
umc_v12_0 poisoning


[AMD Official Use Only - General]



Please check my comments inline

Regards,
Hawking

-Original Message-
From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhang, 
Hawking mailto:hawking.zh...@amd.com>>; Zhou1, Tao 
mailto:tao.zh...@amd.com>>; Li, Candice 
mailto:candice...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>; Yang, Stanley 
mailto:stanley.y...@amd.com>>; Chai, Thomas 
mailto:yipeng.c...@amd.com>>
Subject: [PATCH 3/5] drm/amdgpu: Use asynchronous polling to handle umc_v12_0 
poisoning

Use asynchronous polling to handle umc_v12_0 poisoning.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |   5 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 143 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   3 +
 3 files changed, 120 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 856206e95842..44929281840e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -118,6 +118,8 @@ const char *get_ras_block_str(struct ras_common_if 
*ras_block)
 /* typical ECC bad page rate is 1 bad page per 100MB VRAM */
 #define RAS_BAD_PAGE_COVER  (100 * 1024 * 1024ULL)

+#define MAX_UMC_POISON_POLLING_TIME_ASYNC  100  //ms
+
 enum amdgpu_ras_retire_page_reservation {
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2670,6 +2672,9 @@ static int amdgpu_ras_page_retirement_thread(void *param)
atomic_read(>page_retirement_req_cnt));

atomic_dec(>page_retirement_req_cnt);
+
+   amdgpu_umc_poison_retire_page_polling_timeout(adev,
+   false, MAX_UMC_POISON_POLLING_TIME_ASYNC);
}

return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 9d1cf41cf483..2dde29cb807d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -23,6 +23,7 @@

 #include "amdgpu.h"
 #include "umc_v6_7.h"
+#define MAX_UMC_POISON_POLLING_TIME_SYNC   20  //ms

 static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data, uint64_t 
err_addr, @@ -85,17 +86,14 @@ int amdgpu_umc_page_retirement_mca(struct 
amdgpu_device *adev,
return ret;
 }

-static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
-   void *ras_error_status,
-   struct amdgpu_iv_entry *entry,
-   bool reset)
+static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
+   void *ras_error_status)
 {
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret = 0;
unsigned long err_count;
-
-   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   mutex_lock(>page_retirement_lock);
ret = amdgpu_dpm_get_ecc_info(adev, (void *)&(con->umc_ecc));
if (ret == -EOPNOTSUPP) {
if (adev->umc.ras && adev->umc.ras->ras_block.hw_ops && @@ 
-163,19 +161,86 @@ static int amdgpu_umc_do_page_retirement(struct 
amdgpu_device *adev,
con->update_channel_flag = false;
}
}
-
-   if (reset) {
-   /* use mode-2 reset for poison consumption */
-   if (!entry)
-   con->gpu_reset_flags |= 
AMDGPU_RAS_GPU_RESET_MODE2_RESET;
-   amdgpu_ras_reset_gpu(adev);
-   }
}

kfree(err_data->err_addr);
+
+   mutex_unlock(>page_retirement_lock);
+}
+
+static int amdgpu_umc_do_page_retirement(struct amdgpu_device *adev,
+   void *ras_error_status,
+   struct amdgpu_iv_entry *entry,
+   bool reset)
+{
+   struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
+   amdgpu_umc_handle_bad_pages(adev, ras_error_sta

RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

2024-01-17 Thread Chai, Thomas
[AMD Official Use Only - General]

OK


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Wednesday, January 17, 2024 7:40 PM
To: Zhang, Hawking ; Chai, Thomas ; 
amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Li, Candice 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

Please ignore my first comment. It doesn't necessarily associated with socket  
id in UMC MCA status log at this stage.

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of Zhang, 
Hawking
Sent: Wednesday, January 17, 2024 19:12
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Li, Candice 
Subject: RE: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

[AMD Official Use Only - General]

[AMD Official Use Only - General]

+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,

Please also print out socket id for UMC MCA status.

+   dev_info(smu->adev->dev, "MSG %s(%d) query %s MCA count result:%u\n",
+   (msg == SMU_MSG_QueryValidMcaCeCount) ?
+   "SMU_MSG_QueryValidMcaCeCount" : 
"SMU_MSG_QueryValidMcaCount",
+   msg,
+   (msg == SMU_MSG_QueryValidMcaCeCount) ? "CE" : "UE",
+   *count);
+

This seems redundant or was added for debugging purpose. We can drop this print 
since there is log to cover failures.

Regards,
Hawking


-Original Message-
From: Chai, Thomas 
Sent: Tuesday, January 16, 2024 16:21
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Wang, 
Yang(Kevin) ; Yang, Stanley ; 
Chai, Thomas 
Subject: [PATCH 1/5] drm/amdgpu: Add log info for umc_v12_0 and smu_v13_0_6

Add log info for umc_v12_0 and smu_v13_0_6.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/umc_v12_0.c  | 11 +++
 drivers/gpu/drm/amd/amdkfd/kfd_events.c |  6 +-
 .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c| 13 +
 3 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c 
b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 6423dca5b777..fa2168f1d3bf 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -91,6 +91,17 @@ static void umc_v12_0_reset_error_count(struct amdgpu_device 
*adev)

 bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t 
mc_umc_status)  {
+   dev_info(adev->dev,
+   "MCA_UMC_STATUS(0x%llx): Val:%llu, Poison:%llu, Deferred:%llu, 
PCC:%llu, UC:%llu, TCC:%llu\n",
+   mc_umc_status,
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Poison),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, PCC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, UC),
+   REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, TCC)
+   );
+
return (amdgpu_ras_is_poison_mode_supported(adev) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, Val) 
== 1) &&
(REG_GET_FIELD(mc_umc_status, MCA_UMC_UMC0_MCUMC_STATUST0, 
Deferred) == 1)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 11923964ce9a..51bb98db5d7a 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -1297,8 +1297,10 @@ void kfd_signal_poison_consumed_event(struct kfd_node 
*dev, u32 pasid)
uint32_t id = KFD_FIRST_NONSIGNAL_EVENT_ID;
int user_gpu_id;

-   if (!p)
+   if (!p) {
+   dev_warn(dev->adev->dev, "Not find process with pasid:%d\n", 
pasid);
return; /* Presumably process exited. */
+   }

user_gpu_id = kfd_process_get_user_gpu_id(p, dev->id);
if (unlikely(user_gpu_id == -EINVAL)) { @@ -1334,6 +1336,8 @@ void 
kfd_signal_poison_consumed_event(struct kfd_node *dev, u32 pasid)
}
}

+   dev_warn(dev->adev->dev, "Send SIGBUS to process %s(pasid:%d)\n",
+   p->lead_thread->comm, pasid);
rcu_read_unlock();

/* user application will handle SIGBUS signal */ diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c
index 952a983da49a..cee8ee5afcb6 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6

RE: [PATCH] drm/amdgpu: mode1 reset needs to recover mp1 for mp0 v13_0_10

2023-08-08 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, will do


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Tuesday, August 8, 2023 5:50 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Chai, Thomas ; Yang, 
Stanley ; Chai, Thomas ; Li, Candice 

Subject: RE: [PATCH] drm/amdgpu: mode1 reset needs to recover mp1 for mp0 
v13_0_10

[AMD Official Use Only - General]

Like other psp callback helper defined in amdgpu_psp.h, let's define a macro 
called psp_fatal_error_recovery_quirk to wrap the psp function 
(psp_v13_0_fatal_error_recovery_quirk)

Regards,
Hawking

-Original Message-
From: amd-gfx  On Behalf Of YiPeng Chai
Sent: Tuesday, August 8, 2023 16:02
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Chai, Thomas ; Yang, 
Stanley ; Chai, Thomas ; Li, Candice 
; Zhang, Hawking 
Subject: [PATCH] drm/amdgpu: mode1 reset needs to recover mp1 for mp0 v13_0_10

Mode1 reset needs to recover mp1 in fatal error case for mp0 v13_0_10.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h |  1 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c |  3 +++  
drivers/gpu/drm/amd/amdgpu/psp_v13_0.c  | 24 +++-
 3 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index e8cbfacb5ac1..763242d702c2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -127,6 +127,7 @@ struct psp_funcs
int (*ring_destroy)(struct psp_context *psp,
enum psp_ring_type ring_type);
bool (*smu_reload_quirk)(struct psp_context *psp);
+   int (*pre_mode1_reset)(struct psp_context *psp);
int (*mode1_reset)(struct psp_context *psp);
int (*mem_training)(struct psp_context *psp, uint32_t ops);
uint32_t (*ring_get_wptr)(struct psp_context *psp); diff --git 
a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 50c38f75769c..f59f0cc2ab5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2066,6 +2066,9 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
if (ras->gpu_reset_flags & 
AMDGPU_RAS_GPU_RESET_MODE1_RESET) {
ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE1_RESET;
set_bit(AMDGPU_NEED_FULL_RESET, 
_context.flags);
+
+   if (adev->psp.funcs && 
adev->psp.funcs->pre_mode1_reset)
+   
adev->psp.funcs->pre_mode1_reset(>psp);
}
}

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 58db1ee631b3..65c44c7d2b12 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -691,6 +691,27 @@ static int psp_v13_0_vbflash_status(struct psp_context 
*psp)
return RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_115);  }

+static int psp_v13_0_pre_mode1_reset(struct psp_context *psp) {
+   struct amdgpu_device *adev = psp->adev;
+
+   if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(13, 0, 10)) {
+   uint32_t  reg_data;
+   /* MP1 fatal error: trigger PSP dram read to unhalt PSP
+* during MP1 triggered sync flood.
+*/
+   reg_data = RREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_67);
+   WREG32_SOC15(MP0, 0, regMP0_SMN_C2PMSG_67, reg_data + 0x10);
+
+   /* delay 1000ms for the mode1 reset for fatal error
+* to be recovered back.
+*/
+   msleep(1000);
+   }
+
+   return 0;
+}
+
 static const struct psp_funcs psp_v13_0_funcs = {
.init_microcode = psp_v13_0_init_microcode,
.bootloader_load_kdb = psp_v13_0_bootloader_load_kdb, @@ -710,7 +731,8 
@@ static const struct psp_funcs psp_v13_0_funcs = {
.load_usbc_pd_fw = psp_v13_0_load_usbc_pd_fw,
.read_usbc_pd_fw = psp_v13_0_read_usbc_pd_fw,
.update_spirom = psp_v13_0_update_spirom,
-   .vbflash_stat = psp_v13_0_vbflash_status
+   .vbflash_stat = psp_v13_0_vbflash_status,
+   .pre_mode1_reset = psp_v13_0_pre_mode1_reset,
 };

 void psp_v13_0_set_psp_funcs(struct psp_context *psp)
--
2.34.1




RE: [PATCH] drm/amdgpu: correct vmid_src -> vmhub_index mapping

2023-07-26 Thread Chai, Thomas
[AMD Official Use Only - General]

Yes,  the patch title is " drm/amdgpu: fix incorrect vmhub index ".

Hi lang:
   You can update this patch based on the above patch review results .


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Wednesday, July 26, 2023 8:03 PM
To: Yu, Lang ; amd-gfx@lists.freedesktop.org; Chai, Thomas 

Cc: Deucher, Alexander ; Zhang, Yifan 

Subject: RE: [PATCH] drm/amdgpu: correct vmid_src -> vmhub_index mapping

[AMD Official Use Only - General]

@Chai, Thomas sent the same fix for the review if I remember correctly. Might 
check with him to see when he push the fixes.

Regards,
Hawking

-Original Message-
From: Yu, Lang 
Sent: Wednesday, July 26, 2023 19:25
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Hawking 
; Zhang, Yifan ; Yu, Lang 

Subject: [PATCH] drm/amdgpu: correct vmid_src -> vmhub_index mapping

Align with new vmhub definition.
vmid_src 0 -> AMDGPU_GFXHUB(0).
vmid_src 1 -> AMDGPU_MMHUB0(0).

Signed-off-by: Lang Yu 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 3 ++-  
drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 6b430e10d38e..9c4e084da99a 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -102,9 +102,10 @@ static int gmc_v10_0_process_interrupt(struct 
amdgpu_device *adev,
   struct amdgpu_irq_src *source,
   struct amdgpu_iv_entry *entry)  {
+   struct amdgpu_vmhub *hub =
+   >vmhub[entry->vmid_src ? AMDGPU_MMHUB0(0) : 
AMDGPU_GFXHUB(0)];
bool retry_fault = !!(entry->src_data[1] & 0x80);
bool write_fault = !!(entry->src_data[1] & 0x20);
-   struct amdgpu_vmhub *hub = >vmhub[entry->vmid_src];
struct amdgpu_task_info task_info;
uint32_t status = 0;
u64 addr;
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 604522f70d03..47f5ced12ba2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -99,7 +99,8 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device 
*adev,
   struct amdgpu_irq_src *source,
   struct amdgpu_iv_entry *entry)  {
-   struct amdgpu_vmhub *hub = >vmhub[entry->vmid_src];
+   struct amdgpu_vmhub *hub =
+   >vmhub[entry->vmid_src ? AMDGPU_MMHUB0(0) : 
AMDGPU_GFXHUB(0)];
uint32_t status = 0;
u64 addr;

--
2.25.1




RE: [PATCH 2/2] drm/amdgpu: fix incorrect vmhub index

2023-07-19 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, will do.


-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Thursday, July 20, 2023 1:44 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Yang, 
Stanley 
Subject: RE: [PATCH 2/2] drm/amdgpu: fix incorrect vmhub index

[AMD Official Use Only - General]

Please apply the same change to gmc_v10_0_process_interrupt.

Might be better to check the client_id == VMC to decide vmhub.

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Thursday, July 20, 2023 13:42
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Yang, Stanley 
; Chai, Thomas 
Subject: [PATCH 2/2] drm/amdgpu: fix incorrect vmhub index

Fix incorrect vmhub index.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index d04fc0f19a29..c0b588e5d6aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -97,10 +97,13 @@ static int gmc_v11_0_process_interrupt(struct amdgpu_device 
*adev,
   struct amdgpu_irq_src *source,
   struct amdgpu_iv_entry *entry)  {
-   struct amdgpu_vmhub *hub = >vmhub[entry->vmid_src];
+   struct amdgpu_vmhub *hub;
uint32_t status = 0;
u64 addr;

+   hub = entry->vmid_src ?
+ >vmhub[AMDGPU_MMHUB0(0)] : >vmhub[AMDGPU_GFXHUB(0)];
+
addr = (u64)entry->src_data[0] << 12;
addr |= ((u64)entry->src_data[1] & 0xf) << 44;

--
2.34.1




RE: [PATCH] drm/amdgpu: change reserved vram info print

2023-06-01 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, thanks!


-
Best Regards,
Thomas

From: Christian König 
Sent: Thursday, June 1, 2023 2:25 PM
To: Paneer Selvam, Arunpravin ; Chai, Thomas 
; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Yang, Stanley ; Chai, 
Thomas ; Li, Candice ; Koenig, 
Christian ; Zhang, Hawking 
Subject: Re: [PATCH] drm/amdgpu: change reserved vram info print

If you haven't already pushed it Reviewed-by: Christian König 
<mailto:christian.koe...@amd.com> as well.

You might want to add a CC: stable... tag so that it gets backported.

Regards,
Christian.
Am 01.06.23 um 08:13 schrieb Arunpravin Paneer Selvam:

Reviewed-by: Arunpravin Paneer Selvam 
mailto:arunpravin.paneersel...@amd.com>>

On 5/25/2023 2:20 PM, YiPeng Chai wrote

The link object of mgr->reserved_pages is the blocks

variable in struct amdgpu_vram_reservation, not the

link variable in struct drm_buddy_block.



Signed-off-by: YiPeng Chai <mailto:yipeng.c...@amd.com>

---

 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 7 ---

 1 file changed, 4 insertions(+), 3 deletions(-)



diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

index 89d35d194f2c..c7085a747b03 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c

@@ -839,7 +839,7 @@ static void amdgpu_vram_mgr_debug(struct 
ttm_resource_manager *man,

 {

   struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);

   struct drm_buddy *mm = >mm;

-  struct drm_buddy_block *block;

+  struct amdgpu_vram_reservation *rsv;



   drm_printf(printer, "  vis usage:%llu\n",

  amdgpu_vram_mgr_vis_usage(mgr));

@@ -851,8 +851,9 @@ static void amdgpu_vram_mgr_debug(struct 
ttm_resource_manager *man,

   drm_buddy_print(mm, printer);



   drm_printf(printer, "reserved:\n");

-  list_for_each_entry(block, >reserved_pages, link)

- drm_buddy_block_print(mm, block, printer);

+  list_for_each_entry(rsv, >reserved_pages, blocks)

+  drm_printf(printer, "%#018llx-%#018llx: %llu\n",

+  rsv->start, rsv->start + rsv->size, rsv->size);

   mutex_unlock(>lock);

 }






RE: [PATCH] drm/amdgpu: change reserved vram info print

2023-05-28 Thread Chai, Thomas
[AMD Official Use Only - General]

Ping 


-
Best Regards,
Thomas

-Original Message-
From: Chai, Thomas 
Sent: Thursday, May 25, 2023 4:50 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Yang, Stanley 
; Chai, Thomas 
Subject: [PATCH] drm/amdgpu: change reserved vram info print

The link object of mgr->reserved_pages is the blocks variable in struct 
amdgpu_vram_reservation, not the link variable in struct drm_buddy_block.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
index 89d35d194f2c..c7085a747b03 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
@@ -839,7 +839,7 @@ static void amdgpu_vram_mgr_debug(struct 
ttm_resource_manager *man,  {
struct amdgpu_vram_mgr *mgr = to_vram_mgr(man);
struct drm_buddy *mm = >mm;
-   struct drm_buddy_block *block;
+   struct amdgpu_vram_reservation *rsv;

drm_printf(printer, "  vis usage:%llu\n",
   amdgpu_vram_mgr_vis_usage(mgr));
@@ -851,8 +851,9 @@ static void amdgpu_vram_mgr_debug(struct 
ttm_resource_manager *man,
drm_buddy_print(mm, printer);

drm_printf(printer, "reserved:\n");
-   list_for_each_entry(block, >reserved_pages, link)
-   drm_buddy_block_print(mm, block, printer);
+   list_for_each_entry(rsv, >reserved_pages, blocks)
+   drm_printf(printer, "%#018llx-%#018llx: %llu\n",
+   rsv->start, rsv->start + rsv->size, rsv->size);
mutex_unlock(>lock);
 }

--
2.34.1



RE: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx v11_0_3

2023-05-17 Thread Chai, Thomas
[AMD Official Use Only - General]

reset_context is a local variable in amdgpu_ras_do_recovery, if gpu_reset_flag 
is not used, read regRLC_RLCS_FED_STATUS_0 register and check sdma fed error 
field may move into amdgpu_ras_do_recovery, which may corrupt the code 
structure of amdgpu_ras.c.

amdgpu_ras_do_recovery support various mode resets, but the order of these 
resets is fixed and the driver cannot specify a reset type.

gpu_reset_flag is like the input parameter of amdgpu_ras_do_recovery, which 
allows the driver to specify a special reset type.

-
Best Regards,
Thomas

-Original Message-
From: Zhang, Hawking 
Sent: Wednesday, May 17, 2023 11:41 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Li, Candice ; Yang, 
Stanley 
Subject: RE: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx 
v11_0_3

[AMD Official Use Only - General]

Shall we just force the mode-2 reset if it is non-fatal error mode? Is the 
gpu_reset_flag really necessary in such case?

reset_context.method = AMD_RESET_METHOD_MODE2;

Ideally, driver decides either perform reset or other error handling approach 
(i.e. unmap queue for gfx) in IP specific handler, while keep the 
amdgpu_ras_do_recovery as the unified entry for various driver mode reset as 
ras error handling. Is it feasible?

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Wednesday, May 17, 2023 10:14
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Li, Candice ; Yang, Stanley 
; Chai, Thomas 
Subject: [PATCH] drm/amdgpu: perform mode2 reset for sdma fed error on gfx 
v11_0_3

perform mode2 reset for sdma fed error on gfx v11_0_3.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  8 +++-  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h  |  5 +  
drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c | 14 +-
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 6bb438642cc0..f2da69adcd9d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2053,9 +2053,15 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
/* Perform full reset in fatal error mode */
if (!amdgpu_ras_is_poison_mode_supported(ras->adev))
set_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
-   else
+   else {
clear_bit(AMDGPU_NEED_FULL_RESET, _context.flags);

+   if (ras->gpu_reset_flags & 
AMDGPU_RAS_GPU_RESET_MODE2_RESET) {
+   ras->gpu_reset_flags &= 
~AMDGPU_RAS_GPU_RESET_MODE2_RESET;
+   reset_context.method = AMD_RESET_METHOD_MODE2;
+   }
+   }
+
amdgpu_device_gpu_recover(ras->adev, NULL, _context);
}
atomic_set(>in_recovery, 0);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index bc43f7db17cc..46bf1889a9d7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -339,6 +339,8 @@ enum amdgpu_ras_ret {
 #define AMDGPU_RAS_ERR_STATUS_VALID(1 << 1)
 #define AMDGPU_RAS_ERR_ADDRESS_VALID   (1 << 2)

+#define AMDGPU_RAS_GPU_RESET_MODE2_RESET  (0x1 << 0)
+
 struct amdgpu_ras_err_status_reg_entry {
uint32_t hwip;
uint32_t ip_inst;
@@ -427,6 +429,9 @@ struct amdgpu_ras {

/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+
+   /* Record special requirements of gpu reset caller */
+   uint32_t  gpu_reset_flags;
 };

 struct ras_fs_data {
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
index 068b9586a223..26d6286d86c9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0_3.c
@@ -84,8 +84,20 @@ static int gfx_v11_0_3_poison_consumption_handler(struct 
amdgpu_device *adev,
/* Workaround: when vmid and pasid are both zero, trigger gpu reset in 
KGD. */
if (entry && (entry->client_id == SOC21_IH_CLIENTID_GFX) &&
(entry->src_id == GFX_11_0_0__SRCID__RLC_GC_FED_INTERRUPT) &&
-!entry->vmid && !entry->pasid)
+!entry->vmid && !entry->pasid) {
+   uint32_t rlc_status0 = 0;
+
+   rlc_status0 = RREG32_SOC15(GC, 0, regRLC_RLCS_FED_STATUS_0);
+
+   if (REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, 
SDMA0_FED_ERR) ||
+   REG_GET_FIELD(rlc_status0, RLC_RLCS_FED_STATUS_0, 
SDMA1_FED_ERR)) {
+   struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
+
+   ras->gpu_reset_fla

RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in umc_v8_10

2023-04-03 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, Will do.


-
Best Regards,
Thomas

-Original Message-
From: Zhou1, Tao  
Sent: Monday, April 3, 2023 3:21 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Li, Candice ; 
Yang, Stanley 
Subject: RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in umc_v8_10

[AMD Official Use Only - General]



> -Original Message-
> From: Chai, Thomas 
> Sent: Monday, April 3, 2023 3:00 PM
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Li, Candice 
> ; Yang, Stanley 
> Subject: RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in 
> umc_v8_10
> 
> [AMD Official Use Only - General]
> 
> 
> 
> 
> -
> Best Regards,
> Thomas
> 
> -Original Message-
> From: Zhou1, Tao 
> Sent: Monday, April 3, 2023 11:45 AM
> To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking ; Li, Candice 
> ; Yang, Stanley 
> Subject: RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in 
> umc_v8_10
> 
> [AMD Official Use Only - General]
> 
> 
> 
> > -Original Message-
> > From: Chai, Thomas 
> > Sent: Monday, April 3, 2023 9:59 AM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Chai, Thomas ; Zhang, Hawking 
> > ; Zhou1, Tao ; Li,
> Candice
> > ; Yang, Stanley ; Chai, 
> > Thomas 
> > Subject: [PATCH 1/2] drm/amdgpu: optimize redundant code in 
> > umc_v8_10
> >
> > Optimize redundant code in umc_v8_10
> >
> > Signed-off-by: YiPeng Chai 
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  31 
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   7 +
> >  drivers/gpu/drm/amd/amdgpu/umc_v8_10.c  | 197
> > +---
> >  3 files changed, 115 insertions(+), 120 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > index 9e2e97207e53..734442315cf6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> > @@ -302,3 +302,34 @@ void amdgpu_umc_fill_error_record(struct
> > ras_err_data *err_data,
> >
> > err_data->err_addr_cnt++;
> >  }
> > +
> > +int amdgpu_umc_scan_all_umc_channels(struct amdgpu_device *adev,
> > +   umc_func func, void *data)
> > +{
> > +   uint32_t node_inst   = 0;
> > +   uint32_t umc_inst= 0;
> > +   uint32_t ch_inst = 0;
> > +   int ret = 0;
> > +
> > +   if (adev->umc.node_inst_num) {
> > +   LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst,
> > ch_inst) {
> > +   ret = func(adev, node_inst, umc_inst, ch_inst, data);
> > +   if (ret) {
> > +   dev_err(adev->dev, "Node %d umc %d ch %d
> > func returns %d\n",
> > +   node_inst, umc_inst, ch_inst, ret);
> > +   return ret;
> > +   }
> > +   }
> > +   } else {
> > +   LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {
> 
> >[Tao] for ASIC which doesn't support node, can we set its 
> >node_inst_num to 1
> and retire the macro LOOP_UMC_INST_AND_CH?
> 
> [Thomas] I am afraid not.
> 
>   " #define LOOP_UMC_NODE_INST(node_inst) \
>   for_each_set_bit((node_inst), &(adev->umc.active_mask),
> adev->umc.node_inst_num) "
> 
>   The node instance loop of LOOP_UMC_EACH_NODE_INST_AND_CH supports 
> node harvest, so node_inst_num is not the real node instance number.

[Tao] we can set both node_inst_num and active_mask to 1, but either way is 
fine for me.
BTW, I think amdgpu_umc_loop_channels  is simple than 
amdgpu_umc_scan_all_umc_channels, with this fixed, the series is:

Reviewed-by: Tao Zhou 

> 
> 
> > +   ret = func(adev, 0, umc_inst, ch_inst, data);
> > +   if (ret) {
> > +   dev_err(adev->dev, "Umc %d ch %d func
> > returns %d\n",
> > +   umc_inst, ch_inst, ret);
> > +   return ret;
> > +   }
> > +   }
> > +   }
> > +
> > +   return 0;
> > +}
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > index d7f1229ff11f..f279c8057f96 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> > @@ -47,6 +47,10 @@
> >  #define LOOP_UMC_EACH

RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in umc_v8_10

2023-04-03 Thread Chai, Thomas
[AMD Official Use Only - General]




-
Best Regards,
Thomas

-Original Message-
From: Zhou1, Tao  
Sent: Monday, April 3, 2023 11:45 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Li, Candice ; 
Yang, Stanley 
Subject: RE: [PATCH 1/2] drm/amdgpu: optimize redundant code in umc_v8_10

[AMD Official Use Only - General]



> -Original Message-
> From: Chai, Thomas 
> Sent: Monday, April 3, 2023 9:59 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Li, Candice 
> ; Yang, Stanley ; Chai, 
> Thomas 
> Subject: [PATCH 1/2] drm/amdgpu: optimize redundant code in umc_v8_10
> 
> Optimize redundant code in umc_v8_10
> 
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c |  31 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h |   7 +
>  drivers/gpu/drm/amd/amdgpu/umc_v8_10.c  | 197 
> +---
>  3 files changed, 115 insertions(+), 120 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> index 9e2e97207e53..734442315cf6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
> @@ -302,3 +302,34 @@ void amdgpu_umc_fill_error_record(struct
> ras_err_data *err_data,
> 
>   err_data->err_addr_cnt++;
>  }
> +
> +int amdgpu_umc_scan_all_umc_channels(struct amdgpu_device *adev,
> + umc_func func, void *data)
> +{
> + uint32_t node_inst   = 0;
> + uint32_t umc_inst= 0;
> + uint32_t ch_inst = 0;
> + int ret = 0;
> +
> + if (adev->umc.node_inst_num) {
> + LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst,
> ch_inst) {
> + ret = func(adev, node_inst, umc_inst, ch_inst, data);
> + if (ret) {
> + dev_err(adev->dev, "Node %d umc %d ch %d
> func returns %d\n",
> + node_inst, umc_inst, ch_inst, ret);
> + return ret;
> + }
> + }
> + } else {
> + LOOP_UMC_INST_AND_CH(umc_inst, ch_inst) {

>[Tao] for ASIC which doesn't support node, can we set its node_inst_num to 1 
>and retire the macro LOOP_UMC_INST_AND_CH?

[Thomas] I am afraid not.

" #define LOOP_UMC_NODE_INST(node_inst) \
for_each_set_bit((node_inst), &(adev->umc.active_mask), 
adev->umc.node_inst_num) "

The node instance loop of LOOP_UMC_EACH_NODE_INST_AND_CH  supports node 
harvest, so node_inst_num is not the real node instance number.


> + ret = func(adev, 0, umc_inst, ch_inst, data);
> + if (ret) {
> + dev_err(adev->dev, "Umc %d ch %d func
> returns %d\n",
> + umc_inst, ch_inst, ret);
> + return ret;
> + }
> + }
> + }
> +
> + return 0;
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> index d7f1229ff11f..f279c8057f96 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
> @@ -47,6 +47,10 @@
>  #define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) 
> \
>   LOOP_UMC_NODE_INST((node_inst))
> LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
> 
> +
> +typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
> + uint32_t umc_inst, uint32_t ch_inst, void *data);
> +
>  struct amdgpu_umc_ras {
>   struct amdgpu_ras_block_object ras_block;
>   void (*err_cnt_init)(struct amdgpu_device *adev); @@ -104,4 +108,7 
> @@ int amdgpu_umc_process_ras_data_cb(struct amdgpu_device *adev,
>   struct amdgpu_iv_entry *entry);
>  int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
>   uint64_t err_addr, uint32_t ch_inst, uint32_t umc_inst);
> +
> +int amdgpu_umc_scan_all_umc_channels(struct amdgpu_device *adev,
> + umc_func func, void *data);
>  #endif
> diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> index fb55e8cb9967..6dff313ac04c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> +++ b/drivers/gpu/drm/amd/amdgpu/umc_v8_10.c
> @@ -76,10 +76,13 @@ static inline uint32_t 
> get_umc_v8_10_reg_offset(struct amdgpu_device *adev,
>   UMC_8_NODE_DIST * node_inst;
>  }
> 
> -static void umc_v8_10_clear_error_count_per_c

RE: [PATCH] drm/amdgpu: Fixed bug on error when uninstalling amdgpu

2022-12-16 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, I will update subject line.  Thanks!


-
Best Regards,
Thomas

-Original Message-
From: Christian König  
Sent: Friday, December 16, 2022 4:50 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org; Paneer 
Selvam, Arunpravin 
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Chai, Thomas 
Subject: Re: [PATCH] drm/amdgpu: Fixed bug on error when uninstalling amdgpu

Am 16.12.22 um 03:56 schrieb YiPeng Chai:
> Fixed bug on error when uninstalling amdgpu.
> The error message is as follows:
> [  304.852489] kernel BUG at drivers/gpu/drm/drm_buddy.c:278!
> [  304.852503] invalid opcode:  [#1] PREEMPT SMP NOPTI
> [  304.852510] CPU: 2 PID: 4192 Comm: modprobe Tainted: GW IOE 
> 5.19.0-thomas #1
> [  304.852519] Hardware name: ASUS System Product Name/PRIME Z390-A, 
> BIOS 2004 11/02/2021 [  304.852526] RIP: 
> 0010:drm_buddy_free_block+0x26/0x30 [drm_buddy] [  304.852535] Code: 
> 00 00 00 90 0f 1f 44 00 00 48 8b 0e 89 c8 25 00 0c 00 00 3d 00 04 00 
> 00 75 10 48 8b 47 18 48 d3 e0 48 01 47 28 e9 fa fe ff ff <0f> 0b 0f 1f 
> 84 00 00 00 00 00 0f 1f 44 00 00 41 54 55 48 89 f5 53 [  304.852549] 
> RSP: 0018:9afac17bbcb8 EFLAGS: 00010287 [  304.852556] RAX: 
>  RBX: 8dacd37fd778 RCX:  [  
> 304.852563] RDX: 8dacd37fd7a0 RSI: 8dacd37fd3b8 RDI: 
> 8dac672a5f80 [  304.852570] RBP: 8dacd37fd3a0 R08: 
> 0001 R09:  [  304.852577] R10: 
> 8dac68185500 R11: 9afac17bbd00 R12: 8dac672a5f80 [  
> 304.852584] R13: 8dac672a5fe0 R14: 8dacd37fd380 R15: 
> 8dac672a5f80 [  304.852590] FS:  7f0fa9b30c40() 
> GS:8dadb648() knlGS: [  304.852598] CS:  0010 DS: 
>  ES:  CR0: 80050033 [  304.852604] CR2: 7f4bf1a1ba50 CR3: 
> 000108c58004 CR4: 003706e0 [  304.852611] DR0:  
> DR1:  DR2:  [  304.852618] DR3: 
>  DR6: fffe0ff0 DR7: 0400 [  304.852625] 
> Call Trace:
> [  304.852629]  
> [  304.852632]  drm_buddy_free_list+0x2a/0x60 [drm_buddy] [  
> 304.852639]  amdgpu_vram_mgr_fini+0xea/0x180 [amdgpu] [  304.852827]  
> amdgpu_ttm_fini+0x1f9/0x280 [amdgpu] [  304.852925]  
> amdgpu_bo_fini+0x22/0x90 [amdgpu] [  304.853022]  
> gmc_v11_0_sw_fini+0x26/0x30 [amdgpu] [  304.853132]  
> amdgpu_device_fini_sw+0xc5/0x3b0 [amdgpu] [  304.853229]  
> amdgpu_driver_release_kms+0x12/0x30 [amdgpu] [  304.853327]  
> drm_dev_release+0x20/0x40 [drm] [  304.853352]  
> release_nodes+0x35/0xb0 [  304.853359]  devres_release_all+0x8b/0xc0 [  
> 304.853364]  device_unbind_cleanup+0xe/0x70 [  304.853370]  
> device_release_driver_internal+0xee/0x160
> [  304.853377]  driver_detach+0x44/0x90 [  304.853382]  
> bus_remove_driver+0x55/0xe0 [  304.853387]  
> pci_unregister_driver+0x3b/0x90 [  304.853393]  amdgpu_exit+0x11/0x69 
> [amdgpu] [  304.853540]  __x64_sys_delete_module+0x142/0x260
> [  304.853548]  ? exit_to_user_mode_prepare+0x3e/0x190
> [  304.853555]  do_syscall_64+0x38/0x90 [  304.853562]  
> entry_SYSCALL_64_after_hwframe+0x63/0xcd
>
> Signed-off-by: YiPeng Chai 

The subject line should probably read "when unloading amdgpu", but apart from 
that good catch.

Reviewed-by: Christian König 

> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> index 0b598b510bd8..eb63324c30d2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c
> @@ -829,7 +829,7 @@ void amdgpu_vram_mgr_fini(struct amdgpu_device *adev)
>   kfree(rsv);
>   
>   list_for_each_entry_safe(rsv, temp, >reserved_pages, blocks) {
> - drm_buddy_free_list(>mm, >blocks);
> + drm_buddy_free_list(>mm, >allocated);
>   kfree(rsv);
>   }
>   drm_buddy_fini(>mm);


RE: [PATCH] drm/amdgpu: Fixed ras warning when uninstalling amdgpu

2022-09-21 Thread Chai, Thomas
[AMD Official Use Only - General]

Ping ...


-
Best Regards,
Thomas

-Original Message-
From: Chai, Thomas  
Sent: Tuesday, September 20, 2022 10:07 AM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Clements, John ; Yang, 
Stanley ; Chai, Thomas 
Subject: [PATCH] drm/amdgpu: Fixed ras warning when uninstalling amdgpu

  For the asic using smu v13_0_2, there is the following warning when 
uninstalling amdgpu:
  amdgpu: ras disable gfx failed poison:1 ret:-22.

[Why]:
  For the asic using smu v13_0_2, the psp .suspend and
  mode1reset is called before amdgpu_ras_pre_fini during
  amdgpu uninstall, it has disabled all ras features and
  reset the psp. Since the psp is reset, calling
  amdgpu_ras_disable_all_features in amdgpu_ras_pre_fini
  to disable ras features will fail.

[How]:
  If all ras features are disabled, amdgpu_ras_disable_all_features
  will not be called to disable all ras features again.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index e55f106621ef..3deb716710e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2720,7 +2720,8 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
 
 
/* Need disable ras on all IPs here before ip [hw/sw]fini */
-   amdgpu_ras_disable_all_features(adev, 0);
+   if (con->features)
+   amdgpu_ras_disable_all_features(adev, 0);
amdgpu_ras_recovery_fini(adev);
return 0;
 }
--
2.25.1


RE: [PATCH V2] drm/amdgpu: Adjust removal control flow for smu v13_0_2

2022-09-06 Thread Chai, Thomas
[AMD Official Use Only - General]

Yes, I will add the sequence adjustment to the comment.


-
Best Regards,
Thomas

From: Zhang, Hawking 
Sent: Wednesday, September 7, 2022 11:42 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Wang, Yang(Kevin) 
Subject: Re: [PATCH V2] drm/amdgpu: Adjust removal control flow for smu v13_0_2

Thanks.

Can you please share more details to help me understand the sequence adjustment 
in suspend?

Regards,
Hawking

From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Date: Wednesday, September 7, 2022 at 11:29
To: Zhang, Hawking mailto:hawking.zh...@amd.com>>, 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Zhou1, Tao mailto:tao.zh...@amd.com>>, Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>
Subject: RE: [PATCH V2] drm/amdgpu: Adjust removal control flow for smu v13_0_2

[AMD Official Use Only - General]

OK, I will update patch.


-
Best Regards,
Thomas

From: Zhang, Hawking mailto:hawking.zh...@amd.com>>
Sent: Wednesday, September 7, 2022 10:40 AM
To: Chai, Thomas mailto:yipeng.c...@amd.com>>; 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhou1, Tao 
mailto:tao.zh...@amd.com>>; Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>
Subject: Re: [PATCH V2] drm/amdgpu: Adjust removal control flow for smu v13_0_2


[AMD Official Use Only - General]

+static void amdgpu_device_gpu_reset(struct amdgpu_device *adev)
+{
+   struct amdgpu_reset_context reset_context;
+
+   memset(_context, 0, sizeof(reset_context));
+   reset_context.method = AMD_RESET_METHOD_NONE;
+   reset_context.reset_req_dev = adev;
+   set_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
+   set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, _context.flags);
+
+   amdgpu_device_gpu_recover(adev, NULL, _context);
+}

This wrapper is kind of confusing. Let's keep amdgpu_device_gpu_recover as the 
only entry point for recovery handling. If possible, please drop this wrapper,  
initialize reset_context and call amdgpu_device_gpu_recover directly


+   /* If in_remove is true, psp_hw_fini should be executed after
+*  psp_suspend to free psp shared buffers.
+*/
+   if (adev->in_remove && (adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_PSP))
+   continue;
Can you please share more details to help me understand the sequence adjustment 
here?

Regards,
Hawking

From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Date: Tuesday, September 6, 2022 at 15:48
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>, Zhang, 
Hawking mailto:hawking.zh...@amd.com>>, Zhou1, Tao 
mailto:tao.zh...@amd.com>>, Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>, Chai, Thomas 
mailto:yipeng.c...@amd.com>>
Subject: [PATCH V2] drm/amdgpu: Adjust removal control flow for smu v13_0_2
Adjust removal control flow for smu v13_0_2:
   During amdgpu uninstallation, when removing the first
device, the kernel needs to first send a mode1reset message
to all gpu devices. Otherwise, smu initialization will fail
the next time amdgpu is installed.

V2:
1. Update commit comments.
2. Remove the global variable amdgpu_device_remove_cnt
   and add a variable to the structure amdgpu_hive_info.
3. Use hive to detect the first removed device instead of
   a global variable.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 35 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 16 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  1 +
 drivers/gpu/drm/amd/pm/amdgpu_pm.c |  6 +++-
 7 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 79bb6fd83094..465295318830 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -997,6 +997,9 @@ struct amdgpu_device {
 boolin_s4;
 boolin_s0ix;

+   /* uninstall */
+   boolin_remove;
+
 enum pp_mp1_state   mp1_state;
 struct amdgpu_doorbell_index doorbell_index;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62b26f0e37b0..1402717673f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_devi

RE: [PATCH V2] drm/amdgpu: Adjust removal control flow for smu v13_0_2

2022-09-06 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, I will update patch.


-
Best Regards,
Thomas

From: Zhang, Hawking 
Sent: Wednesday, September 7, 2022 10:40 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhou1, Tao ; Wang, 
Yang(Kevin) 
Subject: Re: [PATCH V2] drm/amdgpu: Adjust removal control flow for smu v13_0_2


[AMD Official Use Only - General]

+static void amdgpu_device_gpu_reset(struct amdgpu_device *adev)
+{
+   struct amdgpu_reset_context reset_context;
+
+   memset(_context, 0, sizeof(reset_context));
+   reset_context.method = AMD_RESET_METHOD_NONE;
+   reset_context.reset_req_dev = adev;
+   set_bit(AMDGPU_NEED_FULL_RESET, _context.flags);
+   set_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, _context.flags);
+
+   amdgpu_device_gpu_recover(adev, NULL, _context);
+}

This wrapper is kind of confusing. Let's keep amdgpu_device_gpu_recover as the 
only entry point for recovery handling. If possible, please drop this wrapper,  
initialize reset_context and call amdgpu_device_gpu_recover directly


+   /* If in_remove is true, psp_hw_fini should be executed after
+*  psp_suspend to free psp shared buffers.
+*/
+   if (adev->in_remove && (adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_PSP))
+   continue;
Can you please share more details to help me understand the sequence adjustment 
here?

Regards,
Hawking

From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Date: Tuesday, September 6, 2022 at 15:48
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>, Zhang, 
Hawking mailto:hawking.zh...@amd.com>>, Zhou1, Tao 
mailto:tao.zh...@amd.com>>, Wang, Yang(Kevin) 
mailto:kevinyang.w...@amd.com>>, Chai, Thomas 
mailto:yipeng.c...@amd.com>>
Subject: [PATCH V2] drm/amdgpu: Adjust removal control flow for smu v13_0_2
Adjust removal control flow for smu v13_0_2:
   During amdgpu uninstallation, when removing the first
device, the kernel needs to first send a mode1reset message
to all gpu devices. Otherwise, smu initialization will fail
the next time amdgpu is installed.

V2:
1. Update commit comments.
2. Remove the global variable amdgpu_device_remove_cnt
   and add a variable to the structure amdgpu_hive_info.
3. Use hive to detect the first removed device instead of
   a global variable.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|  3 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 40 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 35 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 16 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h  |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h   |  1 +
 drivers/gpu/drm/amd/pm/amdgpu_pm.c |  6 +++-
 7 files changed, 99 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 79bb6fd83094..465295318830 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -997,6 +997,9 @@ struct amdgpu_device {
 boolin_s4;
 boolin_s0ix;

+   /* uninstall */
+   boolin_remove;
+
 enum pp_mp1_state   mp1_state;
 struct amdgpu_doorbell_index doorbell_index;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 62b26f0e37b0..1402717673f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2999,6 +2999,13 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
 DRM_ERROR("suspend of IP block <%s> failed %d\n",
   adev->ip_blocks[i].version->funcs->name, r);
 }
+
+   /* If in_remove is true, psp_hw_fini should be executed after
+*  psp_suspend to free psp shared buffers.
+*/
+   if (adev->in_remove && (adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_PSP))
+   continue;
+
 adev->ip_blocks[i].status.hw = false;
 /* handle putting the SMC in the appropriate state */
 if(!amdgpu_sriov_vf(adev)){
@@ -4739,6 +4746,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
 struct amdgpu_device *tmp_adev = NULL;
 bool need_full_reset, skip_hw_reset, vram_lost = false;
 int r = 0;
+   bool gpu_reset_for_dev_remove = 0;

 /* Try reset handler method first */
 tmp_adev = list_first_entry(device_list_handle, struct amdgpu_devi

RE: [PATCH V2] drm/amdgpu: TA unload messages are not actually sent to psp when amdgpu is uninstalled

2022-09-05 Thread Chai, Thomas
[AMD Official Use Only - General]

Ping


-
Best Regards,
Thomas

-Original Message-
From: Chai, Thomas  
Sent: Thursday, September 1, 2022 4:40 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Chai, Thomas 
Subject: [PATCH V2] drm/amdgpu: TA unload messages are not actually sent to psp 
when amdgpu is uninstalled

V1:
  The psp_cmd_submit_buf function is called by psp_hw_fini to send TA unload 
messages to psp to terminate ras, asd and tmr. But when amdgpu is uninstalled, 
drm_dev_unplug is called earlier than psp_hw_fini in amdgpu_pci_remove, the 
calling order as follows:
static void amdgpu_pci_remove(struct pci_dev *pdev) {
drm_dev_unplug
..
amdgpu_driver_unload_kms->amdgpu_device_fini_hw->...
->.hw_fini->psp_hw_fini->...
->psp_ta_unload->psp_cmd_submit_buf
..
}
The program will return when calling drm_dev_enter in psp_cmd_submit_buf.

So the call to drm_dev_enter in psp_cmd_submit_buf should be removed, so that 
the TA unload messages can be sent to the psp when amdgpu is uninstalled.

V2:
1. Restore psp_cmd_submit_buf to its original code.
2. Move drm_dev_unplug call after amdgpu_driver_unload_kms in
   amdgpu_pci_remove.
3. Since amdgpu_device_fini_hw is called by amdgpu_driver_unload_kms,
   remove the unplug check to release device mmio resource in
   amdgpu_device_fini_hw before calling drm_dev_unplug.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 3 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index afaa1056e039..62b26f0e37b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3969,8 +3969,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 
amdgpu_gart_dummy_page_fini(adev);
 
-   if (drm_dev_is_unplugged(adev_to_drm(adev)))
-   amdgpu_device_unmap_mmio(adev);
+   amdgpu_device_unmap_mmio(adev);
 
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index de7144b06e93..728a0933ea6f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2181,8 +2181,6 @@ amdgpu_pci_remove(struct pci_dev *pdev)
struct drm_device *dev = pci_get_drvdata(pdev);
struct amdgpu_device *adev = drm_to_adev(dev);
 
-   drm_dev_unplug(dev);
-
if (adev->pm.rpm_mode != AMDGPU_RUNPM_NONE) {
pm_runtime_get_sync(dev->dev);
pm_runtime_forbid(dev->dev);
@@ -2190,6 +2188,8 @@ amdgpu_pci_remove(struct pci_dev *pdev)
 
amdgpu_driver_unload_kms(dev);
 
+   drm_dev_unplug(dev);
+
/*
 * Flush any in flight DMA operations from device.
 * Clear the Bus Master Enable bit and then wait on the PCIe Device
--
2.25.1


RE: [PATCH 1/2] drm/amdgpu: The call to amdgpu_xgmi_remove_device needs to be earlier than psp_hw_fini

2022-08-24 Thread Chai, Thomas
[AMD Official Use Only - General]

Ping on this series.

-Original Message-
From: Chai, Thomas  
Sent: Friday, August 12, 2022 5:13 PM
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Wang, Yang(Kevin) ; Chai, Thomas 
Subject: [PATCH 1/2] drm/amdgpu: The call to amdgpu_xgmi_remove_device needs to 
be earlier than psp_hw_fini

The amdgpu_xgmi_remove_device function will send unload command to psp through 
psp ring to terminate xgmi, but psp ring has been destroyed in psp_hw_fini.

Signed-off-by: YiPeng Chai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c84fdef0ac45..2445255bbf01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2787,6 +2787,9 @@ static int amdgpu_device_ip_fini_early(struct 
amdgpu_device *adev)
 
amdgpu_amdkfd_suspend(adev, false);
 
+   if (adev->gmc.xgmi.num_physical_nodes > 1)
+   amdgpu_xgmi_remove_device(adev);
+
/* Workaroud for ASICs need to disable SMC first */
amdgpu_device_smu_fini_early(adev);
 
@@ -2830,9 +2833,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
amdgpu_virt_release_ras_err_handler_data(adev);
 
-   if (adev->gmc.xgmi.num_physical_nodes > 1)
-   amdgpu_xgmi_remove_device(adev);
-
amdgpu_amdkfd_device_fini_sw(adev);
 
for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
--
2.25.1


RE: [PATCH] drm/amdgpu: TA unload messages are not actually sent to psp when amdgpu is uninstalled

2022-08-16 Thread Chai, Thomas
[AMD Official Use Only - General]

Hi Alex:
  When removing an amdgpu device, it may be difficult to change the order of 
psp_hw_fini calls.

1. The drm_dev_unplug call is at the beginning in the amdgpu_pci_remove 
function,  which makes the gpu device inaccessible for userspace operations.  
If the call to psp_hw_fini was moved before drm_dev_unplug,  userspace could 
access the gpu device but the psp might be removing. It has unknown issues.

2. psp_hw_fini is called by the .hw_fini iterator in 
amdgpu_device_ip_fini_early, referring to the code starting from 
amdgpu_pci_remove to .hw_fini is called,
   there are many preparatory operations before calling .hw_fini,  which makes 
it very difficult to change the order of psp_hw_fini or all block .hw_fini.
   
   So can we do a workaround in psp_cmd_submit_buf when removing amdgpu device?

-Original Message-
From: Alex Deucher  
Sent: Monday, August 15, 2022 10:22 PM
To: Chai, Thomas 
Cc: amd-gfx@lists.freedesktop.org; Zhang, Hawking ; 
Chen, Guchun ; Chai, Thomas 
Subject: Re: [PATCH] drm/amdgpu: TA unload messages are not actually sent to 
psp when amdgpu is uninstalled

On Mon, Aug 15, 2022 at 3:06 AM YiPeng Chai  wrote:
>
> The psp_cmd_submit_buf function is called by psp_hw_fini to send TA 
> unload messages to psp to terminate ras, asd and tmr.
> But when amdgpu is uninstalled, drm_dev_unplug is called earlier than 
> psp_hw_fini in amdgpu_pci_remove, the calling order as follows:
> static void amdgpu_pci_remove(struct pci_dev *pdev) {
> drm_dev_unplug
> ..
> amdgpu_driver_unload_kms->amdgpu_device_fini_hw->...
> ->.hw_fini->psp_hw_fini->...
> ->psp_ta_unload->psp_cmd_submit_buf
> ..
> }
> The program will return when calling drm_dev_enter in 
> psp_cmd_submit_buf.
>
> So the call to drm_dev_enter in psp_cmd_submit_buf should be removed, 
> so that the TA unload messages can be sent to the psp when amdgpu is 
> uninstalled.
>
> Signed-off-by: YiPeng Chai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 4 
>  1 file changed, 4 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index b067ce45d226..0578d8d094a7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -585,9 +585,6 @@ psp_cmd_submit_buf(struct psp_context *psp,
> if (psp->adev->no_hw_access)
> return 0;
>
> -   if (!drm_dev_enter(adev_to_drm(psp->adev), ))
> -   return 0;
> -

This check is to prevent the hardware from being accessed if the card is 
removed.  I think we need to fix the ordering elsewhere.

Alex

> memset(psp->cmd_buf_mem, 0, PSP_CMD_BUFFER_SIZE);
>
> memcpy(psp->cmd_buf_mem, cmd, sizeof(struct 
> psp_gfx_cmd_resp)); @@ -651,7 +648,6 @@ psp_cmd_submit_buf(struct psp_context 
> *psp,
> }
>
>  exit:
> -   drm_dev_exit(idx);
> return ret;
>  }
>
> --
> 2.25.1
>


RE: [PATCH 1/2] drm/amdgpu: The call to amdgpu_xgmi_remove_device needs to be earlier than psp_hw_fini

2022-08-15 Thread Chai, Thomas
[AMD Official Use Only - General]

OK, I will update the patch.

From: Zhang, Hawking 
Sent: Tuesday, August 16, 2022 11:51 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Wang, Yang(Kevin) 
Subject: Re: [PATCH 1/2] drm/amdgpu: The call to amdgpu_xgmi_remove_device 
needs to be earlier than psp_hw_fini

Fixed typo

Regards,
Hawking

From: Zhang, Hawking mailto:hawking.zh...@amd.com>>
Date: Tuesday, August 16, 2022 at 11:49
To: Chai, Thomas mailto:yipeng.c...@amd.com>>, 
amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org> 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Wang, Yang(Kevin) mailto:kevinyang.w...@amd.com>>
Subject: RE: [PATCH 1/2] drm/amdgpu: The call to amdgpu_xgmi_remove_device 
needs to be earlier than psp_hw_fini
[AMD Official Use Only - General]

Alternatively, it might be better split xgmi ta terminate from 
xgmi_remove_device. In psp_hw_fini, check ta->fw and num_of_physical_mode to 
terminate xgmi ta. and make amdgpu_xgmi_remove_device only deal with software 
fini, like add_device.

Regards,
Hawking

-Original Message-
From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Sent: Monday, August 15, 2022 15:03
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Zhang, Hawking mailto:hawking.zh...@amd.com>>; Wang, 
Yang(Kevin) mailto:kevinyang.w...@amd.com>>
Subject: RE: [PATCH 1/2] drm/amdgpu: The call to amdgpu_xgmi_remove_device 
needs to be earlier than psp_hw_fini

[AMD Official Use Only - General]

Ping on this series.

-Original Message-
From: Chai, Thomas mailto:yipeng.c...@amd.com>>
Sent: Friday, August 12, 2022 5:13 PM
To: amd-gfx@lists.freedesktop.org<mailto:amd-gfx@lists.freedesktop.org>
Cc: Chai, Thomas mailto:yipeng.c...@amd.com>>; Zhang, 
Hawking mailto:hawking.zh...@amd.com>>; Wang, 
Yang(Kevin) mailto:kevinyang.w...@amd.com>>; Chai, 
Thomas mailto:yipeng.c...@amd.com>>
Subject: [PATCH 1/2] drm/amdgpu: The call to amdgpu_xgmi_remove_device needs to 
be earlier than psp_hw_fini

The amdgpu_xgmi_remove_device function will send unload command to psp through 
psp ring to terminate xgmi, but psp ring has been destroyed in psp_hw_fini.

Signed-off-by: YiPeng Chai mailto:yipeng.c...@amd.com>>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c84fdef0ac45..2445255bbf01 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2787,6 +2787,9 @@ static int amdgpu_device_ip_fini_early(struct 
amdgpu_device *adev)

 amdgpu_amdkfd_suspend(adev, false);

+   if (adev->gmc.xgmi.num_physical_nodes > 1)
+   amdgpu_xgmi_remove_device(adev);
+
 /* Workaroud for ASICs need to disable SMC first */
 amdgpu_device_smu_fini_early(adev);

@@ -2830,9 +2833,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device 
*adev)
 if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
 amdgpu_virt_release_ras_err_handler_data(adev);

-   if (adev->gmc.xgmi.num_physical_nodes > 1)
-   amdgpu_xgmi_remove_device(adev);
-
 amdgpu_amdkfd_device_fini_sw(adev);

 for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
--
2.25.1


Recall: [PATCH] drm/amdgpu: add umc ras functions for navi31

2022-07-04 Thread Chai, Thomas
Chai, Thomas would like to recall the message, "[PATCH] drm/amdgpu: add umc ras 
functions for navi31".

RE: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

2022-03-29 Thread Chai, Thomas
[AMD Official Use Only]

Sorry for the confusing commit message。 
This interface is only for amdgpu ras tool new function. There is no impact on 
currently existing tools and scripts。

-Original Message-
From: Zhang, Hawking  
Sent: Tuesday, March 29, 2022 4:33 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Clements, John 
Subject: RE: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

[AMD Official Use Only]

I'm not sure I understand the fix correctly - It seems to me it is trying to 
stop user/test cases that initiate error injection request back-to-back? But 
anyway, we shouldn't make the change or leverage debugfs for that purpose, and 
there is no guarantee test scripts/applications will follow the rule as well. 

I guess we need to identify the root cause case by case and stop the invalid 
request in kernel driver.

Regards,
Hawking

-Original Message-
From: Chai, Thomas 
Sent: Tuesday, March 29, 2022 15:39
To: amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas ; Zhang, Hawking ; 
Zhou1, Tao ; Clements, John ; Chai, 
Thomas 
Subject: [PATCH] drm/amdgpu: Support AMDGPU RAS debugfs poll interface

Some AMDGPU RAS debugfs operations like UE injection can cause gpu reset. 
Before doing the next debugfs operation, the application should call poll to 
check if the gpu has finished recovering.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 38 -  
drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  6 
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 4bbed76b79c8..337e3e247a45 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -452,6 +452,12 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f,
 
/* data.inject.address is offset instead of absolute gpu 
address */
ret = amdgpu_ras_error_inject(adev, );
+
+   if (!ret && (data.head.type == 
AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE)) {
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+   con->ras_ue_injected = 1;
+   }
break;
default:
ret = -EINVAL;
@@ -464,6 +470,30 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file 
*f,
return size;
 }
 
+/**
+ * DOC: Support AMDGPU RAS debugfs poll interface
+ *
+ * Some AMDGPU RAS debugfs operations like UE injection
+ * can cause gpu reset. Before doing the next debugfs
+ * operation, the application should call poll to check
+ * if gpu is in recovering status.
+ */
+static __poll_t amdgpu_ras_debugfs_ctrl_poll(struct file *f, struct 
+poll_table_struct *wait) {
+   struct amdgpu_device *adev = (struct amdgpu_device 
*)file_inode(f)->i_private;
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   __poll_t mask = 0;
+
+   /* For UE injection, wait for gpu to finish recovery */
+   if (con->ras_ue_injected)
+   poll_wait(f, >gpu_ready_wait_wq, wait);
+
+   if (!atomic_read(>in_recovery))
+   mask = EPOLLIN | EPOLLRDNORM;
+
+   return mask;
+}
+
 /**
  * DOC: AMDGPU RAS debugfs EEPROM table reset interface
  *
@@ -503,6 +533,7 @@ static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file 
*f,
 
 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
.owner = THIS_MODULE,
+   .poll = amdgpu_ras_debugfs_ctrl_poll,
.read = NULL,
.write = amdgpu_ras_debugfs_ctrl_write,
.llseek = default_llseek
@@ -1837,6 +1868,11 @@ static void amdgpu_ras_do_recovery(struct work_struct 
*work)
if (amdgpu_device_should_recover_gpu(ras->adev))
amdgpu_device_gpu_recover(ras->adev, NULL);
atomic_set(>in_recovery, 0);
+
+   if (ras->ras_ue_injected) {
+   ras->ras_ue_injected = 0;
+   wake_up_all(>gpu_ready_wait_wq);
+   }
 }
 
 /* alloc/realloc bps array */
@@ -2279,7 +2315,7 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
INIT_DELAYED_WORK(>ras_counte_delay_work, amdgpu_ras_counte_dw);
atomic_set(>ras_ce_count, 0);
atomic_set(>ras_ue_count, 0);
-
+   init_waitqueue_head(>gpu_ready_wait_wq);
con->objs = (struct ras_manager *)(con + 1);
 
amdgpu_ras_set_context(adev, con);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index 606df8869b89..aea6bbb71501 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -379,6 +379,12 @@ struct amdgpu_ras {
 
/* Indicates smu whether need update bad channel info */
bool update_channel_flag;
+
+   /* UE injection flag */
+   uint32_t  ras_ue_injected;
+
+   /* Waiting for gpu ready work queue */
+   wait_q

RE: [PATCH V2] drm/amdgpu: Move common initialization operations of each ras block to one function

2022-03-07 Thread Chai, Thomas
[AMD Official Use Only]

OK

-Original Message-
From: Chen, Guchun  
Sent: Monday, March 7, 2022 5:11 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas ; 
Chai, Thomas 
Subject: RE: [PATCH V2] drm/amdgpu: Move common initialization operations of 
each ras block to one function

if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = _ras;
-   amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
-   adev->gmc.xgmi.ras_if = >gmc.xgmi.ras->ras_block.ras_comm;
}

Coding style needs to be fixed as well. '{}' should be dropped as there is only 
one line after upper if.

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of yipechai
Sent: Monday, March 7, 2022 4:43 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas ; 
Chai, Thomas 
Subject: [PATCH V2] drm/amdgpu: Move common initialization operations of each 
ras block to one function

Define amdgpu_ras_sw_init function to initialize all ras blocks.

V2: Modify error debugging information.

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c|   2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 143 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|   1 +
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  |  21 ---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  16 ---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  28 
 drivers/gpu/drm/amd/amdgpu/mca_v3_0.c  |   6 -
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  17 ---
 9 files changed, 148 insertions(+), 92 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6113ddc765a7..0c83eb69dad5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2402,6 +2402,12 @@ static int amdgpu_device_ip_init(struct amdgpu_device 
*adev)
}
}
 
+   r = amdgpu_ras_sw_init(adev);
+   if (r) {
+   DRM_ERROR("amdgpu_ras_sw_init failed (%d).\n", r);
+   goto init_failed;
+   }
+
if (amdgpu_sriov_vf(adev))
amdgpu_virt_init_data_exchange(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index ab75e189bc0b..544241f357b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -440,8 +440,6 @@ int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev)  {
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->gmc.xgmi.ras = _ras;
-   amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
-   adev->gmc.xgmi.ras_if = >gmc.xgmi.ras->ras_block.ras_comm;
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d3875618ebf5..89075ab9e82e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -2299,8 +2299,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
case CHIP_ALDEBARAN:
if (!adev->gmc.xgmi.connected_to_cpu) {
adev->nbio.ras = _v7_4_ras;
-   amdgpu_ras_register_ras_block(adev, 
>nbio.ras->ras_block);
-   adev->nbio.ras_if = >nbio.ras->ras_block.ras_comm;
}
break;
default:
@@ -2533,6 +2531,147 @@ void amdgpu_ras_suspend(struct amdgpu_device *adev)
amdgpu_ras_disable_all_features(adev, 1);  }
 
+int amdgpu_ras_sw_init(struct amdgpu_device *adev) {
+   int err = 0;
+
+   if (!amdgpu_ras_asic_supported(adev))
+   return 0;
+
+   if (adev->nbio.ras) {
+   err = amdgpu_ras_register_ras_block(adev, 
>nbio.ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register nbio ras 
block!\n");
+   return err;
+   }
+   adev->nbio.ras_if = >nbio.ras->ras_block.ras_comm;
+   }
+
+   if (adev->gmc.xgmi.ras) {
+   err = amdgpu_ras_register_ras_block(adev, 
>gmc.xgmi.ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register xgmi ras 
block!\n");
+   return err;
+   }
+   adev->gmc.xgmi.ras_if = >gmc.xgmi.ras->ras_block.ras_comm;
+   }
+
+   if (adev->gfx.ras) {
+   err = amdgpu_ras_register_ras_block(adev, 
>gfx.ras->ras_block);
+   if (err) {
+   dev_err(adev->dev, "Failed to register gfx ras 
block!\n");

RE: [PATCH] drm/amdgpu: Move common initialization operations of each ras block to one function

2022-03-01 Thread Chai, Thomas
[AMD Official Use Only]

Hi Stanley,
Thanks for your suggestion. 
I add a comment after your comment.

-Original Message-
From: Yang, Stanley  
Sent: Tuesday, March 1, 2022 9:50 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas ; 
Chai, Thomas 
Subject: 回复: [PATCH] drm/amdgpu: Move common initialization operations of each 
ras block to one function

[AMD Official Use Only]

Hi yipe,

One suggestion for this patch, please check my comment.

Regards,
Stanley
> -邮件原件-
> 发件人: amd-gfx  代表 yipechai
> 发送时间: Tuesday, March 1, 2022 5:46 PM
> 收件人: amd-gfx@lists.freedesktop.org
> 抄送: Zhou1, Tao ; Zhang, Hawking 
> ; Clements, John ; Chai, 
> Thomas ; Chai, Thomas 
> 主题: [PATCH] drm/amdgpu: Move common initialization operations of each 
> ras block to one function
>
> Define amdgpu_ras_sw_init function to initialize all ras blocks.
>
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   6 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c|   2 -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 143
> -
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|   1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  |  21 ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  16 ---
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  28 
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.c  |   6 -
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  17 ---
>  9 files changed, 148 insertions(+), 92 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 6113ddc765a7..72550e9f6058 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2402,6 +2402,12 @@ static int amdgpu_device_ip_init(struct 
> amdgpu_device *adev)
>   }
>   }
>
> + r = amdgpu_ras_sw_init(adev);
> + if (r) {
> + DRM_ERROR("amdgpu_ras_early_init failed (%d).\n", r);
> + goto init_failed;
> + }
> [Yang, Stanley] : This is ras blocks early init, I  think it's more 
> reasonable to move amdgpu_ras_sw_init before amdgpu_ras_init function.
 
  [Thomas] Sorry, I will change this error message print.
I also agree with you to initialize all ras operations 
before amdgpu_ras_init,  but the initialization place of each ras instance is 
different.  
The ras block instances of  mmhub umc sdma and hdp are initialized 
in IP .early_init function, but gfx and mca ras block instances are initialized 
in the IP .sw_init function. 
   Then, after calling all IP .sw_init initialization and then calling 
amdgpu_ras_sw_init may be the earliest place to initialize all ras blocks. 
   

> +
>   if (amdgpu_sriov_vf(adev))
>   amdgpu_virt_init_data_exchange(adev);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index ab75e189bc0b..544241f357b2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -440,8 +440,6 @@ int amdgpu_gmc_ras_early_init(struct amdgpu_device 
> *adev)  {
>   if (!adev->gmc.xgmi.connected_to_cpu) {
>   adev->gmc.xgmi.ras = _ras;
> - amdgpu_ras_register_ras_block(adev, 
> >gmc.xgmi.ras->ras_block);
> - adev->gmc.xgmi.ras_if = >gmc.xgmi.ras-
> >ras_block.ras_comm;
>   }
>
>   return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d3875618ebf5..89075ab9e82e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2299,8 +2299,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
>   case CHIP_ALDEBARAN:
>   if (!adev->gmc.xgmi.connected_to_cpu) {
>   adev->nbio.ras = _v7_4_ras;
> - amdgpu_ras_register_ras_block(adev, 
> >nbio.ras->ras_block);
> - adev->nbio.ras_if = >nbio.ras-
> >ras_block.ras_comm;
>   }
>   break;
>   default:
> @@ -2533,6 +2531,147 @@ void amdgpu_ras_suspend(struct amdgpu_device 
> *adev)
>   amdgpu_ras_disable_all_features(adev, 1);  }
>
> +int amdgpu_ras_sw_init(struct amdgpu_device *adev) {
> + int err = 0;
> +
> + if (!amdgpu_ras_asic_supported(adev))
> + return 0;
> +
> + if (adev->nbio.ras) {
> + err = amdgpu_ras_register_ras_block(adev, 
> >nbio.ras->ras_block);
> + if (err) {
> +

RE: [PATCH 7/7] drm/amdgpu: Remove redundant .ras_late_init initialization in some ras blocks

2022-02-15 Thread Chai, Thomas
[AMD Official Use Only]

OK

-Original Message-
From: Zhou1, Tao  
Sent: Wednesday, February 16, 2022 11:11 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 7/7] drm/amdgpu: Remove redundant .ras_late_init 
initialization in some ras blocks

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Tuesday, February 15, 2022 3:41 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 7/7] drm/amdgpu: Remove redundant .ras_late_init 
> initialization in some ras blocks
> 
> 1. Define amdgpu_ras_block_late_init_default in amdgpu_ras.c as
>.ras_late_init common function, which is called when
>.ras_late_init of ras block doesn't initialize.
[Tao]: doesn't initialize -> "isn't initialized" or "is uninitialized" 

> 2. Remove the code of using amdgpu_ras_block_late_init to
>initialize .ras_late_init in ras blocks.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 4 
>  drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   | 1 -
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.c   | 3 ---
>  4 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 6cb1e5d126d7..ad37df6e50ce 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2457,6 +2457,12 @@ int amdgpu_ras_block_late_init(struct 
> amdgpu_device *adev,
>   return r;
>  }
> 
> +int amdgpu_ras_block_late_init_default(struct amdgpu_device *adev,
> +  struct ras_common_if *ras_block)
> +{
> + return amdgpu_ras_block_late_init(adev, ras_block); }
> +
>  /* helper function to remove ras fs node and interrupt handler */  
> void amdgpu_ras_block_late_fini(struct amdgpu_device *adev,
> struct ras_common_if *ras_block) @@ -2535,6 +2541,8 
> @@ int 
> amdgpu_ras_late_init(struct amdgpu_device
> *adev)
>   obj = node->ras_obj;
>   if (obj->ras_late_init)
>   obj->ras_late_init(adev, >ras_comm);
> + else
> + amdgpu_ras_block_late_init_default(adev, 
> >ras_comm);
>   }
> 
>   return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index b719d2c3003b..412e44af1608 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1291,10 +1291,6 @@ static void gmc_v9_0_set_mmhub_ras_funcs(struct
> amdgpu_device *adev)
>   adev->mmhub.ras->ras_block.ras_comm.type = 
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
>   adev->mmhub.ras_if = >mmhub.ras-
> >ras_block.ras_comm;
> 
> - /* If don't define special ras_late_init function, use default
> ras_late_init */
> - if (!adev->mmhub.ras->ras_block.ras_late_init)
> - adev->mmhub.ras->ras_block.ras_late_init =
> amdgpu_ras_block_late_init;
> -
>   /* If don't define special ras_fini function, use default 
> ras_fini */
>   if (!adev->mmhub.ras->ras_block.ras_fini)
>   adev->mmhub.ras->ras_block.ras_fini = 
> amdgpu_mmhub_ras_fini; diff 
> --git a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
> index c9e931f046f7..d7811e0327cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
> @@ -163,7 +163,6 @@ struct amdgpu_hdp_ras hdp_v4_0_ras = {
>   .type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
>   },
>   .hw_ops = _v4_0_ras_hw_ops,
> - .ras_late_init = amdgpu_ras_block_late_init,
>   .ras_fini = amdgpu_hdp_ras_fini,
>   },
>  };
> diff --git a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> index 12d09a58b644..b4b36899f5c6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/mca_v3_0.c
> @@ -71,7 +71,6 @@ struct amdgpu_mca_ras_block mca_v3_0_mp0_ras = {
>   },
>   .hw_ops = _v3_0_mp0_hw_ops,
>   .ras_block_match = mca_v3_0_ras_block_match,
> - .ras_late_init = amdgpu_ras_block_late_init,
>   .ras_fini = mca_v3_0_mp0_ras_fini,
>   },
>  };
> @@ -104,7 +103,6 @@ struct amdgpu_mca_ras_block mca_v3_0_mp1_ras

RE: [PATCH 6/7] drm/amdgpu: define amdgpu_ras_late_init to call all ras blocks' .ras_late_init

2022-02-15 Thread Chai, Thomas
[AMD Official Use Only]



-Original Message-
From: Zhou1, Tao  
Sent: Wednesday, February 16, 2022 11:07 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 6/7] drm/amdgpu: define amdgpu_ras_late_init to call all 
ras blocks' .ras_late_init

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Tuesday, February 15, 2022 3:41 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 6/7] drm/amdgpu: define amdgpu_ras_late_init to call 
> all ras blocks' .ras_late_init
> 
> Define amdgpu_ras_late_init to call all ras blocks' .ras_late_init.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c| 44 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c| 18 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|  1 +
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  |  6 ---
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c |  5 +--
>  drivers/gpu/drm/amd/amdgpu/soc15.c |  6 +--
>  7 files changed, 23 insertions(+), 59 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index a74a1b74a172..67ea23dbc618 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -2625,6 +2625,8 @@ static int amdgpu_device_ip_late_init(struct 
> amdgpu_device *adev)
>   adev->ip_blocks[i].status.late_initialized = true;
>   }
> 
> + amdgpu_ras_late_init(adev);
>[Tao]: do we need to pass return value here?

[Thomas] According to the previous code,  it needs to judge the return value.  
I will add it.

> +
>   amdgpu_ras_set_error_query_ready(adev, true);
> 
>   amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE); diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index ebf4194b0699..49dd81c0db2d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -449,50 +449,6 @@ int amdgpu_gmc_ras_early_init(struct 
> amdgpu_device
> *adev)
> 
>  int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)  {
> - int r;
> -
> - if (adev->umc.ras && adev->umc.ras->ras_block.ras_late_init) {
> - r = adev->umc.ras->ras_block.ras_late_init(adev, adev-
> >umc.ras_if);
> - if (r)
> - return r;
> - }
> -
> - if (adev->mmhub.ras && adev->mmhub.ras->ras_block.ras_late_init) {
> - r = adev->mmhub.ras->ras_block.ras_late_init(adev, adev-
> >mmhub.ras_if);
> - if (r)
> - return r;
> - }
> -
> - if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init)
> {
> - r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, adev-
> >gmc.xgmi.ras_if);
> - if (r)
> - return r;
> - }
> -
> - if (adev->hdp.ras && adev->hdp.ras->ras_block.ras_late_init) {
> - r = adev->hdp.ras->ras_block.ras_late_init(adev, adev-
> >hdp.ras_if);
> - if (r)
> - return r;
> - }
> -
> - if (adev->mca.mp0.ras && adev->mca.mp0.ras->ras_block.ras_late_init)
> {
> - r = adev->mca.mp0.ras->ras_block.ras_late_init(adev, adev-
> >mca.mp0.ras_if);
> - if (r)
> - return r;
> - }
> -
> - if (adev->mca.mp1.ras && adev->mca.mp1.ras->ras_block.ras_late_init)
> {
> - r = adev->mca.mp1.ras->ras_block.ras_late_init(adev, adev-
> >mca.mp1.ras_if);
> - if (r)
> - return r;
> - }
> -
> - if (adev->mca.mpio.ras && adev->mca.mpio.ras-
> >ras_block.ras_late_init) {
> - r = adev->mca.mpio.ras->ras_block.ras_late_init(adev, adev-
> >mca.mpio.ras_if);
> - if (r)
> - return r;
> - }
> -
>   return 0;
>  }
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 1aff88fcea76..6cb1e5d126d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2522,6 +2522,24 @@ void amdgpu_ras_suspend(struct amdgpu_device
> *adev)
>   amdgpu_ras_disable_all_features(adev, 1);  }
>

RE: [PATCH 03/11] drm/amdgpu: Optimize amdgpu_hdp_ras_late_init/amdgpu_hdp_ras_fini function code

2022-02-09 Thread Chai, Thomas
[AMD Official Use Only]



-Original Message-
From: Zhou1, Tao  
Sent: Wednesday, February 9, 2022 4:54 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 03/11] drm/amdgpu: Optimize 
amdgpu_hdp_ras_late_init/amdgpu_hdp_ras_fini function code

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, February 9, 2022 1:57 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 03/11] drm/amdgpu: Optimize 
> amdgpu_hdp_ras_late_init/amdgpu_hdp_ras_fini function code
> 
> Optimize amdgpu_hdp_ras_late_init/amdgpu_hdp_ras_fini function code.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c | 37 ++---
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  1 +
>  drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c   |  1 +
>  3 files changed, 5 insertions(+), 34 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
> index 518966a26130..21a5f884dd2a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hdp.c
> @@ -26,43 +26,12 @@
> 
>  int amdgpu_hdp_ras_late_init(struct amdgpu_device *adev, void *ras_info)  {
> - int r;
> - struct ras_ih_if ih_info = {
> - .cb = NULL,
> - };
> - struct ras_fs_if fs_info = {
> - .sysfs_name = "hdp_err_count",
> - };
> -
> - if (!adev->hdp.ras_if) {
> - adev->hdp.ras_if = kmalloc(sizeof(struct ras_common_if),
> GFP_KERNEL);
> - if (!adev->hdp.ras_if)
> - return -ENOMEM;
> - adev->hdp.ras_if->block = AMDGPU_RAS_BLOCK__HDP;
> - adev->hdp.ras_if->type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE;
> - adev->hdp.ras_if->sub_block_index = 0;
> - }
> - ih_info.head = fs_info.head = *adev->hdp.ras_if;
> - r = amdgpu_ras_late_init(adev, adev->hdp.ras_if,
> -  _info, _info);
> - if (r || !amdgpu_ras_is_supported(adev, adev->hdp.ras_if->block)) {
> - kfree(adev->hdp.ras_if);
> - adev->hdp.ras_if = NULL;
> - }
> -
> - return r;
> + return amdgpu_ras_block_late_init(adev, adev->hdp.ras_if);
>  }
> 
>  void amdgpu_hdp_ras_fini(struct amdgpu_device *adev)  {
>   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__HDP) &&
> - adev->hdp.ras_if) {
> - struct ras_common_if *ras_if = adev->hdp.ras_if;
> - struct ras_ih_if ih_info = {
> - .cb = NULL,
> - };
> -
> - amdgpu_ras_late_fini(adev, ras_if, _info);
> - kfree(ras_if);
> - }
> + adev->hdp.ras_if)
> + amdgpu_ras_block_late_fini(adev, adev->hdp.ras_if);
>  }
>[Tao]: Since hdp_ras_late_init/fini are simple wrapper, can we remove them and 
>call amdgpu_ras_block_late_init/fini directly?
>The same comment to other blocks.

[Thomas] Compared with amdgpu_ras_block_late_init/fin,  hdp_ras_late_init/fini 
have different function interface parameters.
 But can do it as a new ticket later.

> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index af873c99d5e4..b12fe6703f02 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1302,6 +1302,7 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct 
> amdgpu_device *adev)  {
>   adev->hdp.ras = _v4_0_ras;
>   amdgpu_ras_register_ras_block(adev, >hdp.ras->ras_block);
> + adev->hdp.ras_if = >hdp.ras->ras_block.ras_comm;
>  }
> 
>  static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev) diff 
> --git a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
> b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
> index 503c292b321e..a9ed4232cdeb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/hdp_v4_0.c
> @@ -160,6 +160,7 @@ struct amdgpu_hdp_ras hdp_v4_0_ras = {
>   .ras_comm = {
>   .name = "hdp",
>   .block = AMDGPU_RAS_BLOCK__HDP,
> + .type =
> AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE,
>   },
>   .hw_ops = _v4_0_ras_hw_ops,
>   .ras_late_init = amdgpu_hdp_ras_late_init,
> --
> 2.25.1


RE: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop

2022-02-06 Thread Chai, Thomas
OK

-Original Message-
From: Kuehling, Felix  
Sent: Tuesday, February 1, 2022 12:24 AM
To: Zhou1, Tao ; Chai, Thomas ; 
amd-gfx@lists.freedesktop.org
Cc: Clements, John ; Zhang, Hawking 

Subject: Re: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by 
infinite loop


Am 2022-01-29 um 22:19 schrieb Zhou1, Tao:
> [AMD Official Use Only]
>
>
>
>> -Original Message-----
>> From: Chai, Thomas 
>> Sent: Saturday, January 29, 2022 8:34 PM
>> To: amd-gfx@lists.freedesktop.org
>> Cc: Chai, Thomas ; Zhang, Hawking 
>> ; Zhou1, Tao ; Clements, 
>> John ; Chai, Thomas 
>> Subject: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused 
>> by infinite loop
>>
>> 1. The infinite loop case only occurs on multiple cards support
>> ras functions.
>> 2. The explanation of root cause refer to 76641cbbf196523b5752c6cf68f86.
>> 3. Create new node to manage each unique ras instance to guarantee
>> each device .ras_list is completely independent.
>> 4. Fixes:7a6b8ab3231b511915cb94cac1debabf093.
>> 5. The soft locked logs are as follows:
>> [  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE
>> 5.13.0-27-generic #29~20.04.1-Ubuntu
>> [  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, 
>> BIOS T20200717143848 07/17/2020 [  262.165698] Workqueue: events 
>> amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP:
>> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 
>> 68
>> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 
>> 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 
>> 4d 89 f5 48 83 e8 28 48
>> 39 d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 
>> 0018:ac908fa87d80
>> EFLAGS: 0202 [  262.166247] RAX: c1394248 RBX: 
>> 91e4ab8d6e20
>> RCX: c1394248 [  262.166249] RDX: 91e4aa356e20 RSI:
>> 000e RDI: 91e4ab8c [  262.166252] RBP:
>> ac908fa87da8 R08: 0007 R09: 0001 [  
>> 262.166254] R10: 91e4930b64ec R11:  R12:
>> 000e [  262.166256] R13: 91e4aa356df8 R14: 
>> c1394320
>> R15: 0003 [  262.166258] FS:  ()
>> GS:92238fb4() knlGS: [  262.166261] CS:  
>> 0010
>> DS:  ES:  CR0: 80050033 [  262.166264] CR2:
>> 0001004865d0 CR3: 00406d796000 CR4: 00350ee0 [  
>> 262.166267] Call Trace:
>> [  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu] [  
>> 262.166529]  ? psi_task_switch+0xd2/0x250 [  262.166537]  ?
>> __switch_to+0x11d/0x460 [  262.166542]  ? __switch_to_asm+0x36/0x70 [  
>> 262.166549]  process_one_work+0x220/0x3c0 [  262.166556]
>> worker_thread+0x4d/0x3f0 [  262.166560]  ? 
>> process_one_work+0x3c0/0x3c0 [  262.166563]  kthread+0x12b/0x150 [  
>> 262.166568]  ?
>> set_kthread_struct+0x40/0x40 [  262.166571]  ret_from_fork+0x22/0x30
>>
>> Signed-off-by: yipechai 
>> ---
>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 
>> ++--
>> -  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
>>   2 files changed, 33 insertions(+), 7 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> index 9d7c778c1a2d..b0aa67308c31 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
>> @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
>>  "mca_iohc",
>>   };
>>
>> +struct amdgpu_ras_block_list {
>> +/* ras block link */
>> +struct list_head node;
>> +
>> +struct amdgpu_ras_block_object *ras_obj; };
>> +
>>   const char *get_ras_block_str(struct ras_common_if *ras_block)  {
>>  if (!ras_block)
>> @@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object 
>> *amdgpu_ras_get_ras_block(struct amdgpu_de
>>  enum amdgpu_ras_block block,
>> uint32_t sub_block_index)  {
>>  int loop_cnt = 0;
>> -struct amdgpu_ras_block_object *obj, *tmp;
>> +struct amdgpu_ras_block_list *node, *tmp;
>> +struct amdgpu_ras_block_object *obj;
>>
>>  if (block >= AMDGPU_RAS_BLOCK__LAST)
>>  return NULL;
>> @@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object 
>> *amdgpu_ras_get_ras_block(struct amdgpu_de
>>  if (!amdgpu_ras_is_supported(adev, block))
>>  return NULL;
>>
>> - 

RE: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by infinite loop

2022-01-29 Thread Chai, Thomas
OK

-Original Message-
From: Zhou1, Tao  
Sent: Sunday, January 30, 2022 11:20 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused by 
infinite loop

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Saturday, January 29, 2022 8:34 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 1/2] drm/amdgpu: Fixed the defect of soft lock caused 
> by infinite loop
> 
> 1. The infinite loop case only occurs on multiple cards support
>ras functions.
> 2. The explanation of root cause refer to 76641cbbf196523b5752c6cf68f86.
> 3. Create new node to manage each unique ras instance to guarantee
>each device .ras_list is completely independent.
> 4. Fixes:7a6b8ab3231b511915cb94cac1debabf093.
> 5. The soft locked logs are as follows:
> [  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE
> 5.13.0-27-generic #29~20.04.1-Ubuntu
> [  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, 
> BIOS T20200717143848 07/17/2020 [  262.165698] Workqueue: events 
> amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP:
> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 
> 68
> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 
> 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 
> 89 f5 48 83 e8 28 48
> 39 d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 
> 0018:ac908fa87d80
> EFLAGS: 0202 [  262.166247] RAX: c1394248 RBX: 
> 91e4ab8d6e20
> RCX: c1394248 [  262.166249] RDX: 91e4aa356e20 RSI:
> 000e RDI: 91e4ab8c [  262.166252] RBP:
> ac908fa87da8 R08: 0007 R09: 0001 [  
> 262.166254] R10: 91e4930b64ec R11:  R12:
> 000e [  262.166256] R13: 91e4aa356df8 R14: 
> c1394320
> R15: 0003 [  262.166258] FS:  ()
> GS:92238fb4() knlGS: [  262.166261] CS:  
> 0010
> DS:  ES:  CR0: 80050033 [  262.166264] CR2:
> 0001004865d0 CR3: 00406d796000 CR4: 00350ee0 [  
> 262.166267] Call Trace:
> [  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu] [  
> 262.166529]  ? psi_task_switch+0xd2/0x250 [  262.166537]  ?
> __switch_to+0x11d/0x460 [  262.166542]  ? __switch_to_asm+0x36/0x70 [  
> 262.166549]  process_one_work+0x220/0x3c0 [  262.166556]
> worker_thread+0x4d/0x3f0 [  262.166560]  ? 
> process_one_work+0x3c0/0x3c0 [  262.166563]  kthread+0x12b/0x150 [  
> 262.166568]  ?
> set_kthread_struct+0x40/0x40 [  262.166571]  ret_from_fork+0x22/0x30
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 37 ++--
> -  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |  3 --
>  2 files changed, 33 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 9d7c778c1a2d..b0aa67308c31 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -75,6 +75,13 @@ const char *ras_mca_block_string[] = {
>   "mca_iohc",
>  };
> 
> +struct amdgpu_ras_block_list {
> + /* ras block link */
> + struct list_head node;
> +
> + struct amdgpu_ras_block_object *ras_obj; };
> +
>  const char *get_ras_block_str(struct ras_common_if *ras_block)  {
>   if (!ras_block)
> @@ -880,7 +887,8 @@ static struct amdgpu_ras_block_object 
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>   enum amdgpu_ras_block block,
> uint32_t sub_block_index)  {
>   int loop_cnt = 0;
> - struct amdgpu_ras_block_object *obj, *tmp;
> + struct amdgpu_ras_block_list *node, *tmp;
> + struct amdgpu_ras_block_object *obj;
> 
>   if (block >= AMDGPU_RAS_BLOCK__LAST)
>   return NULL;
> @@ -888,7 +896,13 @@ static struct amdgpu_ras_block_object 
> *amdgpu_ras_get_ras_block(struct amdgpu_de
>   if (!amdgpu_ras_is_supported(adev, block))
>   return NULL;
> 
> - list_for_each_entry_safe(obj, tmp, >ras_list, node) {
> + list_for_each_entry_safe(node, tmp, >ras_list, node) {
> + if (!node->ras_obj) {
> + DRM_ERROR("Warning: abnormal ras list node");
[Tao]: dev_warn is recommended.

> + continue;
> + }
> +
> + obj = node->ras_obj;
>   if (obj->ras_block_match) {
>   

RE: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop

2022-01-29 Thread Chai, Thomas
It have solution to solve this defect,   I am debugging the modifications. 

-Original Message-
From: Zhou1, Tao  
Sent: Saturday, January 29, 2022 3:54 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop

[AMD Official Use Only]

For quick workaround, I agree with the solution. But regarding the root cause, 
the list is still messed up.
Can we make ras_list to be a global variable across all cards, and add list 
empty check (or add a flag to indicate the register status of ras block) before 
list add to avoid redundant register?

Regards,
Tao

> -Original Message-
> From: Chai, Thomas 
> Sent: Saturday, January 29, 2022 11:53 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop
> 
> 1. The infinite loop causing soft lock occurs on multiple amdgpu cards
>supporting ras feature.
> 2. This a workaround patch. It is valid for multiple amdgpu cards of the
>same type.
> 3. The root cause is that each GPU card device has a separate .ras_list
>link header, but the instance and linked list node of each ras block
>are unique. When each device is initialized, each ras instance will
>repeatedly add link node to the device every time. In this way, only
>the .ras_list of the last initialized device is completely correct.
>the .ras_list->prev and .ras_list->next of the device initialzied
>before can still point to the correct ras instance, but the prev
>pointer and next pointer of the pointed ras instance both point to
>the last initialized device's .ras_ list instead of the beginning
>.ras_ list. When using list_for_each_entry_safe searches for
>non-existent Ras nodes on devices other than the last device, the
>last ras instance next pointer cannot always be equal to the
>beginning .ras_list, so that the loop cannot be terminated, the
>program enters a infinite loop.
>  BTW: Since the data and initialization process of each card are the same,
>   the link list between ras instances will not be destroyed every time
>   the device is initialized.
>  4. The soft locked logs are as follows:
> [  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE
> 5.13.0-27-generic #29~20.04.1-Ubuntu
> [  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, 
> BIOS T20200717143848 07/17/2020 [  262.165698] Workqueue: events 
> amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP:
> 0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 
> 68
> d8 4c 8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 
> 89 e6 4c 89 ef e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 
> 89 f5 48 83 e8 28 48
> 39 d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 
> 0018:ac908fa87d80
> EFLAGS: 0202 [  262.166247] RAX: c1394248 RBX: 
> 91e4ab8d6e20
> RCX: c1394248 [  262.166249] RDX: 91e4aa356e20 RSI:
> 000e RDI: 91e4ab8c [  262.166252] RBP:
> ac908fa87da8 R08: 0007 R09: 0001 [  
> 262.166254] R10: 91e4930b64ec R11:  R12:
> 000e [  262.166256] R13: 91e4aa356df8 R14: 
> c1394320
> R15: 0003 [  262.166258] FS:  ()
> GS:92238fb4() knlGS: [  262.166261] CS:  
> 0010
> DS:  ES:  CR0: 80050033 [  262.166264] CR2:
> 0001004865d0 CR3: 00406d796000 CR4: 00350ee0 [  
> 262.166267] Call Trace:
> [  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu] [  
> 262.166529]  ? psi_task_switch+0xd2/0x250 [  262.166537]  ?
> __switch_to+0x11d/0x460 [  262.166542]  ? __switch_to_asm+0x36/0x70 [  
> 262.166549]  process_one_work+0x220/0x3c0 [  262.166556]
> worker_thread+0x4d/0x3f0 [  262.166560]  ? 
> process_one_work+0x3c0/0x3c0 [  262.166563]  kthread+0x12b/0x150 [  
> 262.166568]  ?
> set_kthread_struct+0x40/0x40 [  262.166571]  ret_from_fork+0x22/0x30
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index d4e07d0acb66..3d533ef0783d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -884,6 +884,7 @@ static int amdgpu_ras_block_match_default(struct
> amdgpu_ras_block_object *block_  static struct amdgpu_ras_block_object 
> *amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
>

RE: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop

2022-01-28 Thread Chai, Thomas
OK

-Original Message-
From: Chen, Guchun  
Sent: Saturday, January 29, 2022 12:02 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas ; 
Chai, Thomas 
Subject: RE: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop

[Public]

Please add a Fixes tag, as it should fix a regression from former patch.

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of yipechai
Sent: Saturday, January 29, 2022 11:53 AM
To: amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas ; 
Chai, Thomas 
Subject: [PATCH] drm/amdgpu: Add judgement to avoid infinite loop

1. The infinite loop causing soft lock occurs on multiple amdgpu cards
   supporting ras feature.
2. This a workaround patch. It is valid for multiple amdgpu cards of the
   same type.
3. The root cause is that each GPU card device has a separate .ras_list
   link header, but the instance and linked list node of each ras block
   are unique. When each device is initialized, each ras instance will
   repeatedly add link node to the device every time. In this way, only
   the .ras_list of the last initialized device is completely correct.
   the .ras_list->prev and .ras_list->next of the device initialzied
   before can still point to the correct ras instance, but the prev
   pointer and next pointer of the pointed ras instance both point to
   the last initialized device's .ras_ list instead of the beginning
   .ras_ list. When using list_for_each_entry_safe searches for
   non-existent Ras nodes on devices other than the last device, the
   last ras instance next pointer cannot always be equal to the
   beginning .ras_list, so that the loop cannot be terminated, the
   program enters a infinite loop.
 BTW: Since the data and initialization process of each card are the same,
  the link list between ras instances will not be destroyed every time
  the device is initialized.
 4. The soft locked logs are as follows:
[  262.165690] CPU: 93 PID: 758 Comm: kworker/93:1 Tainted: G   OE 
5.13.0-27-generic #29~20.04.1-Ubuntu
[  262.165695] Hardware name: Supermicro AS -4124GS-TNR/H12DSG-O-CPU, BIOS 
T20200717143848 07/17/2020 [  262.165698] Workqueue: events 
amdgpu_ras_do_recovery [amdgpu] [  262.165980] RIP: 
0010:amdgpu_ras_get_ras_block+0x86/0xd0 [amdgpu] [  262.166239] Code: 68 d8 4c 
8d 71 d8 48 39 c3 74 54 49 8b 45 38 48 85 c0 74 32 44 89 fa 44 89 e6 4c 89 ef 
e8 82 e4 9b dc 85 c0 74 3c 49 8b 46 28 <49> 8d 56 28 4d 89 f5 48 83 e8 28 48 39 
d3 74 25 49 89 c6 49 8b 45 [  262.166243] RSP: 0018:ac908fa87d80 EFLAGS: 
0202 [  262.166247] RAX: c1394248 RBX: 91e4ab8d6e20 RCX: 
c1394248 [  262.166249] RDX: 91e4aa356e20 RSI: 000e 
RDI: 91e4ab8c [  262.166252] RBP: ac908fa87da8 R08: 
0007 R09: 0001 [  262.166254] R10: 91e4930b64ec 
R11:  R12: 000e [  262.166256] R13: 
91e4aa356df8 R14: c1394320 R15: 0003 [  262.166258] FS: 
 () GS:92238fb4() knlGS: [  
262.166261] CS:  0010 DS:  ES:  CR0: 80050033 [  262.166264] 
CR2: 0001004865d0 CR3: 00406d796000 CR4: 00350ee0 [  
262.166267] Call Trace:
[  262.166272]  amdgpu_ras_do_recovery+0x130/0x290 [amdgpu] [  262.166529]  ? 
psi_task_switch+0xd2/0x250 [  262.166537]  ? __switch_to+0x11d/0x460 [  
262.166542]  ? __switch_to_asm+0x36/0x70 [  262.166549]  
process_one_work+0x220/0x3c0 [  262.166556]  worker_thread+0x4d/0x3f0 [  
262.166560]  ? process_one_work+0x3c0/0x3c0 [  262.166563]  kthread+0x12b/0x150 
[  262.166568]  ? set_kthread_struct+0x40/0x40 [  262.166571]  
ret_from_fork+0x22/0x30

Signed-off-by: yipechai 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d4e07d0acb66..3d533ef0783d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -884,6 +884,7 @@ static int amdgpu_ras_block_match_default(struct 
amdgpu_ras_block_object *block_  static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
enum amdgpu_ras_block block, uint32_t 
sub_block_index)  {
+   int loop_cnt = 0;
struct amdgpu_ras_block_object *obj, *tmp;
 
if (block >= AMDGPU_RAS_BLOCK__LAST)
@@ -900,6 +901,9 @@ static struct amdgpu_ras_block_object 
*amdgpu_ras_get_ras_block(struct amdgpu_de
if (amdgpu_ras_block_match_default(obj, block) == 0)
return obj;
}
+
+   if (++loop_cnt >= AMDGPU_RAS_BLOCK__LAST)
+   break;
}
 
return NULL;
--
2.25.1


RE: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init

2022-01-20 Thread Chai, Thomas


-Original Message-
From: Lazar, Lijo  
Sent: Thursday, January 20, 2022 3:32 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John 
Subject: Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from 
.late_init to .early_init



On 1/20/2022 12:57 PM, Chai, Thomas wrote:
> 
> -Original Message-
> From: Lazar, Lijo 
> Sent: Thursday, January 20, 2022 1:49 PM
> To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
> Cc: Zhou1, Tao ; Zhang, Hawking 
> ; Clements, John ; Chai, 
> Thomas 
> Subject: Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization 
> from .late_init to .early_init
> 
> 
> 
> On 1/20/2022 8:48 AM, yipechai wrote:
>> Move xgmi ras initialization from .late_init to .early_init, which 
>> let xgmi ras can be initialized only once.
>>
>> Signed-off-by: yipechai 
>> ---
>>drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++-
>>drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>>drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +
>>drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +
>>4 files changed, 21 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> index 3483a82f5734..788c0257832d 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
>> @@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct 
>> amdgpu_device *adev, uint64_t addr,
>>  } while (fault->timestamp < tmp);
>>}
>>
>> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev) {
>> +if (!adev->gmc.xgmi.connected_to_cpu) {
>> +adev->gmc.xgmi.ras = _ras;
>> +amdgpu_ras_register_ras_block(adev, 
>> >gmc.xgmi.ras->ras_block);
>> +}
>> +
>> +return 0;
>> +}
>> +
>>int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>{
>>  int r;
>> @@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>>  return r;
>>  }
>>
>> -if (!adev->gmc.xgmi.connected_to_cpu) {
>> -adev->gmc.xgmi.ras = _ras;
>> -amdgpu_ras_register_ras_block(adev, 
>> >gmc.xgmi.ras->ras_block);
>> -}
>> -
>>  if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
>>  r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
>>  if (r)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> index 0001631cfedb..ac4c0e50b45c 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
>> @@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
>>uint16_t pasid, uint64_t timestamp);
>>void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t 
>> addr,
>>   uint16_t pasid);
>> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
>>int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>>void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>>int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev); 
>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> index 4f8d356f8432..7a6ad5d467b2 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>> @@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct 
>> amdgpu_device *adev)
>>
>>static int gmc_v10_0_early_init(void *handle)
>>{
>> +int r;
>>  struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>>
>>  gmc_v10_0_set_mmhub_funcs(adev);
>> @@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
>>  adev->gmc.private_aperture_end =
>>  adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>>
>> +r = amdgpu_gmc_ras_early_init(adev);
>> +if (r)
>> +return r;
>> +
> 
>> At this point it's unknown if RAS is applicable for the SKU. I think this 
>> failure check shouldn't be there (here and below one).
> 
>> amdgpu_gmc_ras_early_init is return 0 always, that way also this check is 
>> not needed.
> 
> [Thomas]  Just like calling amdgpu_gmc_ras_late_init,  c

RE: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init

2022-01-19 Thread Chai, Thomas

-Original Message-
From: Lazar, Lijo  
Sent: Thursday, January 20, 2022 1:49 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas 
Subject: Re: [PATCH V2 1/2] drm/amdgpu: Move xgmi ras initialization from 
.late_init to .early_init



On 1/20/2022 8:48 AM, yipechai wrote:
> Move xgmi ras initialization from .late_init to .early_init, which let 
> xgmi ras can be initialized only once.
> 
> Signed-off-by: yipechai 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 15 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  |  5 +
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   |  5 +
>   4 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 3483a82f5734..788c0257832d 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -436,6 +436,16 @@ void amdgpu_gmc_filter_faults_remove(struct 
> amdgpu_device *adev, uint64_t addr,
>   } while (fault->timestamp < tmp);
>   }
>   
> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev) {
> + if (!adev->gmc.xgmi.connected_to_cpu) {
> + adev->gmc.xgmi.ras = _ras;
> + amdgpu_ras_register_ras_block(adev, 
> >gmc.xgmi.ras->ras_block);
> + }
> +
> + return 0;
> +}
> +
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   {
>   int r;
> @@ -452,11 +462,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev)
>   return r;
>   }
>   
> - if (!adev->gmc.xgmi.connected_to_cpu) {
> - adev->gmc.xgmi.ras = _ras;
> - amdgpu_ras_register_ras_block(adev, 
> >gmc.xgmi.ras->ras_block);
> - }
> -
>   if (adev->gmc.xgmi.ras && adev->gmc.xgmi.ras->ras_block.ras_late_init) {
>   r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
>   if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> index 0001631cfedb..ac4c0e50b45c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
> @@ -318,6 +318,7 @@ bool amdgpu_gmc_filter_faults(struct amdgpu_device *adev,
> uint16_t pasid, uint64_t timestamp);
>   void amdgpu_gmc_filter_faults_remove(struct amdgpu_device *adev, uint64_t 
> addr,
>uint16_t pasid);
> +int amdgpu_gmc_ras_early_init(struct amdgpu_device *adev);
>   int amdgpu_gmc_ras_late_init(struct amdgpu_device *adev);
>   void amdgpu_gmc_ras_fini(struct amdgpu_device *adev);
>   int amdgpu_gmc_allocate_vm_inv_eng(struct amdgpu_device *adev); diff 
> --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 4f8d356f8432..7a6ad5d467b2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -719,6 +719,7 @@ static void gmc_v10_0_set_gfxhub_funcs(struct 
> amdgpu_device *adev)
>   
>   static int gmc_v10_0_early_init(void *handle)
>   {
> + int r;
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   gmc_v10_0_set_mmhub_funcs(adev);
> @@ -734,6 +735,10 @@ static int gmc_v10_0_early_init(void *handle)
>   adev->gmc.private_aperture_end =
>   adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>   
> + r = amdgpu_gmc_ras_early_init(adev);
> + if (r)
> + return r;
> +

>At this point it's unknown if RAS is applicable for the SKU. I think this 
>failure check shouldn't be there (here and below one).

>amdgpu_gmc_ras_early_init is return 0 always, that way also this check is not 
>needed.

[Thomas]  Just like calling amdgpu_gmc_ras_late_init,  checking the return 
status may make the code extensible.  
   In amdgpu_gmc_ras_early_init,  the xgmi ras initialization may 
always return 0, but it may add functions that need to check the return status 
in future.

Thanks,
Lijo

>   return 0;
>   }
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index c76ffd1a70cd..3cdd3d459d51 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1318,6 +1318,7 @@ static void gmc_v9_0_set_mca_funcs(struct 
> amdgpu_device *adev)
>   
>   static int gmc_v9_0_early_init(void *handle)
>   {
> + int r;
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle;
>   
>   /* ARCT and VEGA20 don't have XGMI defined in their IP discovery 
> tables */ @@ -1347,6 +1348,10 @@ static int gmc_v9_0_early_init(void *handle)
>   adev->gmc.private_aperture_end =
>   adev->gmc.private_aperture_start + (4ULL << 30) - 1;
>   
> + r = amdgpu_gmc_ras_early_init(adev);
> + if (r)
> + return r;
> +
>   return 0;
>   }
>   
> 


RE: [PATCH 2/3] drm/amdgpu: Move xgmi ras initialization from .late_init to .early_init

2022-01-18 Thread Chai, Thomas



-Original Message-
From: Zhou1, Tao  
Sent: Wednesday, January 19, 2022 2:11 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 2/3] drm/amdgpu: Move xgmi ras initialization from 
.late_init to .early_init

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, January 19, 2022 10:56 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 2/3] drm/amdgpu: Move xgmi ras initialization from 
> .late_init to .early_init
> 
> Move xgmi ras initialization from .late_init to .early_init, which let 
> xgmi ras be initialized only once.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 5 - 
> drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  | 1 -
>  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 9 +
>  3 files changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> index 3483a82f5734..d83eee1984c8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
> @@ -452,11 +452,6 @@ int amdgpu_gmc_ras_late_init(struct amdgpu_device
> *adev)
>   return r;
>   }
> 
> - if (!adev->gmc.xgmi.connected_to_cpu) {
> - adev->gmc.xgmi.ras = _ras;
> - amdgpu_ras_register_ras_block(adev, >gmc.xgmi.ras-
> >ras_block);
> - }
> -
>   if (adev->gmc.xgmi.ras && 
> adev->gmc.xgmi.ras->ras_block.ras_late_init)
> {
>   r = adev->gmc.xgmi.ras->ras_block.ras_late_init(adev, NULL);
>   if (r)
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 4f8d356f8432..5f9f82091000 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -716,7 +716,6 @@ static void gmc_v10_0_set_gfxhub_funcs(struct 
> amdgpu_device *adev)
>   }
>  }
> 
> -
>[Tao]: Please don't introduce irrelevant change.

[Thomas] OK

>  static int gmc_v10_0_early_init(void *handle)  {
>   struct amdgpu_device *adev = (struct amdgpu_device *)handle; diff 
> --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index c76ffd1a70cd..8d1b11368a7b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -1303,6 +1303,14 @@ static void gmc_v9_0_set_hdp_ras_funcs(struct 
> amdgpu_device *adev)
>   amdgpu_ras_register_ras_block(adev, >hdp.ras->ras_block);  }
> 
> +static void gmc_v9_0_set_xgmi_ras_funcs(struct amdgpu_device *adev) {
> + if (!adev->gmc.xgmi.connected_to_cpu) {
> + adev->gmc.xgmi.ras = _ras;
> + amdgpu_ras_register_ras_block(adev, >gmc.xgmi.ras-
> >ras_block);
> + }
> +}
>[Tao]: Since the initialization of xgmi.ras is common for all versions of IP, 
>I recommend to create a generic ras_early_init func for it.
>BTW, I think we can remove the check for block_obj's existence in 
>register_ras_block now.
 
[Thomas] I will update the patch.

> +
>  static void gmc_v9_0_set_mca_funcs(struct amdgpu_device *adev)  {
>   /* is UMC the right IP to check for MCA?  Maybe DF? */ @@ -1339,6
> +1347,7 @@ static int gmc_v9_0_early_init(void *handle)
>   gmc_v9_0_set_gfxhub_funcs(adev);
>   gmc_v9_0_set_hdp_ras_funcs(adev);
>   gmc_v9_0_set_mca_funcs(adev);
> + gmc_v9_0_set_xgmi_ras_funcs(adev);
> 
>   adev->gmc.shared_aperture_start = 0x2000ULL;
>   adev->gmc.shared_aperture_end =
> --
> 2.25.1


RE: [PATCH 1/5] drm/amdgpu: Fix the code style warnings in amdgpu_ras

2022-01-13 Thread Chai, Thomas
OK, I will update the patches to add the fixed warning types.

-Original Message-
From: Zhou1, Tao  
Sent: Friday, January 14, 2022 11:45 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 1/5] drm/amdgpu: Fix the code style warnings in amdgpu_ras

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Friday, January 14, 2022 11:36 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 1/5] drm/amdgpu: Fix the code style warnings in 
> amdgpu_ras
> 
> Fix the code style warnings in amdgpu_ras.

[Tao] Could you add more description to explain the warnings you want to fix?

> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 41 
> +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 10 
> +++---
>  2 files changed, 30 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 0bb6b5354802..23502b2b0770 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -872,7 +872,7 @@ static int amdgpu_ras_enable_all_features(struct
> amdgpu_device *adev,  static int amdgpu_ras_block_match_default(struct
> amdgpu_ras_block_object *block_obj,
>   enum amdgpu_ras_block block)
>  {
> - if(!block_obj)
> + if (!block_obj)
>   return -EINVAL;
> 
>   if (block_obj->block == block)
> @@ -881,7 +881,7 @@ static int amdgpu_ras_block_match_default(struct
> amdgpu_ras_block_object *block_
>   return -EINVAL;
>  }
> 
> -static struct amdgpu_ras_block_object* 
> amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
> +static struct amdgpu_ras_block_object 
> +*amdgpu_ras_get_ras_block(struct amdgpu_device *adev,
>   enum amdgpu_ras_block block,
> uint32_t sub_block_index)  {
>   struct amdgpu_ras_block_object *obj, *tmp; @@ -941,7 +941,7 @@ 
> static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct 
> ras_err_d  int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
> struct ras_query_if *info)
>  {
> - struct amdgpu_ras_block_object* block_obj = NULL;
> + struct amdgpu_ras_block_object *block_obj = NULL;
>   struct ras_manager *obj = amdgpu_ras_find_obj(adev, >head);
>   struct ras_err_data err_data = {0, 0, 0, NULL};
> 
> @@ -953,7 +953,7 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,
>   } else {
>   block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
>   if (!block_obj || !block_obj->hw_ops)   {
> - dev_info(adev->dev, "%s doesn't config ras function \n",
> + dev_info(adev->dev, "%s doesn't config ras function.\n",
>   get_ras_block_str(>head));
>   return -EINVAL;
>   }
> @@ -1023,13 +1023,14 @@ int amdgpu_ras_query_error_status(struct
> amdgpu_device *adev,  int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>   enum amdgpu_ras_block block)
>  {
> - struct amdgpu_ras_block_object* block_obj =
> amdgpu_ras_get_ras_block(adev, block, 0);
> + struct amdgpu_ras_block_object *block_obj = 
> +amdgpu_ras_get_ras_block(adev, block, 0);
> 
>   if (!amdgpu_ras_is_supported(adev, block))
>   return -EINVAL;
> 
>   if (!block_obj || !block_obj->hw_ops)   {
> - dev_info(adev->dev, "%s doesn't config ras function \n",
> ras_block_str(block));
> + dev_info(adev->dev, "%s doesn't config ras function.\n",
> + ras_block_str(block));
>   return -EINVAL;
>   }
> 
> @@ -1066,7 +1067,8 @@ int amdgpu_ras_error_inject(struct amdgpu_device 
> *adev,
>   return -EINVAL;
> 
>   if (!block_obj || !block_obj->hw_ops)   {
> - dev_info(adev->dev, "%s doesn't config ras function \n",
> get_ras_block_str(>head));
> + dev_info(adev->dev, "%s doesn't config ras function.\n",
> + get_ras_block_str(>head));
>   return -EINVAL;
>   }
> 
> @@ -1702,19 +1704,25 @@ static void 
> amdgpu_ras_log_on_err_counter(struct
> amdgpu_device *adev)  static void amdgpu_ras_error_status_query(struct
> amdgpu_device *adev,
> struct 

RE: [PATCH 2/2] drm/amdgpu: No longer insert ras blocks into ras_list if it already exists in ras_list

2022-01-12 Thread Chai, Thomas
Hi Felix:
 amdgpu_ras_register_ras_block was called by all IP ras blocks,  and every 
ip also has different ras versions.  We do common work together, which can 
reduce the chance of the ras function going wrong.

-Original Message-
From: Kuehling, Felix  
Sent: Thursday, January 13, 2022 12:39 AM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhou1, Tao ; Zhang, Hawking ; 
Clements, John ; Chai, Thomas 
Subject: Re: [PATCH 2/2] drm/amdgpu: No longer insert ras blocks into ras_list 
if it already exists in ras_list


Am 2022-01-12 um 2:48 a.m. schrieb yipechai:
> No longer insert ras blocks into ras_list if it already exists in ras_list.
>
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 
>  1 file changed, 8 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 62be0b4909b3..e6d3bb4b56e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2754,9 +2754,17 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device 
> *adev)  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
>   struct amdgpu_ras_block_object* ras_block_obj)  {
> + struct amdgpu_ras_block_object *obj, *tmp;
>   if (!adev || !amdgpu_ras_asic_supported(adev) || !ras_block_obj)
>   return -EINVAL;
>  
> + /* If the ras object had been in ras_list, doesn't add it to ras_list 
> again */
> + list_for_each_entry_safe(obj, tmp, >ras_list, node) {
> + if (obj == ras_block_obj) {
Instead of a loop, can't this be done more efficiently with "if 
(!list_empty(_block_obj->node))"?

Of course this would require that you move the INIT_LIST_HEAD to some earlier 
stage so that list_empty is reliable.

Regards,
  Felix


> + return 0;
> + }
> + }
> +
>   INIT_LIST_HEAD(_block_obj->node);
>   list_add_tail(_block_obj->node, >ras_list);
>  


RE: [PATCH 2/2] drm/amdgpu: No longer insert ras blocks into ras_list if it already exists in ras_list

2022-01-12 Thread Chai, Thomas



-Original Message-
From: Zhou1, Tao  
Sent: Wednesday, January 12, 2022 4:37 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 2/2] drm/amdgpu: No longer insert ras blocks into ras_list 
if it already exists in ras_list

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, January 12, 2022 3:48 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 2/2] drm/amdgpu: No longer insert ras blocks into 
> ras_list if it already exists in ras_list
> 
> No longer insert ras blocks into ras_list if it already exists in ras_list.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 62be0b4909b3..e6d3bb4b56e4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2754,9 +2754,17 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device
> *adev)  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
>   struct amdgpu_ras_block_object* ras_block_obj)  {
> + struct amdgpu_ras_block_object *obj, *tmp;
>   if (!adev || !amdgpu_ras_asic_supported(adev) || !ras_block_obj)
>   return -EINVAL;
> 
> + /* If the ras object had been in ras_list, doesn't add it to 
> +ras_list again */
>[Tao] How about "If the ras object is in ras_list, don't add it again"

[Thomas] OK

> + list_for_each_entry_safe(obj, tmp, >ras_list, node) {
> + if (obj == ras_block_obj) {
> + return 0;
> + }
> + }

>[Tao] The patch is OK for me currently, but I think the root cause is we 
>initialize adev->gmc.xgmi.ras in gmc_ras_late_init, the initialization should 
>be called only in modprobe stage and we can create a general gmc_early_init 
>for it.

[Thomas] This can create a new task to do it.

> +
>   INIT_LIST_HEAD(_block_obj->node);
>   list_add_tail(_block_obj->node, >ras_list);
> 
> --
> 2.25.1


RE: [PATCH 1/2] drm/amdgpu: Add a filter condition to restrict the SW ras function to be registered only by asics whose hardware supports the ras function

2022-01-12 Thread Chai, Thomas



-Original Message-
From: Zhou1, Tao  
Sent: Wednesday, January 12, 2022 4:28 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Clements, John 

Subject: RE: [PATCH 1/2] drm/amdgpu: Add a filter condition to restrict the SW 
ras function to be registered only by asics whose hardware supports the ras 
function

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, January 12, 2022 3:48 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Clements, 
> John ; Chai, Thomas 
> Subject: [PATCH 1/2] drm/amdgpu: Add a filter condition to restrict 
> the SW ras function to be registered only by asics whose hardware 
> supports the ras function

>[Tao] The subject is too long, I think "add ras supported check for 
>register_ras_block" is enough.
[Thomas] Ok.

> 
> Add a filter condition to restrict the SW ras function to be 
> registered only by asics whose hardware supports the ras function.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index b1bedfd4febc..62be0b4909b3 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2754,7 +2754,7 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device
> *adev)  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
>   struct amdgpu_ras_block_object* ras_block_obj)  {
> - if (!adev || !ras_block_obj)
> + if (!adev || !amdgpu_ras_asic_supported(adev) || !ras_block_obj)
>   return -EINVAL;

>[Tao] Can we return 0 if !amdgpu_ras_asic_supported(adev)? It's not an error.
[Thomas] OK.

> 
>   INIT_LIST_HEAD(_block_obj->node);
> --
> 2.25.1


RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from amdgpu_ras.c to each block

2021-12-06 Thread Chai, Thomas
I can add a default error injection function in amdgpuras_c,  if some block 
don't define special  . ras_error_inject function, it will use default error 
injection in amdgpuras_c.

-Original Message-
From: Zhou1, Tao  
Sent: Monday, December 6, 2021 3:34 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: RE: [PATCH V2 11/11] drm/amdgpu: Move error inject function from 
amdgpu_ras.c to each block

[AMD Official Use Only]

The error injection has no difference among RAS blocks except GFX and XGMI.
I agree to move the xgmi error injection to amdgpu_xgmi.c, but I don't think 
it's necessary to implement specific error injection functions for all other 
RAS blocks.

Regards,
Tao

> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Chai, Thomas 
> 
> Subject: [PATCH V2 11/11] drm/amdgpu: Move error inject function from 
> amdgpu_ras.c to each block
> 
> Move each block error inject function from amdgpu_ras.c to each block.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  | 62 
> +--- drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 28 
> +++
>  drivers/gpu/drm/amd/amdgpu/mca_v3_0.c| 18 +++
>  drivers/gpu/drm/amd/amdgpu/mmhub_v1_0.c  | 16 ++ 
> drivers/gpu/drm/amd/amdgpu/mmhub_v1_7.c  | 16 ++ 
> drivers/gpu/drm/amd/amdgpu/mmhub_v9_4.c  | 16 ++
>  drivers/gpu/drm/amd/amdgpu/nbio_v7_4.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/sdma_v4_4.c   | 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_1.c| 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v6_7.c| 16 ++
>  drivers/gpu/drm/amd/amdgpu/umc_v8_7.c| 16 ++
>  12 files changed, 201 insertions(+), 51 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 2e38bd3d3d45..87b625d305c9 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -1032,31 +1032,7 @@ int amdgpu_ras_reset_error_status(struct
> amdgpu_device *adev,
>   return 0;
>  }
> 
> -/* Trigger XGMI/WAFL error */
> -static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
> -  struct ta_ras_trigger_error_input *block_info)
> -{
> - int ret;
> -
> - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
> - dev_warn(adev->dev, "Failed to disallow df cstate");
> 
> - if (amdgpu_dpm_allow_xgmi_power_down(adev, false))
> - dev_warn(adev->dev, "Failed to disallow XGMI power down");
> -
> - ret = psp_ras_trigger_error(>psp, block_info);
> -
> - if (amdgpu_ras_intr_triggered())
> - return ret;
> -
> - if (amdgpu_dpm_allow_xgmi_power_down(adev, true))
> - dev_warn(adev->dev, "Failed to allow XGMI power down");
> -
> - if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_ALLOW))
> - dev_warn(adev->dev, "Failed to allow df cstate");
> -
> - return ret;
> -}
> 
>  /* wrapper of psp_ras_trigger_error */  int 
> amdgpu_ras_error_inject(struct amdgpu_device *adev, @@ -1076,41
> +1052,25 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev,
>   if (!obj)
>   return -EINVAL;
> 
> + if (!block_obj || !block_obj->ops)  {
> + dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(>head));
> + return -EINVAL;
> + }
> +
>   /* Calculate XGMI relative offset */
>   if (adev->gmc.xgmi.num_physical_nodes > 1) {
> - block_info.address =
> - amdgpu_xgmi_get_relative_phy_addr(adev,
> -   block_info.address);
> + block_info.address =
> amdgpu_xgmi_get_relative_phy_addr(adev,
> +block_info.address);
>   }
> 
> - switch (info->head.block) {
> - case AMDGPU_RAS_BLOCK__GFX:
> - if (!block_obj || !block_obj->ops)  {
> - dev_info(adev->dev, "%s don't config ras function \n",
> get_ras_block_str(>head));
> - return -EINVAL;
> - }
> - if (block_obj->ops->ras_error_inject)
> + if (block_obj->ops->ras_error_inject) {
> + if(info->head.block == AMDGPU_RAS_BLOCK__GFX)
>   ret = block_obj->ops->ras_error_inject(adev, info);
> - break;
> - case AMDGPU_RAS_BLO

RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops

2021-12-06 Thread Chai, Thomas
Hi tao:
 Thanks for your review. I  add another two comments behind your comments, 
please review again.

-Original Message-
From: Zhou1, Tao  
Sent: Tuesday, December 7, 2021 12:07 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the 
unified ras block data and ops

[AMD Official Use Only]

Hi Thomas,

Please see my two comments.

Regards,
Tao

> -Original Message-
> From: Chai, Thomas 
> Sent: Tuesday, December 7, 2021 11:37 AM
> To: Zhou1, Tao ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking 
> Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for 
> the unified ras block data and ops
> 
> Hi tao:
>  I add my comments behind your comments. Please review.
> 
> -Original Message-
> From: Zhou1, Tao 
> Sent: Monday, December 6, 2021 2:58 PM
> To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
> Cc: Zhang, Hawking 
> Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for 
> the unified ras block data and ops
> 
> [AMD Official Use Only]
> 
> Please see my comments inline.
> 
> > -Original Message-
> > From: Chai, Thomas 
> > Sent: Wednesday, December 1, 2021 6:53 PM
> > To: amd-gfx@lists.freedesktop.org
> > Cc: Chai, Thomas ; Zhang, Hawking 
> > ; Zhou1, Tao ; Chai,
> Thomas
> > 
> > Subject: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for 
> > the unified ras block data and ops
> >
> > 1.Modify gfx block to fit for the unified ras block data and ops 
> > 2.Implement .ras_block_match function pointer for gfx block to identify 
> > itself.
> > 3.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the 
> > corresponding variable name remove _funcs suffix.
> > 4.Remove the const flag of gfx ras variable so that gfx ras block 
> > can be able to be insertted into amdgpu device ras block link list.
> > 5.Invoke amdgpu_ras_register_ras_block function to register gfx ras 
> > block into amdgpu device ras block link list.
> > 6.Remove the redundant code about gfx in amdgpu_ras.c after using 
> > the unified ras block.
> >
> > Signed-off-by: yipechai 
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  6 +- 
> > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 15 ++--- 
> > drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 80 
> > ++--
> -
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 73 +++---
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 39 
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
> >  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 42 + 
> > drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
> >  8 files changed, 178 insertions(+), 81 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > index 1795d448c700..da8691259ac1 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> > @@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct
> > amdgpu_device *adev,
> >  */
> > if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
> > kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> > -   if (adev->gfx.ras_funcs &&
> > -   adev->gfx.ras_funcs->query_ras_error_count)
> > -   adev->gfx.ras_funcs->query_ras_error_count(adev,
> > err_data);
> > +   if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> > +   adev->gfx.ras->ras_block.ops->query_ras_error_count)
> > +   adev->gfx.ras->ras_block.ops-
> > >query_ras_error_count(adev, err_data);
> > amdgpu_ras_reset_gpu(adev);
> > }
> > return AMDGPU_RAS_SUCCESS;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > index 6b78b4a0e182..ff4a8428a84b 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> > @@ -31,6 +31,7 @@
> >  #include "amdgpu_ring.h"
> >  #include "amdgpu_rlc.h"
> >  #include "soc15.h"
> > +#include "amdgpu_ras.h"
> >
> >  /* GFX current status */
> >  #define AMDGPU_GFX_NORMAL_MODE 0xL
> > @@ -213,16 +214,8 @@ struct amdgpu_cu_info {
> > uint32_t bitmap[4][4];
> >  };
> >
> > -struct amdgpu_gfx_ras_funcs {
> > -   int (*ras_late_init)(struc

RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the unified ras block data and ops

2021-12-06 Thread Chai, Thomas
Hi tao:
 I add my comments behind your comments. Please review.

-Original Message-
From: Zhou1, Tao  
Sent: Monday, December 6, 2021 2:58 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: RE: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the 
unified ras block data and ops

[AMD Official Use Only]

Please see my comments inline.

> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Chai, Thomas 
> 
> Subject: [PATCH V2 03/11] drm/amdgpu: Modify gfx block to fit for the 
> unified ras block data and ops
> 
> 1.Modify gfx block to fit for the unified ras block data and ops 
> 2.Implement .ras_block_match function pointer for gfx block to identify 
> itself.
> 3.Change amdgpu_gfx_ras_funcs to amdgpu_gfx_ras, and the corresponding 
> variable name remove _funcs suffix.
> 4.Remove the const flag of gfx ras variable so that gfx ras block can 
> be able to be insertted into amdgpu device ras block link list.
> 5.Invoke amdgpu_ras_register_ras_block function to register gfx ras 
> block into amdgpu device ras block link list.
> 6.Remove the redundant code about gfx in amdgpu_ras.c after using the 
> unified ras block.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c |  6 +- 
> drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 15 ++--- 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 80 ++---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 73 +++---
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.c   | 39 
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4.h   |  2 +-
>  drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.c | 42 + 
> drivers/gpu/drm/amd/amdgpu/gfx_v9_4_2.h |  2 +-
>  8 files changed, 178 insertions(+), 81 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 1795d448c700..da8691259ac1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -696,9 +696,9 @@ int amdgpu_gfx_process_ras_data_cb(struct
> amdgpu_device *adev,
>*/
>   if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX)) {
>   kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
> - if (adev->gfx.ras_funcs &&
> - adev->gfx.ras_funcs->query_ras_error_count)
> - adev->gfx.ras_funcs->query_ras_error_count(adev,
> err_data);
> + if (adev->gfx.ras && adev->gfx.ras->ras_block.ops &&
> + adev->gfx.ras->ras_block.ops->query_ras_error_count)
> + adev->gfx.ras->ras_block.ops-
> >query_ras_error_count(adev, err_data);
>   amdgpu_ras_reset_gpu(adev);
>   }
>   return AMDGPU_RAS_SUCCESS;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> index 6b78b4a0e182..ff4a8428a84b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
> @@ -31,6 +31,7 @@
>  #include "amdgpu_ring.h"
>  #include "amdgpu_rlc.h"
>  #include "soc15.h"
> +#include "amdgpu_ras.h"
> 
>  /* GFX current status */
>  #define AMDGPU_GFX_NORMAL_MODE   0xL
> @@ -213,16 +214,8 @@ struct amdgpu_cu_info {
>   uint32_t bitmap[4][4];
>  };
> 
> -struct amdgpu_gfx_ras_funcs {
> - int (*ras_late_init)(struct amdgpu_device *adev);
> - void (*ras_fini)(struct amdgpu_device *adev);
> - int (*ras_error_inject)(struct amdgpu_device *adev,
> - void *inject_if);
> - int (*query_ras_error_count)(struct amdgpu_device *adev,
> -  void *ras_error_status);
> - void (*reset_ras_error_count)(struct amdgpu_device *adev);
> - void (*query_ras_error_status)(struct amdgpu_device *adev);
> - void (*reset_ras_error_status)(struct amdgpu_device *adev);
> +struct amdgpu_gfx_ras {
> + struct amdgpu_ras_block_object  ras_block;
>   void (*enable_watchdog_timer)(struct amdgpu_device *adev);  };

>[Tao] Can we add " enable_watchdog_timer" function into amdgpu_ras_block_ops 
>structure?
>And I think using ras_block directly is more simple than amdgpu_gfx_ras 
>gfx_v9_0_ras structure.

[Thomas] The ' enable_watchdog_timer ' function is not a common function. It is 
only defined by gfx_v9_4_2.c and called in gfx_v9_0.c. 
   I think the function pointers in the amdgpu_ras_block_ops structure 
should be the functions used by most blocks 

RE: [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem when other ras blocks' .h include amdgpu_ras.h

2021-12-06 Thread Chai, Thomas



-Original Message-
From: Zhou1, Tao  
Sent: Monday, December 6, 2021 2:57 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: RE: [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed problem 
when other ras blocks' .h include amdgpu_ras.h

[AMD Official Use Only]



> -Original Message-
> From: Chai, Thomas 
> Sent: Wednesday, December 1, 2021 6:53 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Chai, Thomas ; Zhang, Hawking 
> ; Zhou1, Tao ; Chai, Thomas 
> 
> Subject: [PATCH V2 02/11] drm/amdgpu: Modify the compilation failed 
> problem when other ras blocks' .h include amdgpu_ras.h
> 
> Modify the compilation failed problem when other ras blocks' .h 
> include

>[Tao] 'Fix' is better than "Modify" here.
[Thomas] OK.

> amdgpu_ras.h.
> 
> Signed-off-by: yipechai 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 22 ++ 
> drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 23 ---
>  2 files changed, 26 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 8713575c7cf1..1cf1f6331db1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2739,6 +2739,28 @@ static void
> amdgpu_register_bad_pages_mca_notifier(void)
>  }
>  }
>  #endif
> +
> +/* check if ras is supported on block, say, sdma, gfx */ int 
> +amdgpu_ras_is_supported(struct amdgpu_device *adev,
> + unsigned int block)
> +{
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + if (block >= AMDGPU_RAS_BLOCK_COUNT)
> + return 0;
> + return ras && (adev->ras_enabled & (1 << block)); }
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) {
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
> + schedule_work(>recovery_work);
> + return 0;
> +}
> +
> +
>  /* Rigister each ip ras block into amdgpu ras */  int 
> amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
>   struct amdgpu_ras_block_object* ras_block_obj) diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index d6e5e3c862bd..41623a649fa1 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -514,16 +514,6 @@ struct amdgpu_ras_block_ops {
>  #define amdgpu_ras_get_context(adev) ((adev)->psp.ras_context.ras)
>  #define amdgpu_ras_set_context(adev, ras_con)((adev)-
> >psp.ras_context.ras = (ras_con))
> 
> -/* check if ras is supported on block, say, sdma, gfx */ -static 
> inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
> - unsigned int block)
> -{
> - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> - if (block >= AMDGPU_RAS_BLOCK_COUNT)
> - return 0;
> - return ras && (adev->ras_enabled & (1 << block));
> -}
> 
>  int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
> 
> @@ -540,15 +530,6 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device 
> *adev,
> 
>  int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
> 
> -static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) -{
> - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> - if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
> - schedule_work(>recovery_work);
> - return 0;
> -}
> -
>  static inline enum ta_ras_block
>  amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
>   switch (block) {
> @@ -680,5 +661,9 @@ const char *get_ras_block_str(struct ras_common_if 
> *ras_block);
> 
>  bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
> 
> +int amdgpu_ras_is_supported(struct amdgpu_device *adev,  unsigned int
> block);
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev);
> +
>  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev, struct
> amdgpu_ras_block_object* ras_block_obj);  #endif
> --
> 2.25.1


RE: [PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of each IP block

2021-11-25 Thread Chai, Thomas
Hi Lijo:
   I add my replay after your comment.

Thanks,
Thomas
-Original Message-
From: Lazar, Lijo  
Sent: Thursday, November 25, 2021 7:41 PM
To: Chai, Thomas ; amd-gfx@lists.freedesktop.org
Cc: Chai, Thomas 
Subject: Re: [PATCH 1/9] drm/amdgpu:Define the unified ras function pointers of 
each IP block



On 11/25/2021 4:26 PM, yipechai wrote:
> Define an unified ras function pointers for each ip block to adapt.
> 
> Signed-off-by: yipechai 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 36 -
>   2 files changed, 37 insertions(+), 19 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index 90f0db3b4f65..dc6c8130e2d7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -2739,3 +2739,23 @@ static void 
> amdgpu_register_bad_pages_mca_notifier(void)
>   }
>   }
>   #endif
> +
> +/* check if ras is supported on block, say, sdma, gfx */ int 
> +amdgpu_ras_is_supported(struct amdgpu_device *adev,
> + unsigned int block)
> +{
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + if (block >= AMDGPU_RAS_BLOCK_COUNT)
> + return 0;
> + return ras && (adev->ras_enabled & (1 << block)); }
> +
> +int amdgpu_ras_reset_gpu(struct amdgpu_device *adev) {
> + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> +
> + if (atomic_cmpxchg(>in_recovery, 0, 1) == 0)
> + schedule_work(>recovery_work);
> + return 0;
> +}

>These changes look unrelated. Maybe as another patch to move from .h file to 
>.c file.
   When add amdgpu_ras.h  to other ip blocks .h file (such as amdgpu_gfx.h 
amdgpu_xgmi.h ...) for other block using 'struct amdgpu_ras_block_ops',  the 
code compilation will make an error:
“drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h:499:46: error: dereferencing 
pointer to incomplete type ‘struct amdgpu_device’
 499 | #define amdgpu_ras_get_context(adev)  
((adev)->psp.ras_context.ras)”
   The struct amdgpu_device has been defined in amdgpu.h file, and the amdgpu.h 
file has been included in amdgpu_ras.h, it seems that there are some problems 
for .h file cross-include. Due to the amdgpu_ras_get_context(adev)  has only 
been used in the functions of 'amdgpu_ras_is_supported' and ' 
amdgpu_ras_reset_gpu '. When move these two function to .c file, the code 
compilation becomes successful.

> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> index cdd0010a5389..4b7da40dd837 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
> @@ -469,6 +469,19 @@ struct ras_debug_if {
>   };
>   int op;
>   };
> +
> +struct amdgpu_ras_block_ops {
> + int (*ras_late_init)(struct amdgpu_device *adev);
> + void (*ras_fini)(struct amdgpu_device *adev);
> + int (*ras_error_inject)(struct amdgpu_device *adev, void *inject_if);
> + void  (*query_ras_error_count)(struct amdgpu_device *adev,void 
> *ras_error_status);
> + void (*query_ras_error_status)(struct amdgpu_device *adev);
> + bool  (*query_ras_poison_mode)(struct amdgpu_device *adev);
> + void (*query_ras_error_address)(struct amdgpu_device *adev, void 
> *ras_error_status);
> + void (*reset_ras_error_count)(struct amdgpu_device *adev);
> + void (*reset_ras_error_status)(struct amdgpu_device *adev); };
> +

>Generic comment - Since all the operations are consolidated under _ops, it 
>makes sense to rename the _ras_funcs to _ras.

>Ex: amdgpu_gfx_ras_funcs => amdgpu_gfx_ras, amdgpu_xgmi_ras_funcs => 
>amdgpu_xgmi_ras and so forth.

>In future, these ras blocks may have data members to keep IP specific ras data.

OK, I will do it.

Thanks,
Lijo

>   /* work flow
>* vbios
>* 1: ras feature enable (enabled by default) @@ -486,16 +499,6 @@ 
> struct ras_debug_if {
>   #define amdgpu_ras_get_context(adev)
> ((adev)->psp.ras_context.ras)
>   #define amdgpu_ras_set_context(adev, ras_con)   
> ((adev)->psp.ras_context.ras = (ras_con))
>   
> -/* check if ras is supported on block, say, sdma, gfx */ -static 
> inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
> - unsigned int block)
> -{
> - struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
> -
> - if (block >= AMDGPU_RAS_BLOCK_COUNT)
> - return 0;
> - return ras && (adev->ras_enabled & (1 << block));
> -}
>   
>   int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
>   
>