Re: [PATCH 1/1] Refactor radeon driver to use drm_gem_create_map_offset() instead of its custom implementation for associating GEM object with a fake offset. Since, we already have a generic implement

2024-01-07 Thread Christian König

Am 06.01.24 um 15:14 schrieb Dipam Turkar:

Signed-off-by: Dipam Turkar 
---
  drivers/gpu/drm/radeon/radeon_drv.c |  2 +-
  drivers/gpu/drm/radeon/radeon_gem.c | 24 ++--
  2 files changed, 3 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_drv.c 
b/drivers/gpu/drm/radeon/radeon_drv.c
index fa531493b111..f590ed65ffba 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -578,7 +578,7 @@ static const struct drm_driver kms_driver = {
.ioctls = radeon_ioctls_kms,
.num_ioctls = ARRAY_SIZE(radeon_ioctls_kms),
.dumb_create = radeon_mode_dumb_create,
-   .dumb_map_offset = radeon_mode_dumb_mmap,
+   .dumb_map_offset = drm_gem_dumb_map_offset,
.fops = &radeon_driver_kms_fops,
  
  	.gem_prime_import_sg_table = radeon_gem_prime_import_sg_table,

diff --git a/drivers/gpu/drm/radeon/radeon_gem.c 
b/drivers/gpu/drm/radeon/radeon_gem.c
index 358d19242f4b..99794c550d2c 100644
--- a/drivers/gpu/drm/radeon/radeon_gem.c
+++ b/drivers/gpu/drm/radeon/radeon_gem.c
@@ -31,6 +31,7 @@
  
  #include 

  #include 
+#include 
  #include 
  #include 
  
@@ -480,33 +481,12 @@ int radeon_gem_set_domain_ioctl(struct drm_device *dev, void *data,

return r;
  }
  
-int radeon_mode_dumb_mmap(struct drm_file *filp,

- struct drm_device *dev,
- uint32_t handle, uint64_t *offset_p)
-{
-   struct drm_gem_object *gobj;
-   struct radeon_bo *robj;
-
-   gobj = drm_gem_object_lookup(filp, handle);
-   if (gobj == NULL) {
-   return -ENOENT;
-   }
-   robj = gem_to_radeon_bo(gobj);
-   if (radeon_ttm_tt_has_userptr(robj->rdev, robj->tbo.ttm)) {


Well apart from the obvious typos, the missing commit message and the 
mangled subject line this also removes this important check for userptrs 
here.


This in turn is a complete no-go since it can lead to kernel crashes.

Regards,
Christian.


-   drm_gem_object_put(gobj);
-   return -EPERM;
-   }
-   *offset_p = radeon_bo_mmap_offset(robj);
-   drm_gem_object_put(gobj);
-   return 0;
-}
-
  int radeon_gem_mmap_ioctl(struct drm_device *dev, void *data,
  struct drm_file *filp)
  {
struct drm_radeon_gem_mmap *args = data;
  
-	return radeon_mode_dumb_mmap(filp, dev, args->handle, &args->addr_ptr);

+   return drm_gem_dumb_map_offset(filp, dev, args->handle, 
&args->addr_ptr);
  }
  
  int radeon_gem_busy_ioctl(struct drm_device *dev, void *data,




Re: [PATCH v2] drm/amdkfd: Fix sparse __rcu annotation warnings

2024-01-07 Thread Christian König

Am 06.01.24 um 00:00 schrieb Felix Kuehling:

Properly mark kfd_process->ef as __rcu and consistently use the right
accessor functions.

Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202312052245.yfpbsgnh-...@intel.com/
Signed-off-by: Felix Kuehling 


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   | 2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 7 +--
  4 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index cf6ed5fce291..f262b9d89541 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -311,7 +311,7 @@ void amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct 
kgd_mem *mem);
  int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, struct 
amdgpu_bo *bo);
  
  int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,

-   struct dma_fence **ef);
+   struct dma_fence __rcu **ef);
  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
  struct kfd_vm_fault_info *info);
  int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device *adev, int fd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 48697b789342..5f445d856769 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2802,7 +2802,7 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct 
work_struct *work)
put_task_struct(usertask);
  }
  
-static void replace_eviction_fence(struct dma_fence **ef,

+static void replace_eviction_fence(struct dma_fence __rcu **ef,
   struct dma_fence *new_ef)
  {
struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
@@ -2837,7 +2837,7 @@ static void replace_eviction_fence(struct dma_fence **ef,
   * 7.  Add fence to all PD and PT BOs.
   * 8.  Unreserve all BOs
   */
-int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef)
+int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence __rcu 
**ef)
  {
struct amdkfd_process_info *process_info = info;
struct amdgpu_vm *peer_vm;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index 745024b31340..17fbedbf3651 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -917,7 +917,7 @@ struct kfd_process {
 * fence will be triggered during eviction and new one will be created
 * during restore
 */
-   struct dma_fence *ef;
+   struct dma_fence __rcu *ef;
  
  	/* Work items for evicting and restoring BOs */

struct delayed_work eviction_work;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index 71df51fcc1b0..717a60d7a4ea 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1110,6 +1110,7 @@ static void kfd_process_wq_release(struct work_struct 
*work)
  {
struct kfd_process *p = container_of(work, struct kfd_process,
 release_work);
+   struct dma_fence *ef;
  
  	kfd_process_dequeue_from_all_devices(p);

pqm_uninit(&p->pqm);
@@ -1118,7 +1119,9 @@ static void kfd_process_wq_release(struct work_struct 
*work)
 * destroyed. This allows any BOs to be freed without
 * triggering pointless evictions or waiting for fences.
 */
-   dma_fence_signal(p->ef);
+   synchronize_rcu();
+   ef = rcu_access_pointer(p->ef);
+   dma_fence_signal(ef);
  
  	kfd_process_remove_sysfs(p);
  
@@ -1127,7 +1130,7 @@ static void kfd_process_wq_release(struct work_struct *work)

svm_range_list_fini(p);
  
  	kfd_process_destroy_pdds(p);

-   dma_fence_put(p->ef);
+   dma_fence_put(ef);
  
  	kfd_event_free_process(p);
  




RE: [PATCH v2 v2 2/5] drm/amdgpu: Init pcie_index/data address as fallback (v2)

2024-01-07 Thread Wang, Yang(Kevin)
[AMD Official Use Only - General]

-Original Message-
From: Hawking Zhang 
Sent: Sunday, January 7, 2024 11:40 PM
To: amd-gfx@lists.freedesktop.org; Zhou1, Tao ; Yang, 
Stanley ; Wang, Yang(Kevin) ; 
Chai, Thomas ; Li, Candice 
Cc: Zhang, Hawking ; Deucher, Alexander 
; Lazar, Lijo ; Ma, Le 

Subject: [PATCH v2 v2 2/5] drm/amdgpu: Init pcie_index/data address as fallback 
(v2)

To allow using this helper for indirect access when nbio funcs is not 
available. For instance, in ip discovery phase.

v2: define macro for pcie_index/data/index_hi fallback.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 +-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index abad5773714c..05d7cdcf28b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -96,6 +96,9 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
 #define AMDGPU_RESUME_MS   2000
 #define AMDGPU_MAX_RETRY_LIMIT 2
 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) 
== -EINVAL)
+#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2) #define
+AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2) #define
+AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)

 static const struct drm_driver amdgpu_kms_driver;

@@ -781,12 +784,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;

-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
+   pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
[kevin]:
Gentle reminder that the macro 'upper_32_bits()' can help on it .

Series is.
Reviewed-by: Yang Wang 

Best Regards,
Kevin
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }

spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
--
2.17.1



Re: [bug report] drm/amdgpu/vpe: enable vpe dpm

2024-01-07 Thread SHANMUGAM, SRINIVASAN
[Public]

+ Srinath, Tim

Get Outlook for Android

From: SHANMUGAM, SRINIVASAN
Sent: Saturday, January 6, 2024 10:06:33 AM
To: Dan Carpenter ; Lee, Peyton 
Cc: dri-de...@lists.freedesktop.org ; 
amd-gfx@lists.freedesktop.org 
Subject: RE: [bug report] drm/amdgpu/vpe: enable vpe dpm

Hi Dan Carpenter,

This was fixed in 
https://patchwork.freedesktop.org/patch/573477/?series=128249&rev=1

Thank you!

Regards,
Srini

-Original Message-
From: amd-gfx  On Behalf Of Dan Carpenter
Sent: Friday, January 5, 2024 7:04 PM
To: Lee, Peyton 
Cc: dri-de...@lists.freedesktop.org; amd-gfx@lists.freedesktop.org
Subject: [bug report] drm/amdgpu/vpe: enable vpe dpm

Hello Peyton Lee,

The patch 5f82a0c90cca: "drm/amdgpu/vpe: enable vpe dpm" from Dec 12,
2023 (linux-next), leads to the following Smatch static checker
warning:

drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c:62 vpe_u1_8_from_fraction() warn: 
unsigned 'numerator' is never less than zero.
drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c:63 vpe_u1_8_from_fraction() warn: 
unsigned 'denominator' is never less than zero.

drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
60 static uint16_t vpe_u1_8_from_fraction(uint16_t numerator, uint16_t 
denominator)
61 {
--> 62 bool arg1_negative = numerator < 0;
63 bool arg2_negative = denominator < 0;

uint16_t can't be negative.

64
65 uint16_t arg1_value = (uint16_t)(arg1_negative ? -numerator : 
numerator);
66 uint16_t arg2_value = (uint16_t)(arg2_negative ? -denominator : 
denominator);
67
68 uint16_t remainder;
69

regards,
dan carpenter


RE: [PATCH V2] drm/amdgpu: correct the cu count for gfx v11

2024-01-07 Thread Zhang, Hawking
[AMD Official Use Only - General]

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Gao, Likun 
Sent: Monday, January 8, 2024 11:31
To: amd-gfx list 
Cc: Zhang, Hawking 
Subject: [PATCH V2] drm/amdgpu: correct the cu count for gfx v11

[AMD Official Use Only - General]

From: Likun Gao 

Correct the algorithm of active CU to skip disabled sa for gfx v11.

Signed-off-by: Likun Gao 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 2fbcd9765980..c7242877d5d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -6383,6 +6383,9 @@ static int gfx_v11_0_get_cu_info(struct amdgpu_device 
*adev,
mutex_lock(&adev->grbm_idx_mutex);
for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
+   bitmap = i * adev->gfx.config.max_sh_per_se + j;
+   if (!((gfx_v11_0_get_sa_active_bitmap(adev) >> bitmap) 
& 1))
+   continue;
mask = 1;
counter = 0;
gfx_v11_0_select_se_sh(adev, i, j, 0x, 0);
--
2.34.1




Re: [PATCH] drm/amdgpu: Update irq disable flow during unload

2024-01-07 Thread Lazar, Lijo

On 1/5/2024 8:51 PM, Asad Kamal wrote:

In certain special cases, e.g device reset before module
unload, irq gets disabled as part of reset sequence and
won't get enabled back. Add special check to cover such scenarios

Signed-off-by: Asad Kamal 
Suggested-by: Lijo Lazar 


Please also add the tag

Fixes: f5c7e7797060 ("drm/amdgpu: Adjust removal control flow for smu 
v13_0_2")


Reviewed-by: Lijo Lazar 

Thanks,
Lijo


---
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 12 ++--
  drivers/gpu/drm/amd/amdgpu/soc15.c| 13 +++--
  2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 372de9f1ce59..a4e1b9a58679 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -2361,6 +2361,7 @@ static void gmc_v9_0_gart_disable(struct amdgpu_device 
*adev)
  static int gmc_v9_0_hw_fini(void *handle)
  {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+   bool irq_release = true;
  
  	gmc_v9_0_gart_disable(adev);
  
@@ -2378,9 +2379,16 @@ static int gmc_v9_0_hw_fini(void *handle)

if (adev->mmhub.funcs->update_power_gating)
adev->mmhub.funcs->update_power_gating(adev, false);
  
-	amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);

+   if (adev->shutdown)
+   irq_release = amdgpu_irq_enabled(adev, &adev->gmc.vm_fault, 0);
  
-	if (adev->gmc.ecc_irq.funcs &&

+   if (irq_release)
+   amdgpu_irq_put(adev, &adev->gmc.vm_fault, 0);
+
+   if (adev->shutdown)
+   irq_release = amdgpu_irq_enabled(adev, &adev->gmc.ecc_irq, 0);
+
+   if (adev->gmc.ecc_irq.funcs && irq_release &&
amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__UMC))
amdgpu_irq_put(adev, &adev->gmc.ecc_irq, 0);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c

index 15033efec2ba..7ee835049d57 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15.c
+++ b/drivers/gpu/drm/amd/amdgpu/soc15.c
@@ -1266,6 +1266,7 @@ static int soc15_common_hw_init(void *handle)
  static int soc15_common_hw_fini(void *handle)
  {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+   bool irq_release = true;
  
  	/* Disable the doorbell aperture and selfring doorbell aperture

 * separately in hw_fini because soc15_enable_doorbell_aperture
@@ -1280,10 +1281,18 @@ static int soc15_common_hw_fini(void *handle)
  
  	if (adev->nbio.ras_if &&

amdgpu_ras_is_supported(adev, adev->nbio.ras_if->block)) {
-   if (adev->nbio.ras &&
+   if (adev->shutdown)
+   irq_release = amdgpu_irq_enabled(adev, 
&adev->nbio.ras_controller_irq, 0);
+
+   if (adev->nbio.ras && irq_release &&
adev->nbio.ras->init_ras_controller_interrupt)
amdgpu_irq_put(adev, &adev->nbio.ras_controller_irq, 0);
-   if (adev->nbio.ras &&
+
+   if (adev->shutdown)
+   irq_release = amdgpu_irq_enabled(adev,
+   &adev->nbio.ras_err_event_athub_irq, 0);
+
+   if (adev->nbio.ras && irq_release &&
adev->nbio.ras->init_ras_err_event_athub_interrupt)
amdgpu_irq_put(adev, 
&adev->nbio.ras_err_event_athub_irq, 0);
}




[PATCH V2] drm/amdgpu: correct the cu count for gfx v11

2024-01-07 Thread Gao, Likun
[AMD Official Use Only - General]

From: Likun Gao 

Correct the algorithm of active CU to skip disabled sa for gfx v11.

Signed-off-by: Likun Gao 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 2fbcd9765980..c7242877d5d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -6383,6 +6383,9 @@ static int gfx_v11_0_get_cu_info(struct amdgpu_device 
*adev,
mutex_lock(&adev->grbm_idx_mutex);
for (i = 0; i < adev->gfx.config.max_shader_engines; i++) {
for (j = 0; j < adev->gfx.config.max_sh_per_se; j++) {
+   bitmap = i * adev->gfx.config.max_sh_per_se + j;
+   if (!((gfx_v11_0_get_sa_active_bitmap(adev) >> bitmap) 
& 1))
+   continue;
mask = 1;
counter = 0;
gfx_v11_0_select_se_sh(adev, i, j, 0x, 0);
--
2.34.1



RE: [PATCH] drm/amd/display: Fix assignment of integer to fixed point pbn_div

2024-01-07 Thread Lin, Wayne
[Public]

Thanks, Imre Deak!

Reviewed-by: Wayne Lin 

> -Original Message-
> From: Imre Deak 
> Sent: Friday, January 5, 2024 5:56 PM
> To: dri-de...@lists.freedesktop.org
> Cc: Lin, Wayne ; amd-gfx@lists.freedesktop.org; Dave
> Airlie 
> Subject: [PATCH] drm/amd/display: Fix assignment of integer to fixed point
> pbn_div
>
> Fix the merge conflict resolution in
>
> commit 13feae00ee99e0fc8b6f1748fc4c70281a7d6939
> Merge: eb284f4b37817 3c064aea46d07
> Author: Dave Airlie 
> Date:   Fri Jan 5 13:19:40 2024 +1000
>
> Merge remote-tracking branch 'drm/drm-next' into drm-tip
>
> # Conflicts:
> #   drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> #   drivers/gpu/drm/i915/display/intel_dmc.c
>
> @@@ -6916,7 -7007,8 +7009,7 @@@ static int
> dm_encoder_helper_atomic_che
> if (IS_ERR(mst_state))
> return PTR_ERR(mst_state);
>
>  -  if (!mst_state->pbn_div.full)
>  -  mst_state->pbn_div.full =
> dfixed_const(dm_mst_get_pbn_divider(aconnector->mst_root->dc_link));
>  +  mst_state->pbn_div = dm_mst_get_pbn_divider(aconnector->mst_root-
> >dc_link);
>
> resulting from the following two changes:
>
> commit 191dc43935d1ece82bc6c9653463b3b1cd8198fb
> Author: Imre Deak 
> Date:   Thu Nov 16 15:18:31 2023 +0200
>
> drm/dp_mst: Store the MST PBN divider value in fixed point format
>
> commit 9cdef4f720376ef0fb0febce1ed2377c19e531f9
> Author: Wayne Lin 
> Date:   Mon Dec 4 10:09:33 2023 +0800
>
> drm/amd/display: pbn_div need be updated for hotplug event
>
> Cc: Wayne Lin 
> Cc: amd-gfx@lists.freedesktop.org
> Cc: dri-de...@lists.freedesktop.org
> Cc: Dave Airlie 
> Signed-off-by: Imre Deak 
> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index 5d82bac1a51ab..f9714dd6fe8ed 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -7009,7 +7009,7 @@ static int dm_encoder_helper_atomic_check(struct
> drm_encoder *encoder,
>   if (IS_ERR(mst_state))
>   return PTR_ERR(mst_state);
>
> - mst_state->pbn_div = dm_mst_get_pbn_divider(aconnector-
> >mst_root->dc_link);
> + mst_state->pbn_div.full =
> dfixed_const(dm_mst_get_pbn_divider(aconnector->mst_root->dc_link));
>
>   if (!state->duplicated) {
>   int max_bpc = conn_state->max_requested_bpc;
> --
> 2.39.2



Re: [PATCH 1/1] Refactor radeon driver to use drm_gem_create_map_offset() instead of its custom implementation for associating GEM object with a fake offset. Since, we already have a generic implement

2024-01-07 Thread kernel test robot
Hi Dipam,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-misc/drm-misc-next]
[also build test ERROR on linus/master v6.7-rc8 next-20240105]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:
https://github.com/intel-lab-lkp/linux/commits/Dipam-Turkar/Refactor-radeon-driver-to-use-drm_gem_create_map_offset-instead-of-its-custom-implementation-for-associating-GEM-object-/20240106-221755
base:   git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link:
https://lore.kernel.org/r/20240106141422.10734-1-dipamt1729%40gmail.com
patch subject: [PATCH 1/1] Refactor radeon driver to use 
drm_gem_create_map_offset() instead of its custom implementation for 
associating GEM object with a fake offset. Since, we already have a generic 
implementation, we don't need the custom function and it is better to 
standardize the code.
config: x86_64-allmodconfig 
(https://download.01.org/0day-ci/archive/20240108/202401080333.6psotuik-...@intel.com/config)
compiler: ClangBuiltLinux clang version 17.0.6 
(https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
reproduce (this is a W=1 build): 
(https://download.01.org/0day-ci/archive/20240108/202401080333.6psotuik-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202401080333.6psotuik-...@intel.com/

All errors (new ones prefixed by >>):

>> drivers/gpu/drm/radeon/radeon_gem.c:34:10: fatal error: 'drm/dem_gem.h' file 
>> not found
  34 | #include 
 |  ^~~
   1 error generated.


vim +34 drivers/gpu/drm/radeon/radeon_gem.c

31  
32  #include 
33  #include 
  > 34  #include 
35  #include 
36  #include 
37  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


Re: [PATCH 1/1] Revert "drm/amd/display: fix bandwidth validation failure on DCN 2.1"

2024-01-07 Thread Melissa Wen
On 01/06, LIPSKI, IVAN wrote:
> [AMD Official Use Only - General]
> 
> @Siqueira, Rodrigo
> 
> From: LIPSKI, IVAN 
> Sent: January 5, 2024 7:40 PM
> To: amd-gfx@lists.freedesktop.org 
> Cc: rodrigo.siqui...@amd.com ; Choi, Nicholas 
> ; Deucher, Alexander ; 
> Koenig, Christian ; Wentland, Harry 
> ; LIPSKI, IVAN ; Melissa Wen 
> ; Mahfooz, Hamza 
> Subject: [PATCH 1/1] Revert "drm/amd/display: fix bandwidth validation 
> failure on DCN 2.1"
> 
> From: Ivan Lipski 
> 
> This commit causes dmesg-warn on several IGT tests on DCN 3.1.6:
> *ERROR* link_enc_cfg_validate: Invalid link encoder assignments - 0x1c
> 
> Affected IGT tests include:
> amdgpu/[amd_assr|amd_plane|amd_hotplug]
> kms_atomic
> kms_color
> kms_flip
> kms_properties
> kms_universal_plane
> 
> and some other tests
> 
> This reverts commit b7ebd39e2922f642c7ee63ade4a4a5a1ef675d84.

I'm not opposed to reverting this in the short term, but I don't see the
connection between doing a full validation and link encoder assignment
errors. It seems more like the change unveiled an underlying issue
rather than causing the error. I don't see those errors on DCN 2.1 and
3.0.1.

Unfortunately, I don't have a DCN 3.1.6 for debugging :( It'd deserve
further investigation.

Melissa

> 
> Cc: Melissa Wen 
> Cc: Hamza Mahfooz 
> 
> Signed-off-by: Ivan Lipski 
> 
> ---
>  drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> index d77fc79f3542..111c6f51f0ae 100644
> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> @@ -10752,7 +10752,7 @@ static int amdgpu_dm_atomic_check(struct drm_device 
> *dev,
>  DRM_DEBUG_DRIVER("drm_dp_mst_atomic_check() 
> failed\n");
>  goto fail;
>  }
> -   status = dc_validate_global_state(dc, dm_state->context, 
> false);
> +   status = dc_validate_global_state(dc, dm_state->context, 
> true);
>  if (status != DC_OK) {
>  DRM_DEBUG_DRIVER("DC global validation failure: %s 
> (%d)",
> dc_status_to_str(status), status);
> --
> 2.34.1
> 


Re: [PATCH 1/1] Refactor radeon driver to use drm_gem_create_map_offset() instead of its custom implementation for associating GEM object with a fake offset. Since, we already have a generic implement

2024-01-07 Thread kernel test robot
Hi Dipam,

kernel test robot noticed the following build errors:

[auto build test ERROR on drm-misc/drm-misc-next]
[also build test ERROR on linus/master v6.7-rc8 next-20240105]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:
https://github.com/intel-lab-lkp/linux/commits/Dipam-Turkar/Refactor-radeon-driver-to-use-drm_gem_create_map_offset-instead-of-its-custom-implementation-for-associating-GEM-object-/20240106-221755
base:   git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link:
https://lore.kernel.org/r/20240106141422.10734-1-dipamt1729%40gmail.com
patch subject: [PATCH 1/1] Refactor radeon driver to use 
drm_gem_create_map_offset() instead of its custom implementation for 
associating GEM object with a fake offset. Since, we already have a generic 
implementation, we don't need the custom function and it is better to 
standardize the code.
config: s390-allmodconfig 
(https://download.01.org/0day-ci/archive/20240108/202401080151.jsivh6p0-...@intel.com/config)
compiler: s390-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): 
(https://download.01.org/0day-ci/archive/20240108/202401080151.jsivh6p0-...@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot 
| Closes: 
https://lore.kernel.org/oe-kbuild-all/202401080151.jsivh6p0-...@intel.com/

All errors (new ones prefixed by >>):

>> drivers/gpu/drm/radeon/radeon_gem.c:34:10: fatal error: drm/dem_gem.h: No 
>> such file or directory
  34 | #include 
 |  ^~~
   compilation terminated.


vim +34 drivers/gpu/drm/radeon/radeon_gem.c

31  
32  #include 
33  #include 
  > 34  #include 
35  #include 
36  #include 
37  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


RE: [PATCH v2 01/12] drm/amdgpu: implement RAS ACA driver framework

2024-01-07 Thread Zhang, Hawking
[AMD Official Use Only - General]

Series is

Reviewed-by: Hawking Zhang 

Regards,
Hawking
-Original Message-
From: Wang, Yang(Kevin) 
Sent: Thursday, January 4, 2024 19:49
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking ; Zhou1, Tao ; 
Chai, Thomas ; Wang, Yang(Kevin) 
Subject: [PATCH v2 01/12] drm/amdgpu: implement RAS ACA driver framework

v1:
implement new RAS ACA driver code framework.

v2:
- rename aca_bank_set to aca_banks.
- rename aca_source_xxx to aca_handle_xxx

v3:
- Optimize some function implementation details. (from Hawking's suggestion)

Signed-off-by: Yang Wang 
---
 drivers/gpu/drm/amd/amdgpu/Makefile|   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu.h|   4 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c| 665 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h| 196 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   6 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h|   1 +
 6 files changed, 873 insertions(+), 1 deletion(-)  create mode 100644 
drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
 create mode 100644 drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h

diff --git a/drivers/gpu/drm/amd/amdgpu/Makefile 
b/drivers/gpu/drm/amd/amdgpu/Makefile
index 260e32ef7bae..4c989da4d2f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/Makefile
+++ b/drivers/gpu/drm/amd/amdgpu/Makefile
@@ -80,7 +80,7 @@ amdgpu-y += amdgpu_device.o amdgpu_doorbell_mgr.o 
amdgpu_kms.o \
amdgpu_umc.o smu_v11_0_i2c.o amdgpu_fru_eeprom.o amdgpu_rap.o \
amdgpu_fw_attestation.o amdgpu_securedisplay.o \
amdgpu_eeprom.o amdgpu_mca.o amdgpu_psp_ta.o amdgpu_lsdma.o \
-   amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o
+   amdgpu_ring_mux.o amdgpu_xcp.o amdgpu_seq64.o amdgpu_aca.o

 amdgpu-$(CONFIG_PROC_FS) += amdgpu_fdinfo.o

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9da14436a373..eb182225f548 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -107,6 +107,7 @@
 #include "amdgpu_smuio.h"
 #include "amdgpu_fdinfo.h"
 #include "amdgpu_mca.h"
+#include "amdgpu_aca.h"
 #include "amdgpu_ras.h"
 #include "amdgpu_xcp.h"
 #include "amdgpu_seq64.h"
@@ -1047,6 +1048,9 @@ struct amdgpu_device {
/* MCA */
struct amdgpu_mca   mca;

+   /* ACA */
+   struct amdgpu_aca   aca;
+
struct amdgpu_ip_block  ip_blocks[AMDGPU_MAX_IP_NUM];
uint32_tharvest_ip_mask;
int num_ip_blocks;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
new file mode 100644
index ..756b40bde38b
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c
@@ -0,0 +1,665 @@
+/*
+ * Copyright 2023 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person
+obtaining a
+ * copy of this software and associated documentation files (the
+"Software"),
+ * to deal in the Software without restriction, including without
+limitation
+ * the rights to use, copy, modify, merge, publish, distribute,
+sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom
+the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT
+SHALL
+ * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM,
+DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
+OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include 
+#include "amdgpu.h"
+#include "amdgpu_aca.h"
+#include "amdgpu_ras.h"
+
+#define ACA_BANK_HWID(type, hwid, mcatype) [ACA_HWIP_TYPE_##type] =
+{hwid, mcatype}
+
+typedef int bank_handler_t(struct aca_handle *handle, struct aca_bank
+*bank, enum aca_error_type type, void *data);
+
+struct aca_banks {
+   int nr_banks;
+   struct list_head list;
+};
+
+struct aca_hwip {
+   int hwid;
+   int mcatype;
+};
+
+static struct aca_hwip aca_hwid_mcatypes[ACA_HWIP_TYPE_COUNT] = {
+   ACA_BANK_HWID(SMU,  0x01,   0x01),
+   ACA_BANK_HWID(PCS_XGMI, 0x50,   0x00),
+   ACA_BANK_HWID(UMC,  0x96,   0x00),
+};
+
+static void aca_banks_init(struct aca_banks *banks) {
+   if (!banks)
+   return;
+
+   memset(banks, 0, sizeof(*banks));
+   INIT_LIST_HEAD(&banks->list);
+}
+
+static int aca_banks_add_bank(struct aca_banks *banks, struct aca_bank
+*bank) {
+   struct aca_bank_node *node;
+
+   if (!bank)
+   return -EINVAL;
+
+   node = kvzalloc(sizeof(*node), GFP

[PATCH v2 v2 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-07 Thread Hawking Zhang
Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1



[PATCH v2 v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

2024-01-07 Thread Hawking Zhang
Add ras helper function to query boot time gpu
errors.
v2: use aqua_vanjaram smn addressing pattern

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 15 +++-
 3 files changed, 110 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9da14436a373..df3aa69be425 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1330,6 +1330,7 @@ int emu_soc_asic_init(struct amdgpu_device *adev);
 #define WREG32_FIELD_OFFSET(reg, offset, field, val)   \
WREG32(mm##reg + offset, (RREG32(mm##reg + offset) & 
~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
 
+#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >> (l))
 /*
  * BIOS helpers.
  */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index fc42fb6ee191..a901b00d4949 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -3763,3 +3763,98 @@ int amdgpu_ras_error_statistic_ce_count(struct 
ras_err_data *err_data,
 
return 0;
 }
+
+#define mmMP0_SMN_C2PMSG_920x1609C
+#define mmMP0_SMN_C2PMSG_126   0x160BE
+static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
+u32 instance, u32 boot_error)
+{
+   u32 socket_id, aid_id, hbm_id;
+   u32 reg_data;
+   u64 reg_addr;
+
+   socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+   aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+   hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
+   socket_id, aid_id, reg_data);
+
+   if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training 
failed\n",
+socket_id, aid_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory 
test failed\n",
+socket_id, aid_id, hbm_id);
+
+   if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
+   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist 
test failed\n",
+socket_id, aid_id, hbm_id);
+}
+
+static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
+u32 instance, u32 *boot_error)
+{
+   u32 reg_addr;
+   u32 reg_data;
+   int retry_loop;
+
+   /* The pattern for smn addressing in other SOC could be different from
+* the one for aqua_vanjaram. We should revisit the code if the pattern
+* is changed. In such case, replace the aqua_vanjaram implementation
+* with more common helper */
+   reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+  aqua_vanjaram_encode_ext_smn_addressing(instance);
+
+   for (retry_loop = 0; retry_loop < 1000; retry_loop++) {
+   reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+   if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
+   *boot_error = reg_data;
+   return 0;
+   }
+   msleep(1);
+   }
+
+   *boot_error = reg

[PATCH v2 v2 2/5] drm/amdgpu: Init pcie_index/data address as fallback (v2)

2024-01-07 Thread Hawking Zhang
To allow using this helper for indirect access when
nbio funcs is not available. For instance, in ip
discovery phase.

v2: define macro for pcie_index/data/index_hi fallback.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 23 +-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index abad5773714c..05d7cdcf28b0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -96,6 +96,9 @@ MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
 #define AMDGPU_RESUME_MS   2000
 #define AMDGPU_MAX_RETRY_LIMIT 2
 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) 
== -EINVAL)
+#define AMDGPU_PCIE_INDEX_FALLBACK (0x38 >> 2)
+#define AMDGPU_PCIE_INDEX_HI_FALLBACK (0x44 >> 2)
+#define AMDGPU_PCIE_DATA_FALLBACK (0x3C >> 2)
 
 static const struct drm_driver amdgpu_kms_driver;
 
@@ -781,12 +784,22 @@ u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device 
*adev,
void __iomem *pcie_index_hi_offset;
void __iomem *pcie_data_offset;
 
-   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
-   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
-   if ((reg_addr >> 32) && (adev->nbio.funcs->get_pcie_index_hi_offset))
-   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
-   else
+   if (unlikely(!adev->nbio.funcs)) {
+   pcie_index = AMDGPU_PCIE_INDEX_FALLBACK;
+   pcie_data = AMDGPU_PCIE_DATA_FALLBACK;
+   } else {
+   pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
+   pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
+   }
+
+   if (reg_addr >> 32) {
+   if (unlikely(!adev->nbio.funcs))
+   pcie_index_hi = AMDGPU_PCIE_INDEX_HI_FALLBACK;
+   else
+   pcie_index_hi = 
adev->nbio.funcs->get_pcie_index_hi_offset(adev);
+   } else {
pcie_index_hi = 0;
+   }
 
spin_lock_irqsave(&adev->pcie_idx_lock, flags);
pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
-- 
2.17.1



[PATCH 5/5] drm/amdgpu: Query boot status if boot failed

2024-01-07 Thread Hawking Zhang
Check and report firmware boot status if it doesn't
reach steady status.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index 6fad451a85be..676bec2cc157 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -187,11 +187,18 @@ static int psp_v13_0_wait_for_bootloader(struct 
psp_context *psp)
 static int psp_v13_0_wait_for_bootloader_steady_state(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
+   int ret;
 
if (amdgpu_ip_version(adev, MP0_HWIP, 0) == IP_VERSION(13, 0, 6)) {
-   psp_v13_0_wait_for_vmbx_ready(psp);
+   ret = psp_v13_0_wait_for_vmbx_ready(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
+
+   ret = psp_v13_0_wait_for_bootloader(psp);
+   if (ret)
+   amdgpu_ras_query_boot_status(adev, 4);
 
-   return psp_v13_0_wait_for_bootloader(psp);
+   return ret;
}
 
return 0;
-- 
2.17.1



[PATCH 4/5] drm/amdgpu: Query boot status if discovery failed

2024-01-07 Thread Hawking Zhang
Check and report boot status if discovery failed.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index b8fde08aec8e..302b71e9f1e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -27,6 +27,7 @@
 #include "amdgpu_discovery.h"
 #include "soc15_hw_ip.h"
 #include "discovery.h"
+#include "amdgpu_ras.h"
 
 #include "soc15.h"
 #include "gfx_v9_0.h"
@@ -98,6 +99,7 @@
 #define FIRMWARE_IP_DISCOVERY "amdgpu/ip_discovery.bin"
 MODULE_FIRMWARE(FIRMWARE_IP_DISCOVERY);
 
+#define mmIP_DISCOVERY_VERSION  0x16A00
 #define mmRCC_CONFIG_MEMSIZE   0xde3
 #define mmMP0_SMN_C2PMSG_330x16061
 #define mmMM_INDEX 0x0
@@ -518,7 +520,9 @@ static int amdgpu_discovery_init(struct amdgpu_device *adev)
 out:
kfree(adev->mman.discovery_bin);
adev->mman.discovery_bin = NULL;
-
+   if ((amdgpu_discovery != 2) &&
+   (RREG32(mmIP_DISCOVERY_VERSION) == 4))
+   amdgpu_ras_query_boot_status(adev, 4);
return r;
 }
 
-- 
2.17.1



[PATCH 1/5] drm/amdgpu: drop psp v13 query_boot_status implementation

2024-01-07 Thread Hawking Zhang
Will replace it with new implementation to cover
boot fails in ip discovery phase.

Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c| 15 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h|  4 --
 drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 78 --
 4 files changed, 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index a39c9fea55c4..abad5773714c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1218,8 +1218,6 @@ static int amdgpu_device_asic_init(struct amdgpu_device 
*adev)
amdgpu_ip_version(adev, GC_HWIP, 0) >= IP_VERSION(11, 0, 0)) {
amdgpu_psp_wait_for_bootloader(adev);
ret = amdgpu_atomfirmware_asic_init(adev, true);
-   /* TODO: check the return val and stop device initialization if 
boot fails */
-   amdgpu_psp_query_boot_status(adev);
return ret;
} else {
return amdgpu_atom_asic_init(adev->mode_info.atom_context);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 2addbdf88394..90451cabb919 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2125,21 +2125,6 @@ int amdgpu_psp_wait_for_bootloader(struct amdgpu_device 
*adev)
return ret;
 }
 
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev)
-{
-   struct psp_context *psp = &adev->psp;
-   int ret = 0;
-
-   if (amdgpu_sriov_vf(adev) || (adev->flags & AMD_IS_APU))
-   return 0;
-
-   if (psp->funcs &&
-   psp->funcs->query_boot_status)
-   ret = psp->funcs->query_boot_status(psp);
-
-   return ret;
-}
-
 static int psp_hw_start(struct psp_context *psp)
 {
struct amdgpu_device *adev = psp->adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
index c4d9cbde55b9..09d1f8f72a9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.h
@@ -134,7 +134,6 @@ struct psp_funcs {
int (*update_spirom)(struct psp_context *psp, uint64_t fw_pri_mc_addr);
int (*vbflash_stat)(struct psp_context *psp);
int (*fatal_error_recovery_quirk)(struct psp_context *psp);
-   int (*query_boot_status)(struct psp_context *psp);
 };
 
 struct ta_funcs {
@@ -538,7 +537,4 @@ int psp_spatial_partition(struct psp_context *psp, int 
mode);
 int is_psp_fw_valid(struct psp_bin_desc bin);
 
 int amdgpu_psp_wait_for_bootloader(struct amdgpu_device *adev);
-
-int amdgpu_psp_query_boot_status(struct amdgpu_device *adev);
-
 #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
index df1844d0800f..6fad451a85be 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c
@@ -763,83 +763,6 @@ static int psp_v13_0_fatal_error_recovery_quirk(struct 
psp_context *psp)
return 0;
 }
 
-
-static void psp_v13_0_boot_error_reporting(struct amdgpu_device *adev,
-  uint32_t inst,
-  uint32_t boot_error)
-{
-   uint32_t socket_id;
-   uint32_t aid_id;
-   uint32_t hbm_id;
-   uint32_t reg_data;
-
-   socket_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, SOCKET_ID);
-   aid_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, AID_ID);
-   hbm_id = REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, HBM_ID);
-
-   reg_data = RREG32_SOC15(MP0, inst, regMP0_SMN_C2PMSG_109);
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw 
status is 0x%x\n",
-socket_id, aid_id, reg_data);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_MEM_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory 
training failed\n",
-socket_id, aid_id, hbm_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, GPU_ERR_FW_LOAD))
-   dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed 
at boot time\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_WAFL_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, wafl link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_XGMI_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training 
failed\n",
-socket_id, aid_id);
-
-   if (REG_GET_FIELD(boot_error, MP0_SMN_C2PMSG_126, 
GPU_ERR_USR_CP_LINK_TRAINING))
-   dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training 
failed\n",
- 

RE: [PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

2024-01-07 Thread Zhang, Hawking
[AMD Official Use Only - General]


Please check my comments inline.

Regards,
Hawking

-Original Message-
From: Zhang, Morris 
Sent: Wednesday, January 3, 2024 17:46
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org; 
Zhou1, Tao ; Yang, Stanley ; Wang, 
Yang(Kevin) ; Chai, Thomas ; Li, 
Candice 
Cc: Deucher, Alexander ; Ma, Le ; 
Lazar, Lijo ; Zhang, Hawking 
Subject: RE: [PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot errors v2

[AMD Official Use Only - General]

--Brs,
Morris Zhang
MLSE Linux  ML SRDC
Ext. 25147

> -Original Message-
> From: amd-gfx 
> mailto:amd-gfx-boun...@lists.freedesktop.org>>
>  On Behalf Of
> Hawking Zhang
> Sent: Tuesday, January 2, 2024 10:08 PM
> To: amd-gfx@lists.freedesktop.org; 
> Zhou1, Tao mailto:tao.zh...@amd.com>>;
> Yang, Stanley mailto:stanley.y...@amd.com>>; Wang, 
> Yang(Kevin)
> mailto:kevinyang.w...@amd.com>>; Chai, Thomas 
> mailto:yipeng.c...@amd.com>>; Li,
> Candice mailto:candice...@amd.com>>
> Cc: Deucher, Alexander 
> mailto:alexander.deuc...@amd.com>>; Ma, Le
> mailto:le...@amd.com>>; Lazar, Lijo 
> mailto:lijo.la...@amd.com>>; Zhang, Hawking
> mailto:hawking.zh...@amd.com>>
> Subject: [PATCH v2 3/5] drm/amdgpu: Add ras helper to query boot
> errors v2
>
> Add ras helper function to query boot time gpu errors.
> v2: use aqua_vanjaram smn addressing pattern
>
> Signed-off-by: Hawking Zhang 
> mailto:hawking.zh...@amd.com>>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 95
> +  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h |
> 15 +++-
>  3 files changed, 110 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index 616b6c911767..cd91533d641c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1328,6 +1328,7 @@ int emu_soc_asic_init(struct amdgpu_device
> *adev);  #define WREG32_FIELD_OFFSET(reg, offset, field, val) \
>   WREG32(mm##reg + offset, (RREG32(mm##reg + offset) &
> ~REG_FIELD_MASK(reg, field)) | (val) << REG_FIELD_SHIFT(reg, field))
>
> +#define AMDGPU_GET_REG_FIELD(x, h, l) (((x) & GENMASK_ULL(h, l)) >>
> +(l))
>  /*
>   * BIOS helpers.
>   */
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> index fc42fb6ee191..a901b00d4949 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
> @@ -3763,3 +3763,98 @@ int amdgpu_ras_error_statistic_ce_count(struct
> ras_err_data *err_data,
>
>   return 0;
>  }
> +
> +#define mmMP0_SMN_C2PMSG_92  0x1609C
> +#define mmMP0_SMN_C2PMSG_126 0x160BE
> +static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device
> *adev,
> +  u32 instance, u32
> +boot_error) {
> + u32 socket_id, aid_id, hbm_id;
> + u32 reg_data;
> + u64 reg_addr;
> +
> + socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
> + aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
> + hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
> +
> + /* The pattern for smn addressing in other SOC could be different from
> +  * the one for aqua_vanjaram. We should revisit the code if the pattern
> +  * is changed. In such case, replace the aqua_vanjaram implementation
> +  * with more common helper */
> + reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
> +aqua_vanjaram_encode_ext_smn_addressing(instance);
> +
> + reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
> + dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed,
> + fw status
> is 0x%x\n",
> + socket_id, aid_id, reg_data);
> +
> + if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d,
> + memory
> training failed\n",
> +  socket_id, aid_id, hbm_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, firmware load
> + failed at
> boot time\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, wafl link
> + training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, xgmi link
> + training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, usr cp link
> + training
> failed\n",
> +  socket_id, aid_id);
> +
> + if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
> + dev_info(adev->dev, "socket: %d, aid: %d, usr dp link
> + training
> failed\n",
> +