from:"Guchun Chen"

[PATCH] drm/amdgpu: update sched.ready within kernel drm scheduler

2023-07-18 Thread Guchun Chen

amdgpu_test_ring_helper will set sched.ready unconditionally
based on ring test result, this will lead value mismatch between
ring->sched.ready and no_scheduler for those rings without a kernel
scheluer, after they perform ring test. This will be confused as
kernel ring no_scheduler is true, but ring->sched.ready is true as well.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 80d6e132e409..afa76d069d94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -630,7 +630,9 @@ int amdgpu_ring_test_helper(struct amdgpu_ring *ring)
DRM_DEV_DEBUG(adev->dev, "ring test on %s succeeded\n",
  ring->name);
 
-   ring->sched.ready = !r;
+   /* Only set sched.ready on top of kernel scheduler. */
+   if (!ring->no_scheduler)
+   ring->sched.ready = !r;
return r;
 }
 
-- 
2.25.1

[PATCH 4/4] drm/amdgpu: use a macro to define no xcp partition case

2023-07-17 Thread Guchun Chen

~0 as no xcp partition is used in several places, so improve its
definition by a macro for code consistency.

Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h  | 2 ++
 drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c   | 4 ++--
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index a7f314ddd173..d34c3ef8f3ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1709,7 +1709,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
}
-   xcp_id = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id;
+   xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
+   0 : fpriv->xcp_id;
} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
alloc_flags = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index d175e862f222..9c9cca129498 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -363,7 +363,7 @@ int amdgpu_xcp_open_device(struct amdgpu_device *adev,
if (!adev->xcp_mgr)
return 0;
 
-   fpriv->xcp_id = ~0;
+   fpriv->xcp_id = AMDGPU_XCP_NO_PARTITION;
for (i = 0; i < MAX_XCP; ++i) {
if (!adev->xcp_mgr->xcp[i].ddev)
break;
@@ -381,7 +381,7 @@ int amdgpu_xcp_open_device(struct amdgpu_device *adev,
}
}
 
-   fpriv->vm.mem_id = fpriv->xcp_id == ~0 ? -1 :
+   fpriv->vm.mem_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? -1 :
adev->xcp_mgr->xcp[fpriv->xcp_id].mem_id;
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 0f8026d64ea5..9a1036aeec2a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -37,6 +37,8 @@
 #define AMDGPU_XCP_FL_NONE 0
 #define AMDGPU_XCP_FL_LOCKED (1 << 0)
 
+#define AMDGPU_XCP_NO_PARTITION (~0)
+
 struct amdgpu_fpriv;
 
 enum AMDGPU_XCP_IP_BLOCK {
diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 16471b81a1f5..72b629a78c62 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -68,7 +68,7 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device 
*adev,
enum AMDGPU_XCP_IP_BLOCK ip_blk;
uint32_t inst_mask;
 
-   ring->xcp_id = ~0;
+   ring->xcp_id = AMDGPU_XCP_NO_PARTITION;
if (adev->xcp_mgr->mode == AMDGPU_XCP_MODE_NONE)
return;
 
@@ -177,7 +177,7 @@ static int aqua_vanjaram_select_scheds(
u32 sel_xcp_id;
int i;
 
-   if (fpriv->xcp_id == ~0) {
+   if (fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION) {
u32 least_ref_cnt = ~0;
 
fpriv->xcp_id = 0;
-- 
2.25.1

[PATCH 3/4] drm/amdgpu/vm: use the same xcp_id from root PD

2023-07-17 Thread Guchun Chen

Other PDs/PTs allocation should just use the same xcp_id as that
stored in root PD.

Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
Reviewed-by: Felix Kuehling 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index eb52dfe64948..83e1923f6775 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -607,7 +607,8 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
return 0;
 
amdgpu_vm_eviction_unlock(vm);
-   r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, , 0);
+   r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, ,
+   vm->root.bo->xcp_id);
amdgpu_vm_eviction_lock(vm);
if (r)
return r;
-- 
2.25.1

[PATCH 2/4] drm/amdgpu: fix slab-out-of-bounds issue in amdgpu_vm_pt_create

2023-07-17 Thread Guchun Chen

Recent code set xcp_id stored from file private data when opening
device to amdgpu bo for accounting memory usage etc, but not all
VMs are attached to this fpriv structure like the vm cases in
amdgpu_mes_self_test, otherwise, KASAN will complain below out
of bound access. And more importantly, VM code should not touch
fpriv structure, so drop fpriv code handling from amdgpu_vm_pt.

[   77.292314] BUG: KASAN: slab-out-of-bounds in 
amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.293845] Read of size 4 at addr 888102c48a48 by task modprobe/1069
[   77.294146] Call Trace:
[   77.294178]  
[   77.294208]  dump_stack_lvl+0x49/0x63
[   77.294260]  print_report+0x16f/0x4a6
[   77.294307]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.295979]  ? kasan_complete_mode_report_info+0x3c/0x200
[   77.296057]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.297556]  kasan_report+0xb4/0x130
[   77.297609]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.299202]  __asan_load4+0x6f/0x90
[   77.299272]  amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.300796]  ? amdgpu_init+0x6e/0x1000 [amdgpu]
[   77.30]  ? amdgpu_vm_pt_clear+0x750/0x750 [amdgpu]
[   77.303721]  ? preempt_count_sub+0x18/0xc0
[   77.303786]  amdgpu_vm_init+0x39e/0x870 [amdgpu]
[   77.305186]  ? amdgpu_vm_wait_idle+0x90/0x90 [amdgpu]
[   77.306683]  ? kasan_set_track+0x25/0x30
[   77.306737]  ? kasan_save_alloc_info+0x1b/0x30
[   77.306795]  ? __kasan_kmalloc+0x87/0xa0
[   77.306852]  amdgpu_mes_self_test+0x169/0x620 [amdgpu]

v2: without specifying xcp partition for PD/PT bo, the xcp id is -1.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2686
Fixes: ffc6deb773f7 ("drm/amdkfd: Store xcp partition id to amdgpu bo")
Signed-off-by: Guchun Chen 
Tested-by: Mikhail Gavrilov 
Reviewed-by: Felix Kuehling 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 11 ++-
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 53a024cf0544..cab2fdd5b76a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1236,7 +1236,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct 
drm_file *file_priv)
if (r)
goto error_pasid;
 
-   r = amdgpu_vm_init(adev, >vm);
+   r = amdgpu_vm_init(adev, >vm, fpriv->xcp_id);
if (r)
goto error_pasid;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index e9091ebfe230..f808841310fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -1382,7 +1382,7 @@ int amdgpu_mes_self_test(struct amdgpu_device *adev)
goto error_pasid;
}
 
-   r = amdgpu_vm_init(adev, vm);
+   r = amdgpu_vm_init(adev, vm, -1);
if (r) {
DRM_ERROR("failed to initialize vm\n");
goto error_pasid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 32adc31c093d..74380b21e7a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2121,13 +2121,14 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long 
timeout)
  *
  * @adev: amdgpu_device pointer
  * @vm: requested vm
+ * @xcp_id: GPU partition selection id
  *
  * Init @vm fields.
  *
  * Returns:
  * 0 for success, error for failure.
  */
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t 
xcp_id)
 {
struct amdgpu_bo *root_bo;
struct amdgpu_bo_vm *root;
@@ -2177,7 +2178,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
vm->evicting = false;
 
r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
-   false, );
+   false, , xcp_id);
if (r)
goto error_free_delayed;
root_bo = >bo;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 88ee4507f6b6..bca258c38919 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -398,7 +398,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
u32 pasid);
 
 long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout);
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t 
xcp_id);
 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm

[PATCH 1/4] drm/amdgpu: Allocate root PD on correct partition

2023-07-17 Thread Guchun Chen

file_priv needs to be setup firstly, otherwise, root PD
will always be allocated on partition 0, even if opening
the device from other partitions.

Fixes: ffc6deb773f7 ("drm/amdkfd: Store xcp partition id to amdgpu bo")
Signed-off-by: Guchun Chen 
Reviewed-by: Felix Kuehling 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 85a0d5f419c8..53a024cf0544 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1232,13 +1232,13 @@ int amdgpu_driver_open_kms(struct drm_device *dev, 
struct drm_file *file_priv)
pasid = 0;
}
 
-   r = amdgpu_vm_init(adev, >vm);
+   r = amdgpu_xcp_open_device(adev, fpriv, file_priv);
if (r)
goto error_pasid;
 
-   r = amdgpu_xcp_open_device(adev, fpriv, file_priv);
+   r = amdgpu_vm_init(adev, >vm);
if (r)
-   goto error_vm;
+   goto error_pasid;
 
r = amdgpu_vm_set_pasid(adev, >vm, pasid);
if (r)
-- 
2.25.1

[PATCH] drm/ttm: check null pointer before accessing when swapping

2023-07-17 Thread Guchun Chen

Add a check to avoid null pointer dereference.

Fixes: a2848d08742c ("drm/ttm: never consider pinned BOs for eviction")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/ttm/ttm_bo.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 7139a522b2f3..54e3083076b7 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -519,7 +519,8 @@ static bool ttm_bo_evict_swapout_allowable(struct 
ttm_buffer_object *bo,
 
if (bo->pin_count) {
*locked = false;
-   *busy = false;
+   if (busy)
+   *busy = false;
return false;
}
 
-- 
2.25.1

[PATCH 3/4] drm/amdgpu/vm: use the same xcp_id from root PD

2023-07-16 Thread Guchun Chen

Other PDs/PTs allocation should just use the same xcp_id as that
stored in root PD.

Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index eb52dfe64948..83e1923f6775 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -607,7 +607,8 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
return 0;
 
amdgpu_vm_eviction_unlock(vm);
-   r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, , 0);
+   r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, ,
+   vm->root.bo->xcp_id);
amdgpu_vm_eviction_lock(vm);
if (r)
return r;
-- 
2.25.1

[PATCH 4/4] drm/amdgpu: use a macro to define no xcp partition case

2023-07-16 Thread Guchun Chen

~0 as no xcp partition is used in several places, so improve its
definition by a macro for code consistency.

Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h  | 2 ++
 drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c   | 4 ++--
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index a7f314ddd173..d34c3ef8f3ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1709,7 +1709,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
}
-   xcp_id = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id;
+   xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
+   0 : fpriv->xcp_id;
} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
alloc_flags = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index d175e862f222..9c9cca129498 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -363,7 +363,7 @@ int amdgpu_xcp_open_device(struct amdgpu_device *adev,
if (!adev->xcp_mgr)
return 0;
 
-   fpriv->xcp_id = ~0;
+   fpriv->xcp_id = AMDGPU_XCP_NO_PARTITION;
for (i = 0; i < MAX_XCP; ++i) {
if (!adev->xcp_mgr->xcp[i].ddev)
break;
@@ -381,7 +381,7 @@ int amdgpu_xcp_open_device(struct amdgpu_device *adev,
}
}
 
-   fpriv->vm.mem_id = fpriv->xcp_id == ~0 ? -1 :
+   fpriv->vm.mem_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? -1 :
adev->xcp_mgr->xcp[fpriv->xcp_id].mem_id;
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 0f8026d64ea5..9a1036aeec2a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -37,6 +37,8 @@
 #define AMDGPU_XCP_FL_NONE 0
 #define AMDGPU_XCP_FL_LOCKED (1 << 0)
 
+#define AMDGPU_XCP_NO_PARTITION (~0)
+
 struct amdgpu_fpriv;
 
 enum AMDGPU_XCP_IP_BLOCK {
diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 16471b81a1f5..72b629a78c62 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -68,7 +68,7 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device 
*adev,
enum AMDGPU_XCP_IP_BLOCK ip_blk;
uint32_t inst_mask;
 
-   ring->xcp_id = ~0;
+   ring->xcp_id = AMDGPU_XCP_NO_PARTITION;
if (adev->xcp_mgr->mode == AMDGPU_XCP_MODE_NONE)
return;
 
@@ -177,7 +177,7 @@ static int aqua_vanjaram_select_scheds(
u32 sel_xcp_id;
int i;
 
-   if (fpriv->xcp_id == ~0) {
+   if (fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION) {
u32 least_ref_cnt = ~0;
 
fpriv->xcp_id = 0;
-- 
2.25.1

[PATCH 2/4] drm/amdgpu: fix slab-out-of-bounds issue in amdgpu_vm_pt_create

2023-07-16 Thread Guchun Chen

Recent code set xcp_id stored from file private data when opening
device to amdgpu bo for accounting memory usage etc, but not all
VMs are attached to this fpriv structure like the vm cases in
amdgpu_mes_self_test, otherwise, KASAN will complain below out
of bound access. And more importantly, VM code should not touch
fpriv structure, so drop fpriv code handling from amdgpu_vm_pt.

[   77.292314] BUG: KASAN: slab-out-of-bounds in 
amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.293845] Read of size 4 at addr 888102c48a48 by task modprobe/1069
[   77.294146] Call Trace:
[   77.294178]  
[   77.294208]  dump_stack_lvl+0x49/0x63
[   77.294260]  print_report+0x16f/0x4a6
[   77.294307]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.295979]  ? kasan_complete_mode_report_info+0x3c/0x200
[   77.296057]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.297556]  kasan_report+0xb4/0x130
[   77.297609]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.299202]  __asan_load4+0x6f/0x90
[   77.299272]  amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.300796]  ? amdgpu_init+0x6e/0x1000 [amdgpu]
[   77.30]  ? amdgpu_vm_pt_clear+0x750/0x750 [amdgpu]
[   77.303721]  ? preempt_count_sub+0x18/0xc0
[   77.303786]  amdgpu_vm_init+0x39e/0x870 [amdgpu]
[   77.305186]  ? amdgpu_vm_wait_idle+0x90/0x90 [amdgpu]
[   77.306683]  ? kasan_set_track+0x25/0x30
[   77.306737]  ? kasan_save_alloc_info+0x1b/0x30
[   77.306795]  ? __kasan_kmalloc+0x87/0xa0
[   77.306852]  amdgpu_mes_self_test+0x169/0x620 [amdgpu]

v2: without specifying xcp partition for PD/PT bo, the xcp id is -1.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2686
Fixes: ffc6deb773f7 ("drm/amdkfd: Store xcp partition id to amdgpu bo")
Signed-off-by: Guchun Chen 
Tested-by: Mikhail Gavrilov 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 11 ++-
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 53a024cf0544..cab2fdd5b76a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1236,7 +1236,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct 
drm_file *file_priv)
if (r)
goto error_pasid;
 
-   r = amdgpu_vm_init(adev, >vm);
+   r = amdgpu_vm_init(adev, >vm, fpriv->xcp_id);
if (r)
goto error_pasid;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index e9091ebfe230..f808841310fd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -1382,7 +1382,7 @@ int amdgpu_mes_self_test(struct amdgpu_device *adev)
goto error_pasid;
}
 
-   r = amdgpu_vm_init(adev, vm);
+   r = amdgpu_vm_init(adev, vm, -1);
if (r) {
DRM_ERROR("failed to initialize vm\n");
goto error_pasid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 32adc31c093d..74380b21e7a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2121,13 +2121,14 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long 
timeout)
  *
  * @adev: amdgpu_device pointer
  * @vm: requested vm
+ * @xcp_id: GPU partition selection id
  *
  * Init @vm fields.
  *
  * Returns:
  * 0 for success, error for failure.
  */
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t 
xcp_id)
 {
struct amdgpu_bo *root_bo;
struct amdgpu_bo_vm *root;
@@ -2177,7 +2178,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
vm->evicting = false;
 
r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
-   false, );
+   false, , xcp_id);
if (r)
goto error_free_delayed;
root_bo = >bo;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 88ee4507f6b6..bca258c38919 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -398,7 +398,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
u32 pasid);
 
 long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout);
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t 
xcp_id);
 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_release_compute(struct amdgpu_devic

[PATCH 1/4] drm/amdgpu: Allocate root PD on correct partition

2023-07-16 Thread Guchun Chen

file_priv needs to be setup firstly, otherwise, root PD
will always be allocated on partition 0, even if opening
the device from other partitions.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 85a0d5f419c8..53a024cf0544 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1232,13 +1232,13 @@ int amdgpu_driver_open_kms(struct drm_device *dev, 
struct drm_file *file_priv)
pasid = 0;
}
 
-   r = amdgpu_vm_init(adev, >vm);
+   r = amdgpu_xcp_open_device(adev, fpriv, file_priv);
if (r)
goto error_pasid;
 
-   r = amdgpu_xcp_open_device(adev, fpriv, file_priv);
+   r = amdgpu_vm_init(adev, >vm);
if (r)
-   goto error_vm;
+   goto error_pasid;
 
r = amdgpu_vm_set_pasid(adev, >vm, pasid);
if (r)
-- 
2.25.1

[PATCH] drm/amdgpu: use a macro to define no xcp partition case

2023-07-14 Thread Guchun Chen

~0 as no xcp partition is used in several places, so improve its
definition by a macro for code consistency.

Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c  | 4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h  | 2 ++
 drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c   | 4 ++--
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index a7f314ddd173..d34c3ef8f3ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1709,7 +1709,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
alloc_flags |= (flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) 
?
AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED : 0;
}
-   xcp_id = fpriv->xcp_id == ~0 ? 0 : fpriv->xcp_id;
+   xcp_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ?
+   0 : fpriv->xcp_id;
} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
domain = alloc_domain = AMDGPU_GEM_DOMAIN_GTT;
alloc_flags = 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
index d175e862f222..9c9cca129498 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.c
@@ -363,7 +363,7 @@ int amdgpu_xcp_open_device(struct amdgpu_device *adev,
if (!adev->xcp_mgr)
return 0;
 
-   fpriv->xcp_id = ~0;
+   fpriv->xcp_id = AMDGPU_XCP_NO_PARTITION;
for (i = 0; i < MAX_XCP; ++i) {
if (!adev->xcp_mgr->xcp[i].ddev)
break;
@@ -381,7 +381,7 @@ int amdgpu_xcp_open_device(struct amdgpu_device *adev,
}
}
 
-   fpriv->vm.mem_id = fpriv->xcp_id == ~0 ? -1 :
+   fpriv->vm.mem_id = fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION ? -1 :
adev->xcp_mgr->xcp[fpriv->xcp_id].mem_id;
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
index 0f8026d64ea5..9a1036aeec2a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xcp.h
@@ -37,6 +37,8 @@
 #define AMDGPU_XCP_FL_NONE 0
 #define AMDGPU_XCP_FL_LOCKED (1 << 0)
 
+#define AMDGPU_XCP_NO_PARTITION (~0)
+
 struct amdgpu_fpriv;
 
 enum AMDGPU_XCP_IP_BLOCK {
diff --git a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c 
b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
index 16471b81a1f5..72b629a78c62 100644
--- a/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
+++ b/drivers/gpu/drm/amd/amdgpu/aqua_vanjaram.c
@@ -68,7 +68,7 @@ static void aqua_vanjaram_set_xcp_id(struct amdgpu_device 
*adev,
enum AMDGPU_XCP_IP_BLOCK ip_blk;
uint32_t inst_mask;
 
-   ring->xcp_id = ~0;
+   ring->xcp_id = AMDGPU_XCP_NO_PARTITION;
if (adev->xcp_mgr->mode == AMDGPU_XCP_MODE_NONE)
return;
 
@@ -177,7 +177,7 @@ static int aqua_vanjaram_select_scheds(
u32 sel_xcp_id;
int i;
 
-   if (fpriv->xcp_id == ~0) {
+   if (fpriv->xcp_id == AMDGPU_XCP_NO_PARTITION) {
u32 least_ref_cnt = ~0;
 
fpriv->xcp_id = 0;
-- 
2.25.1

[PATCH 2/2] drm/amdgpu/vm: use the same xcp_id from root PD

2023-07-14 Thread Guchun Chen

Other PDs/PTs allocation should just use the same xcp_id as that
stored in root PD.

Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index eb52dfe64948..83e1923f6775 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -607,7 +607,8 @@ static int amdgpu_vm_pt_alloc(struct amdgpu_device *adev,
return 0;
 
amdgpu_vm_eviction_unlock(vm);
-   r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, , 0);
+   r = amdgpu_vm_pt_create(adev, vm, cursor->level, immediate, ,
+   vm->root.bo->xcp_id);
amdgpu_vm_eviction_lock(vm);
if (r)
return r;
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: fix slab-out-of-bounds issue in amdgpu_vm_pt_create

2023-07-14 Thread Guchun Chen

Recent code set xcp_id stored from file private data when opening
device to amdgpu bo for accounting memory usage etc, but not all
VMs are attached to this fpriv structure like the vm cases in
amdgpu_mes_self_test, otherwise, KASAN will complain below out
of bound access. And more importantly, VM code should not touch
fpriv structure, so drop fpriv code handling from amdgpu_vm_pt.

[   77.292314] BUG: KASAN: slab-out-of-bounds in 
amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.293845] Read of size 4 at addr 888102c48a48 by task modprobe/1069
[   77.294146] Call Trace:
[   77.294178]  
[   77.294208]  dump_stack_lvl+0x49/0x63
[   77.294260]  print_report+0x16f/0x4a6
[   77.294307]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.295979]  ? kasan_complete_mode_report_info+0x3c/0x200
[   77.296057]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.297556]  kasan_report+0xb4/0x130
[   77.297609]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.299202]  __asan_load4+0x6f/0x90
[   77.299272]  amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.300796]  ? amdgpu_init+0x6e/0x1000 [amdgpu]
[   77.30]  ? amdgpu_vm_pt_clear+0x750/0x750 [amdgpu]
[   77.303721]  ? preempt_count_sub+0x18/0xc0
[   77.303786]  amdgpu_vm_init+0x39e/0x870 [amdgpu]
[   77.305186]  ? amdgpu_vm_wait_idle+0x90/0x90 [amdgpu]
[   77.306683]  ? kasan_set_track+0x25/0x30
[   77.306737]  ? kasan_save_alloc_info+0x1b/0x30
[   77.306795]  ? __kasan_kmalloc+0x87/0xa0
[   77.306852]  amdgpu_mes_self_test+0x169/0x620 [amdgpu]

Fixes: ffc6deb773f7 ("drm/amdkfd: Store xcp partition id to amdgpu bo")
Signed-off-by: Guchun Chen 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  5 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 11 ++-
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 85a0d5f419c8..9a5aa4318cad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1232,7 +1232,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct 
drm_file *file_priv)
pasid = 0;
}
 
-   r = amdgpu_vm_init(adev, >vm);
+   r = amdgpu_vm_init(adev, >vm, fpriv->xcp_id);
if (r)
goto error_pasid;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index e9091ebfe230..cac1d1b227f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -1382,7 +1382,7 @@ int amdgpu_mes_self_test(struct amdgpu_device *adev)
goto error_pasid;
}
 
-   r = amdgpu_vm_init(adev, vm);
+   r = amdgpu_vm_init(adev, vm, 0);
if (r) {
DRM_ERROR("failed to initialize vm\n");
goto error_pasid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 32adc31c093d..74380b21e7a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2121,13 +2121,14 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long 
timeout)
  *
  * @adev: amdgpu_device pointer
  * @vm: requested vm
+ * @xcp_id: GPU partition selection id
  *
  * Init @vm fields.
  *
  * Returns:
  * 0 for success, error for failure.
  */
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t 
xcp_id)
 {
struct amdgpu_bo *root_bo;
struct amdgpu_bo_vm *root;
@@ -2177,7 +2178,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
vm->evicting = false;
 
r = amdgpu_vm_pt_create(adev, vm, adev->vm_manager.root_level,
-   false, );
+   false, , xcp_id);
if (r)
goto error_free_delayed;
root_bo = >bo;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 88ee4507f6b6..bca258c38919 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -398,7 +398,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
u32 pasid);
 
 long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout);
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm, int32_t 
xcp_id);
 int amdgpu_vm_make_compute(struct amdgpu_device *adev, struct amdgpu_vm *vm);
 void amdgpu_vm_release_compute(struct amdgpu_device *adev, struct amdgpu_vm 
*vm);
 void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm);
@@ -481,7 +481,8 @@ void amdgpu_vm

[PATCH v4] drm/amdgpu/vkms: relax timer deactivation by hrtimer_try_to_cancel

2023-07-11 Thread Guchun Chen

In below thousands of screen rotation loop tests with virtual display
enabled, a CPU hard lockup issue may happen, leading system to unresponsive
and crash.

do {
xrandr --output Virtual --rotate inverted
xrandr --output Virtual --rotate right
xrandr --output Virtual --rotate left
xrandr --output Virtual --rotate normal
} while (1);

NMI watchdog: Watchdog detected hard LOCKUP on cpu 1

? hrtimer_run_softirq+0x140/0x140
? store_vblank+0xe0/0xe0 [drm]
hrtimer_cancel+0x15/0x30
amdgpu_vkms_disable_vblank+0x15/0x30 [amdgpu]
drm_vblank_disable_and_save+0x185/0x1f0 [drm]
drm_crtc_vblank_off+0x159/0x4c0 [drm]
? record_print_text.cold+0x11/0x11
? wait_for_completion_timeout+0x232/0x280
? drm_crtc_wait_one_vblank+0x40/0x40 [drm]
? bit_wait_io_timeout+0xe0/0xe0
? wait_for_completion_interruptible+0x1d7/0x320
? mutex_unlock+0x81/0xd0
amdgpu_vkms_crtc_atomic_disable

It's caused by a stuck in lock dependency in such scenario on different
CPUs.

CPU1 CPU2
drm_crtc_vblank_off  hrtimer_interrupt
grab event_lock (irq disabled)   __hrtimer_run_queues
grab vbl_lock/vblank_time_block  
amdgpu_vkms_vblank_simulate
amdgpu_vkms_disable_vblank   drm_handle_vblank
hrtimer_cancel grab 
dev->event_lock

So CPU1 stucks in hrtimer_cancel as timer callback is running endless on
current clock base, as that timer queue on CPU2 has no chance to finish it
because of failing to hold the lock. So NMI watchdog will throw the errors
after its threshold, and all later CPUs are impacted/blocked.

So use hrtimer_try_to_cancel to fix this, as disable_vblank callback
does not need to wait the handler to finish. And also it's not necessary
to check the return value of hrtimer_try_to_cancel, because even if it's
-1 which means current timer callback is running, it will be reprogrammed
in hrtimer_start with calling enable_vblank to make it works.

v2: only re-arm timer when vblank is enabled (Christian) and add a Fixes
tag as well

v3: drop warn printing (Christian)

v4: drop superfluous check of blank->enabled in timer function, as it's
guaranteed in drm_handle_vblank (Christian)

Fixes: 84ec374bd580("drm/amdgpu: create amdgpu_vkms (v4)")
Cc: sta...@vger.kernel.org
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 53ff91fc6cf6..d0748bcfad16 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -55,8 +55,9 @@ static enum hrtimer_restart 
amdgpu_vkms_vblank_simulate(struct hrtimer *timer)
DRM_WARN("%s: vblank timer overrun\n", __func__);
 
ret = drm_crtc_handle_vblank(crtc);
+   /* Don't queue timer again when vblank is disabled. */
if (!ret)
-   DRM_ERROR("amdgpu_vkms failure on handling vblank");
+   return HRTIMER_NORESTART;
 
return HRTIMER_RESTART;
 }
@@ -81,7 +82,7 @@ static void amdgpu_vkms_disable_vblank(struct drm_crtc *crtc)
 {
struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
 
-   hrtimer_cancel(_crtc->vblank_timer);
+   hrtimer_try_to_cancel(_crtc->vblank_timer);
 }
 
 static bool amdgpu_vkms_get_vblank_timestamp(struct drm_crtc *crtc,
-- 
2.25.1

[PATCH v3] drm/amdgpu/vkms: relax timer deactivation by hrtimer_try_to_cancel

2023-07-10 Thread Guchun Chen

In below thousands of screen rotation loop tests with virtual display
enabled, a CPU hard lockup issue may happen, leading system to unresponsive
and crash.

do {
xrandr --output Virtual --rotate inverted
xrandr --output Virtual --rotate right
xrandr --output Virtual --rotate left
xrandr --output Virtual --rotate normal
} while (1);

NMI watchdog: Watchdog detected hard LOCKUP on cpu 1

? hrtimer_run_softirq+0x140/0x140
? store_vblank+0xe0/0xe0 [drm]
hrtimer_cancel+0x15/0x30
amdgpu_vkms_disable_vblank+0x15/0x30 [amdgpu]
drm_vblank_disable_and_save+0x185/0x1f0 [drm]
drm_crtc_vblank_off+0x159/0x4c0 [drm]
? record_print_text.cold+0x11/0x11
? wait_for_completion_timeout+0x232/0x280
? drm_crtc_wait_one_vblank+0x40/0x40 [drm]
? bit_wait_io_timeout+0xe0/0xe0
? wait_for_completion_interruptible+0x1d7/0x320
? mutex_unlock+0x81/0xd0
amdgpu_vkms_crtc_atomic_disable

It's caused by a stuck in lock dependency in such scenario on different
CPUs.

CPU1 CPU2
drm_crtc_vblank_off  hrtimer_interrupt
grab event_lock (irq disabled)   __hrtimer_run_queues
grab vbl_lock/vblank_time_block  
amdgpu_vkms_vblank_simulate
amdgpu_vkms_disable_vblank   drm_handle_vblank
hrtimer_cancel grab 
dev->event_lock

So CPU1 stucks in hrtimer_cancel as timer callback is running endless on
current clock base, as that timer queue on CPU2 has no chance to finish it
because of failing to hold the lock. So NMI watchdog will throw the errors
after its threshold, and all later CPUs are impacted/blocked.

So use hrtimer_try_to_cancel to fix this, as disable_vblank callback
does not need to wait the handler to finish. And also it's not necessary
to check the return value of hrtimer_try_to_cancel, because even if it's
-1 which means current timer callback is running, it will be reprogrammed
in hrtimer_start with calling enable_vblank to make it works.

v2: only re-arm timer when vblank is enabled (Christian) and add a Fixes
tag as well

v3: drop warn printing (Christian)

Fixes: 84ec374bd580("drm/amdgpu: create amdgpu_vkms (v4)")
Cc: sta...@vger.kernel.org
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 53ff91fc6cf6..b870c827cbaa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -46,7 +46,10 @@ static enum hrtimer_restart 
amdgpu_vkms_vblank_simulate(struct hrtimer *timer)
struct amdgpu_crtc *amdgpu_crtc = container_of(timer, struct 
amdgpu_crtc, vblank_timer);
struct drm_crtc *crtc = _crtc->base;
struct amdgpu_vkms_output *output = 
drm_crtc_to_amdgpu_vkms_output(crtc);
+   struct drm_vblank_crtc *vblank;
+   struct drm_device *dev;
u64 ret_overrun;
+   unsigned int pipe;
bool ret;
 
ret_overrun = hrtimer_forward_now(_crtc->vblank_timer,
@@ -54,9 +57,13 @@ static enum hrtimer_restart 
amdgpu_vkms_vblank_simulate(struct hrtimer *timer)
if (ret_overrun != 1)
DRM_WARN("%s: vblank timer overrun\n", __func__);
 
+   dev = crtc->dev;
+   pipe = drm_crtc_index(crtc);
+   vblank = >vblank[pipe];
ret = drm_crtc_handle_vblank(crtc);
-   if (!ret)
-   DRM_ERROR("amdgpu_vkms failure on handling vblank");
+   /* Don't queue timer again when vblank is disabled. */
+   if (!ret && !READ_ONCE(vblank->enabled))
+   return HRTIMER_NORESTART;
 
return HRTIMER_RESTART;
 }
@@ -81,7 +88,7 @@ static void amdgpu_vkms_disable_vblank(struct drm_crtc *crtc)
 {
struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
 
-   hrtimer_cancel(_crtc->vblank_timer);
+   hrtimer_try_to_cancel(_crtc->vblank_timer);
 }
 
 static bool amdgpu_vkms_get_vblank_timestamp(struct drm_crtc *crtc,
-- 
2.25.1

[PATCH v2] drm/amdgpu/vkms: relax timer deactivation by hrtimer_try_to_cancel

2023-07-10 Thread Guchun Chen

In below thousands of screen rotation loop tests with virtual display
enabled, a CPU hard lockup issue may happen, leading system to unresponsive
and crash.

do {
xrandr --output Virtual --rotate inverted
xrandr --output Virtual --rotate right
xrandr --output Virtual --rotate left
xrandr --output Virtual --rotate normal
} while (1);

NMI watchdog: Watchdog detected hard LOCKUP on cpu 1

? hrtimer_run_softirq+0x140/0x140
? store_vblank+0xe0/0xe0 [drm]
hrtimer_cancel+0x15/0x30
amdgpu_vkms_disable_vblank+0x15/0x30 [amdgpu]
drm_vblank_disable_and_save+0x185/0x1f0 [drm]
drm_crtc_vblank_off+0x159/0x4c0 [drm]
? record_print_text.cold+0x11/0x11
? wait_for_completion_timeout+0x232/0x280
? drm_crtc_wait_one_vblank+0x40/0x40 [drm]
? bit_wait_io_timeout+0xe0/0xe0
? wait_for_completion_interruptible+0x1d7/0x320
? mutex_unlock+0x81/0xd0
amdgpu_vkms_crtc_atomic_disable

It's caused by a stuck in lock dependency in such scenario on different
CPUs.

CPU1 CPU2
drm_crtc_vblank_off  hrtimer_interrupt
grab event_lock (irq disabled)   __hrtimer_run_queues
grab vbl_lock/vblank_time_block  
amdgpu_vkms_vblank_simulate
amdgpu_vkms_disable_vblank   drm_handle_vblank
hrtimer_cancel   grab 
dev->event_lock

So CPU1 stucks in hrtimer_cancel as timer callback is running endless on
current clock base, as that timer queue on CPU2 has no chance to finish it
because of failing to hold the lock. So NMI watchdog will throw the errors
after its threshold, and all later CPUs are impacted/blocked.

So use hrtimer_try_to_cancel to fix this, as disable_vblank callback
does not need to wait the handler to finish. And also it's not necessary
to check the return value of hrtimer_try_to_cancel, because even if it's
-1 which means current timer callback is running, it will be reprogrammed
in hrtimer_start with calling enable_vblank to make it works.

v2: only re-arm timer when vblank is enabled (Christian) and add a Fixes
tag as well

Fixes: 84ec374bd580("drm/amdgpu: create amdgpu_vkms (v4)")
Cc: sta...@vger.kernel.org
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 15 ---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 53ff91fc6cf6..44d704306f44 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -46,7 +46,10 @@ static enum hrtimer_restart 
amdgpu_vkms_vblank_simulate(struct hrtimer *timer)
struct amdgpu_crtc *amdgpu_crtc = container_of(timer, struct 
amdgpu_crtc, vblank_timer);
struct drm_crtc *crtc = _crtc->base;
struct amdgpu_vkms_output *output = 
drm_crtc_to_amdgpu_vkms_output(crtc);
+   struct drm_vblank_crtc *vblank;
+   struct drm_device *dev;
u64 ret_overrun;
+   unsigned int pipe;
bool ret;
 
ret_overrun = hrtimer_forward_now(_crtc->vblank_timer,
@@ -54,9 +57,15 @@ static enum hrtimer_restart 
amdgpu_vkms_vblank_simulate(struct hrtimer *timer)
if (ret_overrun != 1)
DRM_WARN("%s: vblank timer overrun\n", __func__);
 
+   dev = crtc->dev;
+   pipe = drm_crtc_index(crtc);
+   vblank = >vblank[pipe];
ret = drm_crtc_handle_vblank(crtc);
-   if (!ret)
-   DRM_ERROR("amdgpu_vkms failure on handling vblank");
+   if (!ret && !READ_ONCE(vblank->enabled)) {
+   /* Don't queue timer again when vblank is disabled. */
+   DRM_WARN("amdgpu_vkms failure on handling vblank\n");
+   return HRTIMER_NORESTART;
+   }
 
return HRTIMER_RESTART;
 }
@@ -81,7 +90,7 @@ static void amdgpu_vkms_disable_vblank(struct drm_crtc *crtc)
 {
struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
 
-   hrtimer_cancel(_crtc->vblank_timer);
+   hrtimer_try_to_cancel(_crtc->vblank_timer);
 }
 
 static bool amdgpu_vkms_get_vblank_timestamp(struct drm_crtc *crtc,
-- 
2.25.1

[PATCH] drm/amdgpu/vkms: drop redundant set of fb_modifiers_not_supported

2023-07-06 Thread Guchun Chen

Due to a coding typo.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 70fb0df039e3..2a318c6d2cdf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -500,8 +500,6 @@ static int amdgpu_vkms_sw_init(void *handle)
 
adev_to_drm(adev)->mode_config.fb_modifiers_not_supported = true;
 
-   adev_to_drm(adev)->mode_config.fb_modifiers_not_supported = true;
-
r = amdgpu_display_modeset_create_props(adev);
if (r)
return r;
-- 
2.25.1

[PATCH] drm/amdgpu/vkms: relax timer deactivation by hrtimer_try_to_cancel

2023-07-06 Thread Guchun Chen

In below thousands of screen rotation loop tests with virtual display
enabled, a CPU hard lockup issue may happen, leading system to unresponsive
and crash.

do {
xrandr --output Virtual --rotate inverted
xrandr --output Virtual --rotate right
xrandr --output Virtual --rotate left
xrandr --output Virtual --rotate normal
} while (1);

NMI watchdog: Watchdog detected hard LOCKUP on cpu 4

? hrtimer_run_softirq+0x140/0x140
? store_vblank+0xe0/0xe0 [drm]
hrtimer_cancel+0x15/0x30
amdgpu_vkms_disable_vblank+0x15/0x30 [amdgpu]
drm_vblank_disable_and_save+0x185/0x1f0 [drm]
drm_crtc_vblank_off+0x159/0x4c0 [drm]
? record_print_text.cold+0x11/0x11
? wait_for_completion_timeout+0x232/0x280
? drm_crtc_wait_one_vblank+0x40/0x40 [drm]
? bit_wait_io_timeout+0xe0/0xe0
? wait_for_completion_interruptible+0x1d7/0x320
? mutex_unlock+0x81/0xd0
amdgpu_vkms_crtc_atomic_disable

It's caused by a stuck in lock dependency in such scenario on different
CPUs.

CPU1 CPU2
drm_crtc_vblank_off  hrtimer_interrupt
grab event_lock (irq disabled)   __hrtimer_run_queues
grab vbl_lock/vblank_time_block  
amdgpu_vkms_vblank_simulate
amdgpu_vkms_disable_vblank   drm_handle_vblank
hrtimer_cancel   grab 
dev->event_lock

So CPU1 stucks in hrtimer_cancel as timer callback is running endless on
current clock base, as that timer queue on CPU2 has no chance to finish it
because of failing to hold the lock. So NMI watchdog will throw the errors
after its threshold, and all later CPUs are impacted/blocked.

So use hrtimer_try_to_cancel to fix this, as disable_vblank callback
does not need to wait the handler to finish. And also it's not necessary
to check the return value of hrtimer_try_to_cancel, because even if it's
-1 which means current timer callback is running, it will be reprogrammed
in hrtimer_start with calling enable_vblank to make it works.

Cc: sta...@vger.kernel.org
Suggested-by: Christian König 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 53ff91fc6cf6..70fb0df039e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -81,7 +81,7 @@ static void amdgpu_vkms_disable_vblank(struct drm_crtc *crtc)
 {
struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
 
-   hrtimer_cancel(_crtc->vblank_timer);
+   hrtimer_try_to_cancel(_crtc->vblank_timer);
 }
 
 static bool amdgpu_vkms_get_vblank_timestamp(struct drm_crtc *crtc,
-- 
2.25.1

[PATCH] drm/amdgpu: keep irq count in amdgpu_irq_disable_all

2023-05-25 Thread Guchun Chen

This can clean up all irq warnings because of unbalanced
amdgpu_irq_get/put when unplugging/unbind device, and leave
irq count decrease in each ip fini function.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
index 00f2106c17b9..f90920fbd340 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
@@ -140,7 +140,6 @@ void amdgpu_irq_disable_all(struct amdgpu_device *adev)
continue;
 
for (k = 0; k < src->num_types; ++k) {
-   atomic_set(>enabled_types[k], 0);
r = src->funcs->set(adev, src, k,
AMDGPU_IRQ_STATE_DISABLE);
if (r)
-- 
2.25.1

[PATCH] drm/amdgpu: add a flag to indicate if a VM is attached to fpriv

2023-05-24 Thread Guchun Chen

Recent code stores xcp_id to amdgpu bo for accounting memory
usage or find correct KFD node, and this xcp_id is from file
private data after opening device. However, not all VMs are
attached to this fpriv structure like the case in amdgpu_mes_self_test.
So add a flag to differentiate the cases. Otherwise, KASAN will
complain out of bound access.

[   77.292314] BUG: KASAN: slab-out-of-bounds in 
amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.293845] Read of size 4 at addr 888102c48a48 by task modprobe/1069
[   77.294146] Call Trace:
[   77.294178]  
[   77.294208]  dump_stack_lvl+0x49/0x63
[   77.294260]  print_report+0x16f/0x4a6
[   77.294307]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.295979]  ? kasan_complete_mode_report_info+0x3c/0x200
[   77.296057]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.297556]  kasan_report+0xb4/0x130
[   77.297609]  ? amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.299202]  __asan_load4+0x6f/0x90
[   77.299272]  amdgpu_vm_pt_create+0x17e/0x4b0 [amdgpu]
[   77.300796]  ? amdgpu_init+0x6e/0x1000 [amdgpu]
[   77.30]  ? amdgpu_vm_pt_clear+0x750/0x750 [amdgpu]
[   77.303721]  ? preempt_count_sub+0x18/0xc0
[   77.303786]  amdgpu_vm_init+0x39e/0x870 [amdgpu]
[   77.305186]  ? amdgpu_vm_wait_idle+0x90/0x90 [amdgpu]
[   77.306683]  ? kasan_set_track+0x25/0x30
[   77.306737]  ? kasan_save_alloc_info+0x1b/0x30
[   77.306795]  ? __kasan_kmalloc+0x87/0xa0
[   77.306852]  amdgpu_mes_self_test+0x169/0x620 [amdgpu]

Fixes: ffc6deb773f7("drm/amdkfd: Store xcp partition id to amdgpu bo")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c|  5 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  5 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 12 +---
 5 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 41d047e5de69..79b80f9233db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1229,7 +1229,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct 
drm_file *file_priv)
pasid = 0;
}
 
-   r = amdgpu_vm_init(adev, >vm);
+   r = amdgpu_vm_init(adev, >vm, true);
if (r)
goto error_pasid;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 49bb6c03d606..3be5219edf88 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -1345,7 +1345,7 @@ int amdgpu_mes_self_test(struct amdgpu_device *adev)
goto error_pasid;
}
 
-   r = amdgpu_vm_init(adev, vm);
+   r = amdgpu_vm_init(adev, vm, false);
if (r) {
DRM_ERROR("failed to initialize vm\n");
goto error_pasid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 37b9d8a8dbec..47ffaa1526a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -2099,13 +2099,15 @@ long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long 
timeout)
  *
  * @adev: amdgpu_device pointer
  * @vm: requested vm
+ * @vm_attach_to_fpriv: flag to tell if vm is attached to file private data
  *
  * Init @vm fields.
  *
  * Returns:
  * 0 for success, error for failure.
  */
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
+int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm,
+  bool vm_attach_to_fpriv)
 {
struct amdgpu_bo *root_bo;
struct amdgpu_bo_vm *root;
@@ -2131,6 +2133,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
 
vm->pte_support_ats = false;
vm->is_compute_context = false;
+   vm->vm_attach_to_fpriv = vm_attach_to_fpriv;
 
vm->use_cpu_for_update = !!(adev->vm_manager.vm_update_mode &
AMDGPU_VM_USE_CPU_FOR_GFX);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index d551fca1780e..62ed14b1fc16 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -333,6 +333,9 @@ struct amdgpu_vm {
/* Flag to indicate if VM is used for compute */
boolis_compute_context;
 
+   /* Flag to tell if VM is attached to file private data */
+   bool vm_attach_to_fpriv;
+
/* Memory partition number, -1 means any partition */
int8_t  mem_id;
 };
@@ -392,7 +395,7 @@ int amdgpu_vm_set_pasid(struct amdgpu_device *adev, struct 
amdgpu_vm *vm,
u32 pasid);
 
 long amdgpu_vm_wait_idle(struct amdgpu_vm *vm, long timeout);
-int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm);
+int amdg

[PATCH 3/3] drm/amdgpu: switch to unified amdgpu_ring_test_helper

2023-05-18 Thread Guchun Chen

This will simplify code.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 9 ++---
 drivers/gpu/drm/amd/amdgpu/mes_v10_1.c  | 8 +---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  | 7 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c  | 8 ++--
 4 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index f0f00466b59f..49bb6c03d606 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -1305,14 +1305,9 @@ static int amdgpu_mes_test_queues(struct amdgpu_ring 
**added_rings)
if (!ring)
continue;
 
-   r = amdgpu_ring_test_ring(ring);
-   if (r) {
-   DRM_DEV_ERROR(ring->adev->dev,
- "ring %s test failed (%d)\n",
- ring->name, r);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
return r;
-   } else
-   DRM_INFO("ring %s test pass\n", ring->name);
 
r = amdgpu_ring_test_ib(ring, 1000 * 10);
if (r) {
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
index 4560476c7c31..af66e985a33a 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v10_1.c
@@ -812,13 +812,7 @@ static int mes_v10_1_kiq_enable_queue(struct amdgpu_device 
*adev)
 
kiq->pmf->kiq_map_queues(kiq_ring, >mes.ring);
 
-   r = amdgpu_ring_test_ring(kiq_ring);
-   if (r) {
-   DRM_ERROR("kfq enable failed\n");
-   kiq_ring->sched.ready = false;
-   }
-
-   return r;
+   return amdgpu_ring_test_helper(kiq_ring);
 }
 
 static int mes_v10_1_queue_init(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 3adb450eec07..b8eabb37d48d 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -879,12 +879,7 @@ static int mes_v11_0_kiq_enable_queue(struct amdgpu_device 
*adev)
 
kiq->pmf->kiq_map_queues(kiq_ring, >mes.ring);
 
-   r = amdgpu_ring_test_ring(kiq_ring);
-   if (r) {
-   DRM_ERROR("kfq enable failed\n");
-   kiq_ring->sched.ready = false;
-   }
-   return r;
+   return amdgpu_ring_test_helper(kiq_ring);
 }
 
 static int mes_v11_0_queue_init(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 23667605c372..3f722d21b17f 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -622,13 +622,9 @@ static int sdma_v5_2_gfx_resume(struct amdgpu_device *adev)
sdma_v5_2_enable(adev, true);
}
 
-   r = amdgpu_ring_test_ring(ring);
-   if (r) {
-   ring->sched.ready = false;
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
return r;
-   }
-
-   ring->sched.ready = true;
 
if (adev->mman.buffer_funcs_ring == ring)
amdgpu_ttm_set_buffer_funcs_status(adev, true);
-- 
2.25.1

[PATCH 2/3] drm/amdgpu/gfx: set sched.ready status after ring/IB test in gfx

2023-05-18 Thread Guchun Chen

sched.ready is nothing with ring initialization, it needs to set
to be true after ring/IB test in amdgpu_ring_test_helper to tell
the ring is ready for submission.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 24 
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c  | 25 -
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c |  1 -
 5 files changed, 8 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 8e86b2c23c0a..d57671c729bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -6073,7 +6073,6 @@ static int gfx_v10_0_cp_gfx_resume(struct amdgpu_device 
*adev)
u32 tmp;
u32 rb_bufsz;
u64 rb_addr, rptr_addr, wptr_gpu_addr;
-   u32 i;
 
/* Set the write pointer delay */
WREG32_SOC15(GC, 0, mmCP_RB_WPTR_DELAY, 0);
@@ -6168,11 +6167,6 @@ static int gfx_v10_0_cp_gfx_resume(struct amdgpu_device 
*adev)
/* start the ring */
gfx_v10_0_cp_gfx_start(adev);
 
-   for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
-   ring = >gfx.gfx_ring[i];
-   ring->sched.ready = true;
-   }
-
return 0;
 }
 
@@ -6470,7 +6464,7 @@ static int gfx_v10_0_cp_async_gfx_ring_resume(struct 
amdgpu_device *adev)
 
r = amdgpu_bo_reserve(ring->mqd_obj, false);
if (unlikely(r != 0))
-   goto done;
+   return r;
 
r = amdgpu_bo_kmap(ring->mqd_obj, (void **)>mqd_ptr);
if (!r) {
@@ -6480,23 +6474,14 @@ static int gfx_v10_0_cp_async_gfx_ring_resume(struct 
amdgpu_device *adev)
}
amdgpu_bo_unreserve(ring->mqd_obj);
if (r)
-   goto done;
+   return r;
}
 
r = amdgpu_gfx_enable_kgq(adev, 0);
if (r)
-   goto done;
-
-   r = gfx_v10_0_cp_gfx_start(adev);
-   if (r)
-   goto done;
+   return r;
 
-   for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
-   ring = >gfx.gfx_ring[i];
-   ring->sched.ready = true;
-   }
-done:
-   return r;
+   return gfx_v10_0_cp_gfx_start(adev);
 }
 
 static int gfx_v10_0_compute_mqd_init(struct amdgpu_device *adev, void *m,
@@ -6810,7 +6795,6 @@ static int gfx_v10_0_kiq_resume(struct amdgpu_device 
*adev)
amdgpu_bo_kunmap(ring->mqd_obj);
ring->mqd_ptr = NULL;
amdgpu_bo_unreserve(ring->mqd_obj);
-   ring->sched.ready = true;
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index f9c31043..b9a4ef396628 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -3228,7 +3228,6 @@ static int gfx_v11_0_cp_gfx_resume(struct amdgpu_device 
*adev)
u32 tmp;
u32 rb_bufsz;
u64 rb_addr, rptr_addr, wptr_gpu_addr;
-   u32 i;
 
/* Set the write pointer delay */
WREG32_SOC15(GC, 0, regCP_RB_WPTR_DELAY, 0);
@@ -3320,11 +3319,6 @@ static int gfx_v11_0_cp_gfx_resume(struct amdgpu_device 
*adev)
/* start the ring */
gfx_v11_0_cp_gfx_start(adev);
 
-   for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
-   ring = >gfx.gfx_ring[i];
-   ring->sched.ready = true;
-   }
-
return 0;
 }
 
@@ -3370,8 +3364,6 @@ static void gfx_v11_0_cp_compute_enable(struct 
amdgpu_device *adev, bool enable)
WREG32_SOC15(GC, 0, regCP_MEC_CNTL, data);
}
 
-   adev->gfx.kiq[0].ring.sched.ready = enable;
-
udelay(50);
 }
 
@@ -3711,7 +3703,7 @@ static int gfx_v11_0_cp_async_gfx_ring_resume(struct 
amdgpu_device *adev)
 
r = amdgpu_bo_reserve(ring->mqd_obj, false);
if (unlikely(r != 0))
-   goto done;
+   return r;
 
r = amdgpu_bo_kmap(ring->mqd_obj, (void **)>mqd_ptr);
if (!r) {
@@ -3721,23 +3713,14 @@ static int gfx_v11_0_cp_async_gfx_ring_resume(struct 
amdgpu_device *adev)
}
amdgpu_bo_unreserve(ring->mqd_obj);
if (r)
-   goto done;
+   return r;
}
 
r = amdgpu_gfx_enable_kgq(adev, 0);
if (r)
-   goto done;
-
-   r = gfx_v11_0_cp_gfx_start(adev);
-   if (r)
-   goto done;
+   return r;
 
-   for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
-   ring = >gfx.gfx_ring[i];
-   ring->sched.ready = true;
-   }
-done:
-   return r;
+   return gfx_v11_0_cp_gfx_start

[PATCH 1/3] drm/amdgpu/sdma: set sched.ready status after ring/IB test in sdma

2023-05-18 Thread Guchun Chen

sched.ready is nothing with ring initialization, it needs to set
to be true after ring/IB test in amdgpu_ring_test_helper to tell
the ring is ready for submission.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c| 2 --
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c   | 2 --
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c   | 2 --
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   | 4 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 4 
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   | 2 --
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   | 4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   | 6 +-
 drivers/gpu/drm/amd/amdgpu/si_dma.c  | 2 --
 9 files changed, 3 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index 67d16236b216..52598fbc9b39 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -489,8 +489,6 @@ static int cik_sdma_gfx_resume(struct amdgpu_device *adev)
 #endif
/* enable DMA IBs */
WREG32(mmSDMA0_GFX_IB_CNTL + sdma_offsets[i], ib_cntl);
-
-   ring->sched.ready = true;
}
 
cik_sdma_enable(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index fd2a7b66ac56..51afc92994a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -466,8 +466,6 @@ static int sdma_v2_4_gfx_resume(struct amdgpu_device *adev)
 #endif
/* enable DMA IBs */
WREG32(mmSDMA0_GFX_IB_CNTL + sdma_offsets[i], ib_cntl);
-
-   ring->sched.ready = true;
}
 
sdma_v2_4_enable(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index e572389089d2..344202870aeb 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -734,8 +734,6 @@ static int sdma_v3_0_gfx_resume(struct amdgpu_device *adev)
 #endif
/* enable DMA IBs */
WREG32(mmSDMA0_GFX_IB_CNTL + sdma_offsets[i], ib_cntl);
-
-   ring->sched.ready = true;
}
 
/* unhalt the MEs */
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 9295ac7edd56..e3581852ed9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1114,8 +1114,6 @@ static void sdma_v4_0_gfx_resume(struct amdgpu_device 
*adev, unsigned int i)
 #endif
/* enable DMA IBs */
WREG32_SDMA(i, mmSDMA0_GFX_IB_CNTL, ib_cntl);
-
-   ring->sched.ready = true;
 }
 
 /**
@@ -1202,8 +1200,6 @@ static void sdma_v4_0_page_resume(struct amdgpu_device 
*adev, unsigned int i)
 #endif
/* enable DMA IBs */
WREG32_SDMA(i, mmSDMA0_PAGE_IB_CNTL, ib_cntl);
-
-   ring->sched.ready = true;
 }
 
 static void
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 64dcaa2670dd..c3517d9fc38c 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -659,8 +659,6 @@ static void sdma_v4_4_2_gfx_resume(struct amdgpu_device 
*adev, unsigned int i)
 #endif
/* enable DMA IBs */
WREG32_SDMA(i, regSDMA_GFX_IB_CNTL, ib_cntl);
-
-   ring->sched.ready = true;
 }
 
 /**
@@ -750,8 +748,6 @@ static void sdma_v4_4_2_page_resume(struct amdgpu_device 
*adev, unsigned int i)
 #endif
/* enable DMA IBs */
WREG32_SDMA(i, regSDMA_PAGE_IB_CNTL, ib_cntl);
-
-   ring->sched.ready = true;
 }
 
 static void sdma_v4_4_2_init_pg(struct amdgpu_device *adev)
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 92e1299be021..e2fe539ff3ed 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -819,8 +819,6 @@ static int sdma_v5_0_gfx_resume(struct amdgpu_device *adev)
/* enable DMA IBs */
WREG32_SOC15_IP(GC, sdma_v5_0_get_reg_offset(adev, i, 
mmSDMA0_GFX_IB_CNTL), ib_cntl);
 
-   ring->sched.ready = true;
-
if (amdgpu_sriov_vf(adev)) { /* bare-metal sequence doesn't 
need below to lines */
sdma_v5_0_ctx_switch_enable(adev, true);
sdma_v5_0_enable(adev, true);
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index ca7e8757d78e..23667605c372 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -617,8 +617,6 @@ static int sdma_v5_2_gfx_resume(struct amdgpu_device *adev)
/* enable DMA IBs */
WREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_GFX_IB_CNTL), ib_cntl);
 
-   ring->sched.ready = true;
-
if (amdgpu_sriov_vf(adev)) { /* bare-metal sequence d

[PATCH 0/3] cleanup abuse of sched.ready in each ring init

2023-05-18 Thread Guchun Chen

This series intend to fix the abuse of flag sched.ready in
each ring init process. Its status needs to be updated only after
respective ring/IB test in init/resume.

Guchun Chen (3):
  drm/amdgpu/sdma: set sched.ready status after ring/IB test in sdma
  drm/amdgpu/gfx: set sched.ready status after ring/IB test in gfx
  drm/amdgpu: switch to unified amdgpu_ring_test_helper

 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c  |  9 ++---
 drivers/gpu/drm/amd/amdgpu/cik_sdma.c|  2 --
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   | 24 ---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   | 25 
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c|  2 --
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c|  2 --
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c  |  1 -
 drivers/gpu/drm/amd/amdgpu/mes_v10_1.c   |  8 +---
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c   |  7 +--
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c   |  4 
 drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c |  4 
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c   |  2 --
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   |  8 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c   |  6 +-
 drivers/gpu/drm/amd/amdgpu/si_dma.c  |  2 --
 17 files changed, 15 insertions(+), 95 deletions(-)

-- 
2.25.1

[PATCH] drm/amdgpu: skip disabling fence driver src_irqs when device is unplugged

2023-05-09 Thread Guchun Chen

When performing device unbind or halt, we have disabled all irqs at the
very begining like amdgpu_pci_remove or amdgpu_device_halt. So
amdgpu_irq_put for irqs stored in fence driver should not be called
any more, otherwise, below calltrace will arrive.

[  139.114088] WARNING: CPU: 2 PID: 1550 at 
drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c:616 amdgpu_irq_put+0xf6/0x110 [amdgpu]
[  139.114655] Call Trace:
[  139.114655]  
[  139.114657]  amdgpu_fence_driver_hw_fini+0x93/0x130 [amdgpu]
[  139.114836]  amdgpu_device_fini_hw+0xb6/0x350 [amdgpu]
[  139.114955]  amdgpu_driver_unload_kms+0x51/0x70 [amdgpu]
[  139.115075]  amdgpu_pci_remove+0x63/0x160 [amdgpu]
[  139.115193]  ? __pm_runtime_resume+0x64/0x90
[  139.115195]  pci_device_remove+0x3a/0xb0
[  139.115197]  device_remove+0x43/0x70
[  139.115198]  device_release_driver_internal+0xbd/0x140

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 5d96f630a3aa..d0ed70f45db7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -581,7 +581,8 @@ void amdgpu_fence_driver_hw_fini(struct amdgpu_device *adev)
if (r)
amdgpu_fence_driver_force_completion(ring);
 
-   if (ring->fence_drv.irq_src)
+   if (!drm_dev_is_unplugged(adev_to_drm(adev)) &&
+   ring->fence_drv.irq_src)
amdgpu_irq_put(adev, ring->fence_drv.irq_src,
   ring->fence_drv.irq_type);
 
-- 
2.25.1

[PATCH v3] drm/amd/pm: avoid potential UBSAN issue on legacy asics

2023-05-09 Thread Guchun Chen

Prevent further dpm casting on legacy asics without od_enabled in
amdgpu_dpm_is_overdrive_supported. This can avoid UBSAN complain
in init sequence.

v2: add a macro to check legacy dpm instead of checking asic family/type
v3: refine macro name for naming consistency

Suggested-by: Evan Quan 
Signed-off-by: Guchun Chen 
Reviewed-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index 86246f69dbe1..07853162 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -36,6 +36,8 @@
 #define amdgpu_dpm_enable_bapm(adev, e) \

((adev)->powerplay.pp_funcs->enable_bapm((adev)->powerplay.pp_handle, (e)))
 
+#define amdgpu_dpm_is_legacy_dpm(adev) ((adev)->powerplay.pp_handle == (adev))
+
 int amdgpu_dpm_get_sclk(struct amdgpu_device *adev, bool low)
 {
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
@@ -1467,8 +1469,11 @@ int amdgpu_dpm_is_overdrive_supported(struct 
amdgpu_device *adev)
} else {
struct pp_hwmgr *hwmgr;
 
-   /* SI asic does not carry od_enabled */
-   if (adev->family == AMDGPU_FAMILY_SI)
+   /*
+* dpm on some legacy asics don't carry od_enabled member
+* as its pp_handle is casted directly from adev.
+*/
+   if (amdgpu_dpm_is_legacy_dpm(adev))
return false;
 
hwmgr = (struct pp_hwmgr *)adev->powerplay.pp_handle;
-- 
2.25.1

[PATCH] drm/amd/pm: avoid potential UBSAN issue on legacy asics

2023-05-08 Thread Guchun Chen

Prevent further dpm casting on legacy asics without od_enabled in
amdgpu_dpm_is_overdrive_supported. This can avoid UBSAN complain
in init sequence.

v2: add a macro to check legacy dpm instead of checking asic family/type

Suggested-by: Evan Quan 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index 86246f69dbe1..4b28fd62ed7a 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -36,6 +36,8 @@
 #define amdgpu_dpm_enable_bapm(adev, e) \

((adev)->powerplay.pp_funcs->enable_bapm((adev)->powerplay.pp_handle, (e)))
 
+#define is_legacy_dpm(adev) ((adev)->powerplay.pp_handle == (adev))
+
 int amdgpu_dpm_get_sclk(struct amdgpu_device *adev, bool low)
 {
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs;
@@ -1467,8 +1469,11 @@ int amdgpu_dpm_is_overdrive_supported(struct 
amdgpu_device *adev)
} else {
struct pp_hwmgr *hwmgr;
 
-   /* SI asic does not carry od_enabled */
-   if (adev->family == AMDGPU_FAMILY_SI)
+   /*
+* dpm on some legacy asics don't carry od_enabled member
+* as its pp_handle is casted directly from adev.
+*/
+   if (is_legacy_dpm(adev))
return false;
 
hwmgr = (struct pp_hwmgr *)adev->powerplay.pp_handle;
-- 
2.25.1

[PATCH] drm/amd/pm: avoid potential UBSAN issue on legacy asics

2023-05-08 Thread Guchun Chen

Prevent further casting on chip MULLINS/KABINI/KAVERI when calling
amdgpu_dpm_is_overdrive_supported, this can avoid UBSAN complain
in init sequence.

Suggested-by: Evan Quan 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index 86246f69dbe1..ccd3ea89eacf 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -1467,8 +1467,14 @@ int amdgpu_dpm_is_overdrive_supported(struct 
amdgpu_device *adev)
} else {
struct pp_hwmgr *hwmgr;
 
-   /* SI asic does not carry od_enabled */
-   if (adev->family == AMDGPU_FAMILY_SI)
+   /*
+* SI asic and chip MULLINS/KABINI/KAVERI do not carry
+* od_enabled, as its pp_handle is casted from adev.
+*/
+   if ((adev->family == AMDGPU_FAMILY_SI) ||
+   (adev->asic_type == CHIP_MULLINS) ||
+   (adev->asic_type == CHIP_KABINI) ||
+   (adev->asic_type == CHIP_KAVERI))
return false;
 
hwmgr = (struct pp_hwmgr *)adev->powerplay.pp_handle;
-- 
2.25.1

[PATCH] drm/amdgpu/gfx: disable gfx9 cp_ecc_error_irq only when enabling legacy gfx ras

2023-05-08 Thread Guchun Chen

gfx9 cp_ecc_error_irq is only enabled when legacy gfx ras is assert.
So in gfx_v9_0_hw_fini, interrupt disablement for cp_ecc_error_irq
should be executed under such condition, otherwise, an amdgpu_irq_put
calltrace will occur.

[ 7283.170322] RIP: 0010:amdgpu_irq_put+0x45/0x70 [amdgpu]
[ 7283.170964] RSP: 0018:9a5fc3967d00 EFLAGS: 00010246
[ 7283.170967] RAX: 98d88afd3040 RBX: 98d89da2 RCX: 
[ 7283.170969] RDX:  RSI: 98d89da2bef8 RDI: 98d89da2
[ 7283.170971] RBP: 98d89da2 R08: 98d89da2ca18 R09: 0006
[ 7283.170973] R10: d5764243c008 R11:  R12: 1050
[ 7283.170975] R13: 98d89da38978 R14: 999ae15a R15: 98d880130105
[ 7283.170978] FS:  () GS:98d996f0() 
knlGS:
[ 7283.170981] CS:  0010 DS:  ES:  CR0: 80050033
[ 7283.170983] CR2: f7a9d178 CR3: 0001c42ea000 CR4: 003506e0
[ 7283.170986] Call Trace:
[ 7283.170988]  
[ 7283.170989]  gfx_v9_0_hw_fini+0x1c/0x6d0 [amdgpu]
[ 7283.171655]  amdgpu_device_ip_suspend_phase2+0x101/0x1a0 [amdgpu]
[ 7283.172245]  amdgpu_device_suspend+0x103/0x180 [amdgpu]
[ 7283.172823]  amdgpu_pmops_freeze+0x21/0x60 [amdgpu]
[ 7283.173412]  pci_pm_freeze+0x54/0xc0
[ 7283.173419]  ? __pfx_pci_pm_freeze+0x10/0x10
[ 7283.173425]  dpm_run_callback+0x98/0x200
[ 7283.173430]  __device_suspend+0x164/0x5f0

v2: drop gfx11 as it's fixed in a different solution by retiring cp_ecc_irq 
funcs(Hawking)

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2522

Signed-off-by: Guchun Chen 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index ae09fc1cfe6b..c54d05bdc2d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3751,7 +3751,8 @@ static int gfx_v9_0_hw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-   amdgpu_irq_put(adev, >gfx.cp_ecc_error_irq, 0);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
+   amdgpu_irq_put(adev, >gfx.cp_ecc_error_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_inst_irq, 0);
 
-- 
2.25.1

[PATCH] drm/amdgpu: disable sdma ecc irq only when sdma RAS is enabled in suspend

2023-05-08 Thread Guchun Chen

sdma_v4_0_ip is shared on a few asics, but in sdma_v4_0_hw_fini,
driver unconditionally disables ecc_irq which is only enabled on
those asics enabling sdma ecc. This will introduce a warning in
suspend cycle on those chips with sdma ip v4.0, while without
sdma ecc. So this patch correct this.

[ 7283.166354] RIP: 0010:amdgpu_irq_put+0x45/0x70 [amdgpu]
[ 7283.167001] RSP: 0018:9a5fc3967d08 EFLAGS: 00010246
[ 7283.167019] RAX: 98d88afd3770 RBX: 0001 RCX: 
[ 7283.167023] RDX:  RSI: 98d89da30390 RDI: 98d89da2
[ 7283.167025] RBP: 98d89da2 R08: 00036838 R09: 0006
[ 7283.167028] R10: d5764243c008 R11:  R12: 98d89da30390
[ 7283.167030] R13: 98d89da38978 R14: 999ae15a R15: 98d880130105
[ 7283.167032] FS:  () GS:98d996f0() 
knlGS:
[ 7283.167036] CS:  0010 DS:  ES:  CR0: 80050033
[ 7283.167039] CR2: f7a9d178 CR3: 0001c42ea000 CR4: 003506e0
[ 7283.167041] Call Trace:
[ 7283.167046]  
[ 7283.167048]  sdma_v4_0_hw_fini+0x38/0xa0 [amdgpu]
[ 7283.167704]  amdgpu_device_ip_suspend_phase2+0x101/0x1a0 [amdgpu]
[ 7283.168296]  amdgpu_device_suspend+0x103/0x180 [amdgpu]
[ 7283.168875]  amdgpu_pmops_freeze+0x21/0x60 [amdgpu]
[ 7283.169464]  pci_pm_freeze+0x54/0xc0

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2522

Signed-off-by: Guchun Chen 
Reviewed-by: Tao Zhou 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index b5affba22156..8b8ddf050266 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1903,9 +1903,11 @@ static int sdma_v4_0_hw_fini(void *handle)
return 0;
}
 
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   amdgpu_irq_put(adev, >sdma.ecc_irq,
-  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   amdgpu_irq_put(adev, >sdma.ecc_irq,
+  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   }
}
 
sdma_v4_0_ctx_switch_enable(adev, false);
-- 
2.25.1

[PATCH] drm/amdgpu/gfx: disable cp_ecc_error_irq only when gfx ras is enabled in suspend

2023-05-06 Thread Guchun Chen

cp_ecc_error_irq is only enabled when gfx ras is assert.
So in gfx_v9_0_hw_fini, interrupt disablement for cp_ecc_error_irq
should be executed under such condition, otherwise, an amdgpu_irq_put
calltrace will occur.

[ 7283.170322] RIP: 0010:amdgpu_irq_put+0x45/0x70 [amdgpu]
[ 7283.170964] RSP: 0018:9a5fc3967d00 EFLAGS: 00010246
[ 7283.170967] RAX: 98d88afd3040 RBX: 98d89da2 RCX: 
[ 7283.170969] RDX:  RSI: 98d89da2bef8 RDI: 98d89da2
[ 7283.170971] RBP: 98d89da2 R08: 98d89da2ca18 R09: 0006
[ 7283.170973] R10: d5764243c008 R11:  R12: 1050
[ 7283.170975] R13: 98d89da38978 R14: 999ae15a R15: 98d880130105
[ 7283.170978] FS:  () GS:98d996f0() 
knlGS:
[ 7283.170981] CS:  0010 DS:  ES:  CR0: 80050033
[ 7283.170983] CR2: f7a9d178 CR3: 0001c42ea000 CR4: 003506e0
[ 7283.170986] Call Trace:
[ 7283.170988]  
[ 7283.170989]  gfx_v9_0_hw_fini+0x1c/0x6d0 [amdgpu]
[ 7283.171655]  amdgpu_device_ip_suspend_phase2+0x101/0x1a0 [amdgpu]
[ 7283.172245]  amdgpu_device_suspend+0x103/0x180 [amdgpu]
[ 7283.172823]  amdgpu_pmops_freeze+0x21/0x60 [amdgpu]
[ 7283.173412]  pci_pm_freeze+0x54/0xc0
[ 7283.173419]  ? __pfx_pci_pm_freeze+0x10/0x10
[ 7283.173425]  dpm_run_callback+0x98/0x200
[ 7283.173430]  __device_suspend+0x164/0x5f0

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2522

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 3 ++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index ecf8ceb53311..f6bc62a94099 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -4442,7 +4442,8 @@ static int gfx_v11_0_hw_fini(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
int r;
 
-   amdgpu_irq_put(adev, >gfx.cp_ecc_error_irq, 0);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
+   amdgpu_irq_put(adev, >gfx.cp_ecc_error_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_inst_irq, 0);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index ae09fc1cfe6b..c54d05bdc2d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3751,7 +3751,8 @@ static int gfx_v9_0_hw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-   amdgpu_irq_put(adev, >gfx.cp_ecc_error_irq, 0);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__GFX))
+   amdgpu_irq_put(adev, >gfx.cp_ecc_error_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_reg_irq, 0);
amdgpu_irq_put(adev, >gfx.priv_inst_irq, 0);
 
-- 
2.25.1

[PATCH] drm/amdgpu: disable sdma ecc irq only when sdma RAS is enabled in suspend

2023-05-06 Thread Guchun Chen

sdma_v4_0_ip is shared on a few asics, but in sdma_v4_0_hw_fini,
driver unconditionally disables ecc_irq which is only enabled on
those asics enabling sdma ecc. This will introduce a warning in
suspend cycle on those chips with sdma ip v4.0, while without
sdma ecc. So this patch correct this.

[ 7283.166354] RIP: 0010:amdgpu_irq_put+0x45/0x70 [amdgpu]
[ 7283.167001] RSP: 0018:9a5fc3967d08 EFLAGS: 00010246
[ 7283.167019] RAX: 98d88afd3770 RBX: 0001 RCX: 
[ 7283.167023] RDX:  RSI: 98d89da30390 RDI: 98d89da2
[ 7283.167025] RBP: 98d89da2 R08: 00036838 R09: 0006
[ 7283.167028] R10: d5764243c008 R11:  R12: 98d89da30390
[ 7283.167030] R13: 98d89da38978 R14: 999ae15a R15: 98d880130105
[ 7283.167032] FS:  () GS:98d996f0() 
knlGS:
[ 7283.167036] CS:  0010 DS:  ES:  CR0: 80050033
[ 7283.167039] CR2: f7a9d178 CR3: 0001c42ea000 CR4: 003506e0
[ 7283.167041] Call Trace:
[ 7283.167046]  
[ 7283.167048]  sdma_v4_0_hw_fini+0x38/0xa0 [amdgpu]
[ 7283.167704]  amdgpu_device_ip_suspend_phase2+0x101/0x1a0 [amdgpu]
[ 7283.168296]  amdgpu_device_suspend+0x103/0x180 [amdgpu]
[ 7283.168875]  amdgpu_pmops_freeze+0x21/0x60 [amdgpu]
[ 7283.169464]  pci_pm_freeze+0x54/0xc0

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2522

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index b5affba22156..8b8ddf050266 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -1903,9 +1903,11 @@ static int sdma_v4_0_hw_fini(void *handle)
return 0;
}
 
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   amdgpu_irq_put(adev, >sdma.ecc_irq,
-  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   if (amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__SDMA)) {
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   amdgpu_irq_put(adev, >sdma.ecc_irq,
+  AMDGPU_SDMA_IRQ_INSTANCE0 + i);
+   }
}
 
sdma_v4_0_ctx_switch_enable(adev, false);
-- 
2.25.1

[PATCH v3] drm/amd/pm: parse pp_handle under appropriate conditions

2023-05-05 Thread Guchun Chen

amdgpu_dpm_is_overdrive_supported is a common API across all
asics, so we should cast pp_handle into correct structure
under different power frameworks.

v2: using return directly to simplify code
v3: SI asic does not carry od_enabled member in pp_handle, and update Fixes tag

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2541
Fixes: eb4900aa4c49("drm/amdgpu: Fix kernel NULL pointer dereference in dpm 
functions")
Suggested-by: Mario Limonciello 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index 300e156b924f..86246f69dbe1 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -1460,15 +1460,21 @@ int amdgpu_dpm_get_smu_prv_buf_details(struct 
amdgpu_device *adev,
 
 int amdgpu_dpm_is_overdrive_supported(struct amdgpu_device *adev)
 {
-   struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
-   struct smu_context *smu = adev->powerplay.pp_handle;
+   if (is_support_sw_smu(adev)) {
+   struct smu_context *smu = adev->powerplay.pp_handle;
+
+   return (smu->od_enabled || smu->is_apu);
+   } else {
+   struct pp_hwmgr *hwmgr;
 
-   if ((is_support_sw_smu(adev) && smu->od_enabled) ||
-   (is_support_sw_smu(adev) && smu->is_apu) ||
-   (!is_support_sw_smu(adev) && hwmgr->od_enabled))
-   return true;
+   /* SI asic does not carry od_enabled */
+   if (adev->family == AMDGPU_FAMILY_SI)
+   return false;
 
-   return false;
+   hwmgr = (struct pp_hwmgr *)adev->powerplay.pp_handle;
+
+   return hwmgr->od_enabled;
+   }
 }
 
 int amdgpu_dpm_set_pp_table(struct amdgpu_device *adev,
-- 
2.25.1

[PATCH] drm/amd/pm: parse pp_handle under appropriate conditions

2023-05-05 Thread Guchun Chen

amdgpu_dpm_is_overdrive_supported is a common API across all
asics, so we should cast pp_handle into correct structure
under different power frameworks.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2541
Fixes: ebfc253335af("drm/amd/pm: do not expose the smu_context structure used 
internally in power")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index f0068df2d073..ae45abeab5b5 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -1455,13 +1455,15 @@ int amdgpu_dpm_get_smu_prv_buf_details(struct 
amdgpu_device *adev,
 
 int amdgpu_dpm_is_overdrive_supported(struct amdgpu_device *adev)
 {
-   struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
-   struct smu_context *smu = adev->powerplay.pp_handle;
-
-   if ((is_support_sw_smu(adev) && smu->od_enabled) ||
-   (is_support_sw_smu(adev) && smu->is_apu) ||
-   (!is_support_sw_smu(adev) && hwmgr->od_enabled))
-   return true;
+   if (is_support_sw_smu(adev)) {
+   struct smu_context *smu = adev->powerplay.pp_handle;
+   if (smu->od_enabled || smu->is_apu)
+   return true;
+   } else {
+   struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
+   if (hwmgr->od_enabled)
+   return true;
+   }
 
return false;
 }
-- 
2.25.1

[PATCH] drm/amdgpu: drop redundant sched job cleanup when cs is aborted

2023-05-03 Thread Guchun Chen

Once command submission failed due to userptr invalidation in
amdgpu_cs_submit, legacy code will perform cleanup of scheduler
job. However, it's not needed at all, as former commit has integrated
job cleanup stuff into amdgpu_job_free. Otherwise, because of double
free, a NULL pointer dereference will occur in such scenario.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2457
Fixes: f7d66fb2ea43 ("drm/amdgpu: cleanup scheduler job initialization v2")
Signed-off-by: Guchun Chen 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index cb771c73cd07..9879aac3bcdb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1303,7 +1303,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
r = drm_sched_job_add_dependency(>base, fence);
if (r) {
dma_fence_put(fence);
-   goto error_cleanup;
+   return r;
}
}
 
@@ -1330,7 +1330,8 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
}
if (r) {
r = -EAGAIN;
-   goto error_unlock;
+   mutex_unlock(>adev->notifier_lock);
+   return r;
}
 
p->fence = dma_fence_get(>base.s_fence->finished);
@@ -1377,14 +1378,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
mutex_unlock(>adev->notifier_lock);
mutex_unlock(>bo_list->bo_list_mutex);
return 0;
-
-error_unlock:
-   mutex_unlock(>adev->notifier_lock);
-
-error_cleanup:
-   for (i = 0; i < p->gang_size; ++i)
-   drm_sched_job_cleanup(>jobs[i]->base);
-   return r;
 }
 
 /* Cleanup the parser structure */
-- 
2.25.1

[PATCH] drm/amdgpu: drop redudant sched job cleanup when cs is aborted

2023-04-26 Thread Guchun Chen

Once command submission failed due to userptr invalidation in
amdgpu_cs_submit, legacy code will perform cleanup of scheduler
job. However, it's not needed at all, as f7d66fb2ea43 has integrated
job cleanup stuff into amdgpu_job_free. Otherwise, because of double
free, a NULL pointer dereference will occur in such scenario.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2457
Fixes: f7d66fb2ea43("drm/amdgpu: cleanup scheduler job initialization v2")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 08eced097bd8..2eb2c66843a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1276,7 +1276,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
r = drm_sched_job_add_dependency(>base, fence);
if (r) {
dma_fence_put(fence);
-   goto error_cleanup;
+   return r;
}
}
 
@@ -1303,7 +1303,8 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
}
if (r) {
r = -EAGAIN;
-   goto error_unlock;
+   mutex_unlock(>adev->notifier_lock);
+   return r;
}
 
p->fence = dma_fence_get(>base.s_fence->finished);
@@ -1350,14 +1351,6 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p,
mutex_unlock(>adev->notifier_lock);
mutex_unlock(>bo_list->bo_list_mutex);
return 0;
-
-error_unlock:
-   mutex_unlock(>adev->notifier_lock);
-
-error_cleanup:
-   for (i = 0; i < p->gang_size; ++i)
-   drm_sched_job_cleanup(>jobs[i]->base);
-   return r;
 }
 
 /* Cleanup the parser structure */
-- 
2.25.1

[PATCH] drm/amdgpu: mark gfx_v9_4_3_disable_gpa_mode() static

2023-04-26 Thread Guchun Chen

This was left global by accident, the corresponding functions for other 
hardware types are already static:

drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c:1072:6: error: no previous prototype 
for function 'gfx_v9_4_3_disable_gpa_mode' [-Werror,-Wmissing-prototypes]

Fixes: 86301129698b ("drm/amdgpu: split gc v9_4_3 functionality from gc v9_0")
Reported-by: kernel test robot 
Signed-off-by: Arnd Bergmann 
Reviewed-by: Guchun Chen 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
index 56a415e151d4..312491455382 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c
@@ -1075,7 +1075,7 @@ static void gfx_v9_4_3_init_pg(struct amdgpu_device 
*adev, int xcc_id)
}
 }
 
-void gfx_v9_4_3_disable_gpa_mode(struct amdgpu_device *adev, int xcc_id)
+static void gfx_v9_4_3_disable_gpa_mode(struct amdgpu_device *adev, int xcc_id)
 {
uint32_t data;
 
-- 
2.25.1

[PATCH] drm/amdgpu: check correct allocated mqd_backup object after alloc

2023-04-25 Thread Guchun Chen

Instead of the default one, check the right mqd_backup object.

Signed-off-by: Guchun Chen 
Cc: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 2cf1f88fde48..66b9740ec376 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -379,7 +379,7 @@ int amdgpu_gfx_kiq_init(struct amdgpu_device *adev,
 int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev,
   unsigned mqd_size, int xcc_id)
 {
-   int r, i;
+   int r, i, j;
struct amdgpu_kiq *kiq = >gfx.kiq[xcc_id];
struct amdgpu_ring *ring = >ring;
 
@@ -431,7 +431,8 @@ int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev,
 
/* create MQD for each KCQ */
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-   ring = >gfx.compute_ring[i + xcc_id * 
adev->gfx.num_compute_rings];
+   j = i + xcc_id * adev->gfx.num_compute_rings;
+   ring = >gfx.compute_ring[j];
if (!ring->mqd_obj) {
r = amdgpu_bo_create_kernel(adev, mqd_size, PAGE_SIZE,
AMDGPU_GEM_DOMAIN_GTT, 
>mqd_obj,
@@ -443,8 +444,8 @@ int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev,
 
ring->mqd_size = mqd_size;
/* prepare MQD backup */
-   adev->gfx.mec.mqd_backup[i + xcc_id * 
adev->gfx.num_compute_rings] = kmalloc(mqd_size, GFP_KERNEL);
-   if (!adev->gfx.mec.mqd_backup[i])
+   adev->gfx.mec.mqd_backup[j] = kmalloc(mqd_size, 
GFP_KERNEL);
+   if (!adev->gfx.mec.mqd_backup[j])
dev_warn(adev->dev, "no memory to create MQD 
backup for ring %s\n", ring->name);
}
}
-- 
2.25.1

[PATCH] drm/amdgpu: fix a build warning by a typo in amdgpu_gfx.c

2023-04-25 Thread Guchun Chen

This should be a typo when intruducing multi-xx support.

Reported-by: kernel test robot 
Signed-off-by: Guchun Chen 
Cc: Le Ma 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 60bb4bba1994..2cf1f88fde48 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -470,8 +470,8 @@ void amdgpu_gfx_mqd_sw_fini(struct amdgpu_device *adev, int 
xcc_id)
 
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
j = i + xcc_id * adev->gfx.num_compute_rings;
-   ring = >gfx.compute_ring[i];
-   kfree(adev->gfx.mec.mqd_backup[i]);
+   ring = >gfx.compute_ring[j];
+   kfree(adev->gfx.mec.mqd_backup[j]);
amdgpu_bo_free_kernel(>mqd_obj,
  >mqd_gpu_addr,
  >mqd_ptr);
-- 
2.25.1

[PATCH v3] drm/probe_helper: warning on poll_enabled for issue catching

2023-03-13 Thread Guchun Chen

In order to catch issues in other drivers to ensure proper call
sequence of polling function.

v2: drop Fixes tag in commit message (Bert & Jani)
v3: use drm_WARN_ON instead of WARN_ON (Jani)

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2411
Reported-by: Bert Karwatzki 
Suggested-by: Dmitry Baryshkov 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/drm_probe_helper.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/drm_probe_helper.c 
b/drivers/gpu/drm/drm_probe_helper.c
index 8127be134c39..d812a4e91760 100644
--- a/drivers/gpu/drm/drm_probe_helper.c
+++ b/drivers/gpu/drm/drm_probe_helper.c
@@ -852,6 +852,8 @@ EXPORT_SYMBOL(drm_kms_helper_is_poll_worker);
  */
 void drm_kms_helper_poll_disable(struct drm_device *dev)
 {
+   drm_WARN_ON(dev, !dev->mode_config.poll_enabled);
+
if (dev->mode_config.poll_running)
drm_kms_helper_disable_hpd(dev);
 
-- 
2.25.1

[PATCH 2/2] drm/amdgpu: use drm_device pointer directly rather than convert again

2023-03-09 Thread Guchun Chen

The convert from adev is redundant.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ef3368556b..2937912b7757 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5575,7 +5575,7 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
struct amdgpu_device *adev = drm_to_adev(dev);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-   if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
+   if (!amdgpu_device_supports_baco(dev))
return -ENOTSUPP;
 
if (ras && adev->ras_enabled &&
@@ -5591,7 +5591,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int ret = 0;
 
-   if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
+   if (!amdgpu_device_supports_baco(dev))
return -ENOTSUPP;
 
ret = amdgpu_dpm_baco_exit(adev);
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: drop pm_sysfs_en flag from amdgpu_device structure

2023-03-09 Thread Guchun Chen

pm_sysfs_en is overlapped with pm.sysfs_initialized, so drop it
for simplifying code(no functional change).

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h| 1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++--
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 164141bc8b4a..386729cc45d4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1023,7 +1023,6 @@ struct amdgpu_device {
boolin_runpm;
boolhas_pr3;
 
-   boolpm_sysfs_en;
boolucode_sysfs_en;
boolpsp_sysfs_en;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index da5b0258a237..41ef3368556b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3864,11 +3864,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
 
r = amdgpu_pm_sysfs_init(adev);
-   if (r) {
-   adev->pm_sysfs_en = false;
-   DRM_ERROR("registering pm debugfs failed (%d).\n", r);
-   } else
-   adev->pm_sysfs_en = true;
+   if (r)
+   DRM_ERROR("registering pm sysfs failed (%d).\n", r);
 
r = amdgpu_ucode_sysfs_init(adev);
if (r) {
@@ -4011,7 +4008,7 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)
if (adev->mman.initialized)
drain_workqueue(adev->mman.bdev.wq);
 
-   if (adev->pm_sysfs_en)
+   if (adev->pm.sysfs_initialized)
amdgpu_pm_sysfs_fini(adev);
if (adev->ucode_sysfs_en)
amdgpu_ucode_sysfs_fini(adev);
-- 
2.25.1

[PATCH v3 2/2] drm/probe_helper: warning on poll_enabled for issue catching

2023-03-09 Thread Guchun Chen

In order to catch issues in other drivers to ensure proper call
sequence of polling function.

v2: drop Fixes tag in commit message

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2411
Reported-by: Bert Karwatzki 
Suggested-by: Dmitry Baryshkov 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/drm_probe_helper.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/drm_probe_helper.c 
b/drivers/gpu/drm/drm_probe_helper.c
index 8127be134c39..85e0e80d4a52 100644
--- a/drivers/gpu/drm/drm_probe_helper.c
+++ b/drivers/gpu/drm/drm_probe_helper.c
@@ -852,6 +852,8 @@ EXPORT_SYMBOL(drm_kms_helper_is_poll_worker);
  */
 void drm_kms_helper_poll_disable(struct drm_device *dev)
 {
+   WARN_ON(!dev->mode_config.poll_enabled);
+
if (dev->mode_config.poll_running)
drm_kms_helper_disable_hpd(dev);
 
-- 
2.25.1

[PATCH v3 1/2] drm/amdgpu: move poll enabled/disable into non DC path

2023-03-09 Thread Guchun Chen

Some amd asics having reliable hotplug support don't call
drm_kms_helper_poll_init in driver init sequence. However,
due to the unified suspend/resume path for all asics, because
the output_poll_work->func is not set for these asics, a warning
arrives when suspending.

[   90.656049]  
[   90.656050]  ? console_unlock+0x4d/0x100
[   90.656053]  ? __irq_work_queue_local+0x27/0x60
[   90.656056]  ? irq_work_queue+0x2b/0x50
[   90.656057]  ? __wake_up_klogd+0x40/0x60
[   90.656059]  __cancel_work_timer+0xed/0x180
[   90.656061]  drm_kms_helper_poll_disable.cold+0x1f/0x2c [drm_kms_helper]
[   90.656072]  amdgpu_device_suspend+0x81/0x170 [amdgpu]
[   90.656180]  amdgpu_pmops_runtime_suspend+0xb5/0x1b0 [amdgpu]
[   90.656269]  pci_pm_runtime_suspend+0x61/0x1b0

drm_kms_helper_poll_enable/disable is valid when poll_init is called in
amdgpu code, which is only used in non DC path. So move such codes into
non-DC path code to get rid of such warnings.

v1: introduce use_kms_poll flag in amdgpu as the poll stuff check
v2: use dc_enabled as the flag to simply code
v3: move code into non DC path instead of relying on any flag

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2411
Fixes: a4e771729a51("drm/probe_helper: sort out poll_running vs poll_enabled")
Reported-by: Bert Karwatzki 
Suggested-by: Dmitry Baryshkov 
Suggested-by: Alex Deucher 
Signed-off-by: Guchun Chen 
Reviewed-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 4 
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c4a4e2fe6681..da5b0258a237 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4145,8 +4145,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
DRM_WARN("smart shift update failed\n");
 
-   drm_kms_helper_poll_disable(dev);
-
if (fbcon)

drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
 
@@ -4243,8 +4241,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
if (fbcon)

drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
 
-   drm_kms_helper_poll_enable(dev);
-
amdgpu_ras_resume(adev);
 
if (adev->mode_info.num_crtc) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 503f89a766c3..d60fe7eb5579 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -1618,6 +1618,8 @@ int amdgpu_display_suspend_helper(struct amdgpu_device 
*adev)
struct drm_connector_list_iter iter;
int r;
 
+   drm_kms_helper_poll_disable(dev);
+
/* turn off display hw */
drm_modeset_lock_all(dev);
drm_connector_list_iter_begin(dev, );
@@ -1694,6 +1696,8 @@ int amdgpu_display_resume_helper(struct amdgpu_device 
*adev)
 
drm_modeset_unlock_all(dev);
 
+   drm_kms_helper_poll_enable(dev);
+
return 0;
 }
 
-- 
2.25.1

[PATCH 2/2] drm/probe_helper: warning on poll_enabled for issue catching

2023-03-08 Thread Guchun Chen

In order to catch issues in other drivers to ensure proper call
sequence of polling function.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2411
Fixes: a4e771729a51("drm/probe_helper: sort out poll_running vs poll_enabled")
Reported-by: Bert Karwatzki 
Suggested-by: Dmitry Baryshkov 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/drm_probe_helper.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/drm_probe_helper.c 
b/drivers/gpu/drm/drm_probe_helper.c
index 8127be134c39..85e0e80d4a52 100644
--- a/drivers/gpu/drm/drm_probe_helper.c
+++ b/drivers/gpu/drm/drm_probe_helper.c
@@ -852,6 +852,8 @@ EXPORT_SYMBOL(drm_kms_helper_is_poll_worker);
  */
 void drm_kms_helper_poll_disable(struct drm_device *dev)
 {
+   WARN_ON(!dev->mode_config.poll_enabled);
+
if (dev->mode_config.poll_running)
drm_kms_helper_disable_hpd(dev);
 
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: move poll enabled/disable into non DC path

2023-03-08 Thread Guchun Chen

Some amd asics having reliable hotplug support don't call
drm_kms_helper_poll_init in driver init sequence. However,
due to the unified suspend/resume path for all asics, because
the output_poll_work->func is not set for these asics, a warning
arrives when suspending.

[   90.656049]  
[   90.656050]  ? console_unlock+0x4d/0x100
[   90.656053]  ? __irq_work_queue_local+0x27/0x60
[   90.656056]  ? irq_work_queue+0x2b/0x50
[   90.656057]  ? __wake_up_klogd+0x40/0x60
[   90.656059]  __cancel_work_timer+0xed/0x180
[   90.656061]  drm_kms_helper_poll_disable.cold+0x1f/0x2c [drm_kms_helper]
[   90.656072]  amdgpu_device_suspend+0x81/0x170 [amdgpu]
[   90.656180]  amdgpu_pmops_runtime_suspend+0xb5/0x1b0 [amdgpu]
[   90.656269]  pci_pm_runtime_suspend+0x61/0x1b0

drm_kms_helper_poll_enable/disable is valid when poll_init is called in
amdgpu code, which is only used in non DC path. So move such codes into
non-DC path code to get rid of such warnings.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2411
Fixes: a4e771729a51("drm/probe_helper: sort out poll_running vs poll_enabled")
Reported-by: Bert Karwatzki 
Suggested-by: Dmitry Baryshkov 
Suggested-by: Alex Deucher 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 4 
 drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 4 
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c4a4e2fe6681..da5b0258a237 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4145,8 +4145,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
DRM_WARN("smart shift update failed\n");
 
-   drm_kms_helper_poll_disable(dev);
-
if (fbcon)

drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
 
@@ -4243,8 +4241,6 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
if (fbcon)

drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
 
-   drm_kms_helper_poll_enable(dev);
-
amdgpu_ras_resume(adev);
 
if (adev->mode_info.num_crtc) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 503f89a766c3..d60fe7eb5579 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -1618,6 +1618,8 @@ int amdgpu_display_suspend_helper(struct amdgpu_device 
*adev)
struct drm_connector_list_iter iter;
int r;
 
+   drm_kms_helper_poll_disable(dev);
+
/* turn off display hw */
drm_modeset_lock_all(dev);
drm_connector_list_iter_begin(dev, );
@@ -1694,6 +1696,8 @@ int amdgpu_display_resume_helper(struct amdgpu_device 
*adev)
 
drm_modeset_unlock_all(dev);
 
+   drm_kms_helper_poll_enable(dev);
+
return 0;
 }
 
-- 
2.25.1

[PATCH 2/2] drm/probe_helper: warning on poll_enabled for issue catching

2023-03-08 Thread Guchun Chen

In order to catch issues in other drivers to ensure proper call
sequence of polling function.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2411
Fixes: a4e771729a51("drm/probe_helper: sort out poll_running vs poll_enabled")
Reported-by: Bert Karwatzki 
Suggested-by: Dmitry Baryshkov 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/drm_probe_helper.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/drm_probe_helper.c 
b/drivers/gpu/drm/drm_probe_helper.c
index 8127be134c39..85e0e80d4a52 100644
--- a/drivers/gpu/drm/drm_probe_helper.c
+++ b/drivers/gpu/drm/drm_probe_helper.c
@@ -852,6 +852,8 @@ EXPORT_SYMBOL(drm_kms_helper_is_poll_worker);
  */
 void drm_kms_helper_poll_disable(struct drm_device *dev)
 {
+   WARN_ON(!dev->mode_config.poll_enabled);
+
if (dev->mode_config.poll_running)
drm_kms_helper_disable_hpd(dev);
 
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: add flag to enable/disable poll in suspend/resume path

2023-03-08 Thread Guchun Chen

Some amd asics having reliable hotplug support don't call
drm_kms_helper_poll_init in driver init sequence. However,
due to the unified suspend/resume path for all asics, because
the output_poll_work->func is not set for these asics, a warning
arrives when suspending.

[   90.656049]  
[   90.656050]  ? console_unlock+0x4d/0x100
[   90.656053]  ? __irq_work_queue_local+0x27/0x60
[   90.656056]  ? irq_work_queue+0x2b/0x50
[   90.656057]  ? __wake_up_klogd+0x40/0x60
[   90.656059]  __cancel_work_timer+0xed/0x180
[   90.656061]  drm_kms_helper_poll_disable.cold+0x1f/0x2c [drm_kms_helper]
[   90.656072]  amdgpu_device_suspend+0x81/0x170 [amdgpu]
[   90.656180]  amdgpu_pmops_runtime_suspend+0xb5/0x1b0 [amdgpu]
[   90.656269]  pci_pm_runtime_suspend+0x61/0x1b0

So add use_kms_poll flag as the initialization check in amdgpu code before
calling drm_kms_helper_poll_disable/drm_kms_helper_poll_enable in suspend/resume
path.

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/2411
Fixes: a4e771729a51("drm/probe_helper: sort out poll_running vs poll_enabled")
Reported-by: Bert Karwatzki 
Suggested-by: Dmitry Baryshkov 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 6 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h   | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c   | 1 +
 drivers/gpu/drm/amd/amdgpu/dce_v10_0.c | 1 +
 drivers/gpu/drm/amd/amdgpu/dce_v11_0.c | 1 +
 drivers/gpu/drm/amd/amdgpu/dce_v6_0.c  | 1 +
 drivers/gpu/drm/amd/amdgpu/dce_v8_0.c  | 1 +
 7 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c4a4e2fe6681..74af0b8c0d08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4145,7 +4145,8 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
DRM_WARN("smart shift update failed\n");
 
-   drm_kms_helper_poll_disable(dev);
+   if (adev->mode_info.use_kms_poll)
+   drm_kms_helper_poll_disable(dev);
 
if (fbcon)

drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
@@ -4243,7 +4244,8 @@ int amdgpu_device_resume(struct drm_device *dev, bool 
fbcon)
if (fbcon)

drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
 
-   drm_kms_helper_poll_enable(dev);
+   if (adev->mode_info.use_kms_poll)
+   drm_kms_helper_poll_enable(dev);
 
amdgpu_ras_resume(adev);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
index 32fe05c810c6..d383ea3e8e94 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mode.h
@@ -343,6 +343,7 @@ struct amdgpu_mode_info {
int disp_priority;
const struct amdgpu_display_funcs *funcs;
const enum drm_plane_type *plane_type;
+   bool use_kms_poll;
 };
 
 #define AMDGPU_MAX_BL_LEVEL 0xFF
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 53ff91fc6cf6..3277799a80bb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -518,6 +518,7 @@ static int amdgpu_vkms_sw_init(void *handle)
return r;
 
drm_kms_helper_poll_init(adev_to_drm(adev));
+   adev->mode_info.use_kms_poll = true;
 
adev->mode_info.mode_config_initialized = true;
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
index 9a24ed463abd..f4d0a7cf588b 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v10_0.c
@@ -2842,6 +2842,7 @@ static int dce_v10_0_sw_init(void *handle)
  amdgpu_display_hotplug_work_func);
 
drm_kms_helper_poll_init(adev_to_drm(adev));
+   adev->mode_info.use_kms_poll = true;
 
adev->mode_info.mode_config_initialized = true;
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
index c14b70350a51..25d0a866ca28 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v11_0.c
@@ -2961,6 +2961,7 @@ static int dce_v11_0_sw_init(void *handle)
  amdgpu_display_hotplug_work_func);
 
drm_kms_helper_poll_init(adev_to_drm(adev));
+   adev->mode_info.use_kms_poll = true;
 
adev->mode_info.mode_config_initialized = true;
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c 
b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
index 7f85ba5b726f..3936c6bfe2e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/dce_v6_0.c
@@ -2720,6 +2720,7 @@ static int dce_v6_0_sw_init(void *handl

[PATCH] drm/amd/pm: downgrade log level upon SMU IF version mismatch

2023-02-20 Thread Guchun Chen

SMU IF version mismatch as a warning message exists widely
after asic production, however, due to this log level setting,
such mismatch warning will be caught by automation test like
IGT and reported as a fake error after checking. As such mismatch
does not break anything, to reduce confusion, downgrade it from
dev_warn to dev_info.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 4 ++--
 drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c | 4 ++--
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index 6492d69e2e60..e1ef88ee1ed3 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -256,7 +256,7 @@ int smu_v11_0_check_fw_version(struct smu_context *smu)
 * to be backward compatible.
 * 2. New fw usually brings some optimizations. But that's visible
 * only on the paired driver.
-* Considering above, we just leave user a warning message instead
+* Considering above, we just leave user a verbal message instead
 * of halt driver loading.
 */
if (if_version != smu->smc_driver_if_version) {
@@ -264,7 +264,7 @@ int smu_v11_0_check_fw_version(struct smu_context *smu)
"smu fw program = %d, version = 0x%08x (%d.%d.%d)\n",
smu->smc_driver_if_version, if_version,
smu_program, smu_version, smu_major, smu_minor, 
smu_debug);
-   dev_warn(smu->adev->dev, "SMU driver if version not matched\n");
+   dev_info(smu->adev->dev, "SMU driver if version not matched\n");
}
 
return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c
index 56a02bc60cee..c788aa7a99a9 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu12/smu_v12_0.c
@@ -93,7 +93,7 @@ int smu_v12_0_check_fw_version(struct smu_context *smu)
 * to be backward compatible.
 * 2. New fw usually brings some optimizations. But that's visible
 * only on the paired driver.
-* Considering above, we just leave user a warning message instead
+* Considering above, we just leave user a verbal message instead
 * of halt driver loading.
 */
if (if_version != smu->smc_driver_if_version) {
@@ -101,7 +101,7 @@ int smu_v12_0_check_fw_version(struct smu_context *smu)
"smu fw program = %d, smu fw version = 0x%08x 
(%d.%d.%d)\n",
smu->smc_driver_if_version, if_version,
smu_program, smu_version, smu_major, smu_minor, 
smu_debug);
-   dev_warn(smu->adev->dev, "SMU driver if version not matched\n");
+   dev_info(smu->adev->dev, "SMU driver if version not matched\n");
}
 
return ret;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index 78945e79dbee..25f336829840 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -311,7 +311,7 @@ int smu_v13_0_check_fw_version(struct smu_context *smu)
 * to be backward compatible.
 * 2. New fw usually brings some optimizations. But that's visible
 * only on the paired driver.
-* Considering above, we just leave user a warning message instead
+* Considering above, we just leave user a verbal message instead
 * of halt driver loading.
 */
if (if_version != smu->smc_driver_if_version) {
@@ -319,7 +319,7 @@ int smu_v13_0_check_fw_version(struct smu_context *smu)
 "smu fw program = %d, smu fw version = 0x%08x 
(%d.%d.%d)\n",
 smu->smc_driver_if_version, if_version,
 smu_program, smu_version, smu_major, smu_minor, 
smu_debug);
-   dev_warn(adev->dev, "SMU driver if version not matched\n");
+   dev_info(adev->dev, "SMU driver if version not matched\n");
}
 
return ret;
-- 
2.25.1

[PATCH] drm/amd/pm/smu13: BACO is supported when it's in BACO state

2023-01-09 Thread Guchun Chen

This leverages the logc in smu11. No need to talk to SMU to
check BACO enablement as it's in BACO state already.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
index ccaedfcf977e..78945e79dbee 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -2243,6 +2243,10 @@ bool smu_v13_0_baco_is_support(struct smu_context *smu)
!smu_baco->platform_support)
return false;
 
+   /* return true if ASIC is in BACO state already */
+   if (smu_v13_0_baco_get_state(smu) == SMU_BACO_STATE_ENTER)
+   return true;
+
if (smu_cmn_feature_is_supported(smu, SMU_FEATURE_BACO_BIT) &&
!smu_cmn_feature_is_enabled(smu, SMU_FEATURE_BACO_BIT))
return false;
-- 
2.25.1

[PATCH] drm/amdgpu: use dev_dbg to print messages in runtime cycle

2022-11-23 Thread Guchun Chen

Runtime PM can happen pretty frequently, as these printings
may be annoyed, switch to dev_dbg.

Suggested-by: Lijo Lazar 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 8b1f6c032a2e..447e27b2e16b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2568,7 +2568,7 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
amdgpu_device_baco_enter(drm_dev);
}
 
-   dev_info(>dev, "asic/device is runtime suspended\n");
+   dev_dbg(>dev, "asic/device is runtime suspended\n");
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 7bb2de1d11ff..4a18d1944e4f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -861,7 +861,7 @@ static int psp_tmr_unload(struct psp_context *psp)
struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
 
psp_prep_tmr_unload_cmd_buf(psp, cmd);
-   dev_info(psp->adev->dev, "free PSP TMR buffer\n");
+   dev_dbg(psp->adev->dev, "free PSP TMR buffer\n");
 
ret = psp_cmd_submit_buf(psp, NULL, cmd,
 psp->fence_buf_mc_addr);
-- 
2.25.1

[PATCH 3/3] drm/amdgpu: add printing to indicate rpm completeness

2022-11-22 Thread Guchun Chen

Add an explicit printing to tell when finishing rpm execution
in amdgpu.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index bf2d50c8c92a..fa42c0fcf848 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2572,6 +2572,8 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
amdgpu_device_baco_enter(drm_dev);
}
 
+   dev_info(>dev, "amdgpu finishes runtime suspend\n");
+
return 0;
 }
 
-- 
2.25.1

[PATCH 2/3] drm/amd/pm/smu11: poll BACO status after RPM BACO exits

2022-11-22 Thread Guchun Chen

After executing BACO exit, driver needs to poll the status
to ensure FW has completed BACO exit sequence to prevent
timing issue.

v2: use usleep_range to replace msleep to fix checkpatch.pl warnings

Signed-off-by: Guchun Chen 
---
 .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c| 24 ++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index ad5f6a15a1d7..ad66d57aa102 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -79,6 +79,17 @@ MODULE_FIRMWARE("amdgpu/beige_goby_smc.bin");
 #define mmTHM_BACO_CNTL_ARCT   0xA7
 #define mmTHM_BACO_CNTL_ARCT_BASE_IDX  0
 
+static void smu_v11_0_poll_baco_exit(struct smu_context *smu)
+{
+   struct amdgpu_device *adev = smu->adev;
+   uint32_t data, loop = 0;
+
+   do {
+   usleep_range(1000, 1100);
+   data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL);
+   } while ((data & 0x100) && (++loop < 100));
+}
+
 int smu_v11_0_init_microcode(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
@@ -1689,7 +1700,18 @@ int smu_v11_0_baco_enter(struct smu_context *smu)
 
 int smu_v11_0_baco_exit(struct smu_context *smu)
 {
-   return smu_v11_0_baco_set_state(smu, SMU_BACO_STATE_EXIT);
+   int ret;
+
+   ret = smu_v11_0_baco_set_state(smu, SMU_BACO_STATE_EXIT);
+   if (!ret) {
+   /*
+* Poll BACO exit status to ensure FW has completed
+* BACO exit process to avoid timing issues.
+*/
+   smu_v11_0_poll_baco_exit(smu);
+   }
+
+   return ret;
 }
 
 int smu_v11_0_mode1_reset(struct smu_context *smu)
-- 
2.25.1

[PATCH 1/3] drm/amd/pm/smu11: BACO is supported when it's in BACO state

2022-11-22 Thread Guchun Chen

Return true early if ASIC is in BACO state already, no need
to talk to SMU. It can fix the issue that driver was not
calling BACO exit at all in runtime pm resume, and a timing
issue leading to a PCI AER error happened eventually.

Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in 
get_port_device_capability()")
Suggested-by: Lijo Lazar 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index 70b560737687..ad5f6a15a1d7 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -1588,6 +1588,10 @@ bool smu_v11_0_baco_is_support(struct smu_context *smu)
if (amdgpu_sriov_vf(smu->adev) || !smu_baco->platform_support)
return false;
 
+   /* return true if ASIC is in BACO state already */
+   if (smu_v11_0_baco_get_state(smu) == SMU_BACO_STATE_ENTER)
+   return true;
+
/* Arcturus does not support this bit mask */
if (smu_cmn_feature_is_supported(smu, SMU_FEATURE_BACO_BIT) &&
   !smu_cmn_feature_is_enabled(smu, SMU_FEATURE_BACO_BIT))
-- 
2.25.1

[PATCH 2/2] drm/amdgpu: poll BACO status after RPM BACO exits

2022-11-20 Thread Guchun Chen

After executing BACO exit, driver needs to poll the status
to ensure FW has completed BACO exit sequence to prevent
timing issue.

Signed-off-by: Guchun Chen 
---
 .../gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c| 23 ++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
index dccbd9f70723..796c8179bfbf 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -79,6 +79,16 @@ MODULE_FIRMWARE("amdgpu/beige_goby_smc.bin");
 #define mmTHM_BACO_CNTL_ARCT   0xA7
 #define mmTHM_BACO_CNTL_ARCT_BASE_IDX  0
 
+static void smu_v11_0_poll_baco_exit(struct smu_context *smu)
+{
+   uint32_t data, loop = 0;
+
+   do {
+   msleep(1);
+   data = RREG32_SOC15(THM, 0, mmTHM_BACO_CNTL);
+   } while ((data & 0x100) && (++loop < 100));
+}
+
 int smu_v11_0_init_microcode(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
@@ -1685,7 +1695,18 @@ int smu_v11_0_baco_enter(struct smu_context *smu)
 
 int smu_v11_0_baco_exit(struct smu_context *smu)
 {
-   return smu_v11_0_baco_set_state(smu, SMU_BACO_STATE_EXIT);
+   int ret = 0;
+
+   ret = smu_v11_0_baco_set_state(smu, SMU_BACO_STATE_EXIT);
+   if (!ret) {
+   /*
+* Poll BACO exit status to ensure FW has completed
+* BACO exit process to avoid timing issues.
+*/
+   smu_v11_0_poll_baco_exit(smu);
+   }
+
+   return ret;
 }
 
 int smu_v11_0_mode1_reset(struct smu_context *smu)
-- 
2.25.1

[PATCH 1/2] drm/amdgpu: use rpm_mode as runtime pm check flag

2022-11-20 Thread Guchun Chen

Driver was not calling BACO exit at all in runtime pm
resume, and it caused the timing issue leading to a PCI
AER error, as once system enters BACO, it's not reliable
to check runtime pm mode by talking to SMU. So use rpm_mode
instead as a general pm mode check to ensure driver executes
BACO exit in runtime pm resume.

Fixes: 8795e182b02d ("PCI/portdrv: Don't disable AER reporting in 
get_port_device_capability()")

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 25 +++---
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index ab8f970b2849..40af21040b47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5619,7 +5619,7 @@ int amdgpu_device_baco_enter(struct drm_device *dev)
struct amdgpu_device *adev = drm_to_adev(dev);
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
 
-   if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
+   if (adev->pm.rpm_mode != AMDGPU_RUNPM_BACO)
return -ENOTSUPP;
 
if (ras && adev->ras_enabled &&
@@ -5635,7 +5635,7 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
int ret = 0;
 
-   if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
+   if (adev->pm.rpm_mode != AMDGPU_RUNPM_BACO)
return -ENOTSUPP;
 
ret = amdgpu_dpm_baco_exit(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 3c9fecdd6b2f..be03f7b1cee1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2532,7 +2532,7 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
}
 
adev->in_runpm = true;
-   if (amdgpu_device_supports_px(drm_dev))
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_PX)
drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
 
/*
@@ -2542,21 +2542,21 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
 * platforms.
 * TODO: this may be also needed for PX capable platform.
 */
-   if (amdgpu_device_supports_boco(drm_dev))
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO)
adev->mp1_state = PP_MP1_STATE_UNLOAD;
 
ret = amdgpu_device_suspend(drm_dev, false);
if (ret) {
adev->in_runpm = false;
-   if (amdgpu_device_supports_boco(drm_dev))
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO)
adev->mp1_state = PP_MP1_STATE_NONE;
return ret;
}
 
-   if (amdgpu_device_supports_boco(drm_dev))
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO)
adev->mp1_state = PP_MP1_STATE_NONE;
 
-   if (amdgpu_device_supports_px(drm_dev)) {
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_PX) {
/* Only need to handle PCI state in the driver for ATPX
 * PCI core handles it for _PR3.
 */
@@ -2565,9 +2565,9 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
pci_ignore_hotplug(pdev);
pci_set_power_state(pdev, PCI_D3cold);
drm_dev->switch_power_state = DRM_SWITCH_POWER_DYNAMIC_OFF;
-   } else if (amdgpu_device_supports_boco(drm_dev)) {
+   } else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO) {
/* nothing to do */
-   } else if (amdgpu_device_supports_baco(drm_dev)) {
+   } else {
amdgpu_device_baco_enter(drm_dev);
}
 
@@ -2588,7 +2588,7 @@ static int amdgpu_pmops_runtime_resume(struct device *dev)
if (!pci_device_is_present(adev->pdev))
adev->no_hw_access = true;
 
-   if (amdgpu_device_supports_px(drm_dev)) {
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_PX) {
drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
 
/* Only need to handle PCI state in the driver for ATPX
@@ -2600,22 +2600,23 @@ static int amdgpu_pmops_runtime_resume(struct device 
*dev)
if (ret)
return ret;
pci_set_master(pdev);
-   } else if (amdgpu_device_supports_boco(drm_dev)) {
+   } else if (adev->pm.rpm_mode == AMDGPU_RUNPM_BOCO) {
/* Only need to handle PCI state in the driver for ATPX
 * PCI core handles it for _PR3.
 */
pci_set_master(pdev);
-   } else if (amdgpu_device_supports_baco(drm_dev)) {
+   } else {
amdgpu_device_baco_exit(drm_dev);
}
+
ret = amdgpu_device_resume(drm_dev, false);
if (ret) {
-   if (amd

[PATCH] drm/amdgpu: disable BACO support on more cards

2022-11-11 Thread Guchun Chen

Otherwise, some unexpected PCIE AER errors will be observed
in runtime suspend/resume cycle.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 6212fd270857..697e98a0a20a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -379,6 +379,10 @@ static void sienna_cichlid_check_bxco_support(struct 
smu_context *smu)
((adev->pdev->device == 0x73BF) &&
(adev->pdev->revision == 0xCF)) ||
((adev->pdev->device == 0x7422) &&
+   (adev->pdev->revision == 0x00)) ||
+   ((adev->pdev->device == 0x73A3) &&
+   (adev->pdev->revision == 0x00)) ||
+   ((adev->pdev->device == 0x73E3) &&
(adev->pdev->revision == 0x00)))
smu_baco->platform_support = false;
 
-- 
2.25.1

[PATCH] drm/amdgpu: disable BACO on special BEIGE_GOBY card

2022-11-07 Thread Guchun Chen

Still avoid intermittent failure.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 74996a8fb671..c9e0be9bb180 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -377,7 +377,9 @@ static void sienna_cichlid_check_bxco_support(struct 
smu_context *smu)
if (((adev->pdev->device == 0x73A1) &&
(adev->pdev->revision == 0x00)) ||
((adev->pdev->device == 0x73BF) &&
-   (adev->pdev->revision == 0xCF)))
+   (adev->pdev->revision == 0xCF)) ||
+   ((adev->pdev->device == 0x7422) &&
+(adev->pdev->revision == 0x00)))
smu_baco->platform_support = false;
 
}
-- 
2.25.1

[PATCH] drm/amd/pm: disable BACO entry/exit completely on several sienna cichlid cards

2022-09-07 Thread Guchun Chen

To avoid hardware intermittent failures.

Signed-off-by: Guchun Chen 
---
 .../gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index 7ed4d4265797..74996a8fb671 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -369,6 +369,17 @@ static void sienna_cichlid_check_bxco_support(struct 
smu_context *smu)
smu_baco->platform_support =
(val & RCC_BIF_STRAP0__STRAP_PX_CAPABLE_MASK) ? true :
false;
+
+   /*
+* Disable BACO entry/exit completely on below SKUs to
+* avoid hardware intermittent failures.
+*/
+   if (((adev->pdev->device == 0x73A1) &&
+   (adev->pdev->revision == 0x00)) ||
+   ((adev->pdev->device == 0x73BF) &&
+   (adev->pdev->revision == 0xCF)))
+   smu_baco->platform_support = false;
+
}
 }
 
-- 
2.25.1

[PATCH] Revert "drm/amdgpu: drop runtime pm disablement quirk on several sienna cichlid cards"

2022-09-06 Thread Guchun Chen

This reverts commit e2994d23d8afa2fb465fdb8cf544b736f67ab8ba.

Frequent BACO enter/exit will cause EMI failure, so disable runtime PM
on these server SKUs.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 1369c25448dc..4f6473faaf24 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -43,6 +43,17 @@
 #include "amdgpu_display.h"
 #include "amdgpu_ras.h"
 
+static void amdgpu_runtime_pm_quirk(struct amdgpu_device *adev)
+{
+   /*
+* Add below quirk on several sienna_cichlid cards to disable
+* runtime pm to fix EMI failures.
+*/
+   if (((adev->pdev->device == 0x73A1) && (adev->pdev->revision == 0x00)) 
||
+   ((adev->pdev->device == 0x73BF) && (adev->pdev->revision == 0xCF)))
+   adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
+}
+
 void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)
 {
struct amdgpu_gpu_instance *gpu_instance;
@@ -176,6 +187,8 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
break;
}
 
+   amdgpu_runtime_pm_quirk(adev);
+
if (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO)
dev_info(adev->dev, "Using BACO for runtime pm\n");
}
-- 
2.25.1

[PATCH] drm/amdgpu: prevent toc firmware memory leak

2022-09-02 Thread Guchun Chen

It's missed in psp fini.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 28ca0a94b8a5..cfcaf890a6a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -496,11 +496,14 @@ static int psp_sw_fini(void *handle)
release_firmware(psp->ta_fw);
psp->ta_fw = NULL;
}
-   if (adev->psp.cap_fw) {
+   if (psp->cap_fw) {
release_firmware(psp->cap_fw);
psp->cap_fw = NULL;
}
-
+   if (psp->toc_fw) {
+   release_firmware(psp->toc_fw);
+   psp->toc_fw = NULL;
+   }
if (adev->ip_versions[MP0_HWIP][0] == IP_VERSION(11, 0, 0) ||
adev->ip_versions[MP0_HWIP][0] == IP_VERSION(11, 0, 7))
psp_sysfs_fini(adev);
-- 
2.25.1

[PATCH] drm/amdgpu: use adev_to_drm to get drm device

2022-08-25 Thread Guchun Chen

adev_to_drm is used everywhere in amdgpu code, so modify
it to keep consistency.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index 8ee4e8491f39..6ea8980c8ad7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -402,7 +402,7 @@ static void amdgpu_ctx_fini(struct kref *ref)
}
}
 
-   if (drm_dev_enter(>ddev, )) {
+   if (drm_dev_enter(adev_to_drm(adev), )) {
amdgpu_ctx_set_stable_pstate(ctx, ctx->stable_pstate);
drm_dev_exit(idx);
}
-- 
2.25.1

[PATCH] drm/amdgpu: use dev_info to benifit mGPU case

2022-08-25 Thread Guchun Chen

'free PSP TMR buffer' happens in suspend, but sometimes
in mGPU config, it mixes with PSP resume log printing from
another GPU, which is confusing. So use dev_info instead of
DRM_INFO for printing.

[drm] PSP is resuming...
[drm] reserve 0xa0 from 0x877e00 for PSP TMR
amdgpu :e3:00.0: amdgpu: GECC is enabled
amdgpu :e3:00.0: amdgpu: SECUREDISPLAY: securedisplay ta ucode is not 
available
amdgpu :e3:00.0: amdgpu: SMU is resuming...
amdgpu :e3:00.0: amdgpu: smu driver if version = 0x0040, smu fw if 
version = 0x0041, smu fw program = 0, version = 0x003a5400 (58.84.0)
amdgpu :e3:00.0: amdgpu: SMU driver if version not matched
amdgpu :e3:00.0: amdgpu: dpm has been enabled
amdgpu :e3:00.0: amdgpu: SMU is resumed successfully!
[drm] DMUB hardware initialized: version=0x02020014
[drm] free PSP TMR buffer
[drm] kiq ring mec 2 pipe 1 q 0

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index 1036446abc30..c932bc148554 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -812,7 +812,7 @@ static int psp_tmr_unload(struct psp_context *psp)
struct psp_gfx_cmd_resp *cmd = acquire_psp_cmd_buf(psp);
 
psp_prep_tmr_unload_cmd_buf(psp, cmd);
-   DRM_INFO("free PSP TMR buffer\n");
+   dev_info(psp->adev->dev, "free PSP TMR buffer\n");
 
ret = psp_cmd_submit_buf(psp, NULL, cmd,
 psp->fence_buf_mc_addr);
-- 
2.25.1

[PATCH] drm/amdgpu: disable FRU access on special SIENNA CICHLID card

2022-08-24 Thread Guchun Chen

Below driver load error will be printed, not friendly to end user.

amdgpu: ATOM BIOS: 113-D603GLXE-077
[drm] FRU: Failed to get size field
[drm:amdgpu_fru_get_product_info [amdgpu]] *ERROR* Failed to read FRU 
Manufacturer, ret:-5

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c
index ecada5eadfe3..9d612b8745aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fru_eeprom.c
@@ -66,10 +66,15 @@ static bool is_fru_eeprom_supported(struct amdgpu_device 
*adev)
return true;
case CHIP_SIENNA_CICHLID:
if (strnstr(atom_ctx->vbios_version, "D603",
-   sizeof(atom_ctx->vbios_version)))
-   return true;
-   else
+   sizeof(atom_ctx->vbios_version))) {
+   if (strnstr(atom_ctx->vbios_version, "D603GLXE",
+sizeof(atom_ctx->vbios_version)))
+   return false;
+   else
+   return true;
+   } else {
return false;
+   }
default:
return false;
}
-- 
2.25.1

[PATCH] drm/amdgpu: use adev_to_drm for consistency

2022-07-25 Thread Guchun Chen

Keep code consistency when accessing drm_device from amdgpu driver.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c 
b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
index 84ac2401895a..698a59ec3dba 100644
--- a/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/vcn_v4_0.c
@@ -150,7 +150,7 @@ static int vcn_v4_0_sw_fini(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
int i, r, idx;
 
-   if (drm_dev_enter(>ddev, )) {
+   if (drm_dev_enter(adev_to_drm(adev), )) {
for (i = 0; i < adev->vcn.num_vcn_inst; i++) {
volatile struct amdgpu_vcn4_fw_shared *fw_shared;
 
-- 
2.17.1

[PATCH 4/4] drm/amdgpu: drop runpm from amdgpu_device structure

2022-07-14 Thread Guchun Chen

It's redundant, as now switching to rpm_mode to indicate
runtime power management mode.

Suggested-by: Lijo Lazar 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 23 ++-
 3 files changed, 15 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 15f290c9523d..9f729a648005 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1011,7 +1011,6 @@ struct amdgpu_device {
uint64_tdf_perfmon_config_assign_mask[AMDGPU_MAX_DF_PERFMONS];
 
/* enable runtime pm on the device */
-   boolrunpm;
boolin_runpm;
boolhas_pr3;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1cc9260e75de..70a7203a2916 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2118,7 +2118,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
if (ret)
DRM_ERROR("Creating debugfs files failed (%d).\n", ret);
 
-   if (adev->runpm) {
+   if (adev->pm.rpm_mode != AMDGPU_RUNPM_NONE) {
/* only need to skip on ATPX */
if (amdgpu_device_supports_px(ddev))
dev_pm_set_driver_flags(ddev->dev, 
DPM_FLAG_NO_DIRECT_COMPLETE);
@@ -2175,7 +2175,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
 
drm_dev_unplug(dev);
 
-   if (adev->runpm) {
+   if (adev->pm.rpm_mode != AMDGPU_RUNPM_NONE) {
pm_runtime_get_sync(dev->dev);
pm_runtime_forbid(dev->dev);
}
@@ -2458,7 +2458,7 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
struct amdgpu_device *adev = drm_to_adev(drm_dev);
int ret, i;
 
-   if (!adev->runpm) {
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) {
pm_runtime_forbid(dev);
return -EBUSY;
}
@@ -2527,7 +2527,7 @@ static int amdgpu_pmops_runtime_resume(struct device *dev)
struct amdgpu_device *adev = drm_to_adev(drm_dev);
int ret;
 
-   if (!adev->runpm)
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE)
return -EINVAL;
 
/* Avoids registers access if device is physically gone */
@@ -2571,7 +2571,7 @@ static int amdgpu_pmops_runtime_idle(struct device *dev)
/* we don't want the main rpm_idle to call suspend - we want to 
autosuspend */
int ret = 1;
 
-   if (!adev->runpm) {
+   if (adev->pm.rpm_mode == AMDGPU_RUNPM_NONE) {
pm_runtime_forbid(dev);
return -EBUSY;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 9182e81e3135..a3744c0b632b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -147,14 +147,13 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
goto out;
}
 
+   adev->pm.rpm_mode = AMDGPU_RUNPM_NONE;
if (amdgpu_device_supports_px(dev) &&
-   (amdgpu_runtime_pm != 0)) { /* enable runpm by default for atpx */
-   adev->runpm = true;
+   (amdgpu_runtime_pm != 0)) { /* enable PX as runtime mode */
adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
dev_info(adev->dev, "Using ATPX for runtime pm\n");
} else if (amdgpu_device_supports_boco(dev) &&
-  (amdgpu_runtime_pm != 0)) { /* enable runpm by default for 
boco */
-   adev->runpm = true;
+  (amdgpu_runtime_pm != 0)) { /* enable boco as runtime mode */
adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
dev_info(adev->dev, "Using BOCO for runtime pm\n");
} else if (amdgpu_device_supports_baco(dev) &&
@@ -162,25 +161,23 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
switch (adev->asic_type) {
case CHIP_VEGA20:
case CHIP_ARCTURUS:
-   /* enable runpm if runpm=1 */
+   /* enable BACO as runpm mode if runpm=1 */
if (amdgpu_runtime_pm > 0)
-   adev->runpm = true;
+   adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
break;
case CHIP_VEGA10:
-   /* turn runpm on if noretry=0 */
+   /* enable BACO as runpm mode if noretry=0 */
if (!adev->gmc.noretry)
-   adev->runpm =

[PATCH 3/4] drm/amdgpu: drop runtime pm disablement quirk on several sienna cichlid cards

2022-07-14 Thread Guchun Chen

This quirk is not needed any more as it's fixed by bypassing
SMU FW reloading in runtime resume.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 13 -
 1 file changed, 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index ceecb74842de..9182e81e3135 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -43,17 +43,6 @@
 #include "amdgpu_display.h"
 #include "amdgpu_ras.h"
 
-static void amdgpu_runtime_pm_quirk(struct amdgpu_device *adev)
-{
-   /*
-* Add below quirk on several sienna_cichlid cards to disable
-* runtime pm to fix EMI failures.
-*/
-   if (((adev->pdev->device == 0x73A1) && (adev->pdev->revision == 0x00)) 
||
-   ((adev->pdev->device == 0x73BF) && (adev->pdev->revision == 0xCF)))
-   adev->runpm = false;
-}
-
 void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)
 {
struct amdgpu_gpu_instance *gpu_instance;
@@ -188,8 +177,6 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
break;
}
 
-   amdgpu_runtime_pm_quirk(adev);
-
if (adev->runpm) {
adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
dev_info(adev->dev, "Using BACO for runtime pm\n");
-- 
2.17.1

[PATCH 2/4] drm/amdgpu: skip SMU FW reloading in runpm BACO case

2022-07-14 Thread Guchun Chen

SMU is always alive, so it's fine to skip SMU FW reloading
when runpm resumed from BACO, this can avoid some race issues
when resuming SMU.

Suggested-by: Evan Quan 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index e9411c28d88b..6540582ecbf8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2348,6 +2348,13 @@ static int psp_load_smu_fw(struct psp_context *psp)
>firmware.ucode[AMDGPU_UCODE_ID_SMC];
struct amdgpu_ras *ras = psp->ras_context.ras;
 
+   /*
+* Skip SMU FW reloading in case of using BACO for runpm only,
+* as SMU is always alive.
+*/
+   if (adev->in_runpm && (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO))
+   return 0;
+
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
return 0;
 
-- 
2.17.1

[PATCH 1/4] drm/amdgpu: introduce runtime pm mode

2022-07-14 Thread Guchun Chen

It can benefit code consistency in future.

Suggested-by: Lijo Lazar 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 +-
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 9 +
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 4b663866d33a..ceecb74842de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -161,10 +161,12 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
if (amdgpu_device_supports_px(dev) &&
(amdgpu_runtime_pm != 0)) { /* enable runpm by default for atpx */
adev->runpm = true;
+   adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
dev_info(adev->dev, "Using ATPX for runtime pm\n");
} else if (amdgpu_device_supports_boco(dev) &&
   (amdgpu_runtime_pm != 0)) { /* enable runpm by default for 
boco */
adev->runpm = true;
+   adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
dev_info(adev->dev, "Using BOCO for runtime pm\n");
} else if (amdgpu_device_supports_baco(dev) &&
   (amdgpu_runtime_pm != 0)) {
@@ -188,8 +190,10 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
 
amdgpu_runtime_pm_quirk(adev);
 
-   if (adev->runpm)
+   if (adev->runpm) {
+   adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
dev_info(adev->dev, "Using BACO for runtime pm\n");
+   }
}
 
/* Call ACPI methods: require modeset init
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index 524fb09437e5..65624d091ed2 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -45,6 +45,13 @@ enum amdgpu_int_thermal_type {
THERMAL_TYPE_KV,
 };
 
+enum amdgpu_runpm_mode {
+   AMDGPU_RUNPM_NONE,
+   AMDGPU_RUNPM_PX,
+   AMDGPU_RUNPM_BOCO,
+   AMDGPU_RUNPM_BACO,
+};
+
 struct amdgpu_ps {
u32 caps; /* vbios flags */
u32 class; /* vbios flags */
@@ -355,6 +362,8 @@ struct amdgpu_pm {
struct amdgpu_ctx   *stable_pstate_ctx;
 
struct config_table_setting config_table;
+   /* runtime mode */
+   enum amdgpu_runpm_mode rpm_mode;
 };
 
 int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, enum amd_pp_sensors 
sensor,
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: use cached baco flag as the check in runpm (v3)

2022-07-12 Thread Guchun Chen

SMU will perform dpm disablement when entering BACO,
and enable them later on, so talking to SMU to get
enabled features mask in runpm cycle as BACO support
check is not reliable. Hence, use a cached baco flag
to fix it.

v2: cache this flag in load sequence to simplify code (from Evan)
v3: introduce runpm mode as the check (from Lijo)

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 +++---
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h | 8 
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 4b663866d33a..ceecb74842de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -161,10 +161,12 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
if (amdgpu_device_supports_px(dev) &&
(amdgpu_runtime_pm != 0)) { /* enable runpm by default for atpx */
adev->runpm = true;
+   adev->pm.rpm_mode = AMDGPU_RUNPM_PX;
dev_info(adev->dev, "Using ATPX for runtime pm\n");
} else if (amdgpu_device_supports_boco(dev) &&
   (amdgpu_runtime_pm != 0)) { /* enable runpm by default for 
boco */
adev->runpm = true;
+   adev->pm.rpm_mode = AMDGPU_RUNPM_BOCO;
dev_info(adev->dev, "Using BOCO for runtime pm\n");
} else if (amdgpu_device_supports_baco(dev) &&
   (amdgpu_runtime_pm != 0)) {
@@ -188,8 +190,10 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
 
amdgpu_runtime_pm_quirk(adev);
 
-   if (adev->runpm)
+   if (adev->runpm) {
+   adev->pm.rpm_mode = AMDGPU_RUNPM_BACO;
dev_info(adev->dev, "Using BACO for runtime pm\n");
+   }
}
 
/* Call ACPI methods: require modeset init
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index de59dc051340..0d31ab5fa1d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2348,12 +2348,12 @@ static int psp_load_smu_fw(struct psp_context *psp)
>firmware.ucode[AMDGPU_UCODE_ID_SMC];
struct amdgpu_ras *ras = psp->ras_context.ras;
 
-   /* Skip SMU FW reloading in case of using BACO for runpm only,
+   /*
+* Skip SMU FW reloading in case of using BACO for runpm only,
 * as SMU is always alive.
 */
if (adev->in_runpm &&
-   !amdgpu_device_supports_boco(adev_to_drm(adev)) &&
-   amdgpu_device_supports_baco(adev_to_drm(adev)))
+   (adev->pm.rpm_mode == AMDGPU_RUNPM_BACO))
return 0;
 
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index 524fb09437e5..efeb3a8d20e2 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -45,6 +45,13 @@ enum amdgpu_int_thermal_type {
THERMAL_TYPE_KV,
 };
 
+enum amdgpu_runpm_mode {
+   AMDGPU_RUNPM_NONE,
+   AMDGPU_RUNPM_PX,
+   AMDGPU_RUNPM_BOCO,
+   AMDGPU_RUNPM_BACO,
+};
+
 struct amdgpu_ps {
u32 caps; /* vbios flags */
u32 class; /* vbios flags */
@@ -355,6 +362,7 @@ struct amdgpu_pm {
struct amdgpu_ctx   *stable_pstate_ctx;
 
struct config_table_setting config_table;
+   enum amdgpu_runpm_mode rpm_mode;
 };
 
 int amdgpu_dpm_read_sensor(struct amdgpu_device *adev, enum amd_pp_sensors 
sensor,
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: skip SMU FW reloading in runpm BACO case (v2)

2022-07-12 Thread Guchun Chen

SMU is always alive, so it's fine to skip SMU FW reloading
when runpm resumed from BACO, this can avoid some race issues
when resuming SMU FW.

v2: Exclude boco case if an ASIC supports both boco and baco

Suggested-by: Evan Quan 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index e9411c28d88b..de59dc051340 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2348,6 +2348,14 @@ static int psp_load_smu_fw(struct psp_context *psp)
>firmware.ucode[AMDGPU_UCODE_ID_SMC];
struct amdgpu_ras *ras = psp->ras_context.ras;
 
+   /* Skip SMU FW reloading in case of using BACO for runpm only,
+* as SMU is always alive.
+*/
+   if (adev->in_runpm &&
+   !amdgpu_device_supports_boco(adev_to_drm(adev)) &&
+   amdgpu_device_supports_baco(adev_to_drm(adev)))
+   return 0;
+
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
return 0;
 
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: use cached baco flag as the check in runpm (v2)

2022-07-12 Thread Guchun Chen

SMU will perform dpm disablement when entering BACO,
and enable them later on, so talking to SMU to get
enabled features mask in runpm cycle as BACO support
check is not reliable. Hence, use a cached baco flag
to fix it.

v2: cache this flag in load sequence to simplify code (from Evan)

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 4 +++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 845d6054992a..816f813a5df2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1014,6 +1014,7 @@ struct amdgpu_device {
boolrunpm;
boolin_runpm;
boolhas_pr3;
+   boolis_baco_supported;
 
boolpm_sysfs_en;
boolucode_sysfs_en;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1cc9260e75de..c3f870c01c47 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2513,7 +2513,7 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
drm_dev->switch_power_state = DRM_SWITCH_POWER_DYNAMIC_OFF;
} else if (amdgpu_device_supports_boco(drm_dev)) {
/* nothing to do */
-   } else if (amdgpu_device_supports_baco(drm_dev)) {
+   } else if (adev->is_baco_supported) {
amdgpu_device_baco_enter(drm_dev);
}
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 4b663866d33a..532406d32fba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -188,8 +188,10 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
 
amdgpu_runtime_pm_quirk(adev);
 
-   if (adev->runpm)
+   if (adev->runpm) {
dev_info(adev->dev, "Using BACO for runtime pm\n");
+   adev->is_baco_supported = true;
+   }
}
 
/* Call ACPI methods: require modeset init
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index de59dc051340..f05d7ac03122 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2353,7 +2353,7 @@ static int psp_load_smu_fw(struct psp_context *psp)
 */
if (adev->in_runpm &&
!amdgpu_device_supports_boco(adev_to_drm(adev)) &&
-   amdgpu_device_supports_baco(adev_to_drm(adev)))
+   adev->is_baco_supported)
return 0;
 
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: skip SMU FW reloading in runpm BACO case (v2)

2022-07-12 Thread Guchun Chen

SMU is always alive, so it's fine to skip SMU FW reloading
when runpm resumed from BACO, this can avoid some race issues
when resuming SMU FW.

v2: Exclude boco case if an ASIC supports both boco and baco

Suggested-by: Evan Quan 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index e9411c28d88b..de59dc051340 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2348,6 +2348,14 @@ static int psp_load_smu_fw(struct psp_context *psp)
>firmware.ucode[AMDGPU_UCODE_ID_SMC];
struct amdgpu_ras *ras = psp->ras_context.ras;
 
+   /* Skip SMU FW reloading in case of using BACO for runpm only,
+* as SMU is always alive.
+*/
+   if (adev->in_runpm &&
+   !amdgpu_device_supports_boco(adev_to_drm(adev)) &&
+   amdgpu_device_supports_baco(adev_to_drm(adev)))
+   return 0;
+
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
return 0;
 
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: use cached SMU feature mask in runpm

2022-07-11 Thread Guchun Chen

SMU will perform dpm disablement when entering BACO,
and enable them later on, so talking to SMU to get
enabled features in runpm cycle as BACO support check
is not reliable. Hence, use a cached value to fix it.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c   | 4 
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 9 +
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   | 1 +
 drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 5 +
 drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h | 3 +++
 drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c| 8 +++-
 6 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 1cc9260e75de..dc2e78bb7224 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2478,6 +2478,10 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
}
 
adev->in_runpm = true;
+
+   /* cache SMU feature mask */
+   amdgpu_dpm_set_cached_feature_mask(adev);
+
if (amdgpu_device_supports_px(drm_dev))
drm_dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
 
diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index 956b6ce81c84..211f73a987d6 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -1702,3 +1702,12 @@ int amdgpu_dpm_get_dpm_clock_table(struct amdgpu_device 
*adev,
 
return ret;
 }
+
+void amdgpu_dpm_set_cached_feature_mask(struct amdgpu_device *adev)
+{
+   struct smu_context *smu = adev->powerplay.pp_handle;
+
+   mutex_lock(>pm.mutex);
+   smu_set_cached_enabled_mask(smu);
+   mutex_unlock(>pm.mutex);
+}
diff --git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index 524fb09437e5..e9c002a561c2 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -543,4 +543,5 @@ enum pp_smu_status amdgpu_dpm_get_uclk_dpm_states(struct 
amdgpu_device *adev,
  unsigned int *num_states);
 int amdgpu_dpm_get_dpm_clock_table(struct amdgpu_device *adev,
   struct dpm_clocks *clock_table);
+void amdgpu_dpm_set_cached_feature_mask(struct amdgpu_device *adev);
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c 
b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
index fd79b213fab4..e8ead58a00b4 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -3130,3 +3130,8 @@ int smu_send_hbm_bad_channel_flag(struct smu_context 
*smu, uint32_t size)
 
return ret;
 }
+
+void smu_set_cached_enabled_mask(struct smu_context *smu)
+{
+   smu_feature_get_enabled_mask(smu, >cache_enabled_mask);
+}
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h 
b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
index b81c657c7386..678123b5e2bf 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -568,6 +568,8 @@ struct smu_context
u32 param_reg;
u32 msg_reg;
u32 resp_reg;
+
+   uint64_t cache_enabled_mask;
 };
 
 struct i2c_adapter;
@@ -1465,5 +1467,6 @@ int smu_stb_collect_info(struct smu_context *smu, void 
*buff, uint32_t size);
 void amdgpu_smu_stb_debug_fs_init(struct amdgpu_device *adev);
 int smu_send_hbm_bad_pages_num(struct smu_context *smu, uint32_t size);
 int smu_send_hbm_bad_channel_flag(struct smu_context *smu, uint32_t size);
+void smu_set_cached_enabled_mask(struct smu_context *smu);
 #endif
 #endif
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
index 15e4298c7cc8..b3087085622a 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
@@ -499,7 +499,13 @@ int smu_cmn_feature_is_enabled(struct smu_context *smu,
uint64_t enabled_features;
int feature_id;
 
-   if (__smu_get_enabled_features(smu, _features)) {
+   /* SMU will perform dpm disablement when entering BACO, and enable
+* them later on, so talking to SMU to get enabled features in runpm
+* stage is not reliable. Use a cache value for this instead to fix it.
+*/
+   if (adev->in_runpm) {
+   enabled_features = smu->cache_enabled_mask;
+   } else if (__smu_get_enabled_features(smu, _features)) {
dev_err(adev->dev, "Failed to retrieve enabled ppfeatures!\n");
return 0;
}
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: skip SMU FW reloading in runpm BACO case

2022-07-11 Thread Guchun Chen

SMU is always alive, so it's fine to skip SMU FW reloading
when runpm resumed from BACO, this can avoid some race issues
when resuming SMU FW.

Suggested-by: Evan Quan 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index e9411c28d88b..9f17235bab83 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -2348,6 +2348,12 @@ static int psp_load_smu_fw(struct psp_context *psp)
>firmware.ucode[AMDGPU_UCODE_ID_SMC];
struct amdgpu_ras *ras = psp->ras_context.ras;
 
+   /* Skip SMU FW reloading in case of using BACO for runpm,
+* as SMU is always alive.
+*/
+   if (adev->in_runpm && amdgpu_asic_supports_baco(adev))
+   return 0;
+
if (!ucode->fw || amdgpu_sriov_vf(psp->adev))
return 0;
 
-- 
2.17.1

[PATCH] Revert "drm/amdgpu: Ensure the DMA engine is deactivated during set ups"

2022-06-05 Thread Guchun Chen

This reverts commit da38a66ac46e334f198afcd1b4d4554b4ddca0df.

This causes regression in GPU reset related test.

Cc: Alexander Deucher 
Cc: ricet...@gmail.com
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 109 ++---
 1 file changed, 45 insertions(+), 64 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 06b2635b142a..83c6ccaaa9e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -469,6 +469,7 @@ static void sdma_v5_2_ring_emit_fence(struct amdgpu_ring 
*ring, u64 addr, u64 se
}
 }
 
+
 /**
  * sdma_v5_2_gfx_stop - stop the gfx async dma engines
  *
@@ -514,21 +515,17 @@ static void sdma_v5_2_rlc_stop(struct amdgpu_device *adev)
 }
 
 /**
- * sdma_v5_2_ctx_switch_enable_for_instance - start the async dma engines
- * context switch for an instance
+ * sdma_v5_2_ctx_switch_enable - stop the async dma engines context switch
  *
  * @adev: amdgpu_device pointer
- * @instance_idx: the index of the SDMA instance
+ * @enable: enable/disable the DMA MEs context switch.
  *
- * Unhalt the async dma engines context switch.
+ * Halt or unhalt the async dma engines context switch.
  */
-static void sdma_v5_2_ctx_switch_enable_for_instance(struct amdgpu_device 
*adev, int instance_idx)
+static void sdma_v5_2_ctx_switch_enable(struct amdgpu_device *adev, bool 
enable)
 {
u32 f32_cntl, phase_quantum = 0;
-
-   if (WARN_ON(instance_idx >= adev->sdma.num_instances)) {
-   return;
-   }
+   int i;
 
if (amdgpu_sdma_phase_quantum) {
unsigned value = amdgpu_sdma_phase_quantum;
@@ -552,68 +549,50 @@ static void 
sdma_v5_2_ctx_switch_enable_for_instance(struct amdgpu_device *adev,
phase_quantum =
value << SDMA0_PHASE0_QUANTUM__VALUE__SHIFT |
unit  << SDMA0_PHASE0_QUANTUM__UNIT__SHIFT;
-
-   WREG32_SOC15_IP(GC,
-   sdma_v5_2_get_reg_offset(adev, instance_idx, 
mmSDMA0_PHASE0_QUANTUM),
-   phase_quantum);
-   WREG32_SOC15_IP(GC,
-   sdma_v5_2_get_reg_offset(adev, instance_idx, 
mmSDMA0_PHASE1_QUANTUM),
-   phase_quantum);
-   WREG32_SOC15_IP(GC,
-   sdma_v5_2_get_reg_offset(adev, instance_idx, 
mmSDMA0_PHASE2_QUANTUM),
-   phase_quantum);
}
 
-   if (!amdgpu_sriov_vf(adev)) {
-   f32_cntl = RREG32(sdma_v5_2_get_reg_offset(adev, instance_idx, 
mmSDMA0_CNTL));
-   f32_cntl = REG_SET_FIELD(f32_cntl, SDMA0_CNTL,
-   AUTO_CTXSW_ENABLE, 1);
-   WREG32(sdma_v5_2_get_reg_offset(adev, instance_idx, 
mmSDMA0_CNTL), f32_cntl);
+   for (i = 0; i < adev->sdma.num_instances; i++) {
+   if (enable && amdgpu_sdma_phase_quantum) {
+   WREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_PHASE0_QUANTUM),
+  phase_quantum);
+   WREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_PHASE1_QUANTUM),
+  phase_quantum);
+   WREG32_SOC15_IP(GC, sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_PHASE2_QUANTUM),
+  phase_quantum);
+   }
+
+   if (!amdgpu_sriov_vf(adev)) {
+   f32_cntl = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_CNTL));
+   f32_cntl = REG_SET_FIELD(f32_cntl, SDMA0_CNTL,
+   AUTO_CTXSW_ENABLE, enable ? 1 : 0);
+   WREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_CNTL), 
f32_cntl);
+   }
}
+
 }
 
 /**
- * sdma_v5_2_ctx_switch_disable_all - stop the async dma engines context switch
+ * sdma_v5_2_enable - stop the async dma engines
  *
  * @adev: amdgpu_device pointer
+ * @enable: enable/disable the DMA MEs.
  *
- * Halt the async dma engines context switch.
+ * Halt or unhalt the async dma engines.
  */
-static void sdma_v5_2_ctx_switch_disable_all(struct amdgpu_device *adev)
+static void sdma_v5_2_enable(struct amdgpu_device *adev, bool enable)
 {
u32 f32_cntl;
int i;
 
-   if (amdgpu_sriov_vf(adev))
-   return;
-
-   for (i = 0; i < adev->sdma.num_instances; i++) {
-   f32_cntl = RREG32(sdma_v5_2_get_reg_offset(adev, i, 
mmSDMA0_CNTL));
-   f32_cntl = REG_SET_FIELD(f32_cntl, SDMA0_CNTL,
-   AUTO_CTXSW_ENABLE, 0);
-   WREG32(sdma_v5_2_get_reg_offset(adev, i, mmSDMA0_CNTL), 
f32_cntl);
+   if (!enable) {
+   sdma_v5_2_gfx_stop(adev);
+   sdma_v5_2_rlc_stop(adev);
}
-}
-
-/**
- * sdma_v5_2_halt - stop the async dma engines
- *
- * @adev: amdgpu_device poin

[PATCH] drm/amdgpu: disable runtime pm on several sienna cichlid cards(v2)

2022-04-27 Thread Guchun Chen

Disable runtime power management on several sienna cichlid
cards, otherwise SMU will possibly fail to be resumed from
runtime suspend. Will drop this after a clean solution between
kernel driver and SMU FW is available.

amdgpu :63:00.0: amdgpu: GECC is enabled
amdgpu :63:00.0: amdgpu: SECUREDISPLAY: securedisplay ta ucode is not 
available
amdgpu :63:00.0: amdgpu: SMU is resuming...
amdgpu :63:00.0: amdgpu: SMU: I'm not done with your command: 
SMN_C2PMSG_66:0x000E SMN_C2PMSG_82:0x0080
amdgpu :63:00.0: amdgpu: Failed to SetDriverDramAddr!
amdgpu :63:00.0: amdgpu: Failed to setup smc hw!
[drm:amdgpu_device_ip_resume_phase2 [amdgpu]] *ERROR* resume of IP block  
failed -62
amdgpu :63:00.0: amdgpu: amdgpu_device_ip_resume failed (-62)

v2: seperate to a function.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 262938f0dfdb..1bf8ff71b6b5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -43,6 +43,17 @@
 #include "amdgpu_display.h"
 #include "amdgpu_ras.h"
 
+static void amdgpu_runtime_pm_quirk(struct amdgpu_device *adev)
+{
+   /*
+* Add below quirk on several sienna_cichlid cards to disable
+* runtime pm to fix EMI failures.
+*/
+   if (((adev->pdev->device == 0x73A1) && (adev->pdev->revision == 0x00)) 
||
+   ((adev->pdev->device == 0x73BF) && (adev->pdev->revision == 0xCF)))
+   adev->runpm = false;
+}
+
 void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev)
 {
struct amdgpu_gpu_instance *gpu_instance;
@@ -180,6 +191,9 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
 */
if (adev->is_fw_fb)
adev->runpm = false;
+
+   amdgpu_runtime_pm_quirk(adev);
+
if (adev->runpm)
dev_info(adev->dev, "Using BACO for runtime pm\n");
}
-- 
2.17.1

[PATCH] drm/amdgpu: disable runtime pm on several sienna cichlid cards

2022-04-27 Thread Guchun Chen

Disable runtime power management on several sienna cichlid
cards, otherwise SMU will possibly fail to be resumed from
runtime suspend. Will drop this after a clean solution between
kernel driver and SMU FW is available.

amdgpu :63:00.0: amdgpu: GECC is enabled
amdgpu :63:00.0: amdgpu: SECUREDISPLAY: securedisplay ta ucode is not 
available
amdgpu :63:00.0: amdgpu: SMU is resuming...
amdgpu :63:00.0: amdgpu: SMU: I'm not done with your command: 
SMN_C2PMSG_66:0x000E SMN_C2PMSG_82:0x0080
amdgpu :63:00.0: amdgpu: Failed to SetDriverDramAddr!
amdgpu :63:00.0: amdgpu: Failed to setup smc hw!
[drm:amdgpu_device_ip_resume_phase2 [amdgpu]] *ERROR* resume of IP block  
failed -62
amdgpu :63:00.0: amdgpu: amdgpu_device_ip_resume failed (-62)

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 262938f0dfdb..9c483787c0ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -180,6 +180,15 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, 
unsigned long flags)
 */
if (adev->is_fw_fb)
adev->runpm = false;
+
+   /*
+* Add below quirk on several sienna_cichlid cards to disable
+* runtime pm to fix EMI failures.
+*/
+   if (((adev->pdev->device == 0x73A1) && (adev->pdev->revision == 
0x00)) ||
+   ((adev->pdev->device == 0x73BF) && (adev->pdev->revision == 
0xCF)))
+   adev->runpm = false;
+
if (adev->runpm)
dev_info(adev->dev, "Using BACO for runtime pm\n");
}
-- 
2.17.1

[PATCH] drm/amdgpu: control baco sequence by driver on several SIENNA_CICHLID SKUs

2022-04-22 Thread Guchun Chen

Add a quirk to switch baco trigger sequence from armd3 sequence
to control by driver itself on several SKUs, otherwise, SMU will
fail to be resumed from runtime suspend. Will drop this after a
clean solution between kernel driver and SMU FW is available.

amdgpu :63:00.0: amdgpu: GECC is enabled
amdgpu :63:00.0: amdgpu: SECUREDISPLAY: securedisplay ta ucode is not 
available
amdgpu :63:00.0: amdgpu: SMU is resuming...
amdgpu :63:00.0: amdgpu: SMU: I'm not done with your command: 
SMN_C2PMSG_66:0x000E SMN_C2PMSG_82:0x0080
amdgpu :63:00.0: amdgpu: Failed to SetDriverDramAddr!
amdgpu :63:00.0: amdgpu: Failed to setup smc hw!
[drm:amdgpu_device_ip_resume_phase2 [amdgpu]] *ERROR* resume of IP block  
failed -62
amdgpu :63:00.0: amdgpu: amdgpu_device_ip_resume failed (-62)

Signed-off-by: Guchun Chen 
---
 .../amd/pm/swsmu/smu11/sienna_cichlid_ppt.c   | 19 +--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
index b7320ecf4934..494a42c253d8 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c
@@ -2246,11 +2246,25 @@ static int sienna_cichlid_run_btc(struct smu_context 
*smu)
return res;
 }
 
+static bool sienna_cichlid_baco_trigger_quirk(struct amdgpu_device *adev)
+{
+   /*
+* Add this quirk on several SKUs to control baco enter/exit by
+* driver instead of trigger baco via BACO_SEQ_BACO in armd3 sequence.
+*/
+   if (((adev->pdev->device == 0x73A1) && (adev->pdev->revision == 0x00)) 
||
+   ((adev->pdev->device == 0x73BF) && (adev->pdev->revision == 0xCF)))
+   return true;
+   else
+   return false;
+}
+
 static int sienna_cichlid_baco_enter(struct smu_context *smu)
 {
struct amdgpu_device *adev = smu->adev;
 
-   if (adev->in_runpm && smu_cmn_is_audio_func_enabled(adev))
+   if (adev->in_runpm && smu_cmn_is_audio_func_enabled(adev) &&
+   !sienna_cichlid_baco_trigger_quirk(adev))
return smu_v11_0_baco_set_armd3_sequence(smu, BACO_SEQ_BACO);
else
return smu_v11_0_baco_enter(smu);
@@ -2260,7 +2274,8 @@ static int sienna_cichlid_baco_exit(struct smu_context 
*smu)
 {
struct amdgpu_device *adev = smu->adev;
 
-   if (adev->in_runpm && smu_cmn_is_audio_func_enabled(adev)) {
+   if (adev->in_runpm && smu_cmn_is_audio_func_enabled(adev) &&
+   !sienna_cichlid_baco_trigger_quirk(adev)) {
/* Wait for PMFW handling for the Dstate change */
msleep(10);
return smu_v11_0_baco_set_armd3_sequence(smu, BACO_SEQ_ULPS);
-- 
2.17.1

[PATCH] drm/amdgpu: move PDB bo release into a generic gmc function

2022-03-17 Thread Guchun Chen

To pair with amdgpu_gmc_pdb0_alloc as a more generic handling
in amdgpu_gmc.c, no functional change.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c | 6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 1 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c   | 2 +-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
index 7021e8f390bd..36f6b321438f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.c
@@ -136,6 +136,12 @@ uint64_t amdgpu_gmc_pd_addr(struct amdgpu_bo *bo)
return pd_addr;
 }
 
+/* amdgpu_gmc_pdb0_free - free pdb0 vram */
+void amdgpu_gmc_pdb0_free(struct amdgpu_device *adev)
+{
+   amdgpu_bo_free_kernel(>gmc.pdb0_bo, NULL, >gmc.ptr_pdb0);
+}
+
 /**
  * amdgpu_gmc_set_pte_pde - update the page tables using CPU
  *
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index 032b0313f277..6f425e3a9b6e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -299,6 +299,7 @@ static inline uint64_t amdgpu_gmc_sign_extend(uint64_t addr)
 }
 
 int amdgpu_gmc_pdb0_alloc(struct amdgpu_device *adev);
+void amdgpu_gmc_pdb0_free(struct amdgpu_device *adev);
 void amdgpu_gmc_get_pde_for_bo(struct amdgpu_bo *bo, int level,
   uint64_t *addr, uint64_t *flags);
 int amdgpu_gmc_set_pte_pde(struct amdgpu_device *adev, void *cpu_pt_addr,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 08ceabd6c853..ad600f72a51c 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1721,7 +1721,7 @@ static int gmc_v9_0_sw_fini(void *handle)
amdgpu_gem_force_release(adev);
amdgpu_vm_manager_fini(adev);
amdgpu_gart_table_vram_free(adev);
-   amdgpu_bo_free_kernel(>gmc.pdb0_bo, NULL, >gmc.ptr_pdb0);
+   amdgpu_gmc_pdb0_free(adev);
amdgpu_bo_fini(adev);
 
return 0;
-- 
2.17.1

[PATCH] drm/amdgpu: drop redundant check of harvest info

2022-03-16 Thread Guchun Chen

Harvest bit setting in IP data structure promises this,
so no need to set it explicitly.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index e4fcbb385a62..5a182288391b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -1150,13 +1150,6 @@ void amdgpu_discovery_harvest_ip(struct amdgpu_device 
*adev)
adev->harvest_ip_mask |= AMD_HARVEST_IP_VCN_MASK;
adev->harvest_ip_mask |= AMD_HARVEST_IP_JPEG_MASK;
}
-   if ((adev->pdev->device == 0x731E &&
-(adev->pdev->revision == 0xC6 || adev->pdev->revision == 0xC7)) ||
-   (adev->pdev->device == 0x7340 && adev->pdev->revision == 0xC9)  ||
-   (adev->pdev->device == 0x7360 && adev->pdev->revision == 0xC7)) {
-   adev->harvest_ip_mask |= AMD_HARVEST_IP_VCN_MASK;
-   adev->harvest_ip_mask |= AMD_HARVEST_IP_JPEG_MASK;
-   }
 }
 
 union gc_info {
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: conduct a proper cleanup of PDB bo

2022-03-15 Thread Guchun Chen

Use amdgpu_bo_free_kernel instead of amdgpu_bo_unref to
perform a proper cleanup of PDB bo.

v2: update subject to be more accurate

Signed-off-by: Guchun Chen 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 431742eb7811..e4e7e6dbc6fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1721,7 +1721,7 @@ static int gmc_v9_0_sw_fini(void *handle)
amdgpu_gem_force_release(adev);
amdgpu_vm_manager_fini(adev);
amdgpu_gart_table_vram_free(adev);
-   amdgpu_bo_unref(>gmc.pdb0_bo);
+   amdgpu_bo_free_kernel(>gmc.pdb0_bo, NULL, adev->gmc.ptr_pdb0);
amdgpu_bo_fini(adev);
 
return 0;
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: prevent memory wipe in suspend/shutdown stage

2022-03-15 Thread Guchun Chen

On GPUs with RAS enabled, below call trace is observed when
suspending or shutting down device. The cause is we have enabled
memory wipe flag for BOs on such GPUs by default, and such BOs
will go to memory wipe by amdgpu_fill_buffer, however, because
ring is off already, it fails to clean up the memory and throw
this error message. So add a suspend/shutdown check before
wipping memory.

[drm:amdgpu_fill_buffer [amdgpu]] *ERROR* Trying to clear memory with ring 
turned off.

v2: fix coding style issue

Fixes: e7e7c87a205d("drm/amdgpu: Wipe all VRAM on free when RAS is enabled")
Signed-off-by: Guchun Chen 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 23c9a60693ee..c712d7f5e8a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1284,6 +1284,7 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, uint64_t 
*vram_mem,
  */
 void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
 {
+   struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
struct dma_fence *fence = NULL;
struct amdgpu_bo *abo;
int r;
@@ -1303,7 +1304,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo);
 
if (bo->resource->mem_type != TTM_PL_VRAM ||
-   !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE))
+   !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE) ||
+   adev->in_suspend || adev->shutdown)
return;
 
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: fix pin count leak of PDB bo when unref

2022-03-15 Thread Guchun Chen

Use amdgpu_bo_free_kernel instead of amdgpu_bo_unref to
perform a clean unreference job.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index 431742eb7811..e4e7e6dbc6fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1721,7 +1721,7 @@ static int gmc_v9_0_sw_fini(void *handle)
amdgpu_gem_force_release(adev);
amdgpu_vm_manager_fini(adev);
amdgpu_gart_table_vram_free(adev);
-   amdgpu_bo_unref(>gmc.pdb0_bo);
+   amdgpu_bo_free_kernel(>gmc.pdb0_bo, NULL, adev->gmc.ptr_pdb0);
amdgpu_bo_fini(adev);
 
return 0;
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: prevent memory wipe in suspend/shutdown stage

2022-03-15 Thread Guchun Chen

On GPUs with RAS enabled, below call trace is observed when
suspending or shutting down device. The cause is we have enabled
memory wipe flag for BOs on such GPUs by default, and such BOs
will go to memory wipe by amdgpu_fill_buffer, however, because
ring is off already, it fails to clean up the memory and throw
this error message. So add a suspend/shutdown check before
wipping memory.

[drm:amdgpu_fill_buffer [amdgpu]] *ERROR* Trying to clear memory with ring 
turned off.

Fixes: e7e7c87a205d("drm/amdgpu: Wipe all VRAM on free when RAS is enabled")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 23c9a60693ee..ed1a19be4a54 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1284,6 +1284,7 @@ void amdgpu_bo_get_memory(struct amdgpu_bo *bo, uint64_t 
*vram_mem,
  */
 void amdgpu_bo_release_notify(struct ttm_buffer_object *bo)
 {
+   struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);
struct dma_fence *fence = NULL;
struct amdgpu_bo *abo;
int r;
@@ -1303,7 +1304,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
amdgpu_amdkfd_remove_fence_on_pt_pd_bos(abo);
 
if (bo->resource->mem_type != TTM_PL_VRAM ||
-   !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE))
+   !(abo->flags & AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE) ||
+   adev->in_suspend || adev->shutdown)
return;
 
if (WARN_ON_ONCE(!dma_resv_trylock(bo->base.resv)))
-- 
2.17.1

[PATCH] drm/amdgpu: limit harvest bit read on several ASICs

2022-02-22 Thread Guchun Chen

Due to faulty VBIOS out there, harvest bit setting is not
consistently correct especially for display IP. So far,
it's hard to work out a solution on all the legacy Navi1x
ASICs in a short time, so to avoid regression, limit harvest
bit read on several ASICs. Will revisit later once VBIOS has
corrected it in long term.

Fixes: b3f4ea887d5f("drm/amdgpu: read harvest bit per IP data on legacy GPUs")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 11255290f117..2e0ff1ace6fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -1129,12 +1129,20 @@ void amdgpu_discovery_harvest_ip(struct amdgpu_device 
*adev)
 * so read harvest bit per IP data structure to set
 * harvest configuration.
 */
-   if (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 2, 0))
-   amdgpu_discovery_read_harvest_bit_per_ip(adev,
-   _harvest_count);
-   else
+   if (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 2, 0)) {
+   if ((adev->pdev->device == 0x731E &&
+   (adev->pdev->revision == 0xC6 ||
+adev->pdev->revision == 0xC7)) ||
+   (adev->pdev->device == 0x7340 &&
+adev->pdev->revision == 0xC9) ||
+   (adev->pdev->device == 0x7360 &&
+adev->pdev->revision == 0xC7))
+   amdgpu_discovery_read_harvest_bit_per_ip(adev,
+   _harvest_count);
+   } else {
amdgpu_disocvery_read_from_harvest_table(adev,
-   _harvest_count);
+   _harvest_count);
+   }
 
amdgpu_discovery_harvest_config_quirk(adev);
 
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: bypass tiling flag check in virtual display case (v2)

2022-02-18 Thread Guchun Chen

vkms leverages common amdgpu framebuffer creation, and
also as it does not support FB modifier, there is no need
to check tiling flags when initing framebuffer when virtual
display is enabled.

This can fix below calltrace:

amdgpu :00:08.0: GFX9+ requires FB check based on format modifier
WARNING: CPU: 0 PID: 1023 at drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:1150 
amdgpu_display_framebuffer_init+0x8e7/0xb40 [amdgpu]

v2: check adev->enable_virtual_display instead as vkms can be
enabled in bare metal as well.

Signed-off-by: Leslie Shi 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 0d4ad1ee8348..1043e599b734 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -1150,7 +1150,7 @@ int amdgpu_display_framebuffer_init(struct drm_device 
*dev,
if (ret)
return ret;
 
-   if (!dev->mode_config.allow_fb_modifiers) {
+   if (!dev->mode_config.allow_fb_modifiers && 
!adev->enable_virtual_display) {
drm_WARN_ONCE(dev, adev->family >= AMDGPU_FAMILY_AI,
  "GFX9+ requires FB check based on format 
modifier\n");
ret = check_tiling_flags_gfx6(rfb);
-- 
2.17.1

[PATCH 1/2] Revert "drm/amdgpu: add modifiers in amdgpu_vkms_plane_init()"

2022-02-18 Thread Guchun Chen

This reverts commit 2f73d74ac4cd880beaa1c19fa9ef0296c3eb3b60.

No need to support modifier in virtual kms, otherwise, in SRIOV
mode, when lanuching X server, set crtc will fail due to mismatch
between primary plane modifier and framebuffer modifier.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index d99c8779b51e..5224d9a39737 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -391,7 +391,6 @@ static struct drm_plane *amdgpu_vkms_plane_init(struct 
drm_device *dev,
int index)
 {
struct drm_plane *plane;
-   uint64_t modifiers[] = {DRM_FORMAT_MOD_LINEAR, DRM_FORMAT_MOD_INVALID};
int ret;
 
plane = kzalloc(sizeof(*plane), GFP_KERNEL);
@@ -402,7 +401,7 @@ static struct drm_plane *amdgpu_vkms_plane_init(struct 
drm_device *dev,
   _vkms_plane_funcs,
   amdgpu_vkms_formats,
   ARRAY_SIZE(amdgpu_vkms_formats),
-  modifiers, type, NULL);
+  NULL, type, NULL);
if (ret) {
kfree(plane);
return ERR_PTR(ret);
-- 
2.17.1

[PATCH 2/2] drm/amdgpu: bypass tiling flag check in SRIOV

2022-02-17 Thread Guchun Chen

vkms leverages common amdgpu framebuffer creation, and
also it does not support pixel format modifier, so there
is no need to check tiling flags when initing framebuffer.
This can fix below calltrace:

amdgpu :00:08.0: GFX9+ requires FB check based on format modifier
WARNING: CPU: 0 PID: 1023 at drivers/gpu/drm/amd/amdgpu/amdgpu_display.c:1150 
amdgpu_display_framebuffer_init+0x8e7/0xb40 [amdgpu]

Signed-off-by: Leslie Shi 
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_display.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
index 0d4ad1ee8348..63cc210e4b6b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_display.c
@@ -1150,7 +1150,7 @@ int amdgpu_display_framebuffer_init(struct drm_device 
*dev,
if (ret)
return ret;
 
-   if (!dev->mode_config.allow_fb_modifiers) {
+   if (!dev->mode_config.allow_fb_modifiers && !amdgpu_sriov_vf(adev)) {
drm_WARN_ONCE(dev, adev->family >= AMDGPU_FAMILY_AI,
  "GFX9+ requires FB check based on format 
modifier\n");
ret = check_tiling_flags_gfx6(rfb);
-- 
2.17.1

[PATCH 1/2] Revert "drm/amdgpu: add modifiers in amdgpu_vkms_plane_init()"

2022-02-17 Thread Guchun Chen

This reverts commit 2f73d74ac4cd880beaa1c19fa9ef0296c3eb3b60.

No need to support modifier in virtual kms, otherwise, in SRIOV
mode, when lanuching X server, set crtc will fail due to mismatch
between primary plane modifier and framebuffer modifier.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index d99c8779b51e..5224d9a39737 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -391,7 +391,6 @@ static struct drm_plane *amdgpu_vkms_plane_init(struct 
drm_device *dev,
int index)
 {
struct drm_plane *plane;
-   uint64_t modifiers[] = {DRM_FORMAT_MOD_LINEAR, DRM_FORMAT_MOD_INVALID};
int ret;
 
plane = kzalloc(sizeof(*plane), GFP_KERNEL);
@@ -402,7 +401,7 @@ static struct drm_plane *amdgpu_vkms_plane_init(struct 
drm_device *dev,
   _vkms_plane_funcs,
   amdgpu_vkms_formats,
   ARRAY_SIZE(amdgpu_vkms_formats),
-  modifiers, type, NULL);
+  NULL, type, NULL);
if (ret) {
kfree(plane);
return ERR_PTR(ret);
-- 
2.17.1

[PATCH] drm/amdgpu: read harvest bit per IP data on legacy GPUs

2022-02-17 Thread Guchun Chen

Based on firmware team's input, harvest table in VBIOS does
not apply well to legacy products like Navi1x, so seperate
harvest mask configuration retrieve from different places.
On legacy GPUs, scan harvest bit per IP data stuctures,
while for newer ones, still read IP harvest info from harvest
table.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 119 ++
 1 file changed, 93 insertions(+), 26 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 2506bcf36c87..2ccac1f1582f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -385,6 +385,87 @@ static int amdgpu_discovery_validate_ip(const struct ip 
*ip)
return 0;
 }
 
+static void amdgpu_discovery_read_harvest_bit_per_ip(struct amdgpu_device 
*adev,
+   uint32_t *vcn_harvest_count)
+{
+   struct binary_header *bhdr;
+   struct ip_discovery_header *ihdr;
+   struct die_header *dhdr;
+   struct ip *ip;
+   uint16_t die_offset, ip_offset, num_dies, num_ips;
+   int i, j;
+
+   bhdr = (struct binary_header *)adev->mman.discovery_bin;
+   ihdr = (struct ip_discovery_header *)(adev->mman.discovery_bin +
+   le16_to_cpu(bhdr->table_list[IP_DISCOVERY].offset));
+   num_dies = le16_to_cpu(ihdr->num_dies);
+
+   /* scan harvest bit of all IP data structures */
+   for (i = 0; i < num_dies; i++) {
+   die_offset = le16_to_cpu(ihdr->die_info[i].die_offset);
+   dhdr = (struct die_header *)(adev->mman.discovery_bin + 
die_offset);
+   num_ips = le16_to_cpu(dhdr->num_ips);
+   ip_offset = die_offset + sizeof(*dhdr);
+
+   for (j = 0; j < num_ips; j++) {
+   ip = (struct ip *)(adev->mman.discovery_bin + 
ip_offset);
+
+   if (amdgpu_discovery_validate_ip(ip))
+   goto next_ip;
+
+   if (le16_to_cpu(ip->harvest) == 1) {
+   switch (le16_to_cpu(ip->hw_id)) {
+   case VCN_HWID:
+   (*vcn_harvest_count)++;
+   if (ip->number_instance == 0)
+   adev->vcn.harvest_config |= 
AMDGPU_VCN_HARVEST_VCN0;
+   else
+   adev->vcn.harvest_config |= 
AMDGPU_VCN_HARVEST_VCN1;
+   break;
+   case DMU_HWID:
+   adev->harvest_ip_mask |= 
AMD_HARVEST_IP_DMU_MASK;
+   break;
+   default:
+   break;
+}
+}
+next_ip:
+   ip_offset += sizeof(*ip) + 4 * (ip->num_base_address - 
1);
+   }
+   }
+}
+
+static void amdgpu_disocvery_read_from_harvest_table(struct amdgpu_device 
*adev,
+   uint32_t *vcn_harvest_count)
+{
+   struct binary_header *bhdr;
+   struct harvest_table *harvest_info;
+   int i;
+
+   bhdr = (struct binary_header *)adev->mman.discovery_bin;
+   harvest_info = (struct harvest_table *)(adev->mman.discovery_bin +
+   le16_to_cpu(bhdr->table_list[HARVEST_INFO].offset));
+   for (i = 0; i < 32; i++) {
+   if (le16_to_cpu(harvest_info->list[i].hw_id) == 0)
+   break;
+
+   switch (le16_to_cpu(harvest_info->list[i].hw_id)) {
+   case VCN_HWID:
+   (*vcn_harvest_count)++;
+   if (harvest_info->list[i].number_instance == 0)
+   adev->vcn.harvest_config |= 
AMDGPU_VCN_HARVEST_VCN0;
+   else
+   adev->vcn.harvest_config |= 
AMDGPU_VCN_HARVEST_VCN1;
+   break;
+   case DMU_HWID:
+   adev->harvest_ip_mask |= AMD_HARVEST_IP_DMU_MASK;
+   break;
+   default:
+   break;
+   }
+   }
+}
+
 /* == */
 
 struct ip_hw_instance {
@@ -1046,33 +1127,19 @@ int amdgpu_discovery_get_ip_version(struct 
amdgpu_device *adev, int hw_id, int n
 
 void amdgpu_discovery_harvest_ip(struct amdgpu_device *adev)
 {
-   struct binary_header *bhdr;
-   struct harvest_table *harvest_info;
-   int i, vcn_harvest_count = 0;
-
-   bhdr = (struct binary_header *)adev->mman.discovery_bin;
-   harvest_info = (struct harvest_tabl

[PATCH] drm/amdgpu: no rlcg legacy read in SRIOV case

2022-02-10 Thread Guchun Chen

rlcg legacy read is not available in SRIOV configration.
Otherwise, gmc_v9_0_flush_gpu_tlb will always complain
timeout and finally breaks driver load.

v2: bypass read in amdgpu_virt_get_rlcg_reg_access_flag (from Victor)

Fixes: 0dc4a7e75581("drm/amdgpu: switch to get_rlcg_reg_access_flag for gfx9")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index e1288901beb6..6668d7fa89e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -836,7 +836,7 @@ static bool amdgpu_virt_get_rlcg_reg_access_flag(struct 
amdgpu_device *adev,
/* only in new version, AMDGPU_REGS_NO_KIQ and
 * AMDGPU_REGS_RLC are enabled simultaneously */
} else if ((acc_flags & AMDGPU_REGS_RLC) &&
-  !(acc_flags & AMDGPU_REGS_NO_KIQ)) {
+   !(acc_flags & AMDGPU_REGS_NO_KIQ) && write) {
*rlcg_flag = AMDGPU_RLCG_GC_WRITE_LEGACY;
ret = true;
}
@@ -940,7 +940,7 @@ void amdgpu_sriov_wreg(struct amdgpu_device *adev,
u32 rlcg_flag;
 
if (!amdgpu_sriov_runtime(adev) &&
-   amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, true, 
_flag)) {
+   amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, 
true, _flag)) {
amdgpu_virt_rlcg_reg_rw(adev, offset, value, rlcg_flag);
return;
}
@@ -957,7 +957,7 @@ u32 amdgpu_sriov_rreg(struct amdgpu_device *adev,
u32 rlcg_flag;
 
if (!amdgpu_sriov_runtime(adev) &&
-   amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, false, 
_flag))
+   amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, 
false, _flag))
return amdgpu_virt_rlcg_reg_rw(adev, offset, 0, rlcg_flag);
 
if (acc_flags & AMDGPU_REGS_NO_KIQ)
-- 
2.17.1

[PATCH] drm/amdgpu: no rlcg read access in SRIOV case for gfx v9

2022-02-09 Thread Guchun Chen

Fall back to MMIO to read registers as rlcg read is not
available for gfx v9 in SRIOV configration. Otherwise,
gmc_v9_0_flush_gpu_tlb will always complain timeout and
finally breaks driver load.

Fixes: 0dc4a7e75581("drm/amdgpu: switch to get_rlcg_reg_access_flag for gfx9")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index e1288901beb6..a3274fa1c7e4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -37,6 +37,16 @@
vf2pf_info->ucode_info[ucode].version = ver; \
} while (0)
 
+static bool amdgpu_virt_is_rlcg_read_supported(struct amdgpu_device *adev)
+{
+   /* rlcg read is not support in SRIOV with gfx v9 */
+   if ((adev->ip_versions[MP0_HWIP][0] == IP_VERSION(9, 0, 0)) ||
+   (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 1)))
+   return false;
+
+   return true;
+}
+
 bool amdgpu_virt_mmio_blocked(struct amdgpu_device *adev)
 {
/* By now all MMIO pages except mailbox are blocked */
@@ -957,7 +967,8 @@ u32 amdgpu_sriov_rreg(struct amdgpu_device *adev,
u32 rlcg_flag;
 
if (!amdgpu_sriov_runtime(adev) &&
-   amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, false, 
_flag))
+   amdgpu_virt_is_rlcg_read_supported(adev) &&
+   amdgpu_virt_get_rlcg_reg_access_flag(adev, acc_flags, hwip, 
false, _flag))
return amdgpu_virt_rlcg_reg_rw(adev, offset, 0, rlcg_flag);
 
if (acc_flags & AMDGPU_REGS_NO_KIQ)
-- 
2.17.1

[PATCH] drm/amdgpu: drop flood print in rlcg reg access function

2022-01-28 Thread Guchun Chen

A lot of below message are outputed in SRIOV case.
amdgpu: indirect registers access through rlcg is not supported

Also drop redundant ret set, as it's initialized to be false already.

Fixes: d4cd09ca9bce("drm/amdgpu: add helper to query rlcg reg access flag")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 80c25176c993..b56cafb26f4a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -849,9 +849,6 @@ static bool amdgpu_virt_get_rlcg_reg_access_flag(struct 
amdgpu_device *adev,
}
break;
default:
-   dev_err(adev->dev,
-   "indirect registers access through rlcg is not 
supported\n");
-   ret = false;
break;
}
return ret;
-- 
2.17.1

[PATCH] drm/amdgpu: drop WARN_ON in amdgpu_gart_bind/unbind

2022-01-21 Thread Guchun Chen

NULL pointer check has guarded it already.

calltrace:
amdgpu_ttm_gart_bind+0x49/0xa0 [amdgpu]
amdgpu_ttm_alloc_gart+0x13f/0x180 [amdgpu]
amdgpu_bo_create_reserved+0x139/0x2c0 [amdgpu]
? amdgpu_ttm_debugfs_init+0x120/0x120 [amdgpu]
amdgpu_bo_create_kernel+0x17/0x80 [amdgpu]
amdgpu_ttm_init+0x542/0x5e0 [amdgpu]

Fixes: f0239505d6c4("drm/amdgpu: remove gart.ready flag")
Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index 53cc844346f0..91d8207336c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -161,7 +161,7 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, 
uint64_t offset,
uint64_t flags = 0;
int idx;
 
-   if (WARN_ON(!adev->gart.ptr))
+   if (!adev->gart.ptr)
return;
 
if (!drm_dev_enter(adev_to_drm(adev), ))
@@ -241,7 +241,7 @@ void amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t 
offset,
 int pages, dma_addr_t *dma_addr,
 uint64_t flags)
 {
-   if (WARN_ON(!adev->gart.ptr))
+   if (!adev->gart.ptr)
return;
 
amdgpu_gart_map(adev, offset, pages, dma_addr, flags, adev->gart.ptr);
-- 
2.17.1

[PATCH] drm/amd/pm: use dev_*** to print output in multiple GPUs

2022-01-20 Thread Guchun Chen

In multiple GPU configuration, when failed to send a SMU
message, it's hard to figure out which GPU has such problem.
So it's not comfortable to user.

[40190.142181] amdgpu: [powerplay]
last message was failed ret is 65535
[40190.242420] amdgpu: [powerplay]
failed to send message 201 ret is 65535
[40190.392763] amdgpu: [powerplay]
last message was failed ret is 65535
[40190.492997] amdgpu: [powerplay]
failed to send message 200 ret is 65535
[40190.743575] amdgpu: [powerplay]
last message was failed ret is 65535
[40190.843812] amdgpu: [powerplay]
failed to send message 282 ret is 65535

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c   |  4 +++-
 .../gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c|  4 ++--
 drivers/gpu/drm/amd/pm/powerplay/smumgr/smu7_smumgr.c | 11 +++
 drivers/gpu/drm/amd/pm/powerplay/smumgr/smu9_smumgr.c |  2 +-
 .../gpu/drm/amd/pm/powerplay/smumgr/vega20_smumgr.c   |  4 ++--
 5 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c 
b/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c
index 93a1c7248e26..5ca3c422f7d4 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/ci_smumgr.c
@@ -208,6 +208,7 @@ static int ci_read_smc_sram_dword(struct pp_hwmgr *hwmgr, 
uint32_t smc_addr,
 
 static int ci_send_msg_to_smc(struct pp_hwmgr *hwmgr, uint16_t msg)
 {
+   struct amdgpu_device *adev = hwmgr->adev;
int ret;
 
cgs_write_register(hwmgr->device, mmSMC_RESP_0, 0);
@@ -218,7 +219,8 @@ static int ci_send_msg_to_smc(struct pp_hwmgr *hwmgr, 
uint16_t msg)
ret = PHM_READ_FIELD(hwmgr->device, SMC_RESP_0, SMC_RESP);
 
if (ret != 1)
-   pr_info("\n failed to send message %x ret is %d\n",  msg, ret);
+   dev_info(adev->dev,
+   "failed to send message %x ret is %d\n", msg,ret);
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c 
b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c
index 47b34c6ca924..88a5641465dc 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu10_smumgr.c
@@ -87,7 +87,7 @@ static int smu10_send_msg_to_smc(struct pp_hwmgr *hwmgr, 
uint16_t msg)
smu10_send_msg_to_smc_without_waiting(hwmgr, msg);
 
if (smu10_wait_for_response(hwmgr) == 0)
-   printk("Failed to send Message %x.\n", msg);
+   dev_err(adev->dev, "Failed to send Message %x.\n", msg);
 
return 0;
 }
@@ -108,7 +108,7 @@ static int smu10_send_msg_to_smc_with_parameter(struct 
pp_hwmgr *hwmgr,
 
 
if (smu10_wait_for_response(hwmgr) == 0)
-   printk("Failed to send Message %x.\n", msg);
+   dev_err(adev->dev, "Failed to send Message %x.\n", msg);
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu7_smumgr.c 
b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu7_smumgr.c
index aae25243eb10..5a010cd38303 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu7_smumgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/smumgr/smu7_smumgr.c
@@ -165,6 +165,7 @@ bool smu7_is_smc_ram_running(struct pp_hwmgr *hwmgr)
 
 int smu7_send_msg_to_smc(struct pp_hwmgr *hwmgr, uint16_t msg)
 {
+   struct amdgpu_device *adev = hwmgr->adev;
int ret;
 
PHM_WAIT_FIELD_UNEQUAL(hwmgr, SMC_RESP_0, SMC_RESP, 0);
@@ -172,9 +173,10 @@ int smu7_send_msg_to_smc(struct pp_hwmgr *hwmgr, uint16_t 
msg)
ret = PHM_READ_FIELD(hwmgr->device, SMC_RESP_0, SMC_RESP);
 
if (ret == 0xFE)
-   pr_debug("last message was not supported\n");
+   dev_dbg(adev->dev, "last message was not supported\n");
else if (ret != 1)
-   pr_info("\n last message was failed ret is %d\n", ret);
+   dev_info(adev->dev,
+   "\nlast message was failed ret is %d\n", ret);
 
cgs_write_register(hwmgr->device, mmSMC_RESP_0, 0);
cgs_write_register(hwmgr->device, mmSMC_MESSAGE_0, msg);
@@ -184,9 +186,10 @@ int smu7_send_msg_to_smc(struct pp_hwmgr *hwmgr, uint16_t 
msg)
ret = PHM_READ_FIELD(hwmgr->device, SMC_RESP_0, SMC_RESP);
 
if (ret == 0xFE)
-   pr_debug("message %x was not supported\n", msg);
+   dev_dbg(adev->dev, "message %x was not supported\n", msg);
else if (ret != 1)
-   pr_info("\n failed to send message %x ret is %d \n",  msg, ret);
+   dev_dbg(adev->dev,
+   "failed to send message %x ret is %d \n",  msg, ret);
 
return 0;
 }
diff --git a/driv

[PATCH 2/2] drm/amdgpu: apply vcn harvest quirk

2022-01-13 Thread Guchun Chen

This is a following patch to apply the workaround only on
those boards with a bad harvest table in ip discovery.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 32 ---
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 2f891ae90bad..07965ac6381b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -243,6 +243,30 @@ static inline bool 
amdgpu_discovery_verify_binary_signature(uint8_t *binary)
return (le32_to_cpu(bhdr->binary_signature) == BINARY_SIGNATURE);
 }
 
+static void amdgpu_discovery_harvest_config_quirk(struct amdgpu_device *adev)
+{
+   /*
+* So far, apply this quirk only on those Navy Flounder boards which
+* have a bad harvest table of VCN config.
+*/
+   if ((adev->ip_versions[UVD_HWIP][1] == IP_VERSION(3, 0, 1)) &&
+   (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 3, 2))) {
+   switch (adev->pdev->revision) {
+   case 0xC1:
+   case 0xC2:
+   case 0xC3:
+   case 0xC5:
+   case 0xC7:
+   case 0xCF:
+   case 0xDF:
+   adev->vcn.harvest_config |= AMDGPU_VCN_HARVEST_VCN1;
+   break;
+   default:
+   break;
+   }
+   }
+}
+
 static int amdgpu_discovery_init(struct amdgpu_device *adev)
 {
struct table_info *info;
@@ -548,11 +572,9 @@ void amdgpu_discovery_harvest_ip(struct amdgpu_device 
*adev)
break;
}
}
-   /* some IP discovery tables on Navy Flounder don't have this set 
correctly */
-   if ((adev->ip_versions[UVD_HWIP][1] == IP_VERSION(3, 0, 1)) &&
-   (adev->ip_versions[GC_HWIP][0] == IP_VERSION(10, 3, 2)) &&
-   (adev->pdev->revision != 0xFF))
-   adev->vcn.harvest_config |= AMDGPU_VCN_HARVEST_VCN1;
+
+   amdgpu_discovery_harvest_config_quirk(adev);
+
if (vcn_harvest_count == adev->vcn.num_vcn_inst) {
adev->harvest_ip_mask |= AMD_HARVEST_IP_VCN_MASK;
adev->harvest_ip_mask |= AMD_HARVEST_IP_JPEG_MASK;
-- 
2.17.1

[PATCH 1/2] drm/amdgpu: drop redundant check of ip discovery_bin

2022-01-13 Thread Guchun Chen

Early check in amdgpu_discovery_reg_base_init promises this.

Signed-off-by: Guchun Chen 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
index 57e001d73ec9..2f891ae90bad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c
@@ -1194,11 +1194,6 @@ int amdgpu_discovery_set_ip_blocks(struct amdgpu_device 
*adev)
return -EINVAL;
 
amdgpu_discovery_harvest_ip(adev);
-
-   if (!adev->mman.discovery_bin) {
-   DRM_ERROR("ip discovery uninitialized\n");
-   return -EINVAL;
-   }
break;
}
 
-- 
2.17.1

[PATCH] tests/amdgpu: Add VCN test support for Biege Goby

2022-01-13 Thread Guchun Chen

Added Beige Goby chip id in vcn test, will open a MR to
merge this on gitlab after review.

Signed-off-by: Guchun Chen 
---
 tests/amdgpu/vcn_tests.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/amdgpu/vcn_tests.c b/tests/amdgpu/vcn_tests.c
index 628b4910..15d573d3 100644
--- a/tests/amdgpu/vcn_tests.c
+++ b/tests/amdgpu/vcn_tests.c
@@ -142,7 +142,8 @@ CU_BOOL suite_vcn_tests_enable(void)
} else if (family_id == AMDGPU_FAMILY_NV) {
if (chip_id == (chip_rev + 0x28) ||
chip_id == (chip_rev + 0x32) ||
-   chip_id == (chip_rev + 0x3c)) {
+   chip_id == (chip_rev + 0x3c) ||
+   chip_id == (chip_rev + 0x46)) {
reg.data0 = 0x10;
reg.data1 = 0x11;
reg.cmd = 0xf;
-- 
2.17.1

[PATCH] drm/amdgpu: use spin_lock_irqsave to avoid deadlock by local interrupt

2022-01-10 Thread Guchun Chen

This is observed in SRIOV case with virtual KMS as display.

_raw_spin_lock_irqsave+0x37/0x40
drm_handle_vblank+0x69/0x350 [drm]
? try_to_wake_up+0x432/0x5c0
? amdgpu_vkms_prepare_fb+0x1c0/0x1c0 [amdgpu]
drm_crtc_handle_vblank+0x17/0x20 [drm]
amdgpu_vkms_vblank_simulate+0x4d/0x80 [amdgpu]
__hrtimer_run_queues+0xfb/0x230
hrtimer_interrupt+0x109/0x220
__sysvec_apic_timer_interrupt+0x64/0xe0
asm_call_irq_on_stack+0x12/0x20

Fixes: ba5317109d0c("drm/amdgpu: create amdgpu_vkms (v4)")
Signed-off-by: Guchun Chen 
Acked-by: Alex Deucher 
Tested-by: Kelly Zytaruk 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
index 2dcc68e04e84..d99c8779b51e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vkms.c
@@ -144,15 +144,16 @@ static void amdgpu_vkms_crtc_atomic_disable(struct 
drm_crtc *crtc,
 static void amdgpu_vkms_crtc_atomic_flush(struct drm_crtc *crtc,
  struct drm_atomic_state *state)
 {
+   unsigned long flags;
if (crtc->state->event) {
-   spin_lock(>dev->event_lock);
+   spin_lock_irqsave(>dev->event_lock, flags);
 
if (drm_crtc_vblank_get(crtc) != 0)
drm_crtc_send_vblank_event(crtc, crtc->state->event);
else
drm_crtc_arm_vblank_event(crtc, crtc->state->event);
 
-   spin_unlock(>dev->event_lock);
+   spin_unlock_irqrestore(>dev->event_lock, flags);
 
crtc->state->event = NULL;
}
-- 
2.17.1

1 2 3 >

1 - 100 of 250 matches

Mail list logo