Re: [PATCH v2] drm/amdgpu: Fix one use-after-free of VM

2022-04-13 Thread Paul Menzel

Dear Xinhui,


Thank you for rerolling the patch.

Am 14.04.22 um 07:03 schrieb xinhui pan:

VM might already be freed when amdgpu_vm_tlb_seq_cb() is called.
We see the calltrace below.

Fix it by keeping the last flush fence around and wait for it to signal


Nit: Please add a dot/period to the end of sentences.



BUG kmalloc-4k (Not tainted): Poison overwritten

0x9c88630414e8-0x9c88630414e8 @offset=5352. First byte 0x6c
instead of 0x6b Allocated in amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
age=44 cpu=0 pid=2343
  __slab_alloc.isra.0+0x4f/0x90
  kmem_cache_alloc_trace+0x6b8/0x7a0
  amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
  drm_file_alloc+0x222/0x3e0 [drm]
  drm_open+0x11d/0x410 [drm]
Freed in amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu] age=22 cpu=1
pid=2485
  kfree+0x4a2/0x580
  amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu]
  drm_file_free+0x24e/0x3c0 [drm]
  drm_close_helper.isra.0+0x90/0xb0 [drm]
  drm_release+0x97/0x1a0 [drm]
  __fput+0xb6/0x280
  fput+0xe/0x10
  task_work_run+0x64/0xb0


The v2 annotation is missing.


Suggested-by: Christian König 
Signed-off-by: xinhui pan 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  1 +
  2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 645ce28277c2..cd5aa7edd451 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -932,9 +932,12 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
  
  	if (flush_tlb || params.table_freed) {

tlb_cb->vm = vm;
-   if (!fence || !*fence ||
-   dma_fence_add_callback(*fence, _cb->cb,
-  amdgpu_vm_tlb_seq_cb))
+   if (fence && *fence &&
+   !dma_fence_add_callback(*fence, _cb->cb,
+  amdgpu_vm_tlb_seq_cb)) {
+   dma_fence_put(vm->last_tlb_flush);
+   vm->last_tlb_flush = dma_fence_get(*fence);
+   } else
amdgpu_vm_tlb_seq_cb(NULL, _cb->cb);


The Linux kernel coding style uses braces for all branches of a 
conditional statement, if one branch uses braces. [1]



Kind regards,

Paul



tlb_cb = NULL;
}
@@ -2094,6 +2097,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
vm->update_funcs = _vm_sdma_funcs;
vm->last_update = NULL;
vm->last_unlocked = dma_fence_get_stub();
+   vm->last_tlb_flush = dma_fence_get_stub();
  
  	mutex_init(>eviction_lock);

vm->evicting = false;
@@ -2132,6 +2136,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
vm->root.bo = NULL;
  
  error_free_delayed:

+   dma_fence_put(vm->last_tlb_flush);
dma_fence_put(vm->last_unlocked);
drm_sched_entity_destroy(>delayed);
  
@@ -2248,6 +2253,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)

struct amdgpu_bo_va_mapping *mapping, *tmp;
bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;
struct amdgpu_bo *root;
+   unsigned long flags;
int i;
  
  	amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);

@@ -2257,6 +2263,11 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
amdgpu_vm_set_pasid(adev, vm, 0);
dma_fence_wait(vm->last_unlocked, false);
dma_fence_put(vm->last_unlocked);
+   dma_fence_wait(vm->last_tlb_flush, false);
+   /* Make sure that all fence callbacks have completed */
+   spin_lock_irqsave(vm->last_tlb_flush->lock, flags);
+   spin_unlock_irqrestore(vm->last_tlb_flush->lock, flags);
+   dma_fence_put(vm->last_tlb_flush);
  
  	list_for_each_entry_safe(mapping, tmp, >freed, list) {

if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 1a814fb8..6b06a214f05f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -286,6 +286,7 @@ struct amdgpu_vm {
  
  	/* Last finished delayed update */

atomic64_t  tlb_seq;
+   struct dma_fence*last_tlb_flush;
  
  	/* Last unlocked submission to the scheduler entities */

struct dma_fence*last_unlocked;



[1]: 
https://www.kernel.org/doc/html/latest/process/coding-style.html#placing-braces-and-spaces


[PATCH v2] drm/amdgpu: Fix one use-after-free of VM

2022-04-13 Thread xinhui pan
VM might already be freed when amdgpu_vm_tlb_seq_cb() is called.
We see the calltrace below.

Fix it by keeping the last flush fence around and wait for it to signal

BUG kmalloc-4k (Not tainted): Poison overwritten

0x9c88630414e8-0x9c88630414e8 @offset=5352. First byte 0x6c
instead of 0x6b Allocated in amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
age=44 cpu=0 pid=2343
 __slab_alloc.isra.0+0x4f/0x90
 kmem_cache_alloc_trace+0x6b8/0x7a0
 amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
 drm_file_alloc+0x222/0x3e0 [drm]
 drm_open+0x11d/0x410 [drm]
Freed in amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu] age=22 cpu=1
pid=2485
 kfree+0x4a2/0x580
 amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu]
 drm_file_free+0x24e/0x3c0 [drm]
 drm_close_helper.isra.0+0x90/0xb0 [drm]
 drm_release+0x97/0x1a0 [drm]
 __fput+0xb6/0x280
 fput+0xe/0x10
 task_work_run+0x64/0xb0

Suggested-by: Christian König 
Signed-off-by: xinhui pan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 17 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  1 +
 2 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 645ce28277c2..cd5aa7edd451 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -932,9 +932,12 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,
 
if (flush_tlb || params.table_freed) {
tlb_cb->vm = vm;
-   if (!fence || !*fence ||
-   dma_fence_add_callback(*fence, _cb->cb,
-  amdgpu_vm_tlb_seq_cb))
+   if (fence && *fence &&
+   !dma_fence_add_callback(*fence, _cb->cb,
+  amdgpu_vm_tlb_seq_cb)) {
+   dma_fence_put(vm->last_tlb_flush);
+   vm->last_tlb_flush = dma_fence_get(*fence);
+   } else
amdgpu_vm_tlb_seq_cb(NULL, _cb->cb);
tlb_cb = NULL;
}
@@ -2094,6 +2097,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
vm->update_funcs = _vm_sdma_funcs;
vm->last_update = NULL;
vm->last_unlocked = dma_fence_get_stub();
+   vm->last_tlb_flush = dma_fence_get_stub();
 
mutex_init(>eviction_lock);
vm->evicting = false;
@@ -2132,6 +2136,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
vm->root.bo = NULL;
 
 error_free_delayed:
+   dma_fence_put(vm->last_tlb_flush);
dma_fence_put(vm->last_unlocked);
drm_sched_entity_destroy(>delayed);
 
@@ -2248,6 +2253,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
struct amdgpu_bo_va_mapping *mapping, *tmp;
bool prt_fini_needed = !!adev->gmc.gmc_funcs->set_prt;
struct amdgpu_bo *root;
+   unsigned long flags;
int i;
 
amdgpu_amdkfd_gpuvm_destroy_cb(adev, vm);
@@ -2257,6 +2263,11 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
amdgpu_vm_set_pasid(adev, vm, 0);
dma_fence_wait(vm->last_unlocked, false);
dma_fence_put(vm->last_unlocked);
+   dma_fence_wait(vm->last_tlb_flush, false);
+   /* Make sure that all fence callbacks have completed */
+   spin_lock_irqsave(vm->last_tlb_flush->lock, flags);
+   spin_unlock_irqrestore(vm->last_tlb_flush->lock, flags);
+   dma_fence_put(vm->last_tlb_flush);
 
list_for_each_entry_safe(mapping, tmp, >freed, list) {
if (mapping->flags & AMDGPU_PTE_PRT && prt_fini_needed) {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 1a814fb8..6b06a214f05f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -286,6 +286,7 @@ struct amdgpu_vm {
 
/* Last finished delayed update */
atomic64_t  tlb_seq;
+   struct dma_fence*last_tlb_flush;
 
/* Last unlocked submission to the scheduler entities */
struct dma_fence*last_unlocked;
-- 
2.25.1



[pull] amdgpu drm-fixes-5.18

2022-04-13 Thread Alex Deucher
Hi Dave, Daniel,

Fixes for 5.18.

The following changes since commit 88711fa9a14f6f473f4a7645155ca51386e36c21:

  Merge tag 'drm-misc-fixes-2022-04-07' of 
git://anongit.freedesktop.org/drm/drm-misc into drm-fixes (2022-04-08 09:22:16 
+1000)

are available in the Git repository at:

  https://gitlab.freedesktop.org/agd5f/linux.git 
tags/amd-drm-fixes-5.18-2022-04-13

for you to fetch changes up to aadaeca46ce54af9f8f494792a1ba47a6fbda7ba:

  drm/amd/display: remove dtbclk_ss compensation for dcn316 (2022-04-13 
22:23:54 -0400)


amd-drm-fixes-5.18-2022-04-13:

amdgpu:
- Fix for alpha properly in pre-multiplied mode
- Fix VCN 3.1.2 firmware name
- Suspend/resume fix
- Add a gfxoff quirk for Mac vega20 board
- DCN 3.1.6 spread spectrum fix


Alex Deucher (1):
  drm/amdgpu: fix VCN 3.1.2 firmware name

Charlene Liu (1):
  drm/amd/display: remove dtbclk_ss compensation for dcn316

Kai-Heng Feng (1):
  drm/amdgpu: Ensure HDA function is suspended before ASIC reset

Melissa Wen (1):
  drm/amd/display: don't ignore alpha property on pre-multiplied mode

Tomasz Moń (1):
  drm/amdgpu: Enable gfxoff quirk on MacBook Pro

 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c| 18 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_vcn.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  |  2 ++
 .../drm/amd/display/dc/clk_mgr/dce100/dce_clk_mgr.c|  2 +-
 .../drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c |  4 ++--
 drivers/gpu/drm/amd/display/dc/dc.h|  2 +-
 .../gpu/drm/amd/display/dc/dcn10/dcn10_hw_sequencer.c  | 14 +-
 drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hwseq.c | 14 +-
 8 files changed, 37 insertions(+), 21 deletions(-)


[PATCH] drm/amdgpu: don't runtime suspend if there are displays attached (v2)

2022-04-13 Thread Alex Deucher
We normally runtime suspend when there are displays attached if they
are in the DPMS off state, however, if something wakes the GPU
we send a hotplug event on resume (in case any displays were connected
while the GPU was in suspend) which can cause userspace to light
up the displays again soon after they were turned off.

Prior to
commit 087451f372bf76 ("drm/amdgpu: use generic fb helpers instead of setting 
up AMD own's."),
the driver took a runtime pm reference when the fbdev emulation was
enabled because we didn't implement proper shadowing support for
vram access when the device was off so the device never runtime
suspended when there was a console bound.  Once that commit landed,
we now utilize the core fb helper implementation which properly
handles the emulation, so runtime pm now suspends in cases where it did
not before.  Ultimately, we need to sort out why runtime suspend in not
working in this case for some users, but this should restore similar
behavior to before.

v2: move check into runtime_suspend

Fixes: 087451f372bf76 ("drm/amdgpu: use generic fb helpers instead of setting 
up AMD own's.")
Tested-by: Michele Ballabio 
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 107 
 1 file changed, 72 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 4efaa183abcd..97a1aa02d76e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2395,6 +2395,71 @@ static int amdgpu_pmops_restore(struct device *dev)
return amdgpu_device_resume(drm_dev, true);
 }
 
+static int amdgpu_runtime_idle_check_display(struct device *dev)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+   struct drm_device *drm_dev = pci_get_drvdata(pdev);
+   struct amdgpu_device *adev = drm_to_adev(drm_dev);
+
+   if (adev->mode_info.num_crtc) {
+   struct drm_connector *list_connector;
+   struct drm_connector_list_iter iter;
+   int ret = 0;
+
+   /* XXX: Return busy if any displays are connected to avoid
+* possible display wake ups after runtime resume due to
+* hotplug events in case any displays were connected while
+* the GPU was in suspend.  Remove this once that is fixed.
+*/
+   mutex_lock(_dev->mode_config.mutex);
+   drm_connector_list_iter_begin(drm_dev, );
+   drm_for_each_connector_iter(list_connector, ) {
+   if (list_connector->status == 
connector_status_connected) {
+   ret = -EBUSY;
+   break;
+   }
+   }
+   drm_connector_list_iter_end();
+   mutex_unlock(_dev->mode_config.mutex);
+
+   if (ret)
+   return ret;
+
+   if (amdgpu_device_has_dc_support(adev)) {
+   struct drm_crtc *crtc;
+
+   drm_for_each_crtc(crtc, drm_dev) {
+   drm_modeset_lock(>mutex, NULL);
+   if (crtc->state->active)
+   ret = -EBUSY;
+   drm_modeset_unlock(>mutex);
+   if (ret < 0)
+   break;
+   }
+   } else {
+   mutex_lock(_dev->mode_config.mutex);
+   
drm_modeset_lock(_dev->mode_config.connection_mutex, NULL);
+
+   drm_connector_list_iter_begin(drm_dev, );
+   drm_for_each_connector_iter(list_connector, ) {
+   if (list_connector->dpms ==  DRM_MODE_DPMS_ON) {
+   ret = -EBUSY;
+   break;
+   }
+   }
+
+   drm_connector_list_iter_end();
+
+   
drm_modeset_unlock(_dev->mode_config.connection_mutex);
+   mutex_unlock(_dev->mode_config.mutex);
+   }
+   if (ret)
+   return ret;
+   }
+
+   return 0;
+}
+
 static int amdgpu_pmops_runtime_suspend(struct device *dev)
 {
struct pci_dev *pdev = to_pci_dev(dev);
@@ -2407,6 +2472,10 @@ static int amdgpu_pmops_runtime_suspend(struct device 
*dev)
return -EBUSY;
}
 
+   ret = amdgpu_runtime_idle_check_display(dev);
+   if (ret)
+   return ret;
+
/* wait for all rings to drain before suspending */
for (i = 0; i < AMDGPU_MAX_RINGS; i++) {
struct amdgpu_ring *ring = adev->rings[i];
@@ -2516,41 +2585,9 @@ static int amdgpu_pmops_runtime_idle(struct device *dev)
return -EBUSY;
}
 
-   if 

Re: AMDGPU: regression on 5.17.1

2022-04-13 Thread Michele Ballabio
On Wed, 13 Apr 2022 14:14:42 -0400
Alex Deucher  wrote:

> On Wed, Apr 13, 2022 at 1:33 PM Michele Ballabio
>  wrote:
> >
> > On Mon, 11 Apr 2022 14:34:37 -0400
> > Alex Deucher  wrote:
> >  
> > > On Sat, Apr 9, 2022 at 12:28 PM Michele Ballabio
> > >  wrote:  
> > > >
> > > > On Tue, 5 Apr 2022 10:23:16 -0400
> > > > Alex Deucher  wrote:
> > > >  
> > > > > On Mon, Apr 4, 2022 at 3:39 PM Michele Ballabio
> > > > >  wrote:  
> > > > > >
> > > > > > On Mon, 4 Apr 2022 13:03:41 -0400
> > > > > > Alex Deucher  wrote:
> > > > > >  
> > > > > > > On Sun, Apr 3, 2022 at 10:19 AM Michele Ballabio
> > > > > > >  wrote:  
> > > > > > > >
> > > > > > > > Hi,
> > > > > > > > I've hit a regression on 5.17.1 (haven't tested
> > > > > > > > 5.17.0, but 5.16-stable didn't have this problem).
> > > > > > > >
> > > > > > > > The machine is a Ryzen 5 1600 with AMD graphics (RX
> > > > > > > > 560).
> > > > > > > >
> > > > > > > > The regression I hit seems to trigger when the machine
> > > > > > > > is left idle at boot (I don't boot straight to X, I
> > > > > > > > boot to a tty, login and then start X). The machine
> > > > > > > > after a while blanks the screen. Usually, the screen
> > > > > > > > unblanks as the keyboard is hit or the mouse moves, but
> > > > > > > > with kernel 5.17.1 the screen does not wake up. The
> > > > > > > > machine seems to run mostly fine: I can login from ssh,
> > > > > > > > but I cannot reboot or halt it: a sysrq sequence is
> > > > > > > > needed for that. Note that if the screen goes blank
> > > > > > > > under X, it wakes up fine.
> > > > > > > >
> > > > > > > > Below a dmesg and two traces from syslog (they're quite
> > > > > > > > similar).  
> > > > > > >
> > > > > > > Can you bisect?  Does setting amdgpu.runpm=0 help?  
> > > > > >
> > > > > > I can try to bisect, should I narrow the search to
> > > > > > drivers/gpu/drm/ ?  
> > > > >
> > > > > I would just do a full bisect if possible in case the change
> > > > > happens to be outside of drm.
> > > > >  
> > > > > >
> > > > > > Setting amdgpu.runpm=0 works, the display now unblanks
> > > > > > without problems.  
> > > > >  
> > > >
> > > > Hi,
> > > > I bisected this, and the first bad commit is
> > > > [087451f372bf76d971184caa258807b7c35aac8f] drm/amdgpu: use
> > > > generic fb helpers instead of setting up AMD own's.
> > > >
> > > > Let me know if you need some more testing.  
> > >
> > > Thanks.  Do the attached patches fix the issue?
> > >
> > > Thanks,
> > >
> > > Alex  
> >
> > Sorry, no. I applied them both on top of 5.17.1.  
> 
> Thanks.  Please try the attached patch.
> 
> Thanks,
> 
> Alex

I applied the v2 patch on top of 5.17.1 and it works as expected.

Tested-by: Michele Ballabio 

Thanks,
Michele Ballabio



[PATCH] drm/amdgpu: Move reset domain locking in DPC handler

2022-04-13 Thread Andrey Grodzovsky
Lock reset domain unconditionally because on resume
we unlock it unconditionally.
This solved mutex deadlock when handling both FATAL
and non FATAL PCI errors one after another.

Signed-off-by: Andrey Grodzovsky 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 14 +++---
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 1cc488a767d8..c65f25e3a0fc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5531,18 +5531,18 @@ pci_ers_result_t amdgpu_pci_error_detected(struct 
pci_dev *pdev, pci_channel_sta
 
adev->pci_channel_state = state;
 
+   /*
+* Locking adev->reset_domain->sem will prevent any external access
+* to GPU during PCI error recovery
+*/
+   amdgpu_device_lock_reset_domain(adev->reset_domain);
+   amdgpu_device_set_mp1_state(adev);
+
switch (state) {
case pci_channel_io_normal:
return PCI_ERS_RESULT_CAN_RECOVER;
/* Fatal error, prepare for slot reset */
case pci_channel_io_frozen:
-   /*
-* Locking adev->reset_domain->sem will prevent any external 
access
-* to GPU during PCI error recovery
-*/
-   amdgpu_device_lock_reset_domain(adev->reset_domain);
-   amdgpu_device_set_mp1_state(adev);
-
/*
 * Block any work scheduling as we do for regular GPU reset
 * for the duration of the recovery
-- 
2.25.1



Re: AMDGPU: regression on 5.17.1

2022-04-13 Thread Alex Deucher
On Wed, Apr 13, 2022 at 1:33 PM Michele Ballabio  wrote:
>
> On Mon, 11 Apr 2022 14:34:37 -0400
> Alex Deucher  wrote:
>
> > On Sat, Apr 9, 2022 at 12:28 PM Michele Ballabio
> >  wrote:
> > >
> > > On Tue, 5 Apr 2022 10:23:16 -0400
> > > Alex Deucher  wrote:
> > >
> > > > On Mon, Apr 4, 2022 at 3:39 PM Michele Ballabio
> > > >  wrote:
> > > > >
> > > > > On Mon, 4 Apr 2022 13:03:41 -0400
> > > > > Alex Deucher  wrote:
> > > > >
> > > > > > On Sun, Apr 3, 2022 at 10:19 AM Michele Ballabio
> > > > > >  wrote:
> > > > > > >
> > > > > > > Hi,
> > > > > > > I've hit a regression on 5.17.1 (haven't tested
> > > > > > > 5.17.0, but 5.16-stable didn't have this problem).
> > > > > > >
> > > > > > > The machine is a Ryzen 5 1600 with AMD graphics (RX 560).
> > > > > > >
> > > > > > > The regression I hit seems to trigger when the machine is
> > > > > > > left idle at boot (I don't boot straight to X, I boot to a
> > > > > > > tty, login and then start X). The machine after a while
> > > > > > > blanks the screen. Usually, the screen unblanks as the
> > > > > > > keyboard is hit or the mouse moves, but with kernel 5.17.1
> > > > > > > the screen does not wake up. The machine seems to run
> > > > > > > mostly fine: I can login from ssh, but I cannot reboot or
> > > > > > > halt it: a sysrq sequence is needed for that. Note that if
> > > > > > > the screen goes blank under X, it wakes up fine.
> > > > > > >
> > > > > > > Below a dmesg and two traces from syslog (they're quite
> > > > > > > similar).
> > > > > >
> > > > > > Can you bisect?  Does setting amdgpu.runpm=0 help?
> > > > >
> > > > > I can try to bisect, should I narrow the search to
> > > > > drivers/gpu/drm/ ?
> > > >
> > > > I would just do a full bisect if possible in case the change
> > > > happens to be outside of drm.
> > > >
> > > > >
> > > > > Setting amdgpu.runpm=0 works, the display now unblanks without
> > > > > problems.
> > > >
> > >
> > > Hi,
> > > I bisected this, and the first bad commit is
> > > [087451f372bf76d971184caa258807b7c35aac8f] drm/amdgpu: use generic
> > > fb helpers instead of setting up AMD own's.
> > >
> > > Let me know if you need some more testing.
> >
> > Thanks.  Do the attached patches fix the issue?
> >
> > Thanks,
> >
> > Alex
>
> Sorry, no. I applied them both on top of 5.17.1.

Thanks.  Please try the attached patch.

Thanks,

Alex

> I'm pasting the output of dmesg and decode_stacktrace.sh:
>
> dmesg:
> ---
>
> [0.00] Linux version 5.17.1+ (m...@darkstar.example.org) (gcc (GCC) 
> 11.2.0, GNU ld version 2.38-slack151) #221 SMP PREEMPT Wed Apr 13 18:51:36 
> CEST 2022
> [0.00] Command line: BOOT_IMAGE=bzImage ro root=806 debug
> [0.00] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point 
> registers'
> [0.00] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
> [0.00] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
> [0.00] x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
> [0.00] x86/fpu: Enabled xstate features 0x7, context size is 832 
> bytes, using 'compacted' format.
> [0.00] signal: max sigframe size: 1776
> [0.00] BIOS-provided physical RAM map:
> [0.00] BIOS-e820: [mem 0x-0x0009d3ff] usable
> [0.00] BIOS-e820: [mem 0x0009d400-0x0009] reserved
> [0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
> [0.00] BIOS-e820: [mem 0x0010-0x03ff] usable
> [0.00] BIOS-e820: [mem 0x0400-0x04009fff] ACPI NVS
> [0.00] BIOS-e820: [mem 0x0400a000-0x09bf] usable
> [0.00] BIOS-e820: [mem 0x09c0-0x09ff] reserved
> [0.00] BIOS-e820: [mem 0x0a00-0x0aff] usable
> [0.00] BIOS-e820: [mem 0x0b00-0x0b01] reserved
> [0.00] BIOS-e820: [mem 0x0b02-0xd4434fff] usable
> [0.00] BIOS-e820: [mem 0xd4435000-0xd444] ACPI 
> data
> [0.00] BIOS-e820: [mem 0xd445-0xda830fff] usable
> [0.00] BIOS-e820: [mem 0xda831000-0xda970fff] reserved
> [0.00] BIOS-e820: [mem 0xda971000-0xda97afff] ACPI 
> data
> [0.00] BIOS-e820: [mem 0xda97b000-0xdaa82fff] usable
> [0.00] BIOS-e820: [mem 0xdaa83000-0xdae3efff] ACPI NVS
> [0.00] BIOS-e820: [mem 0xdae3f000-0xdb93efff] reserved
> [0.00] BIOS-e820: [mem 0xdb93f000-0xddff] usable
> [0.00] BIOS-e820: [mem 0xde00-0xdfff] reserved
> [0.00] BIOS-e820: [mem 0xf800-0xfbff] reserved
> [0.00] BIOS-e820: [mem 0xfd80-0xfdff] reserved
> [0.00] BIOS-e820: [mem 0xfea0-0xfea0] reserved
> [

RE: [PATCH] drm/amd/amdgpu: Not request init data for MS_HYPERV with vega10

2022-04-13 Thread Michael Kelley (LINUX)
From: Alex Deucher  Sent: Tuesday, April 12, 2022 7:13 AM
> 
> On Tue, Apr 12, 2022 at 4:01 AM Paul Menzel  wrote:
> >
> > [Cc: +x86 folks]
> >
> > Dear Alex, dear x86 folks,
> >
> >
> > x86 folks, can you think of alternatives to access `X86_HYPER_MS_HYPERV`
> > from `arch/x86/include/asm/hypervisor.h` without any preprocessor ifdef-ery?
> 
> I don't really see what problem that solves.  X86_HYPER_MS_HYPERV is
> an X86 thing.  Why do we need a processor agnostic way to handle it?
> Any code related to that is X86 specific.
> 
> Alex

Try using hv_is_hyperv_initialized() without any #ifdef'ery.  If
CONFIG_HYPERV is defined, it will tell you if you are running as a guest
on Hyper-V on x86/x64 or on ARM64.  If CONFIG_HYPERV is not defined,
it will return "false".

You'll need to #include .

Michael

> 
> >
> >
> > Am 11.04.22 um 18:28 schrieb Alex Deucher:
> > > On Mon, Apr 11, 2022 at 11:28 AM Paul Menzel wrote:
> >
> > […]
> >
> > >> Am 11.04.22 um 15:59 schrieb Yongqiang Sun:
> > >>> MS_HYPERV with vega10 doesn't have the interface to process
> > >>> request init data msg.
> > >>
> > >> Should some Hyper-V folks be added to the reviewers list too?
> > >>
> > >>> Check hypervisor type to not send the request for MS_HYPERV.
> > >>
> > >> Please add a blank line between paragraphs.
> > >>
> > >>> Signed-off-by: Yongqiang Sun 
> > >>> ---
> > >>>drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 12 ++--
> > >>>1 file changed, 10 insertions(+), 2 deletions(-)
> > >>>
> > >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > >>> index 933c41f77c92..56b130ec44a9 100644
> > >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
> > >>> @@ -23,6 +23,10 @@
> > >>>
> > >>>#include 
> > >>>
> > >>> +#ifdef CONFIG_X86
> > >>> +#include 
> > >>> +#endif
> > >>> +
> > >>>#include 
> > >>>
> > >>>#include "amdgpu.h"
> > >>> @@ -721,8 +725,12 @@ void amdgpu_detect_virtualization(struct
> amdgpu_device *adev)
> > >>>break;
> > >>>case CHIP_VEGA10:
> > >>>soc15_set_virt_ops(adev);
> > >>> - /* send a dummy GPU_INIT_DATA request to host on 
> > >>> vega10 */
> > >>> - amdgpu_virt_request_init_data(adev);
> > >>> +#ifdef CONFIG_X86
> > >>> + /* not send GPU_INIT_DATA with MS_HYPERV*/
> > >>> + if (hypervisor_is_type(X86_HYPER_MS_HYPERV) == 
> > >>> false)
> > >>> +#endif
> > >>
> > >> Why guard everything with CONFIG_X86? (If it’s needed, it should be done
> > >> in C code.)
> > >
> > > X86_HYPER_MS_HYPERV only available on x86.
> >
> > Sorry, I missed the X86 dependency when quickly looking at the Hyper-V
> > stub IOMMU driver `drivers/iommu/hyperv-iommu.c`, but missed that
> > `HYPERV_IOMMU` has `depends on HYPERV && X86`.
> >
> >
> > Kind regards,
> >
> > Paul
> >
> >
> > >>> + /* send a dummy GPU_INIT_DATA request to 
> > >>> host on vega10 */
> > >>> + amdgpu_virt_request_init_data(adev);
> > >>>break;
> > >>>case CHIP_VEGA20:
> > >>>case CHIP_ARCTURUS:
> > >>
> > >>
> > >> Kind regards,
> > >>
> > >> Paul


Re: AMDGPU: regression on 5.17.1

2022-04-13 Thread Michele Ballabio
On Mon, 11 Apr 2022 14:34:37 -0400
Alex Deucher  wrote:

> On Sat, Apr 9, 2022 at 12:28 PM Michele Ballabio
>  wrote:
> >
> > On Tue, 5 Apr 2022 10:23:16 -0400
> > Alex Deucher  wrote:
> >  
> > > On Mon, Apr 4, 2022 at 3:39 PM Michele Ballabio
> > >  wrote:  
> > > >
> > > > On Mon, 4 Apr 2022 13:03:41 -0400
> > > > Alex Deucher  wrote:
> > > >  
> > > > > On Sun, Apr 3, 2022 at 10:19 AM Michele Ballabio
> > > > >  wrote:  
> > > > > >
> > > > > > Hi,
> > > > > > I've hit a regression on 5.17.1 (haven't tested
> > > > > > 5.17.0, but 5.16-stable didn't have this problem).
> > > > > >
> > > > > > The machine is a Ryzen 5 1600 with AMD graphics (RX 560).
> > > > > >
> > > > > > The regression I hit seems to trigger when the machine is
> > > > > > left idle at boot (I don't boot straight to X, I boot to a
> > > > > > tty, login and then start X). The machine after a while
> > > > > > blanks the screen. Usually, the screen unblanks as the
> > > > > > keyboard is hit or the mouse moves, but with kernel 5.17.1
> > > > > > the screen does not wake up. The machine seems to run
> > > > > > mostly fine: I can login from ssh, but I cannot reboot or
> > > > > > halt it: a sysrq sequence is needed for that. Note that if
> > > > > > the screen goes blank under X, it wakes up fine.
> > > > > >
> > > > > > Below a dmesg and two traces from syslog (they're quite
> > > > > > similar).  
> > > > >
> > > > > Can you bisect?  Does setting amdgpu.runpm=0 help?  
> > > >
> > > > I can try to bisect, should I narrow the search to
> > > > drivers/gpu/drm/ ?  
> > >
> > > I would just do a full bisect if possible in case the change
> > > happens to be outside of drm.
> > >  
> > > >
> > > > Setting amdgpu.runpm=0 works, the display now unblanks without
> > > > problems.  
> > >  
> >
> > Hi,
> > I bisected this, and the first bad commit is
> > [087451f372bf76d971184caa258807b7c35aac8f] drm/amdgpu: use generic
> > fb helpers instead of setting up AMD own's.
> >
> > Let me know if you need some more testing.  
> 
> Thanks.  Do the attached patches fix the issue?
> 
> Thanks,
> 
> Alex

Sorry, no. I applied them both on top of 5.17.1.
I'm pasting the output of dmesg and decode_stacktrace.sh:

dmesg:
---

[0.00] Linux version 5.17.1+ (m...@darkstar.example.org) (gcc (GCC) 
11.2.0, GNU ld version 2.38-slack151) #221 SMP PREEMPT Wed Apr 13 18:51:36 CEST 
2022
[0.00] Command line: BOOT_IMAGE=bzImage ro root=806 debug
[0.00] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating point 
registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[0.00] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[0.00] x86/fpu: xstate_offset[2]:  576, xstate_sizes[2]:  256
[0.00] x86/fpu: Enabled xstate features 0x7, context size is 832 bytes, 
using 'compacted' format.
[0.00] signal: max sigframe size: 1776
[0.00] BIOS-provided physical RAM map:
[0.00] BIOS-e820: [mem 0x-0x0009d3ff] usable
[0.00] BIOS-e820: [mem 0x0009d400-0x0009] reserved
[0.00] BIOS-e820: [mem 0x000e-0x000f] reserved
[0.00] BIOS-e820: [mem 0x0010-0x03ff] usable
[0.00] BIOS-e820: [mem 0x0400-0x04009fff] ACPI NVS
[0.00] BIOS-e820: [mem 0x0400a000-0x09bf] usable
[0.00] BIOS-e820: [mem 0x09c0-0x09ff] reserved
[0.00] BIOS-e820: [mem 0x0a00-0x0aff] usable
[0.00] BIOS-e820: [mem 0x0b00-0x0b01] reserved
[0.00] BIOS-e820: [mem 0x0b02-0xd4434fff] usable
[0.00] BIOS-e820: [mem 0xd4435000-0xd444] ACPI data
[0.00] BIOS-e820: [mem 0xd445-0xda830fff] usable
[0.00] BIOS-e820: [mem 0xda831000-0xda970fff] reserved
[0.00] BIOS-e820: [mem 0xda971000-0xda97afff] ACPI data
[0.00] BIOS-e820: [mem 0xda97b000-0xdaa82fff] usable
[0.00] BIOS-e820: [mem 0xdaa83000-0xdae3efff] ACPI NVS
[0.00] BIOS-e820: [mem 0xdae3f000-0xdb93efff] reserved
[0.00] BIOS-e820: [mem 0xdb93f000-0xddff] usable
[0.00] BIOS-e820: [mem 0xde00-0xdfff] reserved
[0.00] BIOS-e820: [mem 0xf800-0xfbff] reserved
[0.00] BIOS-e820: [mem 0xfd80-0xfdff] reserved
[0.00] BIOS-e820: [mem 0xfea0-0xfea0] reserved
[0.00] BIOS-e820: [mem 0xfeb8-0xfec01fff] reserved
[0.00] BIOS-e820: [mem 0xfec1-0xfec10fff] reserved
[0.00] BIOS-e820: [mem 0xfec3-0xfec30fff] reserved
[0.00] BIOS-e820: [mem 0xfed0-0xfed00fff] 

Re: [EXTERNAL] [PATCH 2/2] drm/amdkfd: Add PCIe Hotplug Support for AMDKFD

2022-04-13 Thread Andrey Grodzovsky


On 2022-04-13 12:03, Shuotao Xu wrote:



On Apr 11, 2022, at 11:52 PM, Andrey Grodzovsky 
 wrote:


[Some people who received this message don't often get email 
fromandrey.grodzov...@amd.com. Learn why this is important 
athttp://aka.ms/LearnAboutSenderIdentification.]


On 2022-04-08 21:28, Shuotao Xu wrote:


On Apr 8, 2022, at 11:28 PM, Andrey Grodzovsky 
 wrote:


[Some people who received this message don't often get email from 
andrey.grodzov...@amd.com. Learn why this is important at 
http://aka.ms/LearnAboutSenderIdentification.]


On 2022-04-08 04:45, Shuotao Xu wrote:

Adding PCIe Hotplug Support for AMDKFD: the support of hot-plug of GPU
devices can open doors for many advanced applications in data center
in the next few years, such as for GPU resource
disaggregation. Current AMDKFD does not support hotplug out b/o the
following reasons:

1. During PCIe removal, decrement KFD lock which was incremented at
the beginning of hw fini; otherwise kfd_open later is going to
fail.

I assumed you read my comment last time, still you do same approach.
More in details bellow
Aha, I like your fix:) I was not familiar with drm APIs so just only 
half understood your comment last time.


BTW, I tried hot-plugging out a GPU when rocm application is still 
running.
From dmesg, application is still trying to access the removed kfd 
device, and are met with some errors.



Application us supposed to keep running, it holds the drm_device
reference as long as it has an open
FD to the device and final cleanup will come only after the app will die
thus releasing the FD and the last
drm_device reference.


Application would hang and not exiting in this case.




Actually I tried kill -7 $pid, and the process exists. The dmesg has 
some warning though.


[  711.769977] WARNING: CPU: 23 PID: 344 at 
.../amdgpu-rocm5.0.2/src/amd/amdgpu/amdgpu_object.c:1336 
amdgpu_bo_release_notify+0x150/0x160 [amdgpu]
[  711.770528] Modules linked in: amdgpu(OE) amdttm(OE) amd_sched(OE) 
amdkcl(OE) iommu_v2 nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo 
xt_addrtype br_netfilter xt_CHECKSUM iptable_mangle xt_MASQUERADE 
iptable_nat nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 
nf_defrag_ipv4 ipt_REJECT nf_reject_ipv4 xt_tcpudp bridge stp llc 
ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter 
overlay binfmt_misc intel_rapl_msr i40iw intel_rapl_common skx_edac 
nfit x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel rpcrdma 
kvm sunrpc ipmi_ssif ib_umad ib_ipoib rdma_ucm irqbypass rapl joydev 
acpi_ipmi input_leds intel_cstate ipmi_si ipmi_devintf mei_me mei 
intel_pch_thermal ipmi_msghandler ioatdma mac_hid lpc_ich dca 
acpi_power_meter acpi_pad sch_fq_codel ib_iser rdma_cm iw_cm ib_cm 
iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi pci_stub 
ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 
raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor
[  711.779359]  raid6_pq libcrc32c raid1 raid0 multipath linear 
mlx5_ib ib_uverbs ib_core ast drm_vram_helper i2c_algo_bit 
drm_ttm_helper ttm drm_kms_helper syscopyarea crct10dif_pclmul 
crc32_pclmul ghash_clmulni_intel sysfillrect uas hid_generic sysimgblt 
aesni_intel mlx5_core fb_sys_fops crypto_simd usbhid cryptd drm i40e 
pci_hyperv_intf usb_storage glue_helper mlxfw hid ahci libahci wmi
[  711.779752] CPU: 23 PID: 344 Comm: kworker/23:1 Tainted: G        W 
 OE     5.11.0+ #1
[  711.779755] Hardware name: Supermicro 
SYS-4029GP-TRT2/X11DPG-OT-CPU, BIOS 2.1 08/14/2018

[  711.779756] Workqueue: kfd_process_wq kfd_process_wq_release [amdgpu]
[  711.779955] RIP: 0010:amdgpu_bo_release_notify+0x150/0x160 [amdgpu]
[  711.780141] Code: e8 b5 af 34 f4 e9 1f ff ff ff 48 39 c2 74 07 0f 
0b e9 69 ff ff ff 4c 89 e7 e8 3c b4 16 00 e9 5c ff ff ff e8 a2 ce fd 
f3 eb cf <0f> 0b eb cb e8 d7 02 34 f4 0f 1f 80 00 00 00 00 0f 1f 44 00 
00 55

[  711.780143] RSP: 0018:a8100dd67c30 EFLAGS: 00010282
[  711.780145] RAX: ffea RBX: 89980e792058 RCX: 

[  711.780147] RDX:  RSI: 89a8f9ad8870 RDI: 
89a8f9ad8870
[  711.780148] RBP: a8100dd67c50 R08:  R09: 
fff99b18
[  711.780149] R10: a8100dd67bd0 R11: a8100dd67908 R12: 
89980e792000
[  711.780151] R13: 89980e792058 R14: 89980e7921bc R15: 
dead0100
[  711.780152] FS:  () GS:89a8f9ac() 
knlGS:

[  711.780154] CS:  0010 DS:  ES:  CR0: 80050033
[  711.780156] CR2: 7ffddac6f71f CR3: 0030bb80a003 CR4: 
007706e0
[  711.780157] DR0:  DR1:  DR2: 

[  711.780159] DR3:  DR6: fffe0ff0 DR7: 
0400

[  711.780160] PKRU: 5554
[  711.780161] Call Trace:
[  711.780163]  ttm_bo_release+0x2ae/0x320 [amdttm]
[  711.780169]  amdttm_bo_put+0x30/0x40 [amdttm]
[  711.780357]  amdgpu_bo_unref+0x1e/0x30 [amdgpu]
[  711.780543]  

[PATCH] drm/radeon: Add build directory to include path

2022-04-13 Thread Michel Dänzer
From: Michel Dänzer 

Fixes compile errors with out-of-tree builds, e.g.

../drivers/gpu/drm/radeon/r420.c:38:10: fatal error: r420_reg_safe.h: No such 
file or directory
   38 | #include "r420_reg_safe.h"
  |  ^

Signed-off-by: Michel Dänzer 
---
 drivers/gpu/drm/radeon/Makefile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/radeon/Makefile b/drivers/gpu/drm/radeon/Makefile
index 11c97edde54d..37caf5236048 100644
--- a/drivers/gpu/drm/radeon/Makefile
+++ b/drivers/gpu/drm/radeon/Makefile
@@ -3,6 +3,8 @@
 # Makefile for the drm device driver.  This driver provides support for the
 # Direct Rendering Infrastructure (DRI) in XFree86 4.1.0 and higher.
 
+ccflags-y += -I$(src)
+
 hostprogs := mkregtable
 targets := rn50_reg_safe.h r100_reg_safe.h r200_reg_safe.h rv515_reg_safe.h 
r300_reg_safe.h r420_reg_safe.h rs600_reg_safe.h r600_reg_safe.h 
evergreen_reg_safe.h cayman_reg_safe.h
 
-- 
2.35.1



Re: [EXTERNAL] [PATCH 2/2] drm/amdkfd: Add PCIe Hotplug Support for AMDKFD

2022-04-13 Thread Shuotao Xu


On Apr 11, 2022, at 11:52 PM, Andrey Grodzovsky 
mailto:andrey.grodzov...@amd.com>> wrote:

[Some people who received this message don't often get email from 
andrey.grodzov...@amd.com. Learn why this is 
important at http://aka.ms/LearnAboutSenderIdentification.]

On 2022-04-08 21:28, Shuotao Xu wrote:

On Apr 8, 2022, at 11:28 PM, Andrey Grodzovsky 
mailto:andrey.grodzov...@amd.com>> wrote:

[Some people who received this message don't often get email from 
andrey.grodzov...@amd.com. Learn why this is 
important at http://aka.ms/LearnAboutSenderIdentification.]

On 2022-04-08 04:45, Shuotao Xu wrote:
Adding PCIe Hotplug Support for AMDKFD: the support of hot-plug of GPU
devices can open doors for many advanced applications in data center
in the next few years, such as for GPU resource
disaggregation. Current AMDKFD does not support hotplug out b/o the
following reasons:

1. During PCIe removal, decrement KFD lock which was incremented at
the beginning of hw fini; otherwise kfd_open later is going to
fail.
I assumed you read my comment last time, still you do same approach.
More in details bellow
Aha, I like your fix:) I was not familiar with drm APIs so just only half 
understood your comment last time.

BTW, I tried hot-plugging out a GPU when rocm application is still running.
From dmesg, application is still trying to access the removed kfd device, and 
are met with some errors.


Application us supposed to keep running, it holds the drm_device
reference as long as it has an open
FD to the device and final cleanup will come only after the app will die
thus releasing the FD and the last
drm_device reference.

Application would hang and not exiting in this case.


Actually I tried kill -7 $pid, and the process exists. The dmesg has some 
warning though.

[  711.769977] WARNING: CPU: 23 PID: 344 at 
.../amdgpu-rocm5.0.2/src/amd/amdgpu/amdgpu_object.c:1336 
amdgpu_bo_release_notify+0x150/0x160 [amdgpu]
[  711.770528] Modules linked in: amdgpu(OE) amdttm(OE) amd_sched(OE) 
amdkcl(OE) iommu_v2 nf_conntrack_netlink nfnetlink xfrm_user xfrm_algo 
xt_addrtype br_netfilter xt_CHECKSUM iptable_mangle xt_MASQUERADE iptable_nat 
nf_nat xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 ipt_REJECT 
nf_reject_ipv4 xt_tcpudp bridge stp llc ebtable_filter ebtables ip6table_filter 
ip6_tables iptable_filter overlay binfmt_misc intel_rapl_msr i40iw 
intel_rapl_common skx_edac nfit x86_pkg_temp_thermal intel_powerclamp coretemp 
kvm_intel rpcrdma kvm sunrpc ipmi_ssif ib_umad ib_ipoib rdma_ucm irqbypass rapl 
joydev acpi_ipmi input_leds intel_cstate ipmi_si ipmi_devintf mei_me mei 
intel_pch_thermal ipmi_msghandler ioatdma mac_hid lpc_ich dca acpi_power_meter 
acpi_pad sch_fq_codel ib_iser rdma_cm iw_cm ib_cm iscsi_tcp libiscsi_tcp 
libiscsi scsi_transport_iscsi pci_stub ip_tables x_tables autofs4 btrfs 
blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy 
async_pq async_xor async_tx xor
[  711.779359]  raid6_pq libcrc32c raid1 raid0 multipath linear mlx5_ib 
ib_uverbs ib_core ast drm_vram_helper i2c_algo_bit drm_ttm_helper ttm 
drm_kms_helper syscopyarea crct10dif_pclmul crc32_pclmul ghash_clmulni_intel 
sysfillrect uas hid_generic sysimgblt aesni_intel mlx5_core fb_sys_fops 
crypto_simd usbhid cryptd drm i40e pci_hyperv_intf usb_storage glue_helper 
mlxfw hid ahci libahci wmi
[  711.779752] CPU: 23 PID: 344 Comm: kworker/23:1 Tainted: GW  OE 
5.11.0+ #1
[  711.779755] Hardware name: Supermicro SYS-4029GP-TRT2/X11DPG-OT-CPU, BIOS 
2.1 08/14/2018
[  711.779756] Workqueue: kfd_process_wq kfd_process_wq_release [amdgpu]
[  711.779955] RIP: 0010:amdgpu_bo_release_notify+0x150/0x160 [amdgpu]
[  711.780141] Code: e8 b5 af 34 f4 e9 1f ff ff ff 48 39 c2 74 07 0f 0b e9 69 
ff ff ff 4c 89 e7 e8 3c b4 16 00 e9 5c ff ff ff e8 a2 ce fd f3 eb cf <0f> 0b eb 
cb e8 d7 02 34 f4 0f 1f 80 00 00 00 00 0f 1f 44 00 00 55
[  711.780143] RSP: 0018:a8100dd67c30 EFLAGS: 00010282
[  711.780145] RAX: ffea RBX: 89980e792058 RCX: 
[  711.780147] RDX:  RSI: 89a8f9ad8870 RDI: 89a8f9ad8870
[  711.780148] RBP: a8100dd67c50 R08:  R09: fff99b18
[  711.780149] R10: a8100dd67bd0 R11: a8100dd67908 R12: 89980e792000
[  711.780151] R13: 89980e792058 R14: 89980e7921bc R15: dead0100
[  711.780152] FS:  () GS:89a8f9ac() 
knlGS:
[  711.780154] CS:  0010 DS:  ES:  CR0: 80050033
[  711.780156] CR2: 7ffddac6f71f CR3: 0030bb80a003 CR4: 007706e0
[  711.780157] DR0:  DR1:  DR2: 
[  711.780159] DR3:  DR6: fffe0ff0 DR7: 0400
[  711.780160] PKRU: 5554
[  711.780161] Call Trace:
[  711.780163]  ttm_bo_release+0x2ae/0x320 [amdttm]
[  711.780169]  amdttm_bo_put+0x30/0x40 [amdttm]
[  

[PATCH 2/2] drm/amdkfd: CRIU add support for GWS queues

2022-04-13 Thread David Yat Sin
Adding support to checkpoint/restore GWS(Global Wave Sync) queues.

Signed-off-by: David Yat Sin 
---
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h |  4 ++--
 .../amd/amdkfd/kfd_process_queue_manager.c| 22 ++-
 3 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index e1e2362841f8..b2c72ddd9c1c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1377,7 +1377,7 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
 
-   retval = pqm_set_gws(>pqm, args->queue_id, args->num_gws ? dev->gws 
: NULL);
+   retval = pqm_set_gws(>pqm, args->queue_id, args->num_gws ? dev->gws 
: NULL, false);
mutex_unlock(>mutex);
 
args->first_gws = 0;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index f36062be9ca8..3ec32675210c 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1102,7 +1102,7 @@ struct kfd_criu_queue_priv_data {
uint32_t priority;
uint32_t q_percent;
uint32_t doorbell_id;
-   uint32_t is_gws;
+   uint32_t gws;
uint32_t sdma_id;
uint32_t eop_ring_buffer_size;
uint32_t ctx_save_restore_area_size;
@@ -1199,7 +1199,7 @@ int pqm_update_queue_properties(struct 
process_queue_manager *pqm, unsigned int
 int pqm_update_mqd(struct process_queue_manager *pqm, unsigned int qid,
struct mqd_update_info *minfo);
 int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
-   void *gws);
+   void *gws, bool criu_restore);
 struct kernel_queue *pqm_get_kernel_queue(struct process_queue_manager *pqm,
unsigned int qid);
 struct queue *pqm_get_user_queue(struct process_queue_manager *pqm,
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 6eca9509f2e3..3eb9d43fdaac 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -91,7 +91,7 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)
 }
 
 int pqm_set_gws(struct process_queue_manager *pqm, unsigned int qid,
-   void *gws)
+   void *gws, bool criu_restore)
 {
struct kfd_dev *dev = NULL;
struct process_queue_node *pqn;
@@ -135,8 +135,14 @@ int pqm_set_gws(struct process_queue_manager *pqm, 
unsigned int qid,
pqn->q->gws = mem;
pdd->qpd.num_gws = gws ? dev->adev->gds.gws_size : 0;
 
-   return pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
-   pqn->q, NULL);
+   ret = pqn->q->device->dqm->ops.update_queue(pqn->q->device->dqm,
+   pqn->q, NULL);
+
+   if (!ret && criu_restore) {
+   dev->dqm->gws_queue_count++;
+   pdd->qpd.mapped_gws_queue = true;
+   }
+   return ret;
 }
 
 void kfd_process_dequeue_from_all_devices(struct kfd_process *p)
@@ -636,6 +642,8 @@ static int criu_checkpoint_queue(struct kfd_process_device 
*pdd,
q_data->ctx_save_restore_area_size =
q->properties.ctx_save_restore_area_size;
 
+   q_data->gws = !!q->gws;
+
ret = pqm_checkpoint_mqd(>process->pqm, q->properties.queue_id, 
mqd, ctl_stack);
if (ret) {
pr_err("Failed checkpoint queue_mqd (%d)\n", ret);
@@ -743,7 +751,6 @@ static void set_queue_properties_from_criu(struct 
queue_properties *qp,
  struct kfd_criu_queue_priv_data 
*q_data)
 {
qp->is_interop = false;
-   qp->is_gws = q_data->is_gws;
qp->queue_percent = q_data->q_percent;
qp->priority = q_data->priority;
qp->queue_address = q_data->q_address;
@@ -826,12 +833,15 @@ int kfd_criu_restore_queue(struct kfd_process *p,
NULL);
if (ret) {
pr_err("Failed to create new queue err:%d\n", ret);
-   ret = -EINVAL;
+   goto exit;
}
 
+   if (q_data->gws)
+   ret = pqm_set_gws(>pqm, q_data->q_id, pdd->dev->gws, true);
+
 exit:
if (ret)
-   pr_err("Failed to create queue (%d)\n", ret);
+   pr_err("Failed to restore queue (%d)\n", ret);
else
pr_debug("Queue id %d was restored successfully\n", queue_id);
 
-- 
2.30.2



[PATCH 1/2] drm/amdkfd: Fix GWS queue count

2022-04-13 Thread David Yat Sin
Queue can be inactive during process termination. This would cause
dqm->gws_queue_count to not be decremented. There can only be 1 GWS
queue per device process so moving the logic out of loop.

Signed-off-by: David Yat Sin 
---
 .../gpu/drm/amd/amdkfd/kfd_device_queue_manager.c| 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index acf4f7975850..7c107b88d944 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -1945,17 +1945,17 @@ static int process_termination_cpsch(struct 
device_queue_manager *dqm,
else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)
deallocate_sdma_queue(dqm, q);
 
-   if (q->properties.is_active) {
+   if (q->properties.is_active)
decrement_queue_count(dqm, q->properties.type);
-   if (q->properties.is_gws) {
-   dqm->gws_queue_count--;
-   qpd->mapped_gws_queue = false;
-   }
-   }
 
dqm->total_queue_count--;
}
 
+   if (qpd->mapped_gws_queue) {
+   dqm->gws_queue_count--;
+   qpd->mapped_gws_queue = false;
+   }
+
/* Unregister process */
list_for_each_entry_safe(cur, next_dpn, >queues, list) {
if (qpd == cur->qpd) {
-- 
2.30.2



Re: [PATCHv4] drm/amdgpu: disable ASPM on Intel Alder Lake based systems

2022-04-13 Thread Nathan Chancellor
Hi Richard,

On Tue, Apr 12, 2022 at 04:50:00PM -0500, Richard Gong wrote:
> Active State Power Management (ASPM) feature is enabled since kernel 5.14.
> There are some AMD GFX cards (such as WX3200 and RX640) that won't work
> with ASPM-enabled Intel Alder Lake based systems. Using these GFX cards as
> video/display output, Intel Alder Lake based systems will hang during
> suspend/resume.
> 
> The issue was initially reported on one system (Dell Precision 3660 with
> BIOS version 0.14.81), but was later confirmed to affect at least 4 Alder
> Lake based systems.
> 
> Add extra check to disable ASPM on Intel Alder Lake based systems.
> 
> Fixes: 0064b0ce85bb ("drm/amd/pm: enable ASPM by default")
> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1885
> Reported-by: kernel test robot 
> Signed-off-by: Richard Gong 
> ---
> v4: s/CONFIG_X86_64/CONFIG_X86
> enhanced check logic
> v3: s/intel_core_asom_chk/aspm_support_quirk_check
> correct build error with W=1 option
> v2: correct commit description
> move the check from chip family to problematic platform
> ---
>  drivers/gpu/drm/amd/amdgpu/vi.c | 17 -
>  1 file changed, 16 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
> index 039b90cdc3bc..b33e0a9bee65 100644
> --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> @@ -81,6 +81,10 @@
>  #include "mxgpu_vi.h"
>  #include "amdgpu_dm.h"
>  
> +#if IS_ENABLED(CONFIG_X86)
> +#include 
> +#endif
> +
>  #define ixPCIE_LC_L1_PM_SUBSTATE 0x100100C6
>  #define PCIE_LC_L1_PM_SUBSTATE__LC_L1_SUBSTATES_OVERRIDE_EN_MASK 
> 0x0001L
>  #define PCIE_LC_L1_PM_SUBSTATE__LC_PCI_PM_L1_2_OVERRIDE_MASK 0x0002L
> @@ -1134,13 +1138,24 @@ static void vi_enable_aspm(struct amdgpu_device *adev)
>   WREG32_PCIE(ixPCIE_LC_CNTL, data);
>  }
>  
> +static bool aspm_support_quirk_check(void)
> +{
> + if (IS_ENABLED(CONFIG_X86)) {
> + struct cpuinfo_x86 *c = _data(0);
> +
> + return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
> + }

I have not seen this reported by a bot, sorry if it is a duplicate. This
breaks non-x86 builds (arm64 allmodconfig for example):

drivers/gpu/drm/amd/amdgpu/vi.c:1144:28: error: implicit declaration of 
function 'cpu_data' is invalid in C99 [-Werror,-Wimplicit-function-declaration]
struct cpuinfo_x86 *c = _data(0);
 ^
drivers/gpu/drm/amd/amdgpu/vi.c:1144:27: error: cannot take the address of an 
rvalue of type 'int'
struct cpuinfo_x86 *c = _data(0);
^~~~
drivers/gpu/drm/amd/amdgpu/vi.c:1146:13: error: incomplete definition of type 
'struct cpuinfo_x86'
return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
 ~^
drivers/gpu/drm/amd/amdgpu/vi.c:1144:10: note: forward declaration of 'struct 
cpuinfo_x86'
struct cpuinfo_x86 *c = _data(0);
   ^
drivers/gpu/drm/amd/amdgpu/vi.c:1146:28: error: incomplete definition of type 
'struct cpuinfo_x86'
return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
~^
drivers/gpu/drm/amd/amdgpu/vi.c:1144:10: note: forward declaration of 'struct 
cpuinfo_x86'
struct cpuinfo_x86 *c = _data(0);
   ^
drivers/gpu/drm/amd/amdgpu/vi.c:1146:43: error: use of undeclared identifier 
'INTEL_FAM6_ALDERLAKE'
return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
^
5 errors generated.

'struct cpuinfo_x86' is only defined for CONFIG_X86 so this section
needs to guarded with the preprocessor, which is how it was done in v2.
Please go back to that.

Cheers,
Nathan


[PATCH] drm/amd/amdgpu: Remove static from variable in RLCG Reg RW.

2022-04-13 Thread Gavin Wan
[why] These static variables saves the RLC Scratch registers address.
  When we installed multiple GPUs (for example: XGMI setting) and
  multiple GPUs call the function at same time. The RLC Scratch
  registers address are changed each other. Then it caused
  reading/writing to wrong GPU.

[fix] Removed the static from the variables. The variables are
  in stack.

Signed-off-by: Gavin Wan 
Change-Id: Iee78849291d4f7a9688ecc5165bec70ee85cdfbe
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index d5eea031c3e3..d18a05a20566 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -868,11 +868,11 @@ static u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device 
*adev, u32 offset, u32 v
uint32_t timeout = 5;
uint32_t i, tmp;
uint32_t ret = 0;
-   static void *scratch_reg0;
-   static void *scratch_reg1;
-   static void *scratch_reg2;
-   static void *scratch_reg3;
-   static void *spare_int;
+   void *scratch_reg0;
+   void *scratch_reg1;
+   void *scratch_reg2;
+   void *scratch_reg3;
+   void *spare_int;
 
if (!adev->gfx.rlc.rlcg_reg_access_supported) {
dev_err(adev->dev,
-- 
2.32.0



Re: [PATCH] drm/amdkfd: potential NULL dereference in kfd_set/reset_event()

2022-04-13 Thread Felix Kuehling



Am 2022-04-13 um 03:36 schrieb Dan Carpenter:

If lookup_event_by_id() returns a NULL "ev" pointer then the
spin_lock(>lock) will crash.  This was detected by Smatch:

 drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_events.c:644 kfd_set_event()
 error: we previously assumed 'ev' could be null (see line 639)

Fixes: 5273e82c5f47 ("drm/amdkfd: Improve concurrency of event handling")
Signed-off-by: Dan Carpenter 


The patch is

Reviewed-by: Felix Kuehling 

I will apply it to amd-staging-drm-next. Thank you!



---
  drivers/gpu/drm/amd/amdkfd/kfd_events.c | 14 --
  1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 0fef24b0b915..15570e72a5dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -634,14 +634,19 @@ int kfd_set_event(struct kfd_process *p, uint32_t 
event_id)
rcu_read_lock();
  
  	ev = lookup_event_by_id(p, event_id);

+   if (!ev) {
+   ret = -EINVAL;
+   goto unlock_rcu;
+   }
spin_lock(>lock);
  
-	if (ev && event_can_be_cpu_signaled(ev))

+   if (event_can_be_cpu_signaled(ev))
set_event(ev);
else
ret = -EINVAL;
  
  	spin_unlock(>lock);

+unlock_rcu:
rcu_read_unlock();
return ret;
  }
@@ -660,14 +665,19 @@ int kfd_reset_event(struct kfd_process *p, uint32_t 
event_id)
rcu_read_lock();
  
  	ev = lookup_event_by_id(p, event_id);

+   if (!ev) {
+   ret = -EINVAL;
+   goto unlock_rcu;
+   }
spin_lock(>lock);
  
-	if (ev && event_can_be_cpu_signaled(ev))

+   if (event_can_be_cpu_signaled(ev))
reset_event(ev);
else
ret = -EINVAL;
  
  	spin_unlock(>lock);

+unlock_rcu:
rcu_read_unlock();
return ret;
  


Re: [PATCH v3 1/1] amdgpu/pm: Clarify documentation of error handling in send_smc_mesg

2022-04-13 Thread Luben Tuikov
Looks good! Thanks.

Reviewed-by: Luben Tuikov 

On 2022-04-13 01:08, Darren Powell wrote:
> Clarify the smu_cmn_send_smc_msg_with_param documentation to mention two
> cases exist where messages are silently dropped with no error returned.
> These cases occur in unusual situations where either:
>  1. the message type is not allowed to a virtual GPU, or
>  2. a PCI recovery is underway and the HW is not yet in sync with the SW
> 
> For more details see
>  commit 4ea5081c82c4 ("drm/amd/powerplay: enable SMC message filter")
>  commit bf36b52e781d ("drm/amdgpu: Avoid accessing HW when suspending SW 
> state")
> 
> (v2)
>   Reworked with suggestions from Luben & Paul
> 
> (v3)
>   Updated wording as per Luben's feedback
>   Corrected error stating all messages denied on virtual GPU
>   (each GPU has mask of which messages are allowed)
> 
> Signed-off-by: Darren Powell 
> ---
>  drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c 
> b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> index b8d0c70ff668..f12319883a80 100644
> --- a/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> +++ b/drivers/gpu/drm/amd/pm/swsmu/smu_cmn.c
> @@ -356,9 +356,11 @@ int smu_cmn_wait_for_response(struct smu_context *smu)
>   * completion of the command, and return back a value from the SMU in
>   * @read_arg pointer.
>   *
> - * Return 0 on success, -errno on error, if we weren't able to send
> - * the message or if the message completed with some kind of
> - * error. See __smu_cmn_reg2errno() for details of the -errno.
> + * Return 0 on success, -errno when a problem is encountered sending
> + * message or receiving reply. If there is a PCI bus recovery or
> + * the destination is a virtual GPU which does not allow this message
> + * type, the message is simply dropped and success is also returned.
> + * See __smu_cmn_reg2errno() for details of the -errno.
>   *
>   * If we weren't able to send the message to the SMU, we also print
>   * the error to the standard log.
> 
> base-commit: 4585c45a6a66cb17cc97f4370457503746e540b7

Regards,
-- 
Luben


RE: [PATCHv4] drm/amdgpu: disable ASPM on Intel Alder Lake based systems

2022-04-13 Thread Limonciello, Mario
[Public]

 
> On Wed, Apr 13, 2022 at 3:43 AM Paul Menzel 
> wrote:
> >
> > Dear Richard,
> >
> >
> > Thank you for sending out v4.
> >
> > Am 12.04.22 um 23:50 schrieb Richard Gong:
> > > Active State Power Management (ASPM) feature is enabled since kernel
> 5.14.
> > > There are some AMD GFX cards (such as WX3200 and RX640) that won't
> work
> > > with ASPM-enabled Intel Alder Lake based systems. Using these GFX
> cards as
> > > video/display output, Intel Alder Lake based systems will hang during
> > > suspend/resume.
> >
> > I am still not clear, what "hang during suspend/resume" means. I guess
> > suspending works fine? During resume (S3 or S0ix?), where does it hang?
> > The system is functional, but there are only display problems?

I believe Intel would need to identify the state of the SOC to determine where
the PCIE problem actually occurs; on the way down or up.

As he said in the commit message it results in a hang.

> >
> > > The issue was initially reported on one system (Dell Precision 3660 with
> > > BIOS version 0.14.81), but was later confirmed to affect at least 4 Alder
> > > Lake based systems.
> > >
> > > Add extra check to disable ASPM on Intel Alder Lake based systems.
> > >
> > > Fixes: 0064b0ce85bb ("drm/amd/pm: enable ASPM by default")
> > > Link:
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitla
> b.freedesktop.org%2Fdrm%2Famd%2F-
> %2Fissues%2F1885data=04%7C01%7Cmario.limonciello%40amd.com%
> 7Cfe4b6b553c3b47c1288f08da1d4da9c8%7C3dd8961fe4884e608e11a82d994e
> 183d%7C0%7C0%7C637854516675116782%7CUnknown%7CTWFpbGZsb3d8ey
> JWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%
> 7C3000sdata=gvFmP1HQP%2FyzLfT0gYYCupAQBIG%2FtiDYelQNqTLAx
> ck%3Dreserved=0
> > > Reported-by: kernel test robot 
> >
> > This tag is a little confusing. Maybe clarify that it was for an issue
> > in a previous patch iteration?
> >
> > > Signed-off-by: Richard Gong 
> > > ---
> > > v4: s/CONFIG_X86_64/CONFIG_X86
> > >  enhanced check logic
> > > v3: s/intel_core_asom_chk/aspm_support_quirk_check
> > >  correct build error with W=1 option
> > > v2: correct commit description
> > >  move the check from chip family to problematic platform
> > > ---
> > >   drivers/gpu/drm/amd/amdgpu/vi.c | 17 -
> > >   1 file changed, 16 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c
> b/drivers/gpu/drm/amd/amdgpu/vi.c
> > > index 039b90cdc3bc..b33e0a9bee65 100644
> > > --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> > > +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> > > @@ -81,6 +81,10 @@
> > >   #include "mxgpu_vi.h"
> > >   #include "amdgpu_dm.h"
> > >
> > > +#if IS_ENABLED(CONFIG_X86)
> > > +#include 
> > > +#endif
> > > +
> > >   #define ixPCIE_LC_L1_PM_SUBSTATE0x100100C6
> > >   #define
> PCIE_LC_L1_PM_SUBSTATE__LC_L1_SUBSTATES_OVERRIDE_EN_MASK
> 0x0001L
> > >   #define
> PCIE_LC_L1_PM_SUBSTATE__LC_PCI_PM_L1_2_OVERRIDE_MASK
> 0x0002L
> > > @@ -1134,13 +1138,24 @@ static void vi_enable_aspm(struct
> amdgpu_device *adev)
> > >   WREG32_PCIE(ixPCIE_LC_CNTL, data);
> > >   }
> > >
> > > +static bool aspm_support_quirk_check(void)
> > > +{
> > > + if (IS_ENABLED(CONFIG_X86)) {
> > > + struct cpuinfo_x86 *c = _data(0);
> > > +
> > > + return !(c->x86 == 6 && c->x86_model ==
> INTEL_FAM6_ALDERLAKE);

Don't you need to check x86_vendor?  Although extremely unlikely if you don't
check x86_vendor nothing to stop another X86 manufacturer from having a
design that has same model # as INTEL_FAM6_ALDERLAKE.

> > > + }
> > > +
> > > + return true;
> > > +}
> > > +
> > >   static void vi_program_aspm(struct amdgpu_device *adev)
> > >   {
> > >   u32 data, data1, orig;
> > >   bool bL1SS = false;
> > >   bool bClkReqSupport = true;
> > >
> > > - if (!amdgpu_device_should_use_aspm(adev))
> > > + if (!amdgpu_device_should_use_aspm(adev) ||
> !aspm_support_quirk_check())
> > >   return;
> >
> > Can users still forcefully enable ASPM with the parameter `amdgpu.aspm`?

amdgpu.aspm is module wide not just for one card.  That is it covers all AMD 
GPU cards
in the system.  If it's set to 1 or pcie_aspm_enabled returns true it will 
enable for other
GPUs besides these.

There is the possibility to move this quirk check within " 
amdgpu_device_should_use_aspm"
and only match this combination when set to amdgpu.aspm is set to "-1" (the 
default), something
like this:

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -1335,6 +1335,8 @@ bool amdgpu_device_should_use_aspm(struct amdgpu_device 
*adev)
default:
return false;
}
+   if (amdgpu_device_is_quirked_for_aspm(adev))
+   return false;
return pcie_aspm_enabled(adev->pdev);
 }


> >
> > >
> > >   if (adev->flags & AMD_IS_APU ||
> >
> > If I remember correctly, there were also newer cards, where ASPM worked
> 

Re: [PATCH v2] Revert "drm/amd/display: Pass HostVM enable flag into DCN3.1 DML"

2022-04-13 Thread Alex Deucher
On Tue, Apr 12, 2022 at 5:03 PM Rodrigo Siqueira
 wrote:
>
> This reverts commit 367b3e934f578f6c0d5d8ca5987dc6ac4cd6831d.
>
> While we were testing DCN3.1 with a hub, we noticed that only one of 2
> connected displays lights up when using some specific display
> resolution. In summary, this was the setup:
>
> 1. Displays:
>  * Sharp LQ156M1JW26 (eDP): 1080@240
>  * BENQ SW320 (DP): 4k@60
>  * BENQ EX3203R (DP): 4k@60
> 2. Hub: Club3D CSV-7300
> 3. ASIC: DCN3.1
>
> After bisecting this issue, we figured out the commit mentioned above
> introduced this issue. We are investigating why this patch introduced
> this regression, but we need to revert it for now.
>
> Cc: Harry Wentland 
> Cc: Mark Broadworth 
> Cc: Michael Strauss 
> Signed-off-by: Rodrigo Siqueira 

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c 
> b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
> index 6cc580be7c79..5b3f0c2dfb55 100644
> --- a/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
> +++ b/drivers/gpu/drm/amd/display/dc/dcn31/dcn31_resource.c
> @@ -1668,7 +1668,6 @@ int dcn31_populate_dml_pipes_from_context(
> pipes[pipe_cnt].pipe.src.immediate_flip = true;
>
> pipes[pipe_cnt].pipe.src.unbounded_req_mode = false;
> -   pipes[pipe_cnt].pipe.src.hostvm = 
> dc->res_pool->hubbub->riommu_active;
> pipes[pipe_cnt].pipe.src.gpuvm = true;
> pipes[pipe_cnt].pipe.src.dcc_fraction_of_zs_req_luma = 0;
> pipes[pipe_cnt].pipe.src.dcc_fraction_of_zs_req_chroma = 0;
> --
> 2.25.1
>


Re: gcc inserts __builtin_popcount, causes 'modpost: "__popcountdi2" ... amdgpu.ko] undefined'

2022-04-13 Thread Sergei Trofimovich
On Mon, Apr 11, 2022 at 10:08:15PM +0100, Sergei Trofimovich wrote:
> Current linux-5.17.1 on fresh gcc-12 fails to build with errors like:
> 
> ERROR: modpost: "__popcountdi2" 
> [drivers/net/ethernet/broadcom/bnx2x/bnx2x.ko] undefined!
> ERROR: modpost: "__popcountdi2" [drivers/gpu/drm/amd/amdgpu/amdgpu.ko] 
> undefined!
> 
> It is occasionally seen by others on previous gcc versions as well:
> 
> https://lkml.org/lkml/2021/7/11/261
> https://lkml.org/lkml/2018/10/24/403
> 
> '__popcountdi2' are inserted by gcc for code like the following
> from 'drivers/gpu/drm/amd/display/modules/hdcp/hdcp1_execution.c':
> 
> static inline enum mod_hdcp_status validate_bksv(struct mod_hdcp *hdcp)
> {
> uint64_t n = 0;
> uint8_t count = 0;
> u8 bksv[sizeof(n)] = { };
> 
> memcpy(bksv, hdcp->auth.msg.hdcp1.bksv, 
> sizeof(hdcp->auth.msg.hdcp1.bksv));
> n = *(uint64_t *)bksv;
> 
> /* Here gcc inserts 'count = __builtin_popcount(n);' */
> while (n) {
> count++;
> n &= (n - 1);
> }
> 
> return (count == 20) ? MOD_HDCP_STATUS_SUCCESS :
>MOD_HDCP_STATUS_HDCP1_INVALID_BKSV;
> }
> 
> Note that gcc can insert it regardless of -mno-* options.
> 
> How should linux.git handle it? A few options come to mind:
> 
> - Perhaps use libgcc.a directly.
> - Just implement '__popcountdi2'. Example definition from libgcc:
>   https://gcc.gnu.org/git/?p=gcc.git;a=blob;f=libgcc/libgcc2.c;hb=HEAD#l846
> - Or workaround it with -fno-builtin-popcount in Makefiles.
> 
> CCing people who can help routing it and/or deciding on the fix:
> amd-gfx@lists.freedesktop.org, Joe Perches, linux-kbu...@vger.kernel.org,
> Jakub Jelinek, Segher Boessenkool, Thomas Gleixner,Peter Zijlstra, Andy
> Lutomirski.

There is now a discussion in gcc bugzilla:

https://gcc.gnu.org/PR105253

-- 

  Sergei


Re: [PATCHv4] drm/amdgpu: disable ASPM on Intel Alder Lake based systems

2022-04-13 Thread Alex Deucher
On Wed, Apr 13, 2022 at 3:43 AM Paul Menzel  wrote:
>
> Dear Richard,
>
>
> Thank you for sending out v4.
>
> Am 12.04.22 um 23:50 schrieb Richard Gong:
> > Active State Power Management (ASPM) feature is enabled since kernel 5.14.
> > There are some AMD GFX cards (such as WX3200 and RX640) that won't work
> > with ASPM-enabled Intel Alder Lake based systems. Using these GFX cards as
> > video/display output, Intel Alder Lake based systems will hang during
> > suspend/resume.
>
> I am still not clear, what “hang during suspend/resume” means. I guess
> suspending works fine? During resume (S3 or S0ix?), where does it hang?
> The system is functional, but there are only display problems?
>
> > The issue was initially reported on one system (Dell Precision 3660 with
> > BIOS version 0.14.81), but was later confirmed to affect at least 4 Alder
> > Lake based systems.
> >
> > Add extra check to disable ASPM on Intel Alder Lake based systems.
> >
> > Fixes: 0064b0ce85bb ("drm/amd/pm: enable ASPM by default")
> > Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1885
> > Reported-by: kernel test robot 
>
> This tag is a little confusing. Maybe clarify that it was for an issue
> in a previous patch iteration?
>
> > Signed-off-by: Richard Gong 
> > ---
> > v4: s/CONFIG_X86_64/CONFIG_X86
> >  enhanced check logic
> > v3: s/intel_core_asom_chk/aspm_support_quirk_check
> >  correct build error with W=1 option
> > v2: correct commit description
> >  move the check from chip family to problematic platform
> > ---
> >   drivers/gpu/drm/amd/amdgpu/vi.c | 17 -
> >   1 file changed, 16 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c 
> > b/drivers/gpu/drm/amd/amdgpu/vi.c
> > index 039b90cdc3bc..b33e0a9bee65 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/vi.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/vi.c
> > @@ -81,6 +81,10 @@
> >   #include "mxgpu_vi.h"
> >   #include "amdgpu_dm.h"
> >
> > +#if IS_ENABLED(CONFIG_X86)
> > +#include 
> > +#endif
> > +
> >   #define ixPCIE_LC_L1_PM_SUBSTATE0x100100C6
> >   #define PCIE_LC_L1_PM_SUBSTATE__LC_L1_SUBSTATES_OVERRIDE_EN_MASK
> > 0x0001L
> >   #define PCIE_LC_L1_PM_SUBSTATE__LC_PCI_PM_L1_2_OVERRIDE_MASK
> > 0x0002L
> > @@ -1134,13 +1138,24 @@ static void vi_enable_aspm(struct amdgpu_device 
> > *adev)
> >   WREG32_PCIE(ixPCIE_LC_CNTL, data);
> >   }
> >
> > +static bool aspm_support_quirk_check(void)
> > +{
> > + if (IS_ENABLED(CONFIG_X86)) {
> > + struct cpuinfo_x86 *c = _data(0);
> > +
> > + return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
> > + }
> > +
> > + return true;
> > +}
> > +
> >   static void vi_program_aspm(struct amdgpu_device *adev)
> >   {
> >   u32 data, data1, orig;
> >   bool bL1SS = false;
> >   bool bClkReqSupport = true;
> >
> > - if (!amdgpu_device_should_use_aspm(adev))
> > + if (!amdgpu_device_should_use_aspm(adev) || 
> > !aspm_support_quirk_check())
> >   return;
>
> Can users still forcefully enable ASPM with the parameter `amdgpu.aspm`?
>
> >
> >   if (adev->flags & AMD_IS_APU ||
>
> If I remember correctly, there were also newer cards, where ASPM worked
> with Intel Alder Lake, right? Can only the problematic generations for
> WX3200 and RX640 be excluded from ASPM?

This patch only disables it for the generation that was problematic.

Alex

>
>
> Kind regards,
>
> Paul


Re: [PATCH] drm/amdgpu: Fix one use-after-free of VM

2022-04-13 Thread Daniel Vetter
On Tue, 12 Apr 2022 at 14:11, Christian König  wrote:
>
> Am 12.04.22 um 14:03 schrieb xinhui pan:
> > VM might already be freed when amdgpu_vm_tlb_seq_cb() is called.
> > We see the calltrace below.
> >
> > Fix it by keeping the last flush fence around and wait for it to signal
> >
> > BUG kmalloc-4k (Not tainted): Poison overwritten
> >
> > 0x9c88630414e8-0x9c88630414e8 @offset=5352. First byte 0x6c
> > instead of 0x6b Allocated in amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
> > age=44 cpu=0 pid=2343
> >   __slab_alloc.isra.0+0x4f/0x90
> >   kmem_cache_alloc_trace+0x6b8/0x7a0
> >   amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
> >   drm_file_alloc+0x222/0x3e0 [drm]
> >   drm_open+0x11d/0x410 [drm]
> >   drm_stub_open+0xdc/0x230 [drm]
> >   chrdev_open+0xa5/0x1e0
> >   do_dentry_open+0x16c/0x3c0
> >   vfs_open+0x2d/0x30
> >   path_openat+0x70a/0xa90
> >   do_filp_open+0xb2/0x120
> >   do_sys_openat2+0x245/0x330
> >   do_sys_open+0x46/0x80
> >   __x64_sys_openat+0x20/0x30
> >   do_syscall_64+0x38/0xc0
> >   entry_SYSCALL_64_after_hwframe+0x44/0xae
> > Freed in amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu] age=22 cpu=1
> > pid=2485
> >   kfree+0x4a2/0x580
> >   amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu]
> >   drm_file_free+0x24e/0x3c0 [drm]
> >   drm_close_helper.isra.0+0x90/0xb0 [drm]
> >   drm_release+0x97/0x1a0 [drm]
> >   __fput+0xb6/0x280
> >   fput+0xe/0x10
> >   task_work_run+0x64/0xb0
> >   do_exit+0x406/0xcf0
> >   do_group_exit+0x50/0xc0
> >   __x64_sys_exit_group+0x18/0x20
> >   do_syscall_64+0x38/0xc0
> >   entry_SYSCALL_64_after_hwframe+0x44/0xae
> >
> > Suggested-by: Christian König 
> > Signed-off-by: xinhui pan 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 22 +++---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  1 +
> >   2 files changed, 20 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 645ce28277c2..e2486e95ca69 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -932,9 +932,12 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
> > struct amdgpu_vm *vm,
> >
> >   if (flush_tlb || params.table_freed) {
> >   tlb_cb->vm = vm;
> > - if (!fence || !*fence ||
> > - dma_fence_add_callback(*fence, _cb->cb,
> > -amdgpu_vm_tlb_seq_cb))
> > + if (fence && *fence &&
> > + !dma_fence_add_callback(*fence, _cb->cb,
> > +amdgpu_vm_tlb_seq_cb)) {
> > + dma_fence_put(vm->last_delayed_tlb_flush);
> > + vm->last_delayed_tlb_flush = dma_fence_get(*fence);
> > + } else
> >   amdgpu_vm_tlb_seq_cb(NULL, _cb->cb);
> >   tlb_cb = NULL;
> >   }
> > @@ -2258,6 +2261,19 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, 
> > struct amdgpu_vm *vm)
> >   dma_fence_wait(vm->last_unlocked, false);
> >   dma_fence_put(vm->last_unlocked);
> >
> > + if (vm->last_delayed_tlb_flush) {
>
> You can initialize last_delayed_tlb_flush() with the dummy fence, see
> how last_unlocked works.
>
> > + /* Wait until fence is signaled.
> > +  * But must double check to make sure fence cb is called.
> > +  * As dma_fence_default_wait checks 
> > DMA_FENCE_FLAG_SIGNALED_BIT without
> > +  * holding fence lock(the first test_bit).
> > +  * So call dma_fence_get_status which will hold the fence 
> > lock.
> > +  * Then we can make sure fence cb has been called.
> > +  */
>
> Uff, that is a really good point and most likely a bug in dma_fence_wait().
>
> I'm pretty sure that a couple of other callers rely on that as well.
>
> Daniel what's you opinion about that?

dma_fence_wait + dma_fence_signal provide a barrier (like the other
wake_up/wait function pairs we have), but nothing more. So you're not
really guaranteed at all that any other waiters have received the
wakeup. This is also why we had that wording that waiters need to hold
a dma_fence reference or things can go boom. This is also standard
linux and I have honestly no idea how we could guarantee more without
either making this all more expensive (forcing more refcounting would
probably be needed) or making it look like there's a guarantee that
really doesn't hold when you try to use it. wait/wake_up functions
pair really should not provide more ordering than just the barrier
(and that barrier is even conditional on "an actual wake-up has
happened").

I'm not exactly sure how to best fix this here, but I guess you either
want your own spinlock to protect the link between the fence and the
vm, or some other refcounting scheme changes (like you have the gpu
ctx that run on top of a vm hold the refence on the fence, and the
fence itself holds a full 

Re: 回复: [PATCH] drm/amdgpu: Fix one use-after-free of VM

2022-04-13 Thread Christian König

I think for now we should just have a the following code in amdgpu_vm_fini:

dma_fence_wait(vm->last_tlb_flush, false);
/* Make sure that all fence callbacks have completed*/
spinlock(vm->last_tlb_flush->lock);
spinunlock(vm->last_tlb_flush->lock);
dma_fence_put(vm->last_tlb_flush);

Cleaning that up in a larger context can come later on and is probably 
my job as DMA-buf maintainer.


Thanks,
Christian.

Am 13.04.22 um 05:07 schrieb Pan, Xinhui:

[AMD Official Use Only]

we make something like dma_fence_release does.

@@ -783,11 +783,15 @@ dma_fence_default_wait(struct dma_fence *fence, bool 
intr, signed long timeout)
 unsigned long flags;
 signed long ret = timeout ? timeout : 1;

-   if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, >flags))
+   if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, >flags) &&
+   list_empty(>cb_list))
 return ret;

 spin_lock_irqsave(fence->lock, flags);

+   if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, >flags))
+   goto out;
+
 if (intr && signal_pending(current)) {
 ret = -ERESTARTSYS;
 goto out;

发件人: Koenig, Christian 
发送时间: 2022年4月12日 20:11
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org; Daniel Vetter
抄送: Deucher, Alexander
主题: Re: [PATCH] drm/amdgpu: Fix one use-after-free of VM

Am 12.04.22 um 14:03 schrieb xinhui pan:

VM might already be freed when amdgpu_vm_tlb_seq_cb() is called.
We see the calltrace below.

Fix it by keeping the last flush fence around and wait for it to signal

BUG kmalloc-4k (Not tainted): Poison overwritten

0x9c88630414e8-0x9c88630414e8 @offset=5352. First byte 0x6c
instead of 0x6b Allocated in amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
age=44 cpu=0 pid=2343
   __slab_alloc.isra.0+0x4f/0x90
   kmem_cache_alloc_trace+0x6b8/0x7a0
   amdgpu_driver_open_kms+0x9d/0x360 [amdgpu]
   drm_file_alloc+0x222/0x3e0 [drm]
   drm_open+0x11d/0x410 [drm]
   drm_stub_open+0xdc/0x230 [drm]
   chrdev_open+0xa5/0x1e0
   do_dentry_open+0x16c/0x3c0
   vfs_open+0x2d/0x30
   path_openat+0x70a/0xa90
   do_filp_open+0xb2/0x120
   do_sys_openat2+0x245/0x330
   do_sys_open+0x46/0x80
   __x64_sys_openat+0x20/0x30
   do_syscall_64+0x38/0xc0
   entry_SYSCALL_64_after_hwframe+0x44/0xae
Freed in amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu] age=22 cpu=1
pid=2485
   kfree+0x4a2/0x580
   amdgpu_driver_postclose_kms+0x3e9/0x550 [amdgpu]
   drm_file_free+0x24e/0x3c0 [drm]
   drm_close_helper.isra.0+0x90/0xb0 [drm]
   drm_release+0x97/0x1a0 [drm]
   __fput+0xb6/0x280
   fput+0xe/0x10
   task_work_run+0x64/0xb0
   do_exit+0x406/0xcf0
   do_group_exit+0x50/0xc0
   __x64_sys_exit_group+0x18/0x20
   do_syscall_64+0x38/0xc0
   entry_SYSCALL_64_after_hwframe+0x44/0xae

Suggested-by: Christian König 
Signed-off-by: xinhui pan 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c | 22 +++---
   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h |  1 +
   2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index 645ce28277c2..e2486e95ca69 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -932,9 +932,12 @@ int amdgpu_vm_update_range(struct amdgpu_device *adev, 
struct amdgpu_vm *vm,

   if (flush_tlb || params.table_freed) {
   tlb_cb->vm = vm;
- if (!fence || !*fence ||
- dma_fence_add_callback(*fence, _cb->cb,
-amdgpu_vm_tlb_seq_cb))
+ if (fence && *fence &&
+ !dma_fence_add_callback(*fence, _cb->cb,
+amdgpu_vm_tlb_seq_cb)) {
+ dma_fence_put(vm->last_delayed_tlb_flush);
+ vm->last_delayed_tlb_flush = dma_fence_get(*fence);
+ } else
   amdgpu_vm_tlb_seq_cb(NULL, _cb->cb);
   tlb_cb = NULL;
   }
@@ -2258,6 +2261,19 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct 
amdgpu_vm *vm)
   dma_fence_wait(vm->last_unlocked, false);
   dma_fence_put(vm->last_unlocked);

+ if (vm->last_delayed_tlb_flush) {

You can initialize last_delayed_tlb_flush() with the dummy fence, see
how last_unlocked works.


+ /* Wait until fence is signaled.
+  * But must double check to make sure fence cb is called.
+  * As dma_fence_default_wait checks DMA_FENCE_FLAG_SIGNALED_BIT 
without
+  * holding fence lock(the first test_bit).
+  * So call dma_fence_get_status which will hold the fence lock.
+  * Then we can make sure fence cb has been called.
+  */

Uff, that is a really good point and most likely a bug in dma_fence_wait().

I'm pretty sure that a couple of other callers rely on that as well.

Daniel what's you opinion about that?

Thanks,
Christian.


+ 

Re: 回复: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

2022-04-13 Thread Christian König

That warning is a bit more than a little annoying.

Before we stop the delayed delete worker we *must* absolutely make sure 
that there is nothing going on the hardware any more. Otherwise we could 
easily run into use after free issues.


There should somewhere be a amdgpu_fence_wait_empty() before the 
flush_delayed_work() call. If that isn't there we do have a problem 
elsewhere.


Thanks for investigating this,
Christian.

Am 13.04.22 um 09:47 schrieb Pan, Xinhui:

[AMD Official Use Only]

The log from tester says it is the drm framebuffer BO being busy.

I just feel there is lack of time for its fence to be signaled.
As a delay works too in my test.
But the warning is a little annoying.


发件人: Koenig, Christian 
发送时间: 2022年4月13日 15:30
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander
主题: AW: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

We don't need that.

TTM only reschedules when the BOs are still busy.

And if the BOs are still busy when you unload the driver we have much bigger 
problems that this TTM worker :)

Regards,
Christian


Von: Pan, Xinhui 
Gesendet: Mittwoch, 13. April 2022 05:08
An: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Koenig, Christian 
; Pan, Xinhui 
Betreff: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

ttm_device_delayed_workqueue would reschedule itself if there is pending
BO to be destroyed. So just one flush + cancel_sync is not enough. We
still see lru_list not empty warnging.

Fix it by waiting all BO to be destroyed.

Signed-off-by: xinhui pan 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 --
  1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6f47726f1765..e249923eb9a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3957,11 +3957,17 @@ static void amdgpu_device_unmap_mmio(struct 
amdgpu_device *adev)
   */
  void amdgpu_device_fini_hw(struct amdgpu_device *adev)
  {
+   int pending = 1;
+
  dev_info(adev->dev, "amdgpu: finishing device.\n");
  flush_delayed_work(>delayed_init_work);
-   if (adev->mman.initialized) {
+   while (adev->mman.initialized && pending) {
  flush_delayed_work(>mman.bdev.wq);
-   ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   pending = ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   if (pending) {
+   ttm_bo_unlock_delayed_workqueue(>mman.bdev, true);
+   msleep((HZ / 100) < 1) ? 1 : HZ / 100);
+   }
  }
  adev->shutdown = true;

--
2.25.1





回复: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

2022-04-13 Thread Pan, Xinhui
[AMD Official Use Only]

The log from tester says it is the drm framebuffer BO being busy.

I just feel there is lack of time for its fence to be signaled.
As a delay works too in my test.
But the warning is a little annoying.


发件人: Koenig, Christian 
发送时间: 2022年4月13日 15:30
收件人: Pan, Xinhui; amd-gfx@lists.freedesktop.org
抄送: Deucher, Alexander
主题: AW: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

We don't need that.

TTM only reschedules when the BOs are still busy.

And if the BOs are still busy when you unload the driver we have much bigger 
problems that this TTM worker :)

Regards,
Christian


Von: Pan, Xinhui 
Gesendet: Mittwoch, 13. April 2022 05:08
An: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Koenig, Christian 
; Pan, Xinhui 
Betreff: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

ttm_device_delayed_workqueue would reschedule itself if there is pending
BO to be destroyed. So just one flush + cancel_sync is not enough. We
still see lru_list not empty warnging.

Fix it by waiting all BO to be destroyed.

Signed-off-by: xinhui pan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6f47726f1765..e249923eb9a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3957,11 +3957,17 @@ static void amdgpu_device_unmap_mmio(struct 
amdgpu_device *adev)
  */
 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 {
+   int pending = 1;
+
 dev_info(adev->dev, "amdgpu: finishing device.\n");
 flush_delayed_work(>delayed_init_work);
-   if (adev->mman.initialized) {
+   while (adev->mman.initialized && pending) {
 flush_delayed_work(>mman.bdev.wq);
-   ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   pending = ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   if (pending) {
+   ttm_bo_unlock_delayed_workqueue(>mman.bdev, true);
+   msleep((HZ / 100) < 1) ? 1 : HZ / 100);
+   }
 }
 adev->shutdown = true;

--
2.25.1



Re: [PATCHv4] drm/amdgpu: disable ASPM on Intel Alder Lake based systems

2022-04-13 Thread Paul Menzel

Dear Richard,


Thank you for sending out v4.

Am 12.04.22 um 23:50 schrieb Richard Gong:

Active State Power Management (ASPM) feature is enabled since kernel 5.14.
There are some AMD GFX cards (such as WX3200 and RX640) that won't work
with ASPM-enabled Intel Alder Lake based systems. Using these GFX cards as
video/display output, Intel Alder Lake based systems will hang during
suspend/resume.


I am still not clear, what “hang during suspend/resume” means. I guess 
suspending works fine? During resume (S3 or S0ix?), where does it hang? 
The system is functional, but there are only display problems?



The issue was initially reported on one system (Dell Precision 3660 with
BIOS version 0.14.81), but was later confirmed to affect at least 4 Alder
Lake based systems.

Add extra check to disable ASPM on Intel Alder Lake based systems.

Fixes: 0064b0ce85bb ("drm/amd/pm: enable ASPM by default")
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/1885
Reported-by: kernel test robot 


This tag is a little confusing. Maybe clarify that it was for an issue 
in a previous patch iteration?



Signed-off-by: Richard Gong 
---
v4: s/CONFIG_X86_64/CONFIG_X86
 enhanced check logic
v3: s/intel_core_asom_chk/aspm_support_quirk_check
 correct build error with W=1 option
v2: correct commit description
 move the check from chip family to problematic platform
---
  drivers/gpu/drm/amd/amdgpu/vi.c | 17 -
  1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c
index 039b90cdc3bc..b33e0a9bee65 100644
--- a/drivers/gpu/drm/amd/amdgpu/vi.c
+++ b/drivers/gpu/drm/amd/amdgpu/vi.c
@@ -81,6 +81,10 @@
  #include "mxgpu_vi.h"
  #include "amdgpu_dm.h"
  
+#if IS_ENABLED(CONFIG_X86)

+#include 
+#endif
+
  #define ixPCIE_LC_L1_PM_SUBSTATE  0x100100C6
  #define PCIE_LC_L1_PM_SUBSTATE__LC_L1_SUBSTATES_OVERRIDE_EN_MASK  
0x0001L
  #define PCIE_LC_L1_PM_SUBSTATE__LC_PCI_PM_L1_2_OVERRIDE_MASK  0x0002L
@@ -1134,13 +1138,24 @@ static void vi_enable_aspm(struct amdgpu_device *adev)
WREG32_PCIE(ixPCIE_LC_CNTL, data);
  }
  
+static bool aspm_support_quirk_check(void)

+{
+   if (IS_ENABLED(CONFIG_X86)) {
+   struct cpuinfo_x86 *c = _data(0);
+
+   return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
+   }
+
+   return true;
+}
+
  static void vi_program_aspm(struct amdgpu_device *adev)
  {
u32 data, data1, orig;
bool bL1SS = false;
bool bClkReqSupport = true;
  
-	if (!amdgpu_device_should_use_aspm(adev))

+   if (!amdgpu_device_should_use_aspm(adev) || !aspm_support_quirk_check())
return;


Can users still forcefully enable ASPM with the parameter `amdgpu.aspm`?

  
  	if (adev->flags & AMD_IS_APU ||


If I remember correctly, there were also newer cards, where ASPM worked 
with Intel Alder Lake, right? Can only the problematic generations for 
WX3200 and RX640 be excluded from ASPM?



Kind regards,

Paul


[PATCH] drm/amdkfd: potential NULL dereference in kfd_set/reset_event()

2022-04-13 Thread Dan Carpenter
If lookup_event_by_id() returns a NULL "ev" pointer then the
spin_lock(>lock) will crash.  This was detected by Smatch:

drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_events.c:644 kfd_set_event()
error: we previously assumed 'ev' could be null (see line 639)

Fixes: 5273e82c5f47 ("drm/amdkfd: Improve concurrency of event handling")
Signed-off-by: Dan Carpenter 
---
 drivers/gpu/drm/amd/amdkfd/kfd_events.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
index 0fef24b0b915..15570e72a5dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c
@@ -634,14 +634,19 @@ int kfd_set_event(struct kfd_process *p, uint32_t 
event_id)
rcu_read_lock();
 
ev = lookup_event_by_id(p, event_id);
+   if (!ev) {
+   ret = -EINVAL;
+   goto unlock_rcu;
+   }
spin_lock(>lock);
 
-   if (ev && event_can_be_cpu_signaled(ev))
+   if (event_can_be_cpu_signaled(ev))
set_event(ev);
else
ret = -EINVAL;
 
spin_unlock(>lock);
+unlock_rcu:
rcu_read_unlock();
return ret;
 }
@@ -660,14 +665,19 @@ int kfd_reset_event(struct kfd_process *p, uint32_t 
event_id)
rcu_read_lock();
 
ev = lookup_event_by_id(p, event_id);
+   if (!ev) {
+   ret = -EINVAL;
+   goto unlock_rcu;
+   }
spin_lock(>lock);
 
-   if (ev && event_can_be_cpu_signaled(ev))
+   if (event_can_be_cpu_signaled(ev))
reset_event(ev);
else
ret = -EINVAL;
 
spin_unlock(>lock);
+unlock_rcu:
rcu_read_unlock();
return ret;
 
-- 
2.20.1



Re: [PATCH v2] drm/amdgpu: Make sure ttm delayed work finished

2022-04-13 Thread Paul Menzel

Dear xinhui,


Am 13.04.22 um 08:46 schrieb xinhui pan:

ttm_device_delayed_workqueue would reschedule itself if there is pending
BO to be destroyed. So just one flush + cancel_sync is not enough. We
still see lru_list not empty warnging.


warning

(`scripts/checkpatch.pl --codespell` should have found this.)



Fix it by waiting all BO to be destroyed.


Please explain the waiting algorithm. Why at least one millisecond?

Do you have a reproducer for this issue?


Acked-by: Guchun Chen 
Signed-off-by: xinhui pan 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 --
  1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6f47726f1765..56dcf8c3a3cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3957,11 +3957,17 @@ static void amdgpu_device_unmap_mmio(struct 
amdgpu_device *adev)
   */
  void amdgpu_device_fini_hw(struct amdgpu_device *adev)
  {
+   int pending = 1;


Could this be bool?


+
dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(>delayed_init_work);
-   if (adev->mman.initialized) {
+   while (adev->mman.initialized && pending) {
flush_delayed_work(>mman.bdev.wq);
-   ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   pending = ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   if (pending) {
+   ttm_bo_unlock_delayed_workqueue(>mman.bdev, true);


Does this call affect `adev->mman.initialized`? If not a do-while loop 
might be better suited, so pending is only checked once.


if (adev->mman.initialized) {
do {
flush_delayed_work(>mman.bdev.wq);
ttm_bo_unlock_delayed_workqueue(>mman.bdev, true);
msleep(((HZ / 100) < 1) ? 1 : HZ / 100);
} while (ttm_bo_lock_delayed_workqueue(>mman.bdev));
}

The logic is slightly different, as 
`ttm_bo_unlock_delayed_workqueue(>mman.bdev, true);` and the sleep 
are run at least once, so the suggestion might be unsuitable.



+   msleep(((HZ / 100) < 1) ? 1 : HZ / 100);


Maybe add a comment for that formula? (No idea, if common knowledge, but 
why is a delay needed, and the loop cannot be run as fast as possible?)



+   }
}
adev->shutdown = true;
  



Kind regards,

Paul


AW: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

2022-04-13 Thread Koenig, Christian
We don't need that.

TTM only reschedules when the BOs are still busy.

And if the BOs are still busy when you unload the driver we have much bigger 
problems that this TTM worker :)

Regards,
Christian


Von: Pan, Xinhui 
Gesendet: Mittwoch, 13. April 2022 05:08
An: amd-gfx@lists.freedesktop.org 
Cc: Deucher, Alexander ; Koenig, Christian 
; Pan, Xinhui 
Betreff: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

ttm_device_delayed_workqueue would reschedule itself if there is pending
BO to be destroyed. So just one flush + cancel_sync is not enough. We
still see lru_list not empty warnging.

Fix it by waiting all BO to be destroyed.

Signed-off-by: xinhui pan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6f47726f1765..e249923eb9a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3957,11 +3957,17 @@ static void amdgpu_device_unmap_mmio(struct 
amdgpu_device *adev)
  */
 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 {
+   int pending = 1;
+
 dev_info(adev->dev, "amdgpu: finishing device.\n");
 flush_delayed_work(>delayed_init_work);
-   if (adev->mman.initialized) {
+   while (adev->mman.initialized && pending) {
 flush_delayed_work(>mman.bdev.wq);
-   ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   pending = ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   if (pending) {
+   ttm_bo_unlock_delayed_workqueue(>mman.bdev, true);
+   msleep((HZ / 100) < 1) ? 1 : HZ / 100);
+   }
 }
 adev->shutdown = true;

--
2.25.1



Re: Vega 56 failing to process EDID from VR Headset

2022-04-13 Thread Paul Menzel

Dear James,


Am 13.04.22 um 00:13 schrieb James Dutton:

On Tue, 12 Apr 2022 at 07:13, Paul Menzel wrote:

Am 11.04.22 um 23:39 schrieb James Dutton:
So, did you do any changes to Linux? Why do you think the EDID is at fault?

[…]

I suggest to analyze, why `No DP link bandwidth` is logged. The macro is
`DC_NO_DP_LINK_BANDWIDTH`, and you should first check why
`dp_validate_mode_timing()` in
`drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c` returns false.



PS: Using the issue tracker [1] might make it easier to keep track of
this problem, and also to attach all the necessary information.


[1]: https://gitlab.freedesktop.org/drm/amd/-/issues/



I will do some more investigation. In addition to it not processing
the EDID particularly well...
Since my email, I have found out that it is failing to complete CR
(Clock Recovery) on Link 0,2, but it works on 1,3 at HBR2. All 4 Links
work at HBR1.   (I need the HBR2 working)
The CR negotiation in the code looks a bit wrong to me, so I will look
into that a bit more.
Looking at the current amdgpu source code   (I am using Mainline
kernel version 5.17.1), it appears to retry CR negotiation, but each
time it uses the same settings, rather than try different driver
parameters, as recommended in the DP standards and compliance test
documents.


[…]

Awesome, that you review the code with your expertise. Though I suggest 
to look at and work on agd5f/amd-staging-drm-next [1], having the latest 
code for the driver.



Once I know more, I will put all the info in the issue track, as you
suggest.


I am looking forward to it. To not get lost in all the problems, one 
email or issue per problem might be a good way forward, and adding 
people adding the code (`git blame -w`) to Cc might also help.


Happy debugging and hacking!


Kind regards,

Paul


[1]: https://gitlab.freedesktop.org/agd5f/linux/


Re: [PATCH 1/2] drm/amd/amdgpu: Update PF2VF header

2022-04-13 Thread Paul Menzel

[Removed unintended paste in second line]

Am 13.04.22 um 09:03 schrieb Paul Menzel:

Dear Bokun,


Thank you for rerolling the patch. Please add the iteration/version in 
the subject next time `[PATCH v2 1/2]` or so.


Am 12.04.22 um 23:31 schrieb Bokun Zhang:

- Add proper indentation in the header file


Please use that as the commit message summary, and the subject/summary 
as the title for the cover letter? Maybe:


drm/amd/amdgpu: Properly indent PF2VF header


Signed-off-by: Bokun Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 95 ++---
  1 file changed, 46 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h

index 7326b6c1b71c..65433cbb00c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -1,34 +1,33 @@
  /*
- * Copyright 2018-2019 Advanced Micro Devices, Inc.
+ * Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
   *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
   *
   * The above copyright notice and this permission notice shall be included in
   * all copies or substantial portions of the Software.
   *
   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
   */


The header change seems unrelated from indentation?

[…]


Kind regards,

Paul


Re: [PATCH 1/2] drm/amd/amdgpu: Update PF2VF header

2022-04-13 Thread Paul Menzel

Dear Bokun,
 drm/amd/amdgpu: Update PF2VF header

Thank you for rerolling the patch. Please add the iteration/version in 
the subject next time `[PATCH v2 1/2]` or so.


Am 12.04.22 um 23:31 schrieb Bokun Zhang:

- Add proper indentation in the header file


Please use that as the commit message summary, and the subject/summary 
as the title for the cover letter? Maybe:


drm/amd/amdgpu: Properly indent PF2VF header


Signed-off-by: Bokun Zhang 
---
  drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h | 95 ++---
  1 file changed, 46 insertions(+), 49 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h 
b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
index 7326b6c1b71c..65433cbb00c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h
@@ -1,34 +1,33 @@
  /*
- * Copyright 2018-2019 Advanced Micro Devices, Inc.
+ * Copyright (c) 2018-2021 Advanced Micro Devices, Inc. All rights reserved.
   *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
   *
   * The above copyright notice and this permission notice shall be included in
   * all copies or substantial portions of the Software.
   *
   * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
   * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
- * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
- * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
   */


The header change seems unrelated from indentation?

[…]


Kind regards,

Paul


[PATCH v2] drm/amdgpu: Make sure ttm delayed work finished

2022-04-13 Thread xinhui pan
ttm_device_delayed_workqueue would reschedule itself if there is pending
BO to be destroyed. So just one flush + cancel_sync is not enough. We
still see lru_list not empty warnging.

Fix it by waiting all BO to be destroyed.

Acked-by: Guchun Chen 
Signed-off-by: xinhui pan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6f47726f1765..56dcf8c3a3cd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3957,11 +3957,17 @@ static void amdgpu_device_unmap_mmio(struct 
amdgpu_device *adev)
  */
 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
 {
+   int pending = 1;
+
dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(>delayed_init_work);
-   if (adev->mman.initialized) {
+   while (adev->mman.initialized && pending) {
flush_delayed_work(>mman.bdev.wq);
-   ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   pending = ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   if (pending) {
+   ttm_bo_unlock_delayed_workqueue(>mman.bdev, true);
+   msleep(((HZ / 100) < 1) ? 1 : HZ / 100);
+   }
}
adev->shutdown = true;
 
-- 
2.25.1



RE: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

2022-04-13 Thread Chen, Guchun
+   msleep((HZ / 100) < 1) ? 1 : HZ / 100);

a "(" is missed. With it fixed, this patch is:
Acked-by: Guchun Chen 

Regards,
Guchun

-Original Message-
From: amd-gfx  On Behalf Of xinhui pan
Sent: Wednesday, April 13, 2022 11:09 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Pan, Xinhui 
; Koenig, Christian 
Subject: [PATCH] drm/amdgpu: Make sure ttm delayed work finished

ttm_device_delayed_workqueue would reschedule itself if there is pending BO to 
be destroyed. So just one flush + cancel_sync is not enough. We still see 
lru_list not empty warnging.

Fix it by waiting all BO to be destroyed.

Signed-off-by: xinhui pan 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 6f47726f1765..e249923eb9a7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3957,11 +3957,17 @@ static void amdgpu_device_unmap_mmio(struct 
amdgpu_device *adev)
  */
 void amdgpu_device_fini_hw(struct amdgpu_device *adev)  {
+   int pending = 1;
+
dev_info(adev->dev, "amdgpu: finishing device.\n");
flush_delayed_work(>delayed_init_work);
-   if (adev->mman.initialized) {
+   while (adev->mman.initialized && pending) {
flush_delayed_work(>mman.bdev.wq);
-   ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   pending = ttm_bo_lock_delayed_workqueue(>mman.bdev);
+   if (pending) {
+   ttm_bo_unlock_delayed_workqueue(>mman.bdev, true);
+   msleep((HZ / 100) < 1) ? 1 : HZ / 100);
+   }
}
adev->shutdown = true;
 
--
2.25.1