RE: [PATCH] drm/amd/powerplay: fix the null pointer issue on switching power profile

2020-01-12 Thread Quan, Evan
There was a same fix from Felix to address this.

Regards,
Evan
> -Original Message-
> From: amd-gfx  On Behalf Of Huang
> Rui
> Sent: Monday, January 13, 2020 10:35 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander ; Kuehling, Felix
> ; Huang, Ray 
> Subject: [PATCH] drm/amd/powerplay: fix the null pointer issue on switching
> power profile
> 
> Fix the wrong input parameter of powerplay callback
> amdgpu_dpm_switch_power_profile which is encountered by kfd test.
> 
> [  176.758381] BUG: kernel NULL pointer dereference, address:
> 0220 [  176.758409] #PF: supervisor read access in kernel mode
> [  176.758424] #PF: error_code(0x) - not-present page [  176.758440] PGD
> 8003f6eea067 P4D 8003f6eea067 PUD 3ce06c067 PMD 0
> [  176.758461] Oops:  [#1] SMP PTI
> [  176.758473] CPU: 5 PID: 2621 Comm: kfdtest Tainted: G   OE
> 5.4.0-rc7-custom #1
> [  176.758496] Hardware name: System manufacturer System Product
> Name/TUF Z370-PLUS GAMING, BIOS 0612 03/01/2018 [  176.758593] RIP:
> 0010:pp_dpm_switch_power_profile+0x46/0x1ee [amdgpu] [  176.758613]
> Code: 00 48 89 45 d8 31 c0 48 85 ff 0f 84 9f 01 00 00 48 89 fb 41
> 89 f4 41 89 d5 80 7f 15 00 0f 84 96 01 00 00 48 8b 87 78 02 00 00 <48> 83 b8
> 20
> 02 00 00 00 0f 84 ba 00 00 00 83 fe 05 0f 87 82 01 00 [  176.758663] RSP:
> 0018:a530c12ebb50 EFLAGS: 00010282 [  176.758678] RAX:
>  RBX: 9797c0b0 RCX: 
> [  176.758698] RDX: 0001 RSI: 0005 RDI:
> 9797c0b0 [  176.758718] RBP: a530c12ebb80 R08: 9797ce221548
> R09: 036c [  176.758739] R10: 94006a80 R11:
>  R12: 0005 [  176.758759] R13:
> 0001 R14: 9797fff4dda8 R15: 9797ce221548 [  176.758779]
> FS:  7efe09cc3780() GS:97982694()
> knlGS:
> [  176.758802] CS:  0010 DS:  ES:  CR0: 80050033
> [  176.758819] CR2: 0220 CR3: 0003cdc70003 CR4:
> 003606e0 [  176.758839] DR0:  DR1:
>  DR2:  [  176.758859] DR3:
>  DR6: fffe0ff0 DR7: 0400
> [  176.758879] Call Trace:
> [  176.758932]  amdgpu_dpm_switch_power_profile+0x4c/0x6f [amdgpu]
> [  176.758995]  amdgpu_amdkfd_set_compute_idle+0x1a/0x1c [amdgpu]
> [  176.759056]  kfd_inc_compute_active+0x29/0x2b [amdgpu] [  176.759117]
> register_process+0x11c/0x14f [amdgpu] [  176.759177]
> pqm_create_queue+0x20b/0x527 [amdgpu] [  176.759234]
> kfd_ioctl_create_queue+0x4aa/0x5e5 [amdgpu] [  176.759294]
> kfd_ioctl+0x235/0x366 [amdgpu]
> 
> Signed-off-by: Huang Rui 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
> index f7c0ae6..d3962e2 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
> @@ -1123,7 +1123,8 @@ int amdgpu_dpm_switch_power_profile(struct
> amdgpu_device *adev,
>   ret = smu_switch_power_profile(>smu, type, en);
>   else if (adev->powerplay.pp_funcs &&
>adev->powerplay.pp_funcs->switch_power_profile)
> - ret = adev->powerplay.pp_funcs->switch_power_profile(adev,
> type, en);
> + ret = adev->powerplay.pp_funcs->switch_power_profile(adev-
> >powerplay.pp_handle,
> +  type, en);
> 
>   return ret;
>  }
> --
> 2.7.4
> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.free
> desktop.org%2Fmailman%2Flistinfo%2Famd-
> gfxdata=02%7C01%7Cevan.quan%40amd.com%7Cce7a462c82c94d51a9
> 6908d797d12d9d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637
> 144797057908962sdata=LLvPWFsLxMKKk2upAUQAaG4E%2FfpNtWjt6G1
> mEpr78kg%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/2] drm/amdgpu/debugfs: properly handle runtime pm

2020-01-12 Thread Quan, Evan
Reviewed-by: Evan Quan 

> -Original Message-
> From: amd-gfx  On Behalf Of Alex
> Deucher
> Sent: Saturday, January 11, 2020 7:45 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander 
> Subject: [PATCH 2/2] drm/amdgpu/debugfs: properly handle runtime pm
> 
> If driver debugfs files are accessed, power up the GPU when necessary.
> 
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 133
> ++--
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c   |   8 ++
>  2 files changed, 134 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 63343bb43049..f24ed9a1a3e5 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -26,6 +26,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
> 
>  #include 
> 
> @@ -144,10 +145,17 @@ static int  amdgpu_debugfs_process_reg_op(bool
> read, struct file *f,
> 
>   *pos &= (1UL << 22) - 1;
> 
> + r = pm_runtime_get_sync(adev->ddev->dev);
> + if (r < 0)
> + return r;
> +
>   if (use_bank) {
>   if ((sh_bank != 0x && sh_bank >= adev-
> >gfx.config.max_sh_per_se) ||
> - (se_bank != 0x && se_bank >= adev-
> >gfx.config.max_shader_engines))
> + (se_bank != 0x && se_bank >= adev-
> >gfx.config.max_shader_engines)) {
> + pm_runtime_mark_last_busy(adev->ddev->dev);
> + pm_runtime_put_autosuspend(adev->ddev->dev);
>   return -EINVAL;
> + }
>   mutex_lock(>grbm_idx_mutex);
>   amdgpu_gfx_select_se_sh(adev, se_bank,
>   sh_bank, instance_bank);
> @@ -193,6 +201,9 @@ static int  amdgpu_debugfs_process_reg_op(bool read,
> struct file *f,
>   if (pm_pg_lock)
>   mutex_unlock(>pm.mutex);
> 
> + pm_runtime_mark_last_busy(adev->ddev->dev);
> + pm_runtime_put_autosuspend(adev->ddev->dev);
> +
>   return result;
>  }
> 
> @@ -237,13 +248,20 @@ static ssize_t
> amdgpu_debugfs_regs_pcie_read(struct file *f, char __user *buf,
>   if (size & 0x3 || *pos & 0x3)
>   return -EINVAL;
> 
> + r = pm_runtime_get_sync(adev->ddev->dev);
> + if (r < 0)
> + return r;
> +
>   while (size) {
>   uint32_t value;
> 
>   value = RREG32_PCIE(*pos >> 2);
>   r = put_user(value, (uint32_t *)buf);
> - if (r)
> + if (r) {
> + pm_runtime_mark_last_busy(adev->ddev->dev);
> + pm_runtime_put_autosuspend(adev->ddev->dev);
>   return r;
> + }
> 
>   result += 4;
>   buf += 4;
> @@ -251,6 +269,9 @@ static ssize_t amdgpu_debugfs_regs_pcie_read(struct
> file *f, char __user *buf,
>   size -= 4;
>   }
> 
> + pm_runtime_mark_last_busy(adev->ddev->dev);
> + pm_runtime_put_autosuspend(adev->ddev->dev);
> +
>   return result;
>  }
> 
> @@ -276,12 +297,19 @@ static ssize_t
> amdgpu_debugfs_regs_pcie_write(struct file *f, const char __user
>   if (size & 0x3 || *pos & 0x3)
>   return -EINVAL;
> 
> + r = pm_runtime_get_sync(adev->ddev->dev);
> + if (r < 0)
> + return r;
> +
>   while (size) {
>   uint32_t value;
> 
>   r = get_user(value, (uint32_t *)buf);
> - if (r)
> + if (r) {
> + pm_runtime_mark_last_busy(adev->ddev->dev);
> + pm_runtime_put_autosuspend(adev->ddev->dev);
>   return r;
> + }
> 
>   WREG32_PCIE(*pos >> 2, value);
> 
> @@ -291,6 +319,9 @@ static ssize_t amdgpu_debugfs_regs_pcie_write(struct
> file *f, const char __user
>   size -= 4;
>   }
> 
> + pm_runtime_mark_last_busy(adev->ddev->dev);
> + pm_runtime_put_autosuspend(adev->ddev->dev);
> +
>   return result;
>  }
> 
> @@ -316,13 +347,20 @@ static ssize_t
> amdgpu_debugfs_regs_didt_read(struct file *f, char __user *buf,
>   if (size & 0x3 || *pos & 0x3)
>   return -EINVAL;
> 
> + r = pm_runtime_get_sync(adev->ddev->dev);
> + if (r < 0)
> + return r;
> +
>   while (size) {
>   uint32_t value;
> 
>   value = RREG32_DIDT(*pos >> 2);
>   r = put_user(value, (uint32_t *)buf);
> - if (r)
> + if (r) {
> + pm_runtime_mark_last_busy(adev->ddev->dev);
> + pm_runtime_put_autosuspend(adev->ddev->dev);
>   return r;
> + }
> 
>   result += 4;
>   buf += 4;
> @@ -330,6 +368,9 @@ static ssize_t amdgpu_debugfs_regs_didt_read(struct
> file *f, char __user *buf,
>   size -= 4;
>   }
> 
> + 

RE: [PATCH 1/2] drm/amdgpu/pm: properly handle runtime pm

2020-01-12 Thread Quan, Evan
The "count" from amdgpu_set_pp_feature_status() seems with type of size_t.
Then assignment "count = -EINVAL" may be improper.
With that confirmed, the patch is reviewed-by: Evan Quan 

> -Original Message-
> From: amd-gfx  On Behalf Of Alex
> Deucher
> Sent: Saturday, January 11, 2020 7:45 AM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander 
> Subject: [PATCH 1/2] drm/amdgpu/pm: properly handle runtime pm
> 
> If power management sysfs or debugfs files are accessed, power up the GPU
> when necessary.
> 
> Signed-off-by: Alex Deucher 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 822 ++-
> --
>  1 file changed, 614 insertions(+), 208 deletions(-)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> index 285d460624c8..806e731c1ff4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
> @@ -37,6 +37,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include "hwmgr.h"
>  #define WIDTH_4K 3840
> 
> @@ -158,10 +159,15 @@ static ssize_t amdgpu_get_dpm_state(struct device
> *dev,
>   struct drm_device *ddev = dev_get_drvdata(dev);
>   struct amdgpu_device *adev = ddev->dev_private;
>   enum amd_pm_state_type pm;
> + int ret;
> 
>   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
>   return 0;
> 
> + ret = pm_runtime_get_sync(ddev->dev);
> + if (ret < 0)
> + return ret;
> +
>   if (is_support_sw_smu(adev)) {
>   if (adev->smu.ppt_funcs->get_current_power_state)
>   pm = smu_get_current_power_state(>smu);
> @@ -173,6 +179,9 @@ static ssize_t amdgpu_get_dpm_state(struct device
> *dev,
>   pm = adev->pm.dpm.user_state;
>   }
> 
> + pm_runtime_mark_last_busy(ddev->dev);
> + pm_runtime_put_autosuspend(ddev->dev);
> +
>   return snprintf(buf, PAGE_SIZE, "%s\n",
>   (pm == POWER_STATE_TYPE_BATTERY) ? "battery" :
>   (pm == POWER_STATE_TYPE_BALANCED) ?
> "balanced" : "performance"); @@ -186,6 +195,7 @@ static ssize_t
> amdgpu_set_dpm_state(struct device *dev,
>   struct drm_device *ddev = dev_get_drvdata(dev);
>   struct amdgpu_device *adev = ddev->dev_private;
>   enum amd_pm_state_type  state;
> + int ret;
> 
>   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
>   return -EINVAL;
> @@ -201,6 +211,10 @@ static ssize_t amdgpu_set_dpm_state(struct device
> *dev,
>   goto fail;
>   }
> 
> + ret = pm_runtime_get_sync(ddev->dev);
> + if (ret < 0)
> + return ret;
> +
>   if (is_support_sw_smu(adev)) {
>   mutex_lock(>pm.mutex);
>   adev->pm.dpm.user_state = state;
> @@ -212,11 +226,12 @@ static ssize_t amdgpu_set_dpm_state(struct device
> *dev,
>   adev->pm.dpm.user_state = state;
>   mutex_unlock(>pm.mutex);
> 
> - /* Can't set dpm state when the card is off */
> - if (!(adev->flags & AMD_IS_PX) ||
> - (ddev->switch_power_state == DRM_SWITCH_POWER_ON))
> - amdgpu_pm_compute_clocks(adev);
> + amdgpu_pm_compute_clocks(adev);
>   }
> + pm_runtime_mark_last_busy(ddev->dev);
> + pm_runtime_put_autosuspend(ddev->dev);
> +
> +
>  fail:
>   return count;
>  }
> @@ -288,13 +303,14 @@ static ssize_t
> amdgpu_get_dpm_forced_performance_level(struct device *dev,
>   struct drm_device *ddev = dev_get_drvdata(dev);
>   struct amdgpu_device *adev = ddev->dev_private;
>   enum amd_dpm_forced_level level = 0xff;
> + int ret;
> 
>   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
>   return 0;
> 
> - if ((adev->flags & AMD_IS_PX) &&
> - (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
> - return snprintf(buf, PAGE_SIZE, "off\n");
> + ret = pm_runtime_get_sync(ddev->dev);
> + if (ret < 0)
> + return ret;
> 
>   if (is_support_sw_smu(adev))
>   level = smu_get_performance_level(>smu);
> @@ -303,6 +319,9 @@ static ssize_t
> amdgpu_get_dpm_forced_performance_level(struct device *dev,
>   else
>   level = adev->pm.dpm.forced_level;
> 
> + pm_runtime_mark_last_busy(ddev->dev);
> + pm_runtime_put_autosuspend(ddev->dev);
> +
>   return snprintf(buf, PAGE_SIZE, "%s\n",
>   (level == AMD_DPM_FORCED_LEVEL_AUTO) ? "auto" :
>   (level == AMD_DPM_FORCED_LEVEL_LOW) ? "low" :
> @@ -329,11 +348,6 @@ static ssize_t
> amdgpu_set_dpm_forced_performance_level(struct device *dev,
>   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
>   return -EINVAL;
> 
> - /* Can't force performance level when the card is off */
> - if  ((adev->flags & AMD_IS_PX) &&
> -  (ddev->switch_power_state != DRM_SWITCH_POWER_ON))
> -   

Re: [PATCH] drm/amdkfd: Add a message when SW scheduler is used

2020-01-12 Thread Huang Rui
On Fri, Jan 10, 2020 at 02:24:29PM -0500, Yong Zhao wrote:
> SW scheduler is previously called non HW scheduler, or non HWS. This
> message is useful when triaging issues from dmesg.
> 
> Change-Id: I625518c88c043df5f60409d1ca520e7fc032251f
> Signed-off-by: Yong Zhao 

Acked-by: Huang Rui 

> ---
>  drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c 
> b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> index d7eb6ac37f62..2870553a2ce0 100644
> --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
> @@ -934,6 +934,7 @@ static void uninitialize(struct device_queue_manager *dqm)
>  
>  static int start_nocpsch(struct device_queue_manager *dqm)
>  {
> + pr_info("SW scheduler is used");
>   init_interrupts(dqm);
>   
>   if (dqm->dev->device_info->asic_family == CHIP_HAWAII)
> -- 
> 2.17.1
> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cray.huang%40amd.com%7C6b815ca66a604cf8bdc808d79602bf9c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637142810865612294sdata=smKBx880vj61EOJ3iHXQdSCb3upww6EyHKjD3xfWdX4%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 3/5] drm/amdkfd: use map_queues for hiq on arcturus as well

2020-01-12 Thread Huang Rui
On Sat, Jan 11, 2020 at 07:05:40AM +0800, Kuehling, Felix wrote:
> What happens on Arcturus without this patch? Does it oops with a null 
> pointer dereference? If yes, then you should squash this patch into 
> patch 2 to avoid a broken intermediate state.

Yes, Arcturus will get a null pointer panic without this patch becasue
hiq_mqd_load is not inited in arcturus_kfd2kgd. I will squash this patch
into the patch 2.

Thanks,
Ray

> 
> Regards,
>    Felix
> 
> On 2020-01-10 1:37 a.m., Huang Rui wrote:
> > Align with gfx v9, use the map_queues packet to load hiq MQD.
> >
> > Signed-off-by: Huang Rui 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 1 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c   | 6 +++---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h   | 3 +++
> >   3 files changed, 7 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > index 3c11940..8baad42 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c
> > @@ -281,6 +281,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = {
> > .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping,
> > .init_interrupts = kgd_gfx_v9_init_interrupts,
> > .hqd_load = kgd_gfx_v9_hqd_load,
> > +   .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load,
> > .hqd_sdma_load = kgd_hqd_sdma_load,
> > .hqd_dump = kgd_gfx_v9_hqd_dump,
> > .hqd_sdma_dump = kgd_hqd_sdma_dump,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > index ab8c23a..d2f9396 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
> > @@ -324,9 +324,9 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, 
> > uint32_t pipe_id,
> > return 0;
> >   }
> >   
> > -static int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
> > -  uint32_t pipe_id, uint32_t queue_id,
> > -  uint32_t doorbell_off)
> > +int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
> > +   uint32_t pipe_id, uint32_t queue_id,
> > +   uint32_t doorbell_off)
> >   {
> > struct amdgpu_device *adev = get_amdgpu_device(kgd);
> > struct amdgpu_ring *kiq_ring = >gfx.kiq.ring;
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > index 02b1426..32dd1a9 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h
> > @@ -33,6 +33,9 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, 
> > uint32_t pipe_id,
> > uint32_t queue_id, uint32_t __user *wptr,
> > uint32_t wptr_shift, uint32_t wptr_mask,
> > struct mm_struct *mm);
> > +int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd,
> > +   uint32_t pipe_id, uint32_t queue_id,
> > +   uint32_t doorbell_off);
> >   int kgd_gfx_v9_hqd_dump(struct kgd_dev *kgd,
> > uint32_t pipe_id, uint32_t queue_id,
> > uint32_t (**dump)[2], uint32_t *n_regs);
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amd/powerplay: fix the null pointer issue on switching power profile

2020-01-12 Thread Huang Rui
Fix the wrong input parameter of powerplay callback
amdgpu_dpm_switch_power_profile which is encountered by kfd test.

[  176.758381] BUG: kernel NULL pointer dereference, address: 0220
[  176.758409] #PF: supervisor read access in kernel mode
[  176.758424] #PF: error_code(0x) - not-present page
[  176.758440] PGD 8003f6eea067 P4D 8003f6eea067 PUD 3ce06c067 PMD 0
[  176.758461] Oops:  [#1] SMP PTI
[  176.758473] CPU: 5 PID: 2621 Comm: kfdtest Tainted: G   OE
5.4.0-rc7-custom #1
[  176.758496] Hardware name: System manufacturer System Product Name/TUF
Z370-PLUS GAMING, BIOS 0612 03/01/2018
[  176.758593] RIP: 0010:pp_dpm_switch_power_profile+0x46/0x1ee [amdgpu]
[  176.758613] Code: 00 48 89 45 d8 31 c0 48 85 ff 0f 84 9f 01 00 00 48 89 fb 41
89 f4 41 89 d5 80 7f 15 00 0f 84 96 01 00 00 48 8b 87 78 02 00 00 <48> 83 b8 20
02 00 00 00 0f 84 ba 00 00 00 83 fe 05 0f 87 82 01 00
[  176.758663] RSP: 0018:a530c12ebb50 EFLAGS: 00010282
[  176.758678] RAX:  RBX: 9797c0b0 RCX: 
[  176.758698] RDX: 0001 RSI: 0005 RDI: 9797c0b0
[  176.758718] RBP: a530c12ebb80 R08: 9797ce221548 R09: 036c
[  176.758739] R10: 94006a80 R11:  R12: 0005
[  176.758759] R13: 0001 R14: 9797fff4dda8 R15: 9797ce221548
[  176.758779] FS:  7efe09cc3780() GS:97982694()
knlGS:
[  176.758802] CS:  0010 DS:  ES:  CR0: 80050033
[  176.758819] CR2: 0220 CR3: 0003cdc70003 CR4: 003606e0
[  176.758839] DR0:  DR1:  DR2: 
[  176.758859] DR3:  DR6: fffe0ff0 DR7: 0400
[  176.758879] Call Trace:
[  176.758932]  amdgpu_dpm_switch_power_profile+0x4c/0x6f [amdgpu]
[  176.758995]  amdgpu_amdkfd_set_compute_idle+0x1a/0x1c [amdgpu]
[  176.759056]  kfd_inc_compute_active+0x29/0x2b [amdgpu]
[  176.759117]  register_process+0x11c/0x14f [amdgpu]
[  176.759177]  pqm_create_queue+0x20b/0x527 [amdgpu]
[  176.759234]  kfd_ioctl_create_queue+0x4aa/0x5e5 [amdgpu]
[  176.759294]  kfd_ioctl+0x235/0x366 [amdgpu]

Signed-off-by: Huang Rui 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
index f7c0ae6..d3962e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c
@@ -1123,7 +1123,8 @@ int amdgpu_dpm_switch_power_profile(struct amdgpu_device 
*adev,
ret = smu_switch_power_profile(>smu, type, en);
else if (adev->powerplay.pp_funcs &&
 adev->powerplay.pp_funcs->switch_power_profile)
-   ret = adev->powerplay.pp_funcs->switch_power_profile(adev, 
type, en);
+   ret = 
adev->powerplay.pp_funcs->switch_power_profile(adev->powerplay.pp_handle,
+type, en);
 
return ret;
 }
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: fix modprobe failure of the secondary GPU when GDDR6 training enabled(V5)

2020-01-12 Thread Tianci Yin
From: "Tianci.Yin" 

[why]
In dual GPUs scenario, stolen_size is assigned to zero on the secondary GPU,
since there is no pre-OS console using that memory. Then the bottom region of
VRAM was allocated as GTT, unfortunately a small region of bottom VRAM was
encroached by UMC firmware during GDDR6 BIST training, this cause page fault.

[how]
Forcing stolen_size to 3MB, then the bottom region of VRAM was
allocated as stolen memory, GTT corruption avoid.

Change-Id: I310a72ba0402994defbe50839842a8edb025a868
Reviewed-by: Christian König 
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h |  5 +
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c  | 27 -
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
index c91dd602d5f1..ede4a0ea0c84 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h
@@ -60,6 +60,11 @@
  */
 #define AMDGPU_GMC_FAULT_TIMEOUT   5000ULL
 
+/*
+ * Default stolen memory size, 1024 * 768 * 4
+ */
+#define AMDGPU_STOLEN_BIST_TRAINING_DEFAULT_SIZE   0x30ULL
+
 struct firmware;
 
 /*
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 5ad89bb6f3ba..04017057f8ca 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -566,7 +566,12 @@ static int gmc_v10_0_late_init(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
int r;
 
-   amdgpu_bo_late_init(adev);
+   /*
+* Can't free the stolen VGA memory when it might be used for memory
+* training again.
+*/
+   if (!adev->fw_vram_usage.mem_train_support)
+   amdgpu_bo_late_init(adev);
 
r = amdgpu_gmc_allocate_vm_inv_eng(adev);
if (r)
@@ -750,6 +755,19 @@ static int gmc_v10_0_sw_init(void *handle)
 
adev->gmc.stolen_size = gmc_v10_0_get_vbios_fb_size(adev);
 
+   /*
+* In dual GPUs scenario, stolen_size is assigned to zero on the
+* secondary GPU, since there is no pre-OS console using that memory.
+* Then the bottom region of VRAM was allocated as GTT, unfortunately a
+* small region of bottom VRAM was encroached by UMC firmware during
+* GDDR6 BIST training, this cause page fault.
+* The page fault can be fixed by forcing stolen_size to 3MB, then the
+* bottom region of VRAM was allocated as stolen memory, GTT corruption
+* avoid.
+*/
+   adev->gmc.stolen_size = max(adev->gmc.stolen_size,
+   AMDGPU_STOLEN_BIST_TRAINING_DEFAULT_SIZE);
+
/* Memory manager */
r = amdgpu_bo_init(adev);
if (r)
@@ -789,6 +807,13 @@ static void gmc_v10_0_gart_fini(struct amdgpu_device *adev)
 static int gmc_v10_0_sw_fini(void *handle)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
+   void *stolen_vga_buf;
+
+   /*
+* Free the stolen memory if it wasn't already freed in late_init
+* because of memory training.
+*/
+   amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, _vga_buf);
 
amdgpu_vm_manager_fini(adev);
gmc_v10_0_gart_fini(adev);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 23/23] drm: Cleanup VBLANK callbacks in struct drm_driver

2020-01-12 Thread Daniel Vetter
On Sun, Jan 12, 2020 at 11:53:12PM +0100, Daniel Vetter wrote:
> On Fri, Jan 10, 2020 at 10:21:27AM +0100, Thomas Zimmermann wrote:
> > All non-legacy users of VBLANK functions in struct drm_driver have been
> > converted to use the respective interfaces in struct drm_crtc_funcs. The
> > remaining users of VBLANK callbacks in struct drm_driver are legacy drivers
> > with userspace modesetting.
> > 
> > There are no users left of get_vblank_timestamp(), so the callback is
> > being removed. The other VBLANK callbacks are being moved to the legacy
> > section at the end of struct drm_driver.
> > 
> > Signed-off-by: Thomas Zimmermann 
> 
> I think sprinkling some WARN_ON (in drm_dev_register or wherever) if new
> drivers try to use the legacy hooks would be really nice. Experience says
> someone is going to copypaste this stuff around forever otherwise.

I meant to add: As a follow-up patch.
-Daniel

> 
> Reviewed-by: Daniel Vetter 
> 
> > ---
> >  drivers/gpu/drm/drm_vblank.c |  39 +-
> >  include/drm/drm_drv.h| 101 ++-
> >  2 files changed, 17 insertions(+), 123 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
> > index 7cf436a4b908..ceff68474d4d 100644
> > --- a/drivers/gpu/drm/drm_vblank.c
> > +++ b/drivers/gpu/drm/drm_vblank.c
> > @@ -138,10 +138,9 @@ static u32 __get_vblank_counter(struct drm_device 
> > *dev, unsigned int pipe)
> >  
> > if (crtc->funcs->get_vblank_counter)
> > return crtc->funcs->get_vblank_counter(crtc);
> > -   }
> > -
> > -   if (dev->driver->get_vblank_counter)
> > +   } else if (dev->driver->get_vblank_counter) {
> > return dev->driver->get_vblank_counter(dev, pipe);
> > +   }
> >  
> > return drm_vblank_no_hw_counter(dev, pipe);
> >  }
> > @@ -334,8 +333,7 @@ u64 drm_crtc_accurate_vblank_count(struct drm_crtc 
> > *crtc)
> > unsigned long flags;
> >  
> > WARN_ONCE(drm_debug_enabled(DRM_UT_VBL) &&
> > - !crtc->funcs->get_vblank_timestamp &&
> > - !dev->driver->get_vblank_timestamp,
> > + !crtc->funcs->get_vblank_timestamp,
> >   "This function requires support for accurate vblank 
> > timestamps.");
> >  
> > spin_lock_irqsave(>vblank_time_lock, flags);
> > @@ -357,13 +355,11 @@ static void __disable_vblank(struct drm_device *dev, 
> > unsigned int pipe)
> > if (WARN_ON(!crtc))
> > return;
> >  
> > -   if (crtc->funcs->disable_vblank) {
> > +   if (crtc->funcs->disable_vblank)
> > crtc->funcs->disable_vblank(crtc);
> > -   return;
> > -   }
> > +   } else {
> > +   dev->driver->disable_vblank(dev, pipe);
> > }
> > -
> > -   dev->driver->disable_vblank(dev, pipe);
> >  }
> >  
> >  /*
> > @@ -791,9 +787,6 @@ drm_get_last_vbltimestamp(struct drm_device *dev, 
> > unsigned int pipe,
> >  
> > ret = crtc->funcs->get_vblank_timestamp(crtc, _error,
> > tvblank, in_vblank_irq);
> > -   } else if (dev->driver->get_vblank_timestamp && (max_error > 0)) {
> > -   ret = dev->driver->get_vblank_timestamp(dev, pipe, _error,
> > -   tvblank, in_vblank_irq);
> > }
> >  
> > /* GPU high precision timestamp query unsupported or failed.
> > @@ -1016,9 +1009,11 @@ static int __enable_vblank(struct drm_device *dev, 
> > unsigned int pipe)
> >  
> > if (crtc->funcs->enable_vblank)
> > return crtc->funcs->enable_vblank(crtc);
> > +   } else if (dev->driver->enable_vblank) {
> > +   return dev->driver->enable_vblank(dev, pipe);
> > }
> >  
> > -   return dev->driver->enable_vblank(dev, pipe);
> > +   return -EINVAL;
> >  }
> >  
> >  static int drm_vblank_enable(struct drm_device *dev, unsigned int pipe)
> > @@ -1109,13 +1104,10 @@ static bool __vblank_disable_immediate(struct 
> > drm_device *dev, unsigned int pipe
> > return false;
> >  
> > crtc = drm_crtc_from_index(dev, pipe);
> > -   if (crtc && crtc->funcs->get_vblank_timestamp)
> > -   return true;
> > -
> > -   if (dev->driver->get_vblank_timestamp)
> > -   return true;
> > +   if (!crtc || !crtc->funcs->get_vblank_timestamp)
> > +   return false;
> >  
> > -   return false;
> > +   return true;
> >  }
> >  
> >  static void drm_vblank_put(struct drm_device *dev, unsigned int pipe)
> > @@ -1798,7 +1790,6 @@ static void drm_handle_vblank_events(struct 
> > drm_device *dev, unsigned int pipe)
> > struct drm_pending_vblank_event *e, *t;
> > ktime_t now;
> > u64 seq;
> > -   bool high_prec;
> >  
> > assert_spin_locked(>event_lock);
> >  
> > @@ -1818,10 +1809,8 @@ static void drm_handle_vblank_events(struct 
> > drm_device *dev, unsigned int pipe)
> > send_vblank_event(dev, e, seq, now);
> > }
> >  
> 

Re: [PATCH 23/23] drm: Cleanup VBLANK callbacks in struct drm_driver

2020-01-12 Thread Daniel Vetter
On Fri, Jan 10, 2020 at 10:21:27AM +0100, Thomas Zimmermann wrote:
> All non-legacy users of VBLANK functions in struct drm_driver have been
> converted to use the respective interfaces in struct drm_crtc_funcs. The
> remaining users of VBLANK callbacks in struct drm_driver are legacy drivers
> with userspace modesetting.
> 
> There are no users left of get_vblank_timestamp(), so the callback is
> being removed. The other VBLANK callbacks are being moved to the legacy
> section at the end of struct drm_driver.
> 
> Signed-off-by: Thomas Zimmermann 

I think sprinkling some WARN_ON (in drm_dev_register or wherever) if new
drivers try to use the legacy hooks would be really nice. Experience says
someone is going to copypaste this stuff around forever otherwise.

Reviewed-by: Daniel Vetter 

> ---
>  drivers/gpu/drm/drm_vblank.c |  39 +-
>  include/drm/drm_drv.h| 101 ++-
>  2 files changed, 17 insertions(+), 123 deletions(-)
> 
> diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c
> index 7cf436a4b908..ceff68474d4d 100644
> --- a/drivers/gpu/drm/drm_vblank.c
> +++ b/drivers/gpu/drm/drm_vblank.c
> @@ -138,10 +138,9 @@ static u32 __get_vblank_counter(struct drm_device *dev, 
> unsigned int pipe)
>  
>   if (crtc->funcs->get_vblank_counter)
>   return crtc->funcs->get_vblank_counter(crtc);
> - }
> -
> - if (dev->driver->get_vblank_counter)
> + } else if (dev->driver->get_vblank_counter) {
>   return dev->driver->get_vblank_counter(dev, pipe);
> + }
>  
>   return drm_vblank_no_hw_counter(dev, pipe);
>  }
> @@ -334,8 +333,7 @@ u64 drm_crtc_accurate_vblank_count(struct drm_crtc *crtc)
>   unsigned long flags;
>  
>   WARN_ONCE(drm_debug_enabled(DRM_UT_VBL) &&
> -   !crtc->funcs->get_vblank_timestamp &&
> -   !dev->driver->get_vblank_timestamp,
> +   !crtc->funcs->get_vblank_timestamp,
> "This function requires support for accurate vblank 
> timestamps.");
>  
>   spin_lock_irqsave(>vblank_time_lock, flags);
> @@ -357,13 +355,11 @@ static void __disable_vblank(struct drm_device *dev, 
> unsigned int pipe)
>   if (WARN_ON(!crtc))
>   return;
>  
> - if (crtc->funcs->disable_vblank) {
> + if (crtc->funcs->disable_vblank)
>   crtc->funcs->disable_vblank(crtc);
> - return;
> - }
> + } else {
> + dev->driver->disable_vblank(dev, pipe);
>   }
> -
> - dev->driver->disable_vblank(dev, pipe);
>  }
>  
>  /*
> @@ -791,9 +787,6 @@ drm_get_last_vbltimestamp(struct drm_device *dev, 
> unsigned int pipe,
>  
>   ret = crtc->funcs->get_vblank_timestamp(crtc, _error,
>   tvblank, in_vblank_irq);
> - } else if (dev->driver->get_vblank_timestamp && (max_error > 0)) {
> - ret = dev->driver->get_vblank_timestamp(dev, pipe, _error,
> - tvblank, in_vblank_irq);
>   }
>  
>   /* GPU high precision timestamp query unsupported or failed.
> @@ -1016,9 +1009,11 @@ static int __enable_vblank(struct drm_device *dev, 
> unsigned int pipe)
>  
>   if (crtc->funcs->enable_vblank)
>   return crtc->funcs->enable_vblank(crtc);
> + } else if (dev->driver->enable_vblank) {
> + return dev->driver->enable_vblank(dev, pipe);
>   }
>  
> - return dev->driver->enable_vblank(dev, pipe);
> + return -EINVAL;
>  }
>  
>  static int drm_vblank_enable(struct drm_device *dev, unsigned int pipe)
> @@ -1109,13 +1104,10 @@ static bool __vblank_disable_immediate(struct 
> drm_device *dev, unsigned int pipe
>   return false;
>  
>   crtc = drm_crtc_from_index(dev, pipe);
> - if (crtc && crtc->funcs->get_vblank_timestamp)
> - return true;
> -
> - if (dev->driver->get_vblank_timestamp)
> - return true;
> + if (!crtc || !crtc->funcs->get_vblank_timestamp)
> + return false;
>  
> - return false;
> + return true;
>  }
>  
>  static void drm_vblank_put(struct drm_device *dev, unsigned int pipe)
> @@ -1798,7 +1790,6 @@ static void drm_handle_vblank_events(struct drm_device 
> *dev, unsigned int pipe)
>   struct drm_pending_vblank_event *e, *t;
>   ktime_t now;
>   u64 seq;
> - bool high_prec;
>  
>   assert_spin_locked(>event_lock);
>  
> @@ -1818,10 +1809,8 @@ static void drm_handle_vblank_events(struct drm_device 
> *dev, unsigned int pipe)
>   send_vblank_event(dev, e, seq, now);
>   }
>  
> - high_prec = crtc->funcs->get_vblank_timestamp ||
> - dev->driver->get_vblank_timestamp;
> -
> - trace_drm_vblank_event(pipe, seq, now, high_prec);
> + trace_drm_vblank_event(pipe, seq, now,
> +