RE: [PATCH] drm/amd/powerplay: fix the null pointer issue on switching power profile
There was a same fix from Felix to address this. Regards, Evan > -Original Message- > From: amd-gfx On Behalf Of Huang > Rui > Sent: Monday, January 13, 2020 10:35 AM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander ; Kuehling, Felix > ; Huang, Ray > Subject: [PATCH] drm/amd/powerplay: fix the null pointer issue on switching > power profile > > Fix the wrong input parameter of powerplay callback > amdgpu_dpm_switch_power_profile which is encountered by kfd test. > > [ 176.758381] BUG: kernel NULL pointer dereference, address: > 0220 [ 176.758409] #PF: supervisor read access in kernel mode > [ 176.758424] #PF: error_code(0x) - not-present page [ 176.758440] PGD > 8003f6eea067 P4D 8003f6eea067 PUD 3ce06c067 PMD 0 > [ 176.758461] Oops: [#1] SMP PTI > [ 176.758473] CPU: 5 PID: 2621 Comm: kfdtest Tainted: G OE > 5.4.0-rc7-custom #1 > [ 176.758496] Hardware name: System manufacturer System Product > Name/TUF Z370-PLUS GAMING, BIOS 0612 03/01/2018 [ 176.758593] RIP: > 0010:pp_dpm_switch_power_profile+0x46/0x1ee [amdgpu] [ 176.758613] > Code: 00 48 89 45 d8 31 c0 48 85 ff 0f 84 9f 01 00 00 48 89 fb 41 > 89 f4 41 89 d5 80 7f 15 00 0f 84 96 01 00 00 48 8b 87 78 02 00 00 <48> 83 b8 > 20 > 02 00 00 00 0f 84 ba 00 00 00 83 fe 05 0f 87 82 01 00 [ 176.758663] RSP: > 0018:a530c12ebb50 EFLAGS: 00010282 [ 176.758678] RAX: > RBX: 9797c0b0 RCX: > [ 176.758698] RDX: 0001 RSI: 0005 RDI: > 9797c0b0 [ 176.758718] RBP: a530c12ebb80 R08: 9797ce221548 > R09: 036c [ 176.758739] R10: 94006a80 R11: > R12: 0005 [ 176.758759] R13: > 0001 R14: 9797fff4dda8 R15: 9797ce221548 [ 176.758779] > FS: 7efe09cc3780() GS:97982694() > knlGS: > [ 176.758802] CS: 0010 DS: ES: CR0: 80050033 > [ 176.758819] CR2: 0220 CR3: 0003cdc70003 CR4: > 003606e0 [ 176.758839] DR0: DR1: > DR2: [ 176.758859] DR3: > DR6: fffe0ff0 DR7: 0400 > [ 176.758879] Call Trace: > [ 176.758932] amdgpu_dpm_switch_power_profile+0x4c/0x6f [amdgpu] > [ 176.758995] amdgpu_amdkfd_set_compute_idle+0x1a/0x1c [amdgpu] > [ 176.759056] kfd_inc_compute_active+0x29/0x2b [amdgpu] [ 176.759117] > register_process+0x11c/0x14f [amdgpu] [ 176.759177] > pqm_create_queue+0x20b/0x527 [amdgpu] [ 176.759234] > kfd_ioctl_create_queue+0x4aa/0x5e5 [amdgpu] [ 176.759294] > kfd_ioctl+0x235/0x366 [amdgpu] > > Signed-off-by: Huang Rui > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c > index f7c0ae6..d3962e2 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c > @@ -1123,7 +1123,8 @@ int amdgpu_dpm_switch_power_profile(struct > amdgpu_device *adev, > ret = smu_switch_power_profile(>smu, type, en); > else if (adev->powerplay.pp_funcs && >adev->powerplay.pp_funcs->switch_power_profile) > - ret = adev->powerplay.pp_funcs->switch_power_profile(adev, > type, en); > + ret = adev->powerplay.pp_funcs->switch_power_profile(adev- > >powerplay.pp_handle, > + type, en); > > return ret; > } > -- > 2.7.4 > > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.free > desktop.org%2Fmailman%2Flistinfo%2Famd- > gfxdata=02%7C01%7Cevan.quan%40amd.com%7Cce7a462c82c94d51a9 > 6908d797d12d9d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637 > 144797057908962sdata=LLvPWFsLxMKKk2upAUQAaG4E%2FfpNtWjt6G1 > mEpr78kg%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 2/2] drm/amdgpu/debugfs: properly handle runtime pm
Reviewed-by: Evan Quan > -Original Message- > From: amd-gfx On Behalf Of Alex > Deucher > Sent: Saturday, January 11, 2020 7:45 AM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander > Subject: [PATCH 2/2] drm/amdgpu/debugfs: properly handle runtime pm > > If driver debugfs files are accessed, power up the GPU when necessary. > > Signed-off-by: Alex Deucher > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 133 > ++-- > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 8 ++ > 2 files changed, 134 insertions(+), 7 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index 63343bb43049..f24ed9a1a3e5 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -26,6 +26,7 @@ > #include > #include > #include > +#include > > #include > > @@ -144,10 +145,17 @@ static int amdgpu_debugfs_process_reg_op(bool > read, struct file *f, > > *pos &= (1UL << 22) - 1; > > + r = pm_runtime_get_sync(adev->ddev->dev); > + if (r < 0) > + return r; > + > if (use_bank) { > if ((sh_bank != 0x && sh_bank >= adev- > >gfx.config.max_sh_per_se) || > - (se_bank != 0x && se_bank >= adev- > >gfx.config.max_shader_engines)) > + (se_bank != 0x && se_bank >= adev- > >gfx.config.max_shader_engines)) { > + pm_runtime_mark_last_busy(adev->ddev->dev); > + pm_runtime_put_autosuspend(adev->ddev->dev); > return -EINVAL; > + } > mutex_lock(>grbm_idx_mutex); > amdgpu_gfx_select_se_sh(adev, se_bank, > sh_bank, instance_bank); > @@ -193,6 +201,9 @@ static int amdgpu_debugfs_process_reg_op(bool read, > struct file *f, > if (pm_pg_lock) > mutex_unlock(>pm.mutex); > > + pm_runtime_mark_last_busy(adev->ddev->dev); > + pm_runtime_put_autosuspend(adev->ddev->dev); > + > return result; > } > > @@ -237,13 +248,20 @@ static ssize_t > amdgpu_debugfs_regs_pcie_read(struct file *f, char __user *buf, > if (size & 0x3 || *pos & 0x3) > return -EINVAL; > > + r = pm_runtime_get_sync(adev->ddev->dev); > + if (r < 0) > + return r; > + > while (size) { > uint32_t value; > > value = RREG32_PCIE(*pos >> 2); > r = put_user(value, (uint32_t *)buf); > - if (r) > + if (r) { > + pm_runtime_mark_last_busy(adev->ddev->dev); > + pm_runtime_put_autosuspend(adev->ddev->dev); > return r; > + } > > result += 4; > buf += 4; > @@ -251,6 +269,9 @@ static ssize_t amdgpu_debugfs_regs_pcie_read(struct > file *f, char __user *buf, > size -= 4; > } > > + pm_runtime_mark_last_busy(adev->ddev->dev); > + pm_runtime_put_autosuspend(adev->ddev->dev); > + > return result; > } > > @@ -276,12 +297,19 @@ static ssize_t > amdgpu_debugfs_regs_pcie_write(struct file *f, const char __user > if (size & 0x3 || *pos & 0x3) > return -EINVAL; > > + r = pm_runtime_get_sync(adev->ddev->dev); > + if (r < 0) > + return r; > + > while (size) { > uint32_t value; > > r = get_user(value, (uint32_t *)buf); > - if (r) > + if (r) { > + pm_runtime_mark_last_busy(adev->ddev->dev); > + pm_runtime_put_autosuspend(adev->ddev->dev); > return r; > + } > > WREG32_PCIE(*pos >> 2, value); > > @@ -291,6 +319,9 @@ static ssize_t amdgpu_debugfs_regs_pcie_write(struct > file *f, const char __user > size -= 4; > } > > + pm_runtime_mark_last_busy(adev->ddev->dev); > + pm_runtime_put_autosuspend(adev->ddev->dev); > + > return result; > } > > @@ -316,13 +347,20 @@ static ssize_t > amdgpu_debugfs_regs_didt_read(struct file *f, char __user *buf, > if (size & 0x3 || *pos & 0x3) > return -EINVAL; > > + r = pm_runtime_get_sync(adev->ddev->dev); > + if (r < 0) > + return r; > + > while (size) { > uint32_t value; > > value = RREG32_DIDT(*pos >> 2); > r = put_user(value, (uint32_t *)buf); > - if (r) > + if (r) { > + pm_runtime_mark_last_busy(adev->ddev->dev); > + pm_runtime_put_autosuspend(adev->ddev->dev); > return r; > + } > > result += 4; > buf += 4; > @@ -330,6 +368,9 @@ static ssize_t amdgpu_debugfs_regs_didt_read(struct > file *f, char __user *buf, > size -= 4; > } > > +
RE: [PATCH 1/2] drm/amdgpu/pm: properly handle runtime pm
The "count" from amdgpu_set_pp_feature_status() seems with type of size_t. Then assignment "count = -EINVAL" may be improper. With that confirmed, the patch is reviewed-by: Evan Quan > -Original Message- > From: amd-gfx On Behalf Of Alex > Deucher > Sent: Saturday, January 11, 2020 7:45 AM > To: amd-gfx@lists.freedesktop.org > Cc: Deucher, Alexander > Subject: [PATCH 1/2] drm/amdgpu/pm: properly handle runtime pm > > If power management sysfs or debugfs files are accessed, power up the GPU > when necessary. > > Signed-off-by: Alex Deucher > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 822 ++- > -- > 1 file changed, 614 insertions(+), 208 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c > index 285d460624c8..806e731c1ff4 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c > @@ -37,6 +37,7 @@ > #include > #include > #include > +#include > #include "hwmgr.h" > #define WIDTH_4K 3840 > > @@ -158,10 +159,15 @@ static ssize_t amdgpu_get_dpm_state(struct device > *dev, > struct drm_device *ddev = dev_get_drvdata(dev); > struct amdgpu_device *adev = ddev->dev_private; > enum amd_pm_state_type pm; > + int ret; > > if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) > return 0; > > + ret = pm_runtime_get_sync(ddev->dev); > + if (ret < 0) > + return ret; > + > if (is_support_sw_smu(adev)) { > if (adev->smu.ppt_funcs->get_current_power_state) > pm = smu_get_current_power_state(>smu); > @@ -173,6 +179,9 @@ static ssize_t amdgpu_get_dpm_state(struct device > *dev, > pm = adev->pm.dpm.user_state; > } > > + pm_runtime_mark_last_busy(ddev->dev); > + pm_runtime_put_autosuspend(ddev->dev); > + > return snprintf(buf, PAGE_SIZE, "%s\n", > (pm == POWER_STATE_TYPE_BATTERY) ? "battery" : > (pm == POWER_STATE_TYPE_BALANCED) ? > "balanced" : "performance"); @@ -186,6 +195,7 @@ static ssize_t > amdgpu_set_dpm_state(struct device *dev, > struct drm_device *ddev = dev_get_drvdata(dev); > struct amdgpu_device *adev = ddev->dev_private; > enum amd_pm_state_type state; > + int ret; > > if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) > return -EINVAL; > @@ -201,6 +211,10 @@ static ssize_t amdgpu_set_dpm_state(struct device > *dev, > goto fail; > } > > + ret = pm_runtime_get_sync(ddev->dev); > + if (ret < 0) > + return ret; > + > if (is_support_sw_smu(adev)) { > mutex_lock(>pm.mutex); > adev->pm.dpm.user_state = state; > @@ -212,11 +226,12 @@ static ssize_t amdgpu_set_dpm_state(struct device > *dev, > adev->pm.dpm.user_state = state; > mutex_unlock(>pm.mutex); > > - /* Can't set dpm state when the card is off */ > - if (!(adev->flags & AMD_IS_PX) || > - (ddev->switch_power_state == DRM_SWITCH_POWER_ON)) > - amdgpu_pm_compute_clocks(adev); > + amdgpu_pm_compute_clocks(adev); > } > + pm_runtime_mark_last_busy(ddev->dev); > + pm_runtime_put_autosuspend(ddev->dev); > + > + > fail: > return count; > } > @@ -288,13 +303,14 @@ static ssize_t > amdgpu_get_dpm_forced_performance_level(struct device *dev, > struct drm_device *ddev = dev_get_drvdata(dev); > struct amdgpu_device *adev = ddev->dev_private; > enum amd_dpm_forced_level level = 0xff; > + int ret; > > if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) > return 0; > > - if ((adev->flags & AMD_IS_PX) && > - (ddev->switch_power_state != DRM_SWITCH_POWER_ON)) > - return snprintf(buf, PAGE_SIZE, "off\n"); > + ret = pm_runtime_get_sync(ddev->dev); > + if (ret < 0) > + return ret; > > if (is_support_sw_smu(adev)) > level = smu_get_performance_level(>smu); > @@ -303,6 +319,9 @@ static ssize_t > amdgpu_get_dpm_forced_performance_level(struct device *dev, > else > level = adev->pm.dpm.forced_level; > > + pm_runtime_mark_last_busy(ddev->dev); > + pm_runtime_put_autosuspend(ddev->dev); > + > return snprintf(buf, PAGE_SIZE, "%s\n", > (level == AMD_DPM_FORCED_LEVEL_AUTO) ? "auto" : > (level == AMD_DPM_FORCED_LEVEL_LOW) ? "low" : > @@ -329,11 +348,6 @@ static ssize_t > amdgpu_set_dpm_forced_performance_level(struct device *dev, > if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) > return -EINVAL; > > - /* Can't force performance level when the card is off */ > - if ((adev->flags & AMD_IS_PX) && > - (ddev->switch_power_state != DRM_SWITCH_POWER_ON)) > -
Re: [PATCH] drm/amdkfd: Add a message when SW scheduler is used
On Fri, Jan 10, 2020 at 02:24:29PM -0500, Yong Zhao wrote: > SW scheduler is previously called non HW scheduler, or non HWS. This > message is useful when triaging issues from dmesg. > > Change-Id: I625518c88c043df5f60409d1ca520e7fc032251f > Signed-off-by: Yong Zhao Acked-by: Huang Rui > --- > drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > index d7eb6ac37f62..2870553a2ce0 100644 > --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c > @@ -934,6 +934,7 @@ static void uninitialize(struct device_queue_manager *dqm) > > static int start_nocpsch(struct device_queue_manager *dqm) > { > + pr_info("SW scheduler is used"); > init_interrupts(dqm); > > if (dqm->dev->device_info->asic_family == CHIP_HAWAII) > -- > 2.17.1 > > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cray.huang%40amd.com%7C6b815ca66a604cf8bdc808d79602bf9c%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637142810865612294sdata=smKBx880vj61EOJ3iHXQdSCb3upww6EyHKjD3xfWdX4%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 3/5] drm/amdkfd: use map_queues for hiq on arcturus as well
On Sat, Jan 11, 2020 at 07:05:40AM +0800, Kuehling, Felix wrote: > What happens on Arcturus without this patch? Does it oops with a null > pointer dereference? If yes, then you should squash this patch into > patch 2 to avoid a broken intermediate state. Yes, Arcturus will get a null pointer panic without this patch becasue hiq_mqd_load is not inited in arcturus_kfd2kgd. I will squash this patch into the patch 2. Thanks, Ray > > Regards, > Felix > > On 2020-01-10 1:37 a.m., Huang Rui wrote: > > Align with gfx v9, use the map_queues packet to load hiq MQD. > > > > Signed-off-by: Huang Rui > > --- > > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c | 1 + > > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 6 +++--- > > drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h | 3 +++ > > 3 files changed, 7 insertions(+), 3 deletions(-) > > > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > index 3c11940..8baad42 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_arcturus.c > > @@ -281,6 +281,7 @@ const struct kfd2kgd_calls arcturus_kfd2kgd = { > > .set_pasid_vmid_mapping = kgd_gfx_v9_set_pasid_vmid_mapping, > > .init_interrupts = kgd_gfx_v9_init_interrupts, > > .hqd_load = kgd_gfx_v9_hqd_load, > > + .hiq_mqd_load = kgd_gfx_v9_hiq_mqd_load, > > .hqd_sdma_load = kgd_hqd_sdma_load, > > .hqd_dump = kgd_gfx_v9_hqd_dump, > > .hqd_sdma_dump = kgd_hqd_sdma_dump, > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > > index ab8c23a..d2f9396 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c > > @@ -324,9 +324,9 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, > > uint32_t pipe_id, > > return 0; > > } > > > > -static int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd, > > - uint32_t pipe_id, uint32_t queue_id, > > - uint32_t doorbell_off) > > +int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t doorbell_off) > > { > > struct amdgpu_device *adev = get_amdgpu_device(kgd); > > struct amdgpu_ring *kiq_ring = >gfx.kiq.ring; > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > > b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > > index 02b1426..32dd1a9 100644 > > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.h > > @@ -33,6 +33,9 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, > > uint32_t pipe_id, > > uint32_t queue_id, uint32_t __user *wptr, > > uint32_t wptr_shift, uint32_t wptr_mask, > > struct mm_struct *mm); > > +int kgd_gfx_v9_hiq_mqd_load(struct kgd_dev *kgd, void *mqd, > > + uint32_t pipe_id, uint32_t queue_id, > > + uint32_t doorbell_off); > > int kgd_gfx_v9_hqd_dump(struct kgd_dev *kgd, > > uint32_t pipe_id, uint32_t queue_id, > > uint32_t (**dump)[2], uint32_t *n_regs); ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amd/powerplay: fix the null pointer issue on switching power profile
Fix the wrong input parameter of powerplay callback amdgpu_dpm_switch_power_profile which is encountered by kfd test. [ 176.758381] BUG: kernel NULL pointer dereference, address: 0220 [ 176.758409] #PF: supervisor read access in kernel mode [ 176.758424] #PF: error_code(0x) - not-present page [ 176.758440] PGD 8003f6eea067 P4D 8003f6eea067 PUD 3ce06c067 PMD 0 [ 176.758461] Oops: [#1] SMP PTI [ 176.758473] CPU: 5 PID: 2621 Comm: kfdtest Tainted: G OE 5.4.0-rc7-custom #1 [ 176.758496] Hardware name: System manufacturer System Product Name/TUF Z370-PLUS GAMING, BIOS 0612 03/01/2018 [ 176.758593] RIP: 0010:pp_dpm_switch_power_profile+0x46/0x1ee [amdgpu] [ 176.758613] Code: 00 48 89 45 d8 31 c0 48 85 ff 0f 84 9f 01 00 00 48 89 fb 41 89 f4 41 89 d5 80 7f 15 00 0f 84 96 01 00 00 48 8b 87 78 02 00 00 <48> 83 b8 20 02 00 00 00 0f 84 ba 00 00 00 83 fe 05 0f 87 82 01 00 [ 176.758663] RSP: 0018:a530c12ebb50 EFLAGS: 00010282 [ 176.758678] RAX: RBX: 9797c0b0 RCX: [ 176.758698] RDX: 0001 RSI: 0005 RDI: 9797c0b0 [ 176.758718] RBP: a530c12ebb80 R08: 9797ce221548 R09: 036c [ 176.758739] R10: 94006a80 R11: R12: 0005 [ 176.758759] R13: 0001 R14: 9797fff4dda8 R15: 9797ce221548 [ 176.758779] FS: 7efe09cc3780() GS:97982694() knlGS: [ 176.758802] CS: 0010 DS: ES: CR0: 80050033 [ 176.758819] CR2: 0220 CR3: 0003cdc70003 CR4: 003606e0 [ 176.758839] DR0: DR1: DR2: [ 176.758859] DR3: DR6: fffe0ff0 DR7: 0400 [ 176.758879] Call Trace: [ 176.758932] amdgpu_dpm_switch_power_profile+0x4c/0x6f [amdgpu] [ 176.758995] amdgpu_amdkfd_set_compute_idle+0x1a/0x1c [amdgpu] [ 176.759056] kfd_inc_compute_active+0x29/0x2b [amdgpu] [ 176.759117] register_process+0x11c/0x14f [amdgpu] [ 176.759177] pqm_create_queue+0x20b/0x527 [amdgpu] [ 176.759234] kfd_ioctl_create_queue+0x4aa/0x5e5 [amdgpu] [ 176.759294] kfd_ioctl+0x235/0x366 [amdgpu] Signed-off-by: Huang Rui --- drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c index f7c0ae6..d3962e2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dpm.c @@ -1123,7 +1123,8 @@ int amdgpu_dpm_switch_power_profile(struct amdgpu_device *adev, ret = smu_switch_power_profile(>smu, type, en); else if (adev->powerplay.pp_funcs && adev->powerplay.pp_funcs->switch_power_profile) - ret = adev->powerplay.pp_funcs->switch_power_profile(adev, type, en); + ret = adev->powerplay.pp_funcs->switch_power_profile(adev->powerplay.pp_handle, +type, en); return ret; } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: fix modprobe failure of the secondary GPU when GDDR6 training enabled(V5)
From: "Tianci.Yin" [why] In dual GPUs scenario, stolen_size is assigned to zero on the secondary GPU, since there is no pre-OS console using that memory. Then the bottom region of VRAM was allocated as GTT, unfortunately a small region of bottom VRAM was encroached by UMC firmware during GDDR6 BIST training, this cause page fault. [how] Forcing stolen_size to 3MB, then the bottom region of VRAM was allocated as stolen memory, GTT corruption avoid. Change-Id: I310a72ba0402994defbe50839842a8edb025a868 Reviewed-by: Christian König Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h | 5 + drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 27 - 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h index c91dd602d5f1..ede4a0ea0c84 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gmc.h @@ -60,6 +60,11 @@ */ #define AMDGPU_GMC_FAULT_TIMEOUT 5000ULL +/* + * Default stolen memory size, 1024 * 768 * 4 + */ +#define AMDGPU_STOLEN_BIST_TRAINING_DEFAULT_SIZE 0x30ULL + struct firmware; /* diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 5ad89bb6f3ba..04017057f8ca 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -566,7 +566,12 @@ static int gmc_v10_0_late_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; int r; - amdgpu_bo_late_init(adev); + /* +* Can't free the stolen VGA memory when it might be used for memory +* training again. +*/ + if (!adev->fw_vram_usage.mem_train_support) + amdgpu_bo_late_init(adev); r = amdgpu_gmc_allocate_vm_inv_eng(adev); if (r) @@ -750,6 +755,19 @@ static int gmc_v10_0_sw_init(void *handle) adev->gmc.stolen_size = gmc_v10_0_get_vbios_fb_size(adev); + /* +* In dual GPUs scenario, stolen_size is assigned to zero on the +* secondary GPU, since there is no pre-OS console using that memory. +* Then the bottom region of VRAM was allocated as GTT, unfortunately a +* small region of bottom VRAM was encroached by UMC firmware during +* GDDR6 BIST training, this cause page fault. +* The page fault can be fixed by forcing stolen_size to 3MB, then the +* bottom region of VRAM was allocated as stolen memory, GTT corruption +* avoid. +*/ + adev->gmc.stolen_size = max(adev->gmc.stolen_size, + AMDGPU_STOLEN_BIST_TRAINING_DEFAULT_SIZE); + /* Memory manager */ r = amdgpu_bo_init(adev); if (r) @@ -789,6 +807,13 @@ static void gmc_v10_0_gart_fini(struct amdgpu_device *adev) static int gmc_v10_0_sw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + void *stolen_vga_buf; + + /* +* Free the stolen memory if it wasn't already freed in late_init +* because of memory training. +*/ + amdgpu_bo_free_kernel(>stolen_vga_memory, NULL, _vga_buf); amdgpu_vm_manager_fini(adev); gmc_v10_0_gart_fini(adev); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 23/23] drm: Cleanup VBLANK callbacks in struct drm_driver
On Sun, Jan 12, 2020 at 11:53:12PM +0100, Daniel Vetter wrote: > On Fri, Jan 10, 2020 at 10:21:27AM +0100, Thomas Zimmermann wrote: > > All non-legacy users of VBLANK functions in struct drm_driver have been > > converted to use the respective interfaces in struct drm_crtc_funcs. The > > remaining users of VBLANK callbacks in struct drm_driver are legacy drivers > > with userspace modesetting. > > > > There are no users left of get_vblank_timestamp(), so the callback is > > being removed. The other VBLANK callbacks are being moved to the legacy > > section at the end of struct drm_driver. > > > > Signed-off-by: Thomas Zimmermann > > I think sprinkling some WARN_ON (in drm_dev_register or wherever) if new > drivers try to use the legacy hooks would be really nice. Experience says > someone is going to copypaste this stuff around forever otherwise. I meant to add: As a follow-up patch. -Daniel > > Reviewed-by: Daniel Vetter > > > --- > > drivers/gpu/drm/drm_vblank.c | 39 +- > > include/drm/drm_drv.h| 101 ++- > > 2 files changed, 17 insertions(+), 123 deletions(-) > > > > diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c > > index 7cf436a4b908..ceff68474d4d 100644 > > --- a/drivers/gpu/drm/drm_vblank.c > > +++ b/drivers/gpu/drm/drm_vblank.c > > @@ -138,10 +138,9 @@ static u32 __get_vblank_counter(struct drm_device > > *dev, unsigned int pipe) > > > > if (crtc->funcs->get_vblank_counter) > > return crtc->funcs->get_vblank_counter(crtc); > > - } > > - > > - if (dev->driver->get_vblank_counter) > > + } else if (dev->driver->get_vblank_counter) { > > return dev->driver->get_vblank_counter(dev, pipe); > > + } > > > > return drm_vblank_no_hw_counter(dev, pipe); > > } > > @@ -334,8 +333,7 @@ u64 drm_crtc_accurate_vblank_count(struct drm_crtc > > *crtc) > > unsigned long flags; > > > > WARN_ONCE(drm_debug_enabled(DRM_UT_VBL) && > > - !crtc->funcs->get_vblank_timestamp && > > - !dev->driver->get_vblank_timestamp, > > + !crtc->funcs->get_vblank_timestamp, > > "This function requires support for accurate vblank > > timestamps."); > > > > spin_lock_irqsave(>vblank_time_lock, flags); > > @@ -357,13 +355,11 @@ static void __disable_vblank(struct drm_device *dev, > > unsigned int pipe) > > if (WARN_ON(!crtc)) > > return; > > > > - if (crtc->funcs->disable_vblank) { > > + if (crtc->funcs->disable_vblank) > > crtc->funcs->disable_vblank(crtc); > > - return; > > - } > > + } else { > > + dev->driver->disable_vblank(dev, pipe); > > } > > - > > - dev->driver->disable_vblank(dev, pipe); > > } > > > > /* > > @@ -791,9 +787,6 @@ drm_get_last_vbltimestamp(struct drm_device *dev, > > unsigned int pipe, > > > > ret = crtc->funcs->get_vblank_timestamp(crtc, _error, > > tvblank, in_vblank_irq); > > - } else if (dev->driver->get_vblank_timestamp && (max_error > 0)) { > > - ret = dev->driver->get_vblank_timestamp(dev, pipe, _error, > > - tvblank, in_vblank_irq); > > } > > > > /* GPU high precision timestamp query unsupported or failed. > > @@ -1016,9 +1009,11 @@ static int __enable_vblank(struct drm_device *dev, > > unsigned int pipe) > > > > if (crtc->funcs->enable_vblank) > > return crtc->funcs->enable_vblank(crtc); > > + } else if (dev->driver->enable_vblank) { > > + return dev->driver->enable_vblank(dev, pipe); > > } > > > > - return dev->driver->enable_vblank(dev, pipe); > > + return -EINVAL; > > } > > > > static int drm_vblank_enable(struct drm_device *dev, unsigned int pipe) > > @@ -1109,13 +1104,10 @@ static bool __vblank_disable_immediate(struct > > drm_device *dev, unsigned int pipe > > return false; > > > > crtc = drm_crtc_from_index(dev, pipe); > > - if (crtc && crtc->funcs->get_vblank_timestamp) > > - return true; > > - > > - if (dev->driver->get_vblank_timestamp) > > - return true; > > + if (!crtc || !crtc->funcs->get_vblank_timestamp) > > + return false; > > > > - return false; > > + return true; > > } > > > > static void drm_vblank_put(struct drm_device *dev, unsigned int pipe) > > @@ -1798,7 +1790,6 @@ static void drm_handle_vblank_events(struct > > drm_device *dev, unsigned int pipe) > > struct drm_pending_vblank_event *e, *t; > > ktime_t now; > > u64 seq; > > - bool high_prec; > > > > assert_spin_locked(>event_lock); > > > > @@ -1818,10 +1809,8 @@ static void drm_handle_vblank_events(struct > > drm_device *dev, unsigned int pipe) > > send_vblank_event(dev, e, seq, now); > > } > > >
Re: [PATCH 23/23] drm: Cleanup VBLANK callbacks in struct drm_driver
On Fri, Jan 10, 2020 at 10:21:27AM +0100, Thomas Zimmermann wrote: > All non-legacy users of VBLANK functions in struct drm_driver have been > converted to use the respective interfaces in struct drm_crtc_funcs. The > remaining users of VBLANK callbacks in struct drm_driver are legacy drivers > with userspace modesetting. > > There are no users left of get_vblank_timestamp(), so the callback is > being removed. The other VBLANK callbacks are being moved to the legacy > section at the end of struct drm_driver. > > Signed-off-by: Thomas Zimmermann I think sprinkling some WARN_ON (in drm_dev_register or wherever) if new drivers try to use the legacy hooks would be really nice. Experience says someone is going to copypaste this stuff around forever otherwise. Reviewed-by: Daniel Vetter > --- > drivers/gpu/drm/drm_vblank.c | 39 +- > include/drm/drm_drv.h| 101 ++- > 2 files changed, 17 insertions(+), 123 deletions(-) > > diff --git a/drivers/gpu/drm/drm_vblank.c b/drivers/gpu/drm/drm_vblank.c > index 7cf436a4b908..ceff68474d4d 100644 > --- a/drivers/gpu/drm/drm_vblank.c > +++ b/drivers/gpu/drm/drm_vblank.c > @@ -138,10 +138,9 @@ static u32 __get_vblank_counter(struct drm_device *dev, > unsigned int pipe) > > if (crtc->funcs->get_vblank_counter) > return crtc->funcs->get_vblank_counter(crtc); > - } > - > - if (dev->driver->get_vblank_counter) > + } else if (dev->driver->get_vblank_counter) { > return dev->driver->get_vblank_counter(dev, pipe); > + } > > return drm_vblank_no_hw_counter(dev, pipe); > } > @@ -334,8 +333,7 @@ u64 drm_crtc_accurate_vblank_count(struct drm_crtc *crtc) > unsigned long flags; > > WARN_ONCE(drm_debug_enabled(DRM_UT_VBL) && > - !crtc->funcs->get_vblank_timestamp && > - !dev->driver->get_vblank_timestamp, > + !crtc->funcs->get_vblank_timestamp, > "This function requires support for accurate vblank > timestamps."); > > spin_lock_irqsave(>vblank_time_lock, flags); > @@ -357,13 +355,11 @@ static void __disable_vblank(struct drm_device *dev, > unsigned int pipe) > if (WARN_ON(!crtc)) > return; > > - if (crtc->funcs->disable_vblank) { > + if (crtc->funcs->disable_vblank) > crtc->funcs->disable_vblank(crtc); > - return; > - } > + } else { > + dev->driver->disable_vblank(dev, pipe); > } > - > - dev->driver->disable_vblank(dev, pipe); > } > > /* > @@ -791,9 +787,6 @@ drm_get_last_vbltimestamp(struct drm_device *dev, > unsigned int pipe, > > ret = crtc->funcs->get_vblank_timestamp(crtc, _error, > tvblank, in_vblank_irq); > - } else if (dev->driver->get_vblank_timestamp && (max_error > 0)) { > - ret = dev->driver->get_vblank_timestamp(dev, pipe, _error, > - tvblank, in_vblank_irq); > } > > /* GPU high precision timestamp query unsupported or failed. > @@ -1016,9 +1009,11 @@ static int __enable_vblank(struct drm_device *dev, > unsigned int pipe) > > if (crtc->funcs->enable_vblank) > return crtc->funcs->enable_vblank(crtc); > + } else if (dev->driver->enable_vblank) { > + return dev->driver->enable_vblank(dev, pipe); > } > > - return dev->driver->enable_vblank(dev, pipe); > + return -EINVAL; > } > > static int drm_vblank_enable(struct drm_device *dev, unsigned int pipe) > @@ -1109,13 +1104,10 @@ static bool __vblank_disable_immediate(struct > drm_device *dev, unsigned int pipe > return false; > > crtc = drm_crtc_from_index(dev, pipe); > - if (crtc && crtc->funcs->get_vblank_timestamp) > - return true; > - > - if (dev->driver->get_vblank_timestamp) > - return true; > + if (!crtc || !crtc->funcs->get_vblank_timestamp) > + return false; > > - return false; > + return true; > } > > static void drm_vblank_put(struct drm_device *dev, unsigned int pipe) > @@ -1798,7 +1790,6 @@ static void drm_handle_vblank_events(struct drm_device > *dev, unsigned int pipe) > struct drm_pending_vblank_event *e, *t; > ktime_t now; > u64 seq; > - bool high_prec; > > assert_spin_locked(>event_lock); > > @@ -1818,10 +1809,8 @@ static void drm_handle_vblank_events(struct drm_device > *dev, unsigned int pipe) > send_vblank_event(dev, e, seq, now); > } > > - high_prec = crtc->funcs->get_vblank_timestamp || > - dev->driver->get_vblank_timestamp; > - > - trace_drm_vblank_event(pipe, seq, now, high_prec); > + trace_drm_vblank_event(pipe, seq, now, > +