RE: [PATCH 8/8] drm/amdgpu: for nv12 always need smu ip
Please check whether this is needed also for the following code: if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT && !amdgpu_sriov_vf(adev)) amdgpu_device_ip_block_add(adev, _v11_0_ip_block); Other than that, this one and patch1-4 are acked-by: Evan Quan -Original Message- From: amd-gfx On Behalf Of Monk Liu Sent: Thursday, April 23, 2020 3:02 PM To: amd-gfx@lists.freedesktop.org Cc: Liu, Monk Subject: [PATCH 8/8] drm/amdgpu: for nv12 always need smu ip because nv12 SRIOV support one vf mode Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/nv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 995bdec..9c42316 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -498,8 +498,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, _v10_0_ip_block); amdgpu_device_ip_block_add(adev, _ih_ip_block); amdgpu_device_ip_block_add(adev, _v11_0_ip_block); - if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP && - !amdgpu_sriov_vf(adev)) + if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) amdgpu_device_ip_block_add(adev, _v11_0_ip_block); if (adev->enable_virtual_display || amdgpu_sriov_vf(adev)) amdgpu_device_ip_block_add(adev, _virtual_ip_block); -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cevan.quan%40amd.com%7C0776cfa8343f4992679608d7e7543a7e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232221889995485sdata=L4BeRmyYxVu4to1tk%2BtQANvWg3fxTtjyv0zroWUJ3jA%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 7/8] drm/amdgpu: skip sysfs node not belong to one vf mode
-Original Message- From: amd-gfx On Behalf Of Monk Liu Sent: Thursday, April 23, 2020 3:02 PM To: amd-gfx@lists.freedesktop.org Cc: Liu, Monk Subject: [PATCH 7/8] drm/amdgpu: skip sysfs node not belong to one vf mode Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 48 -- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c index 49e2e43..c762deb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c @@ -3271,26 +3271,27 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev) return ret; } - - ret = device_create_file(adev->dev, _attr_pp_num_states); - if (ret) { - DRM_ERROR("failed to create device file pp_num_states\n"); - return ret; - } - ret = device_create_file(adev->dev, _attr_pp_cur_state); - if (ret) { - DRM_ERROR("failed to create device file pp_cur_state\n"); - return ret; - } - ret = device_create_file(adev->dev, _attr_pp_force_state); - if (ret) { - DRM_ERROR("failed to create device file pp_force_state\n"); - return ret; - } - ret = device_create_file(adev->dev, _attr_pp_table); - if (ret) { - DRM_ERROR("failed to create device file pp_table\n"); - return ret; + if (!amdgpu_sriov_vf(adev)) { + ret = device_create_file(adev->dev, _attr_pp_num_states); + if (ret) { + DRM_ERROR("failed to create device file pp_num_states\n"); + return ret; + } + ret = device_create_file(adev->dev, _attr_pp_cur_state); + if (ret) { + DRM_ERROR("failed to create device file pp_cur_state\n"); + return ret; + } + ret = device_create_file(adev->dev, _attr_pp_force_state); + if (ret) { + DRM_ERROR("failed to create device file pp_force_state\n"); + return ret; + } + ret = device_create_file(adev->dev, _attr_pp_table); + if (ret) { + DRM_ERROR("failed to create device file pp_table\n"); + return ret; + } } [Evan] Please add this for amdgpu_pm_sysfs_init also. ret = device_create_file(adev->dev, _attr_pp_dpm_sclk); @@ -3337,6 +3338,13 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev) return ret; } } + + /* the reset are not needed for SRIOV one vf mode */ [Evan] Typo: 'reset' should be 'rest'. + if (amdgpu_sriov_vf(adev)) { + adev->pm.sysfs_initialized = true; + return ret; + } + if (adev->asic_type != CHIP_ARCTURUS) { ret = device_create_file(adev->dev, _attr_pp_dpm_pcie); if (ret) { -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cevan.quan%40amd.com%7C2fcb995e57fb4ceeb23608d7e7543adc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232221351766801sdata=waHsAN5irYjlYb%2FrY8UT5h2eLu2wjaEw67DcaWJaUDs%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 6/8] drm/amdgpu: enable one vf mode for nv12
-Original Message- From: amd-gfx On Behalf Of Monk Liu Sent: Thursday, April 23, 2020 3:02 PM To: amd-gfx@lists.freedesktop.org Cc: Liu, Monk Subject: [PATCH 6/8] drm/amdgpu: enable one vf mode for nv12 Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 12 +++- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 6 +++- drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 49 +- 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 361a5b6..5964d63 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -347,13 +347,13 @@ int smu_get_dpm_freq_by_index(struct smu_context *smu, enum smu_clk_type clk_typ param = (uint32_t)(((clk_id & 0x) << 16) | (level & 0x)); ret = smu_send_smc_msg_with_param(smu, SMU_MSG_GetDpmFreqByIndex, - param, ); + param, value); if (ret) return ret; /* BIT31: 0 - Fine grained DPM, 1 - Dicrete DPM * now, we un-support it */ - *value = param & 0x7fff; + *value = *value & 0x7fff; return ret; } @@ -535,7 +535,6 @@ int smu_update_table(struct smu_context *smu, enum smu_table_id table_index, int int table_id = smu_table_get_index(smu, table_index); uint32_t table_size; int ret = 0; - if (!table_data || table_id >= SMU_TABLE_COUNT || table_id < 0) return -EINVAL; @@ -691,7 +690,6 @@ int smu_feature_is_enabled(struct smu_context *smu, enum smu_feature_mask mask) if (smu->is_apu) return 1; - feature_id = smu_feature_get_index(smu, mask); if (feature_id < 0) return 0; @@ -1339,6 +1337,9 @@ static int smu_hw_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct smu_context *smu = >smu; + if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) + return 0; + ret = smu_start_smc_engine(smu); if (ret) { pr_err("SMU is not ready yet!\n"); @@ -1352,9 +1353,6 @@ static int smu_hw_init(void *handle) smu_set_gfx_cgpg(>smu, true); } - if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) - return 0; - if (!smu->pm_enabled) return 0; diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index c94270f..2184d24 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -1817,7 +1817,8 @@ static int navi10_get_power_limit(struct smu_context *smu, int power_src; if (!smu->power_limit) { - if (smu_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT)) { + if (smu_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT) && + !amdgpu_sriov_vf(smu->adev)) { power_src = smu_power_get_index(smu, SMU_POWER_SOURCE_AC); if (power_src < 0) return -EINVAL; @@ -1960,6 +1961,9 @@ static int navi10_set_default_od_settings(struct smu_context *smu, bool initiali OverDriveTable_t *od_table, *boot_od_table; int ret = 0; + if (amdgpu_sriov_vf(smu->adev)) + return 0; + ret = smu_v11_0_set_default_od_settings(smu, initialize, sizeof(OverDriveTable_t)); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index a97b296..3e1b3ed 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -57,7 +57,7 @@ static int smu_v11_0_send_msg_without_waiting(struct smu_context *smu, uint16_t msg) { struct amdgpu_device *adev = smu->adev; - WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg); + WREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_66, msg); return 0; } @@ -65,7 +65,7 @@ static int smu_v11_0_read_arg(struct smu_context *smu, uint32_t *arg) { struct amdgpu_device *adev = smu->adev; - *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82); + *arg = RREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_82); return 0; } @@ -75,7 +75,7 @@ static int smu_v11_0_wait_for_response(struct smu_context *smu) uint32_t cur_value, i, timeout = adev->usec_timeout * 10; for (i = 0; i < timeout; i++) { - cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90); + cur_value = RREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_90); if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0) return
RE: [PATCH 5/8] drm/amdgpu: clear the messed up checking logic
Please make this the last one of the series. Other than that, this is acked-by: Evan Quan -Original Message- From: amd-gfx On Behalf Of Monk Liu Sent: Thursday, April 23, 2020 3:02 PM To: amd-gfx@lists.freedesktop.org Cc: Liu, Monk Subject: [PATCH 5/8] drm/amdgpu: clear the messed up checking logic for MI100 + ASICS, we always support SW_SMU for bare-metal and for SRIOV one_vf_mode Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 2bb1e0c..361a5b6 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -571,15 +571,10 @@ bool is_support_sw_smu(struct amdgpu_device *adev) if (adev->asic_type == CHIP_VEGA20) return (amdgpu_dpm == 2) ? true : false; else if (adev->asic_type >= CHIP_ARCTURUS) { - if (amdgpu_sriov_vf(adev) && - !(adev->asic_type == CHIP_ARCTURUS && - amdgpu_sriov_is_pp_one_vf(adev))) - - return false; - else + if (amdgpu_sriov_is_pp_one_vf(adev) || !amdgpu_sriov_vf(adev)) return true; - } else - return false; + } + return false; } bool is_support_sw_smu_xgmi(struct amdgpu_device *adev) -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cevan.quan%40amd.com%7C88cd44199dc3451d709608d7e75438b0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63723119291353sdata=rDamzIdmtaTo%2FCiEzyIo015JHJcZIV5ZyVLnkrJHcec%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amdgpu: check ring type for secure IBs
[AMD Official Use Only - Internal Distribution Only] Reviewed-by: Aaron Liu -- Best Regards Aaron Liu -Original Message- From: amd-gfx On Behalf Of Alex Deucher Sent: Friday, April 24, 2020 4:47 AM To: amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander Subject: [PATCH] drm/amdgpu: check ring type for secure IBs We don't support secure operation on compute rings at the moment so reject them. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index ec2c5e164cd3..b91853fd66d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -161,6 +161,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, return -EINVAL; } + if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) && + (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) { + dev_err(adev->dev, "secure submissions not supported on compute rings\n"); + return -EINVAL; + } + alloc_size = ring->funcs->emit_frame_size + num_ibs * ring->funcs->emit_ib_size; -- 2.25.3 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Caaron.liu%40amd.com%7C4237b46dff1743425aa708d7e7c78c83%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232717308400132sdata=0%2BSeNlm5n2%2F3OaMf%2Bw0ArpVcyenjJ2fO%2FwaRymsbZxI%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu: check ring type for secure IBs
Reviewed-by: Andrey Grodzovsky Andrey On 4/23/20 4:47 PM, Alex Deucher wrote: We don't support secure operation on compute rings at the moment so reject them. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index ec2c5e164cd3..b91853fd66d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -161,6 +161,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, return -EINVAL; } + if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) && + (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) { + dev_err(adev->dev, "secure submissions not supported on compute rings\n"); + return -EINVAL; + } + alloc_size = ring->funcs->emit_frame_size + num_ibs * ring->funcs->emit_ib_size; ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: check ring type for secure IBs
We don't support secure operation on compute rings at the moment so reject them. Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c index ec2c5e164cd3..b91853fd66d3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c @@ -161,6 +161,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned num_ibs, return -EINVAL; } + if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) && + (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) { + dev_err(adev->dev, "secure submissions not supported on compute rings\n"); + return -EINVAL; + } + alloc_size = ring->funcs->emit_frame_size + num_ibs * ring->funcs->emit_ib_size; -- 2.25.3 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] PCI/P2PDMA: Add additional AMD ZEN root ports to the whitelist
On Mon, Apr 06, 2020 at 03:42:01PM -0400, Alex Deucher wrote: > According to the hw architect, pre-ZEN parts support > p2p writes and ZEN parts support both p2p reads and writes. > > Add entries for Zen parts Raven (0x15d0) and Renoir (0x1630). > > Cc: Christian König > Acked-by: Christian König > Signed-off-by: Alex Deucher Applied with Huang's ack to pci/p2pdma for v5.8, thanks! > --- > drivers/pci/p2pdma.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c > index 9a8a38384121..91a4c987399d 100644 > --- a/drivers/pci/p2pdma.c > +++ b/drivers/pci/p2pdma.c > @@ -282,6 +282,8 @@ static const struct pci_p2pdma_whitelist_entry { > } pci_p2pdma_whitelist[] = { > /* AMD ZEN */ > {PCI_VENDOR_ID_AMD, 0x1450, 0}, > + {PCI_VENDOR_ID_AMD, 0x15d0, 0}, > + {PCI_VENDOR_ID_AMD, 0x1630, 0}, > > /* Intel Xeon E5/Core i7 */ > {PCI_VENDOR_ID_INTEL, 0x3c00, REQ_SAME_HOST_BRIDGE}, > -- > 2.25.1 > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] PCI/P2PDMA: Add additional AMD ZEN root ports to the whitelist
+ Bjorn Can chance I can get this picked up for -next? Thanks, Alex On Mon, Apr 6, 2020 at 3:42 PM Alex Deucher wrote: > > According to the hw architect, pre-ZEN parts support > p2p writes and ZEN parts support both p2p reads and writes. > > Add entries for Zen parts Raven (0x15d0) and Renoir (0x1630). > > Cc: Christian König > Acked-by: Christian König > Signed-off-by: Alex Deucher > --- > drivers/pci/p2pdma.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c > index 9a8a38384121..91a4c987399d 100644 > --- a/drivers/pci/p2pdma.c > +++ b/drivers/pci/p2pdma.c > @@ -282,6 +282,8 @@ static const struct pci_p2pdma_whitelist_entry { > } pci_p2pdma_whitelist[] = { > /* AMD ZEN */ > {PCI_VENDOR_ID_AMD, 0x1450, 0}, > + {PCI_VENDOR_ID_AMD, 0x15d0, 0}, > + {PCI_VENDOR_ID_AMD, 0x1630, 0}, > > /* Intel Xeon E5/Core i7 */ > {PCI_VENDOR_ID_INTEL, 0x3c00, REQ_SAME_HOST_BRIDGE}, > -- > 2.25.1 > ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH][next] drm/amd/display: remove redundant assignment to variable ret
On Thu, Apr 23, 2020 at 10:18 AM Colin King wrote: > > From: Colin Ian King > > The variable ret is being initialized with a value that is never read > and it is being updated later with a new value. The initialization is > redundant and can be removed. > > Addresses-Coverity: ("Unused value") > Signed-off-by: Colin Ian King Applied. Thanks! Alex > --- > drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c > b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c > index d5b306384d79..9ef9e50a34fa 100644 > --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c > +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c > @@ -4231,7 +4231,7 @@ void dpcd_set_source_specific_data(struct dc_link *link) > { > const uint32_t post_oui_delay = 30; // 30ms > uint8_t dspc = 0; > - enum dc_status ret = DC_ERROR_UNEXPECTED; > + enum dc_status ret; > > ret = core_link_read_dpcd(link, DP_DOWN_STREAM_PORT_COUNT, , > sizeof(dspc)); > -- > 2.25.1 > > ___ > dri-devel mailing list > dri-de...@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/dri-devel ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH][next] drm/amd/display: fix incorrect assignment due to a typo
Hi Colin, Sorry for any confusion of this code. I think in this case, it seems like the comment is wrong (but original implementation is somewhat wrong as well). Probably the original code implementation makes it unclear. There are three scenarios: 1. Variable refresh active, targeting a fixed rate In this case, the min = max = fixed rate 2. Variable refresh active, with a variable range In this case, the min = minimum refresh rate of the range. And the max = maximum refresh rate of the range. 3. Variable refresh rate is disabled (The case you are modifying) In the disabled case, we want to indicate to the display that the refresh rate is fixed, so we want to program min = max = the base refresh rate. Today there seems to be an implication that max refresh = base refresh, which is not necessarily true. I guess to make the code more clear and correct, the min and max should both be programmed equal to the base refresh rate (nominal field rate from mod_freesync_calc_nominal_field_rate) Does that make sense? Thanks, Anthony -Original Message- From: Colin King Sent: Thursday, April 23, 2020 10:03 AM To: Wentland, Harry ; Li, Sun peng (Leo) ; Deucher, Alexander ; Koenig, Christian ; Zhou, David(ChunMing) ; David Airlie ; Daniel Vetter ; Koo, Anthony ; amd-gfx@lists.freedesktop.org; dri-de...@lists.freedesktop.org Cc: kernel-janit...@vger.kernel.org; linux-ker...@vger.kernel.org Subject: [PATCH][next] drm/amd/display: fix incorrect assignment due to a typo From: Colin Ian King The assignment to infopacket->sb[7] looks incorrect, the comment states it is the minimum refresh rate yet it is being assigned a value from the maximum refresh rate max_refresh_in_uhz. Fix this by using min_refresh_in_uhz instead. Addresses-Coverity: ("Copy-paste error") Fixes: d2bacc38f6ca ("drm/amd/display: Change infopacket type programming") Signed-off-by: Colin Ian King --- drivers/gpu/drm/amd/display/modules/freesync/freesync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c index eb7421e83b86..fe11436536e8 100644 --- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c +++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c @@ -587,7 +587,7 @@ static void build_vrr_infopacket_data_v3(const struct mod_vrr_params *vrr, } else { // Non-fs case, program nominal range /* PB7 = FreeSync Minimum refresh rate (Hz) */ - infopacket->sb[7] = (unsigned char)((vrr->max_refresh_in_uhz + 50) / 100); + infopacket->sb[7] = (unsigned char)((vrr->min_refresh_in_uhz + +50) / 100); /* PB8 = FreeSync Maximum refresh rate (Hz) */ infopacket->sb[8] = (unsigned char)((vrr->max_refresh_in_uhz + 50) / 100); } -- 2.25.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed
On Thu, Apr 23, 2020 at 10:55 AM Christian König wrote: > > Yeah, we certainly could try this again. But maybe split that up into > individual patches for gfx7/8/9. > > In other words make it easy to revert if this still doesn't work well on > gfx7 or some other generation. Yeah, unless there is a good reason, I don't think we should do this. IIRC, compute rings randomly fail to recover on a lot of hw generations. Alex > > Christian. > > Am 23.04.20 um 15:43 schrieb Zhang, Hawking: > > [AMD Official Use Only - Internal Distribution Only] > > > > Would you mind to enable this and try it again? The recent gpu reset > > testing on vega20 looks very positive. > > > > Regards, > > Hawking > > -Original Message- > > From: Christian König > > Sent: Thursday, April 23, 2020 20:31 > > To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org > > Subject: Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test > > failed > > > > Am 23.04.20 um 11:01 schrieb Hawking Zhang: > >> driver should stop cp resume once compute ring test failed > > Mhm intentionally ignored those errors because the compute rings sometimes > > doesn't come up again after a GPU reset. > > > > We even have the necessary logic in the SW scheduler to redirect the jobs > > to another compute ring when one fails to come up again. > > > > Christian. > > > >> Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50 > >> Signed-off-by: Hawking Zhang > >> --- > >>drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++- > >>drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++- > >>drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++- > >>3 files changed, 9 insertions(+), 3 deletions(-) > >> > >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > >> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > >> index b2f10e3..fcee758 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > >> @@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct > >> amdgpu_device *adev) > >> > >> for (i = 0; i < adev->gfx.num_compute_rings; i++) { > >> ring = >gfx.compute_ring[i]; > >> -amdgpu_ring_test_helper(ring); > >> +r = amdgpu_ring_test_helper(ring); > >> +if (r) > >> +return r; > >> } > >> > >> return 0; > >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > >> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > >> index 6c56ced..8dc8e90 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > >> @@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct > >> amdgpu_device *adev) > >> > >> for (i = 0; i < adev->gfx.num_compute_rings; i++) { > >> ring = >gfx.compute_ring[i]; > >> -amdgpu_ring_test_helper(ring); > >> +r = amdgpu_ring_test_helper(ring); > >> +if (r) > >> +return r; > >> } > >> > >> return 0; > >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > >> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > >> index 09aa5f5..20937059 100644 > >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > >> @@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct > >> amdgpu_device *adev) > >> > >> for (i = 0; i < adev->gfx.num_compute_rings; i++) { > >> ring = >gfx.compute_ring[i]; > >> -amdgpu_ring_test_helper(ring); > >> +r = amdgpu_ring_test_helper(ring); > >> +if (r) > >> +return r; > >> } > >> > >> gfx_v9_0_enable_gui_idle_interrupt(adev, true); > > ___ > > amd-gfx mailing list > > amd-gfx@lists.freedesktop.org > > https://lists.freedesktop.org/mailman/listinfo/amd-gfx > > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://lists.freedesktop.org/mailman/listinfo/amd-gfx ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed
Yeah, we certainly could try this again. But maybe split that up into individual patches for gfx7/8/9. In other words make it easy to revert if this still doesn't work well on gfx7 or some other generation. Christian. Am 23.04.20 um 15:43 schrieb Zhang, Hawking: [AMD Official Use Only - Internal Distribution Only] Would you mind to enable this and try it again? The recent gpu reset testing on vega20 looks very positive. Regards, Hawking -Original Message- From: Christian König Sent: Thursday, April 23, 2020 20:31 To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org Subject: Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed Am 23.04.20 um 11:01 schrieb Hawking Zhang: driver should stop cp resume once compute ring test failed Mhm intentionally ignored those errors because the compute rings sometimes doesn't come up again after a GPU reset. We even have the necessary logic in the SW scheduler to redirect the jobs to another compute ring when one fails to come up again. Christian. Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50 Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index b2f10e3..fcee758 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 6c56ced..8dc8e90 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 09aa5f5..20937059 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } gfx_v9_0_enable_gui_idle_interrupt(adev, true); ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH][next] drm/amd/display: remove redundant assignment to variable ret
From: Colin Ian King The variable ret is being initialized with a value that is never read and it is being updated later with a new value. The initialization is redundant and can be removed. Addresses-Coverity: ("Unused value") Signed-off-by: Colin Ian King --- drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index d5b306384d79..9ef9e50a34fa 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -4231,7 +4231,7 @@ void dpcd_set_source_specific_data(struct dc_link *link) { const uint32_t post_oui_delay = 30; // 30ms uint8_t dspc = 0; - enum dc_status ret = DC_ERROR_UNEXPECTED; + enum dc_status ret; ret = core_link_read_dpcd(link, DP_DOWN_STREAM_PORT_COUNT, , sizeof(dspc)); -- 2.25.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH][next] drm/amd/display: fix incorrect assignment due to a typo
From: Colin Ian King The assignment to infopacket->sb[7] looks incorrect, the comment states it is the minimum refresh rate yet it is being assigned a value from the maximum refresh rate max_refresh_in_uhz. Fix this by using min_refresh_in_uhz instead. Addresses-Coverity: ("Copy-paste error") Fixes: d2bacc38f6ca ("drm/amd/display: Change infopacket type programming") Signed-off-by: Colin Ian King --- drivers/gpu/drm/amd/display/modules/freesync/freesync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c index eb7421e83b86..fe11436536e8 100644 --- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c +++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c @@ -587,7 +587,7 @@ static void build_vrr_infopacket_data_v3(const struct mod_vrr_params *vrr, } else { // Non-fs case, program nominal range /* PB7 = FreeSync Minimum refresh rate (Hz) */ - infopacket->sb[7] = (unsigned char)((vrr->max_refresh_in_uhz + 50) / 100); + infopacket->sb[7] = (unsigned char)((vrr->min_refresh_in_uhz + 50) / 100); /* PB8 = FreeSync Maximum refresh rate (Hz) */ infopacket->sb[8] = (unsigned char)((vrr->max_refresh_in_uhz + 50) / 100); } -- 2.25.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed
[AMD Official Use Only - Internal Distribution Only] Would you mind to enable this and try it again? The recent gpu reset testing on vega20 looks very positive. Regards, Hawking -Original Message- From: Christian König Sent: Thursday, April 23, 2020 20:31 To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org Subject: Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed Am 23.04.20 um 11:01 schrieb Hawking Zhang: > driver should stop cp resume once compute ring test failed Mhm intentionally ignored those errors because the compute rings sometimes doesn't come up again after a GPU reset. We even have the necessary logic in the SW scheduler to redirect the jobs to another compute ring when one fails to come up again. Christian. > > Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50 > Signed-off-by: Hawking Zhang > --- > drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++- > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++- > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++- > 3 files changed, 9 insertions(+), 3 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > index b2f10e3..fcee758 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c > @@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct > amdgpu_device *adev) > > for (i = 0; i < adev->gfx.num_compute_rings; i++) { > ring = >gfx.compute_ring[i]; > - amdgpu_ring_test_helper(ring); > + r = amdgpu_ring_test_helper(ring); > + if (r) > + return r; > } > > return 0; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > index 6c56ced..8dc8e90 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c > @@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct > amdgpu_device *adev) > > for (i = 0; i < adev->gfx.num_compute_rings; i++) { > ring = >gfx.compute_ring[i]; > - amdgpu_ring_test_helper(ring); > + r = amdgpu_ring_test_helper(ring); > + if (r) > + return r; > } > > return 0; > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > index 09aa5f5..20937059 100644 > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c > @@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct > amdgpu_device *adev) > > for (i = 0; i < adev->gfx.num_compute_rings; i++) { > ring = >gfx.compute_ring[i]; > - amdgpu_ring_test_helper(ring); > + r = amdgpu_ring_test_helper(ring); > + if (r) > + return r; > } > > gfx_v9_0_enable_gui_idle_interrupt(adev, true); ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed
Am 23.04.20 um 11:01 schrieb Hawking Zhang: driver should stop cp resume once compute ring test failed Mhm intentionally ignored those errors because the compute rings sometimes doesn't come up again after a GPU reset. We even have the necessary logic in the SW scheduler to redirect the jobs to another compute ring when one fails to come up again. Christian. Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50 Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index b2f10e3..fcee758 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 6c56ced..8dc8e90 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 09aa5f5..20937059 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } gfx_v9_0_enable_gui_idle_interrupt(adev, true); ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amdgpu: protect ring overrun
Hi Christian Thanks. I will remove the initialization of r. Best Regards Yintian Tao -Original Message- From: Christian König Sent: 2020年4月23日 20:22 To: Tao, Yintian ; Koenig, Christian ; Liu, Monk ; Liu, Shaoyun Cc: amd-gfx@lists.freedesktop.org Subject: Re: [PATCH] drm/amdgpu: protect ring overrun Am 23.04.20 um 11:06 schrieb Yintian Tao: > Wait for the oldest sequence on the ring to be signaled in order to > make sure there will be no command overrun. > > v2: fix coding stype and remove abs operation One nit pick below, with that fixed the patch is Reviewed-by: Christian König > > Signed-off-by: Yintian Tao > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 10 +- > drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 22 ++ > drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 ++- > drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 8 +++- > drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 1 - > drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 - > drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 14 +++--- > drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c| 8 +++- > drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 +++- > 9 files changed, 61 insertions(+), 14 deletions(-) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > index 7531527067df..397bd5fa77cb 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c > @@ -192,14 +192,22 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct > dma_fence **f, >* Used For polling fence. >* Returns 0 on success, -ENOMEM on failure. >*/ > -int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s) > +int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s, > + uint32_t timeout) > { > uint32_t seq; > + signed long r = 0; Please drop the initialization of r here. That is usually seen as rather bad style because it prevents the compiler from raising an warning when this really isn't initialized. Regards, Christian. > > if (!s) > return -EINVAL; > > seq = ++ring->fence_drv.sync_seq; > + r = amdgpu_fence_wait_polling(ring, > + seq - ring->fence_drv.num_fences_mask, > + timeout); > + if (r < 1) > + return -ETIMEDOUT; > + > amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, > seq, 0); > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > index a721b0e0ff69..0103acc57474 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c > @@ -675,13 +675,15 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device > *adev, uint32_t reg) > > spin_lock_irqsave(>ring_lock, flags); > if (amdgpu_device_wb_get(adev, _val_offs)) { > - spin_unlock_irqrestore(>ring_lock, flags); > pr_err("critical bug! too many kiq readers\n"); > - goto failed_kiq_read; > + goto failed_unlock; > } > amdgpu_ring_alloc(ring, 32); > amdgpu_ring_emit_rreg(ring, reg, reg_val_offs); > - amdgpu_fence_emit_polling(ring, ); > + r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT); > + if (r) > + goto failed_undo; > + > amdgpu_ring_commit(ring); > spin_unlock_irqrestore(>ring_lock, flags); > > @@ -712,7 +714,13 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, > uint32_t reg) > amdgpu_device_wb_free(adev, reg_val_offs); > return value; > > +failed_undo: > + amdgpu_ring_undo(ring); > +failed_unlock: > + spin_unlock_irqrestore(>ring_lock, flags); > failed_kiq_read: > + if (reg_val_offs) > + amdgpu_device_wb_free(adev, reg_val_offs); > pr_err("failed to read reg:%x\n", reg); > return ~0; > } > @@ -730,7 +738,10 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, > uint32_t reg, uint32_t v) > spin_lock_irqsave(>ring_lock, flags); > amdgpu_ring_alloc(ring, 32); > amdgpu_ring_emit_wreg(ring, reg, v); > - amdgpu_fence_emit_polling(ring, ); > + r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT); > + if (r) > + goto failed_undo; > + > amdgpu_ring_commit(ring); > spin_unlock_irqrestore(>ring_lock, flags); > > @@ -759,6 +770,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, > uint32_t reg, uint32_t v) > > return; > > +failed_undo: > + amdgpu_ring_undo(ring); > + spin_unlock_irqrestore(>ring_lock, flags); > failed_kiq_write: > pr_err("failed to write reg:%x\n", reg); > } > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > index 137d3d2b46e8..be218754629a 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h > +++
Re: [PATCH 4/8] drm/amdgpu: provide RREG32_SOC15_NO_KIQ, will be used later
Am 23.04.20 um 09:01 schrieb Monk Liu: Signed-off-by: Monk Liu Yeah, I also stumbled over that recently. Patch is Acked-by: Christian König . --- drivers/gpu/drm/amd/amdgpu/soc15_common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h index c893c64..56d02aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h @@ -35,6 +35,9 @@ #define RREG32_SOC15(ip, inst, reg) \ RREG32(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) +#define RREG32_SOC15_NO_KIQ(ip, inst, reg) \ + RREG32_NO_KIQ(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) + #define RREG32_SOC15_OFFSET(ip, inst, reg, offset) \ RREG32((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) + offset) ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu: protect ring overrun
Am 23.04.20 um 11:06 schrieb Yintian Tao: Wait for the oldest sequence on the ring to be signaled in order to make sure there will be no command overrun. v2: fix coding stype and remove abs operation One nit pick below, with that fixed the patch is Reviewed-by: Christian König Signed-off-by: Yintian Tao --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 22 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 8 +++- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 1 - drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 - drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 14 +++--- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c| 8 +++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 +++- 9 files changed, 61 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 7531527067df..397bd5fa77cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -192,14 +192,22 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, * Used For polling fence. * Returns 0 on success, -ENOMEM on failure. */ -int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s) +int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s, + uint32_t timeout) { uint32_t seq; + signed long r = 0; Please drop the initialization of r here. That is usually seen as rather bad style because it prevents the compiler from raising an warning when this really isn't initialized. Regards, Christian. if (!s) return -EINVAL; seq = ++ring->fence_drv.sync_seq; + r = amdgpu_fence_wait_polling(ring, + seq - ring->fence_drv.num_fences_mask, + timeout); + if (r < 1) + return -ETIMEDOUT; + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, seq, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index a721b0e0ff69..0103acc57474 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -675,13 +675,15 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) spin_lock_irqsave(>ring_lock, flags); if (amdgpu_device_wb_get(adev, _val_offs)) { - spin_unlock_irqrestore(>ring_lock, flags); pr_err("critical bug! too many kiq readers\n"); - goto failed_kiq_read; + goto failed_unlock; } amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_rreg(ring, reg, reg_val_offs); - amdgpu_fence_emit_polling(ring, ); + r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT); + if (r) + goto failed_undo; + amdgpu_ring_commit(ring); spin_unlock_irqrestore(>ring_lock, flags); @@ -712,7 +714,13 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) amdgpu_device_wb_free(adev, reg_val_offs); return value; +failed_undo: + amdgpu_ring_undo(ring); +failed_unlock: + spin_unlock_irqrestore(>ring_lock, flags); failed_kiq_read: + if (reg_val_offs) + amdgpu_device_wb_free(adev, reg_val_offs); pr_err("failed to read reg:%x\n", reg); return ~0; } @@ -730,7 +738,10 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) spin_lock_irqsave(>ring_lock, flags); amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_wreg(ring, reg, v); - amdgpu_fence_emit_polling(ring, ); + r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT); + if (r) + goto failed_undo; + amdgpu_ring_commit(ring); spin_unlock_irqrestore(>ring_lock, flags); @@ -759,6 +770,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) return; +failed_undo: + amdgpu_ring_undo(ring); + spin_unlock_irqrestore(>ring_lock, flags); failed_kiq_write: pr_err("failed to write reg:%x\n", reg); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 137d3d2b46e8..be218754629a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -118,7 +118,8 @@ void amdgpu_fence_driver_suspend(struct amdgpu_device *adev); void amdgpu_fence_driver_resume(struct amdgpu_device *adev); int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, unsigned flags); -int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s); +int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s, +
Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset
Am 23.04.20 um 12:44 schrieb Zhao, Jiange: [AMD Official Use Only - Internal Distribution Only] Hi Christian, Here are some explanations, (1) registered means that an app is listening to this node, completion means that this app has finished a dump. Yeah and both are the same thing. E.g. when an app is listening the reset code needs to wait and when no app is listening it doesn't. We could rename the member to something like app_listening if that makes things more clearer. (2) after a dump, listening app would close this node. If it wants to get another reset hang, it needs to open the node again. In this way, release() function can wrap up and make it ready for another dump. Yeah, but you are forgetting to reset the completion structure. With the current implementation the second timeout will reset the GPU immediately without waiting for the app to do the core dump. (3) At the same time, considering the use case of this node, I believe that only the first GPU reset is worthy of a dump. That is probably not such a bad idea, but we should leave this decision to the end user which can either restart or app or leave it like this. (4) I didn't implement race condition guard because I believe that this node caters for a cautious super-user and a single client is enough to do all the work. I can add the logic if you think it is necessary. That is a very dangerous assumption you made here. All kernel code must in general be free of such things. Regards, Christian. Jiange -Original Message- From: Koenig, Christian Sent: Thursday, April 23, 2020 4:53 PM To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix ; Pelloux-prayer, Pierre-eric ; Deucher, Alexander ; Zhang, Hawking ; Liu, Monk ; Zhao, Jiange Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset Am 23.04.20 um 09:19 schrieb jia...@amd.com: From: Jiange Zhao When GPU got timeout, it would notify an interested part of an opportunity to dump info before actual GPU reset. A usermode app would open 'autodump' node under debugfs system and poll() for readable/writable. When a GPU reset is due, amdgpu would notify usermode app through wait_queue_head and give it 10 minutes to dump info. After usermode app has done its work, this 'autodump' node is closed. On node closure, amdgpu gets to know the dump is done through the completion that is triggered in release(). There is no write or read callback because necessary info can be obtained through dmesg and umr. Messages back and forth between usermode app and amdgpu are unnecessary. Signed-off-by: Jiange Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + 4 files changed, 97 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bc1e0fd71a09..a505b547f242 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -724,6 +724,13 @@ struct amd_powerplay { const struct amd_pm_funcs *pp_funcs; }; +struct amdgpu_autodump { + boolregistered; + struct completion completed; Registered and completed seems to have the same meaning. + struct dentry *dentry; + struct wait_queue_head gpu_hang_wait; +}; + #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 struct amdgpu_device { @@ -990,6 +997,8 @@ struct amdgpu_device { charproduct_number[16]; charproduct_name[32]; charserial[16]; + + struct amdgpu_autodump autodump; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 1a4894fa3693..cdd4bf00adee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, return 0; } +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if +defined(CONFIG_DEBUG_FS) + int ret; + unsigned long tmo = 600*HZ; In general please declare constant lines first and variable like "i" or "r" last. + + if (!adev->autodump.registered) + return 0; + + wake_up_interruptible(>autodump.gpu_hang_wait); + + ret = +wait_for_completion_interruptible_timeout(>autodump.completed, +tmo); This is racy, in other words it can happen that a new client opens up the debugfs file without being signaled but blocks the reset here. You could use two completion structures to avoid that. + if (ret == 0) { /*
RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset
[AMD Official Use Only - Internal Distribution Only] Hi Christian, Here are some explanations, (1) registered means that an app is listening to this node, completion means that this app has finished a dump. (2) after a dump, listening app would close this node. If it wants to get another reset hang, it needs to open the node again. In this way, release() function can wrap up and make it ready for another dump. (3) At the same time, considering the use case of this node, I believe that only the first GPU reset is worthy of a dump. (4) I didn't implement race condition guard because I believe that this node caters for a cautious super-user and a single client is enough to do all the work. I can add the logic if you think it is necessary. Jiange -Original Message- From: Koenig, Christian Sent: Thursday, April 23, 2020 4:53 PM To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org Cc: Kuehling, Felix ; Pelloux-prayer, Pierre-eric ; Deucher, Alexander ; Zhang, Hawking ; Liu, Monk ; Zhao, Jiange Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset Am 23.04.20 um 09:19 schrieb jia...@amd.com: > From: Jiange Zhao > > When GPU got timeout, it would notify an interested part of an > opportunity to dump info before actual GPU reset. > > A usermode app would open 'autodump' node under debugfs system and > poll() for readable/writable. When a GPU reset is due, amdgpu would > notify usermode app through wait_queue_head and give it 10 minutes to > dump info. > > After usermode app has done its work, this 'autodump' node is closed. > On node closure, amdgpu gets to know the dump is done through the > completion that is triggered in release(). > > There is no write or read callback because necessary info can be > obtained through dmesg and umr. Messages back and forth between > usermode app and amdgpu are unnecessary. > > Signed-off-by: Jiange Zhao > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 + > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + > 4 files changed, 97 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index bc1e0fd71a09..a505b547f242 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -724,6 +724,13 @@ struct amd_powerplay { > const struct amd_pm_funcs *pp_funcs; > }; > > +struct amdgpu_autodump { > + boolregistered; > + struct completion completed; Registered and completed seems to have the same meaning. > + struct dentry *dentry; > + struct wait_queue_head gpu_hang_wait; > +}; > + > #define AMDGPU_RESET_MAGIC_NUM 64 > #define AMDGPU_MAX_DF_PERFMONS 4 > struct amdgpu_device { > @@ -990,6 +997,8 @@ struct amdgpu_device { > charproduct_number[16]; > charproduct_name[32]; > charserial[16]; > + > + struct amdgpu_autodump autodump; > }; > > static inline struct amdgpu_device *amdgpu_ttm_adev(struct > ttm_bo_device *bdev) diff --git > a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index 1a4894fa3693..cdd4bf00adee 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, > return 0; > } > > +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if > +defined(CONFIG_DEBUG_FS) > + int ret; > + unsigned long tmo = 600*HZ; In general please declare constant lines first and variable like "i" or "r" last. > + > + if (!adev->autodump.registered) > + return 0; > + > + wake_up_interruptible(>autodump.gpu_hang_wait); > + > + ret = > +wait_for_completion_interruptible_timeout(>autodump.completed, > +tmo); This is racy, in other words it can happen that a new client opens up the debugfs file without being signaled but blocks the reset here. You could use two completion structures to avoid that. > + if (ret == 0) { /* time out and dump tool still not finish its dump*/ > + pr_err("autodump: timeout before dump finished, move on to gpu > recovery\n"); > + return -ETIMEDOUT; > + } > +#endif > + return 0; > +} > + > #if defined(CONFIG_DEBUG_FS) > > +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct > +file *file) { > + int ret; > + struct amdgpu_device *adev; > + > + ret = simple_open(inode, file); > + if (ret) > + return ret; > + > + adev = file->private_data; > + if (adev->autodump.registered == true) > + return -EINVAL; Probably better to return -EBUSY here.
Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset
Hi, The build fails for me with this patch applied (poll_wait, POLLIN, POLLRDNORM and POLLWRNORM are undeclared). Adding "#include " to amdgpu_debugfs.c fixes it. Pierre-Eric On 23/04/2020 09:19, jia...@amd.com wrote: > From: Jiange Zhao > > When GPU got timeout, it would notify an interested part > of an opportunity to dump info before actual GPU reset. > > A usermode app would open 'autodump' node under debugfs system > and poll() for readable/writable. When a GPU reset is due, > amdgpu would notify usermode app through wait_queue_head and give > it 10 minutes to dump info. > > After usermode app has done its work, this 'autodump' node is closed. > On node closure, amdgpu gets to know the dump is done through > the completion that is triggered in release(). > > There is no write or read callback because necessary info can be > obtained through dmesg and umr. Messages back and forth between > usermode app and amdgpu are unnecessary. > > Signed-off-by: Jiange Zhao > --- > drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++ > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 + > drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 1 + > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + > 4 files changed, 97 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > index bc1e0fd71a09..a505b547f242 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h > @@ -724,6 +724,13 @@ struct amd_powerplay { > const struct amd_pm_funcs *pp_funcs; > }; > > +struct amdgpu_autodump { > + boolregistered; > + struct completion completed; > + struct dentry *dentry; > + struct wait_queue_head gpu_hang_wait; > +}; > + > #define AMDGPU_RESET_MAGIC_NUM 64 > #define AMDGPU_MAX_DF_PERFMONS 4 > struct amdgpu_device { > @@ -990,6 +997,8 @@ struct amdgpu_device { > charproduct_number[16]; > charproduct_name[32]; > charserial[16]; > + > + struct amdgpu_autodump autodump; > }; > > static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device > *bdev) > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > index 1a4894fa3693..cdd4bf00adee 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c > @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, > return 0; > } > > +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) > +{ > +#if defined(CONFIG_DEBUG_FS) > + int ret; > + unsigned long tmo = 600*HZ; > + > + if (!adev->autodump.registered) > + return 0; > + > + wake_up_interruptible(>autodump.gpu_hang_wait); > + > + ret = > wait_for_completion_interruptible_timeout(>autodump.completed, tmo); > + if (ret == 0) { /* time out and dump tool still not finish its dump*/ > + pr_err("autodump: timeout before dump finished, move on to gpu > recovery\n"); > + return -ETIMEDOUT; > + } > +#endif > + return 0; > +} > + > #if defined(CONFIG_DEBUG_FS) > > +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file > *file) > +{ > + int ret; > + struct amdgpu_device *adev; > + > + ret = simple_open(inode, file); > + if (ret) > + return ret; > + > + adev = file->private_data; > + if (adev->autodump.registered == true) > + return -EINVAL; > + > + adev->autodump.registered = true; > + > + return 0; > +} > + > +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file > *file) > +{ > + struct amdgpu_device *adev = file->private_data; > + > + complete(>autodump.completed); > + adev->autodump.registered = false; > + > + return 0; > +} > + > +unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct > poll_table_struct *poll_table) > +{ > + struct amdgpu_device *adev = file->private_data; > + > + poll_wait(file, >autodump.gpu_hang_wait, poll_table); > + > + if (adev->in_gpu_reset) > + return POLLIN | POLLRDNORM | POLLWRNORM; > + > + return 0; > +} > + > +static const struct file_operations autodump_debug_fops = { > + .owner = THIS_MODULE, > + .open = amdgpu_debugfs_autodump_open, > + .poll = amdgpu_debugfs_autodump_poll, > + .release = amdgpu_debugfs_autodump_release, > +}; > + > +static int amdgpu_debugfs_autodump_init(struct amdgpu_device *adev) > +{ > + struct dentry *entry; > + > + init_completion(>autodump.completed); > + init_waitqueue_head(>autodump.gpu_hang_wait); > + adev->autodump.registered = false; > + > + entry = debugfs_create_file("autodump", 0600, > + adev->ddev->primary->debugfs_root, >
RE: [PATCH 2/2] drm/amdgpu: drop the unused local variable
Hi Hawking Can you help also remove the same local variable kiq for gfx_v10_0_ring_emit_rreg? Thanks in advance. After that , Reviewed-by: Yintian Tao Best Regards Yintian Tao -Original Message- From: amd-gfx On Behalf Of Hawking Zhang Sent: 2020年4月23日 17:02 To: amd-gfx@lists.freedesktop.org Cc: Zhang, Hawking Subject: [PATCH 2/2] drm/amdgpu: drop the unused local variable local variable kiq won't be used in function gfx_v8_0_ring_emit_rreg Change-Id: I6229987c8ce43ff0d55e1fae15ede9cb0827f76d Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 8dc8e90..9644614 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -6395,7 +6395,6 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t reg_val_offs) { struct amdgpu_device *adev = ring->adev; - struct amdgpu_kiq *kiq = >gfx.kiq; amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4)); amdgpu_ring_write(ring, 0 | /* src: register*/ -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cyintian.tao%40amd.com%7C3e351ebdd7ff45259e6108d7e76502d0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232293332601376sdata=dH2o%2Bl9%2FjqzP2IIN43qIDrbpmmQpXjPwgGvyaAc%2B1L8%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: protect ring overrun
Wait for the oldest sequence on the ring to be signaled in order to make sure there will be no command overrun. v2: fix coding stype and remove abs operation Signed-off-by: Yintian Tao --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 10 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 22 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 ++- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 8 +++- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c| 1 - drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 - drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 14 +++--- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c| 8 +++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 +++- 9 files changed, 61 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 7531527067df..397bd5fa77cb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -192,14 +192,22 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **f, * Used For polling fence. * Returns 0 on success, -ENOMEM on failure. */ -int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s) +int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s, + uint32_t timeout) { uint32_t seq; + signed long r = 0; if (!s) return -EINVAL; seq = ++ring->fence_drv.sync_seq; + r = amdgpu_fence_wait_polling(ring, + seq - ring->fence_drv.num_fences_mask, + timeout); + if (r < 1) + return -ETIMEDOUT; + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, seq, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index a721b0e0ff69..0103acc57474 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -675,13 +675,15 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) spin_lock_irqsave(>ring_lock, flags); if (amdgpu_device_wb_get(adev, _val_offs)) { - spin_unlock_irqrestore(>ring_lock, flags); pr_err("critical bug! too many kiq readers\n"); - goto failed_kiq_read; + goto failed_unlock; } amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_rreg(ring, reg, reg_val_offs); - amdgpu_fence_emit_polling(ring, ); + r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT); + if (r) + goto failed_undo; + amdgpu_ring_commit(ring); spin_unlock_irqrestore(>ring_lock, flags); @@ -712,7 +714,13 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) amdgpu_device_wb_free(adev, reg_val_offs); return value; +failed_undo: + amdgpu_ring_undo(ring); +failed_unlock: + spin_unlock_irqrestore(>ring_lock, flags); failed_kiq_read: + if (reg_val_offs) + amdgpu_device_wb_free(adev, reg_val_offs); pr_err("failed to read reg:%x\n", reg); return ~0; } @@ -730,7 +738,10 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) spin_lock_irqsave(>ring_lock, flags); amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_wreg(ring, reg, v); - amdgpu_fence_emit_polling(ring, ); + r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT); + if (r) + goto failed_undo; + amdgpu_ring_commit(ring); spin_unlock_irqrestore(>ring_lock, flags); @@ -759,6 +770,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) return; +failed_undo: + amdgpu_ring_undo(ring); + spin_unlock_irqrestore(>ring_lock, flags); failed_kiq_write: pr_err("failed to write reg:%x\n", reg); } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h index 137d3d2b46e8..be218754629a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h @@ -118,7 +118,8 @@ void amdgpu_fence_driver_suspend(struct amdgpu_device *adev); void amdgpu_fence_driver_resume(struct amdgpu_device *adev); int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence, unsigned flags); -int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s); +int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s, + uint32_t timeout); bool amdgpu_fence_process(struct amdgpu_ring *ring); int amdgpu_fence_wait_empty(struct amdgpu_ring *ring); signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 8c10084f44ef..cbbb8d02535a 100644
[PATCH 2/2] drm/amdgpu: drop the unused local variable
local variable kiq won't be used in function gfx_v8_0_ring_emit_rreg Change-Id: I6229987c8ce43ff0d55e1fae15ede9cb0827f76d Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 8dc8e90..9644614 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -6395,7 +6395,6 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg, uint32_t reg_val_offs) { struct amdgpu_device *adev = ring->adev; - struct amdgpu_kiq *kiq = >gfx.kiq; amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4)); amdgpu_ring_write(ring, 0 | /* src: register*/ -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH 2/2] drm/amdgpu: limit smu_set_mp1_state to pp_one_vf or bare-metal
Drop this one _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: Monk Liu Sent: Thursday, April 23, 2020 4:13 PM To: amd-gfx@lists.freedesktop.org Cc: Liu, Monk Subject: [PATCH 2/2] drm/amdgpu: limit smu_set_mp1_state to pp_one_vf or bare-metal Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3d601d5..810141f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2465,7 +2465,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) } adev->ip_blocks[i].status.hw = false; /* handle putting the SMC in the appropriate state */ - if(!amdgpu_sriov_vf(adev)){ + if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_is_pp_one_vf(adev)) { if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); if (r) { -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed
driver should stop cp resume once compute ring test failed Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50 Signed-off-by: Hawking Zhang --- drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c index b2f10e3..fcee758 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c @@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 6c56ced..8dc8e90 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } return 0; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 09aa5f5..20937059 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev) for (i = 0; i < adev->gfx.num_compute_rings; i++) { ring = >gfx.compute_ring[i]; - amdgpu_ring_test_helper(ring); + r = amdgpu_ring_test_helper(ring); + if (r) + return r; } gfx_v9_0_enable_gui_idle_interrupt(adev, true); -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset
Am 23.04.20 um 09:19 schrieb jia...@amd.com: From: Jiange Zhao When GPU got timeout, it would notify an interested part of an opportunity to dump info before actual GPU reset. A usermode app would open 'autodump' node under debugfs system and poll() for readable/writable. When a GPU reset is due, amdgpu would notify usermode app through wait_queue_head and give it 10 minutes to dump info. After usermode app has done its work, this 'autodump' node is closed. On node closure, amdgpu gets to know the dump is done through the completion that is triggered in release(). There is no write or read callback because necessary info can be obtained through dmesg and umr. Messages back and forth between usermode app and amdgpu are unnecessary. Signed-off-by: Jiange Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + 4 files changed, 97 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bc1e0fd71a09..a505b547f242 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -724,6 +724,13 @@ struct amd_powerplay { const struct amd_pm_funcs *pp_funcs; }; +struct amdgpu_autodump { + boolregistered; + struct completion completed; Registered and completed seems to have the same meaning. + struct dentry *dentry; + struct wait_queue_head gpu_hang_wait; +}; + #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 struct amdgpu_device { @@ -990,6 +997,8 @@ struct amdgpu_device { charproduct_number[16]; charproduct_name[32]; charserial[16]; + + struct amdgpu_autodump autodump; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 1a4894fa3693..cdd4bf00adee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, return 0; } +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) +{ +#if defined(CONFIG_DEBUG_FS) + int ret; + unsigned long tmo = 600*HZ; In general please declare constant lines first and variable like "i" or "r" last. + + if (!adev->autodump.registered) + return 0; + + wake_up_interruptible(>autodump.gpu_hang_wait); + + ret = wait_for_completion_interruptible_timeout(>autodump.completed, tmo); This is racy, in other words it can happen that a new client opens up the debugfs file without being signaled but blocks the reset here. You could use two completion structures to avoid that. + if (ret == 0) { /* time out and dump tool still not finish its dump*/ + pr_err("autodump: timeout before dump finished, move on to gpu recovery\n"); + return -ETIMEDOUT; + } +#endif + return 0; +} + #if defined(CONFIG_DEBUG_FS) +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) +{ + int ret; + struct amdgpu_device *adev; + + ret = simple_open(inode, file); + if (ret) + return ret; + + adev = file->private_data; + if (adev->autodump.registered == true) + return -EINVAL; Probably better to return -EBUSY here. And this is racy, and might need a lock e.g. multiple clients could open the file at the same time. If we use a struct completion for registered we could use the spinlock of that one for protection here. + + adev->autodump.registered = true; You also need to reset the completion structure here otherwise only the first GPU reset would work with this. + + return 0; +} + +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file) +{ + struct amdgpu_device *adev = file->private_data; + + complete(>autodump.completed); + adev->autodump.registered = false; + + return 0; +} + +unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table) +{ + struct amdgpu_device *adev = file->private_data; + + poll_wait(file, >autodump.gpu_hang_wait, poll_table); + + if (adev->in_gpu_reset) + return POLLIN | POLLRDNORM | POLLWRNORM; + + return 0; +} + +static const struct file_operations autodump_debug_fops = { + .owner = THIS_MODULE, + .open = amdgpu_debugfs_autodump_open, + .poll = amdgpu_debugfs_autodump_poll, + .release = amdgpu_debugfs_autodump_release, +}; +
[PATCH 1/2] drm/amdgpu: extent threshold of waiting FLR_COMPLETE
to 5s to satisfy WHOLE GPU reset which need 3+ seconds to finish Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 2 +- drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h index 52a6975..83b453f5 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h @@ -26,7 +26,7 @@ #define AI_MAILBOX_POLL_ACK_TIMEDOUT 500 #define AI_MAILBOX_POLL_MSG_TIMEDOUT 12000 -#define AI_MAILBOX_POLL_FLR_TIMEDOUT 500 +#define AI_MAILBOX_POLL_FLR_TIMEDOUT 5000 enum idh_request { IDH_REQ_GPU_INIT_ACCESS = 1, diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h index 45bcf43..52605e1 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h @@ -26,7 +26,7 @@ #define NV_MAILBOX_POLL_ACK_TIMEDOUT 500 #define NV_MAILBOX_POLL_MSG_TIMEDOUT 6000 -#define NV_MAILBOX_POLL_FLR_TIMEDOUT 500 +#define NV_MAILBOX_POLL_FLR_TIMEDOUT 5000 enum idh_request { IDH_REQ_GPU_INIT_ACCESS = 1, -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/2] drm/amdgpu: limit smu_set_mp1_state to pp_one_vf or bare-metal
Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 3d601d5..810141f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2465,7 +2465,7 @@ static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev) } adev->ip_blocks[i].status.hw = false; /* handle putting the SMC in the appropriate state */ - if(!amdgpu_sriov_vf(adev)){ + if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_is_pp_one_vf(adev)) { if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state); if (r) { -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset
Yes, an open file descriptor holds a reference to the driver module. So it shouldn't be possible to unload the driver while it is open. Christian. Am 23.04.20 um 09:54 schrieb Liu, Monk: Oh, looks if the daemon is opening the node KMD don't have a chance to enter the path of shutdown/unload driver, thus no chance to return "kmd unloading" to the app... _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: amd-gfx On Behalf Of Liu, Monk Sent: Thursday, April 23, 2020 3:52 PM To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander ; Pelloux-prayer, Pierre-eric ; Kuehling, Felix ; Koenig, Christian ; Zhang, Hawking Subject: RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset Hi Christian Do you think we need to kill the daemon app if we do KMD unloading ? usually user need to close the app first and then the KMD could be unloaded If we don't want to manually shutdown the daemon app we can do a "KILL" signal send to that process, or we can implement "read" and let app call "read()" to fetch information like: 1) xxx process hang 2) kmd unloading And daemon can close() the node if it receives "kmd unloading" instead of doing the dump Thanks _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: Zhao, Jiange Sent: Thursday, April 23, 2020 3:20 PM To: amd-gfx@lists.freedesktop.org Cc: Koenig, Christian ; Kuehling, Felix ; Pelloux-prayer, Pierre-eric ; Deucher, Alexander ; Zhang, Hawking ; Liu, Monk ; Zhao, Jiange Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset From: Jiange Zhao When GPU got timeout, it would notify an interested part of an opportunity to dump info before actual GPU reset. A usermode app would open 'autodump' node under debugfs system and poll() for readable/writable. When a GPU reset is due, amdgpu would notify usermode app through wait_queue_head and give it 10 minutes to dump info. After usermode app has done its work, this 'autodump' node is closed. On node closure, amdgpu gets to know the dump is done through the completion that is triggered in release(). There is no write or read callback because necessary info can be obtained through dmesg and umr. Messages back and forth between usermode app and amdgpu are unnecessary. Signed-off-by: Jiange Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + 4 files changed, 97 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bc1e0fd71a09..a505b547f242 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -724,6 +724,13 @@ struct amd_powerplay { const struct amd_pm_funcs *pp_funcs; }; +struct amdgpu_autodump { + boolregistered; + struct completion completed; + struct dentry *dentry; + struct wait_queue_head gpu_hang_wait; +}; + #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 struct amdgpu_device { @@ -990,6 +997,8 @@ struct amdgpu_device { charproduct_number[16]; charproduct_name[32]; charserial[16]; + + struct amdgpu_autodump autodump; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 1a4894fa3693..cdd4bf00adee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, return 0; } +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if +defined(CONFIG_DEBUG_FS) + int ret; + unsigned long tmo = 600*HZ; + + if (!adev->autodump.registered) + return 0; + + wake_up_interruptible(>autodump.gpu_hang_wait); + + ret = wait_for_completion_interruptible_timeout(>autodump.completed, tmo); + if (ret == 0) { /* time out and dump tool still not finish its dump*/ + pr_err("autodump: timeout before dump finished, move on to gpu recovery\n"); + return -ETIMEDOUT; + } +#endif + return 0; +} + #if defined(CONFIG_DEBUG_FS) +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct +file *file) { + int ret; + struct amdgpu_device *adev; + + ret = simple_open(inode, file); + if (ret) + return ret; + + adev = file->private_data; + if (adev->autodump.registered == true) + return -EINVAL;
RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset
Oh, looks if the daemon is opening the node KMD don't have a chance to enter the path of shutdown/unload driver, thus no chance to return "kmd unloading" to the app... _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: amd-gfx On Behalf Of Liu, Monk Sent: Thursday, April 23, 2020 3:52 PM To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org Cc: Deucher, Alexander ; Pelloux-prayer, Pierre-eric ; Kuehling, Felix ; Koenig, Christian ; Zhang, Hawking Subject: RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset Hi Christian Do you think we need to kill the daemon app if we do KMD unloading ? usually user need to close the app first and then the KMD could be unloaded If we don't want to manually shutdown the daemon app we can do a "KILL" signal send to that process, or we can implement "read" and let app call "read()" to fetch information like: 1) xxx process hang 2) kmd unloading And daemon can close() the node if it receives "kmd unloading" instead of doing the dump Thanks _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: Zhao, Jiange Sent: Thursday, April 23, 2020 3:20 PM To: amd-gfx@lists.freedesktop.org Cc: Koenig, Christian ; Kuehling, Felix ; Pelloux-prayer, Pierre-eric ; Deucher, Alexander ; Zhang, Hawking ; Liu, Monk ; Zhao, Jiange Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset From: Jiange Zhao When GPU got timeout, it would notify an interested part of an opportunity to dump info before actual GPU reset. A usermode app would open 'autodump' node under debugfs system and poll() for readable/writable. When a GPU reset is due, amdgpu would notify usermode app through wait_queue_head and give it 10 minutes to dump info. After usermode app has done its work, this 'autodump' node is closed. On node closure, amdgpu gets to know the dump is done through the completion that is triggered in release(). There is no write or read callback because necessary info can be obtained through dmesg and umr. Messages back and forth between usermode app and amdgpu are unnecessary. Signed-off-by: Jiange Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + 4 files changed, 97 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bc1e0fd71a09..a505b547f242 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -724,6 +724,13 @@ struct amd_powerplay { const struct amd_pm_funcs *pp_funcs; }; +struct amdgpu_autodump { + boolregistered; + struct completion completed; + struct dentry *dentry; + struct wait_queue_head gpu_hang_wait; +}; + #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 struct amdgpu_device { @@ -990,6 +997,8 @@ struct amdgpu_device { charproduct_number[16]; charproduct_name[32]; charserial[16]; + + struct amdgpu_autodump autodump; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 1a4894fa3693..cdd4bf00adee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, return 0; } +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if +defined(CONFIG_DEBUG_FS) + int ret; + unsigned long tmo = 600*HZ; + + if (!adev->autodump.registered) + return 0; + + wake_up_interruptible(>autodump.gpu_hang_wait); + + ret = wait_for_completion_interruptible_timeout(>autodump.completed, tmo); + if (ret == 0) { /* time out and dump tool still not finish its dump*/ + pr_err("autodump: timeout before dump finished, move on to gpu recovery\n"); + return -ETIMEDOUT; + } +#endif + return 0; +} + #if defined(CONFIG_DEBUG_FS) +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct +file *file) { + int ret; + struct amdgpu_device *adev; + + ret = simple_open(inode, file); + if (ret) + return ret; + + adev = file->private_data; + if (adev->autodump.registered == true) + return -EINVAL; + + adev->autodump.registered = true; + + return 0; +} + +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct +file *file) { + struct amdgpu_device *adev =
[PATCH] drm/amdgpu/display: Fix dc_sink refcnt leak in dc_link_detect_helper
dc_link_detect_helper() invokes dc_sink_retain(), which increases the refcount of the "prev_sink". When dc_link_detect_helper() returns, local variable "prev_sink" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in one exception handling path of dc_link_detect_helper(). When alt mode times out, the function forgets to decrease the refcnt increased by dc_sink_retain(), causing a refcnt leak. Fix this issue by calling dc_sink_release() when alt mode times out. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan --- drivers/gpu/drm/amd/display/dc/core/dc_link.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c b/drivers/gpu/drm/amd/display/dc/core/dc_link.c index a09119c10d7c..91550d9a1abb 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c @@ -832,6 +832,8 @@ static bool dc_link_detect_helper(struct dc_link *link, /* if alt mode times out, return false */ if (wait_for_alt_mode(link) == false) { + if (prev_sink != NULL) + dc_sink_release(prev_sink); return false; } } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu/display: Fix dc_sink refcnt leak when detecting link
emulated_link_detect() invokes dc_sink_retain(), which increases the refcount of the "prev_sink". When emulated_link_detect() returns, local variable "prev_sink" becomes invalid, so the refcount should be decreased to keep refcount balanced. The reference counting issue happens in all paths of emulated_link_detect(), which forgets to decrease the refcnt increased by dc_sink_retain(), causing a refcnt leak. Fix this issue by adding a "err_sink_put" label and calling dc_sink_release() before emulated_link_detect() returns. Signed-off-by: Xiyu Yang Signed-off-by: Xin Tan --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index e997251a8b57..1b0c4f11b9b1 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1633,7 +1633,7 @@ static void emulated_link_detect(struct dc_link *link) default: DC_ERROR("Invalid connector type! signal:%d\n", link->connector_signal); - return; + goto err_sink_put; } sink_init_data.link = link; @@ -1642,7 +1642,7 @@ static void emulated_link_detect(struct dc_link *link) sink = dc_sink_create(_init_data); if (!sink) { DC_ERROR("Failed to create sink!\n"); - return; + goto err_sink_put; } /* dc_sink_create returns a new reference */ @@ -1655,6 +1655,9 @@ static void emulated_link_detect(struct dc_link *link) if (edid_status != EDID_OK) DC_ERROR("Failed to read EDID"); +err_sink_put: + if (prev_sink != NULL) + dc_sink_release(prev_sink); } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset
Hi Christian Do you think we need to kill the daemon app if we do KMD unloading ? usually user need to close the app first and then the KMD could be unloaded If we don't want to manually shutdown the daemon app we can do a "KILL" signal send to that process, or we can implement "read" and let app call "read()" to fetch information like: 1) xxx process hang 2) kmd unloading And daemon can close() the node if it receives "kmd unloading" instead of doing the dump Thanks _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: Zhao, Jiange Sent: Thursday, April 23, 2020 3:20 PM To: amd-gfx@lists.freedesktop.org Cc: Koenig, Christian ; Kuehling, Felix ; Pelloux-prayer, Pierre-eric ; Deucher, Alexander ; Zhang, Hawking ; Liu, Monk ; Zhao, Jiange Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset From: Jiange Zhao When GPU got timeout, it would notify an interested part of an opportunity to dump info before actual GPU reset. A usermode app would open 'autodump' node under debugfs system and poll() for readable/writable. When a GPU reset is due, amdgpu would notify usermode app through wait_queue_head and give it 10 minutes to dump info. After usermode app has done its work, this 'autodump' node is closed. On node closure, amdgpu gets to know the dump is done through the completion that is triggered in release(). There is no write or read callback because necessary info can be obtained through dmesg and umr. Messages back and forth between usermode app and amdgpu are unnecessary. Signed-off-by: Jiange Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + 4 files changed, 97 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bc1e0fd71a09..a505b547f242 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -724,6 +724,13 @@ struct amd_powerplay { const struct amd_pm_funcs *pp_funcs; }; +struct amdgpu_autodump { + boolregistered; + struct completion completed; + struct dentry *dentry; + struct wait_queue_head gpu_hang_wait; +}; + #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 struct amdgpu_device { @@ -990,6 +997,8 @@ struct amdgpu_device { charproduct_number[16]; charproduct_name[32]; charserial[16]; + + struct amdgpu_autodump autodump; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 1a4894fa3693..cdd4bf00adee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, return 0; } +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if +defined(CONFIG_DEBUG_FS) + int ret; + unsigned long tmo = 600*HZ; + + if (!adev->autodump.registered) + return 0; + + wake_up_interruptible(>autodump.gpu_hang_wait); + + ret = wait_for_completion_interruptible_timeout(>autodump.completed, tmo); + if (ret == 0) { /* time out and dump tool still not finish its dump*/ + pr_err("autodump: timeout before dump finished, move on to gpu recovery\n"); + return -ETIMEDOUT; + } +#endif + return 0; +} + #if defined(CONFIG_DEBUG_FS) +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct +file *file) { + int ret; + struct amdgpu_device *adev; + + ret = simple_open(inode, file); + if (ret) + return ret; + + adev = file->private_data; + if (adev->autodump.registered == true) + return -EINVAL; + + adev->autodump.registered = true; + + return 0; +} + +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct +file *file) { + struct amdgpu_device *adev = file->private_data; + + complete(>autodump.completed); + adev->autodump.registered = false; + + return 0; +} + +unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct +poll_table_struct *poll_table) { + struct amdgpu_device *adev = file->private_data; + + poll_wait(file, >autodump.gpu_hang_wait, poll_table); + + if (adev->in_gpu_reset) + return POLLIN | POLLRDNORM | POLLWRNORM; + + return 0; +} + +static const struct file_operations autodump_debug_fops = { + .owner = THIS_MODULE, + .open =
Re: [PATCH] drm/amdgpu: protect ring overrun
Am 23.04.20 um 06:22 schrieb Yintian Tao: Wait for the oldest sequence on the ring to be signaled in order to make sure there will be no command overrun. One technical problem and a few style suggestions below. Apart from that looks good to me. Signed-off-by: Yintian Tao --- drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 7 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 17 +++-- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 8 +++- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 9 - drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c| 8 +++- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 8 +++- 6 files changed, 51 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c index 7531527067df..5462ea83d8b2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c @@ -200,6 +200,13 @@ int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s) return -EINVAL; seq = ++ring->fence_drv.sync_seq; + if ((abs(seq - ring->fence_drv.num_fences_mask) > + ring->fence_drv.num_fences_mask) && That is a rather bad idea and won't work. The sequence is a 32bit value and supposed to wrap around. Just dropping the extra check should do it should work. + (amdgpu_fence_wait_polling(ring, + seq - ring->fence_drv.num_fences_mask, + MAX_KIQ_REG_WAIT) < 1)) Maybe we should make the timeout a parameter here. And it is usually better to use the style like this: r = amdgpu_fence if (r < 1) . +return -ETIME; We usually use -ETIMEDOUT because this is not really bound to a specific timer. + amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr, seq, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index a721b0e0ff69..7087333681f6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -681,7 +681,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg) } amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_rreg(ring, reg, reg_val_offs); - amdgpu_fence_emit_polling(ring, ); + r = amdgpu_fence_emit_polling(ring, ); + if (r) { + amdgpu_ring_undo(ring); + amdgpu_device_wb_free(adev, reg_val_offs); + spin_unlock_irqrestore(>ring_lock, flags); + goto failed_kiq_read; + } + If we already have goto style error handling we should probably just add a new label to handle it. Same for the other cases below where you already have a goto. Regards, Christian. amdgpu_ring_commit(ring); spin_unlock_irqrestore(>ring_lock, flags); @@ -730,7 +737,13 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v) spin_lock_irqsave(>ring_lock, flags); amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_wreg(ring, reg, v); - amdgpu_fence_emit_polling(ring, ); + r = amdgpu_fence_emit_polling(ring, ); + if (r) { + amdgpu_ring_undo(ring); + spin_unlock_irqrestore(>ring_lock, flags); + goto failed_kiq_write; + } + amdgpu_ring_commit(ring); spin_unlock_irqrestore(>ring_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index 8c10084f44ef..12d181ac7e78 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -60,7 +60,13 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device *adev, amdgpu_ring_alloc(ring, 32); amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1, ref, mask); - amdgpu_fence_emit_polling(ring, ); + r = amdgpu_fence_emit_polling(ring, ); + if (r) { + amdgpu_ring_undo(ring); + spin_unlock_irqrestore(>ring_lock, flags); + goto failed_kiq; + } + amdgpu_ring_commit(ring); spin_unlock_irqrestore(>ring_lock, flags); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 5b1549f167b0..650b7a67d3bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -4068,7 +4068,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct amdgpu_device *adev) reg_val_offs * 4)); amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr + reg_val_offs * 4)); - amdgpu_fence_emit_polling(ring, ); + r = amdgpu_fence_emit_polling(ring, ); + if (r) { + amdgpu_ring_undo(ring); + amdgpu_device_wb_free(adev,
Re: [PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault
On Wed, Apr 22, 2020 at 09:39:11AM -0300, Jason Gunthorpe wrote: > > Nice callchain from hell.. Unfortunately such "code listings" tend to > > get out of date very quickly, so I'm not sure it is worth keeping in > > the code. What would be really worthile is consolidating the two > > different sets of defines (NVIF_VMM_PFNMAP_V0_ vs NVKM_VMM_PFN_) > > to make the code a little easier to follow. > > I was mainly concerned that this function is using hmm properly, > becuase it sure looks like it is just forming the CPU physical address > into a HW specific data. But it turns out it is just an internal data > for some other code and the dma_map is impossibly far away > > It took forever to find, I figured I'd leave a hint for the next poor > soul that has to look at this.. > > Also, I think it shows there is no 'performance' argument here, if > this path needs more performance the above should be cleaned > before we abuse hmm_range_fault. > > Put it in the commit message instead? Yes, the graph itself sounds reasonable for the commit log as a point of time information. > > > npages = (range->end - range->start) >> PAGE_SHIFT; > > > for (i = 0; i < npages; ++i) { > > > struct page *page; > > > > > > + if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) { > > > + ioctl_addr[i] = 0; > > > continue; > > > + } > > > > Can't we rely on the caller pre-zeroing the array? > > This ends up as args.phys in nouveau_svm_fault - I didn't see a > zeroing? > > I think it makes sense that this routine fully sets the output array > and does not assume pre-initialize Ok. ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset
From: Jiange Zhao When GPU got timeout, it would notify an interested part of an opportunity to dump info before actual GPU reset. A usermode app would open 'autodump' node under debugfs system and poll() for readable/writable. When a GPU reset is due, amdgpu would notify usermode app through wait_queue_head and give it 10 minutes to dump info. After usermode app has done its work, this 'autodump' node is closed. On node closure, amdgpu gets to know the dump is done through the completion that is triggered in release(). There is no write or read callback because necessary info can be obtained through dmesg and umr. Messages back and forth between usermode app and amdgpu are unnecessary. Signed-off-by: Jiange Zhao --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 9 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 + drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 + 4 files changed, 97 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index bc1e0fd71a09..a505b547f242 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -724,6 +724,13 @@ struct amd_powerplay { const struct amd_pm_funcs *pp_funcs; }; +struct amdgpu_autodump { + boolregistered; + struct completion completed; + struct dentry *dentry; + struct wait_queue_head gpu_hang_wait; +}; + #define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_MAX_DF_PERFMONS 4 struct amdgpu_device { @@ -990,6 +997,8 @@ struct amdgpu_device { charproduct_number[16]; charproduct_name[32]; charserial[16]; + + struct amdgpu_autodump autodump; }; static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 1a4894fa3693..cdd4bf00adee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev, return 0; } +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) +{ +#if defined(CONFIG_DEBUG_FS) + int ret; + unsigned long tmo = 600*HZ; + + if (!adev->autodump.registered) + return 0; + + wake_up_interruptible(>autodump.gpu_hang_wait); + + ret = wait_for_completion_interruptible_timeout(>autodump.completed, tmo); + if (ret == 0) { /* time out and dump tool still not finish its dump*/ + pr_err("autodump: timeout before dump finished, move on to gpu recovery\n"); + return -ETIMEDOUT; + } +#endif + return 0; +} + #if defined(CONFIG_DEBUG_FS) +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file) +{ + int ret; + struct amdgpu_device *adev; + + ret = simple_open(inode, file); + if (ret) + return ret; + + adev = file->private_data; + if (adev->autodump.registered == true) + return -EINVAL; + + adev->autodump.registered = true; + + return 0; +} + +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file *file) +{ + struct amdgpu_device *adev = file->private_data; + + complete(>autodump.completed); + adev->autodump.registered = false; + + return 0; +} + +unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct poll_table_struct *poll_table) +{ + struct amdgpu_device *adev = file->private_data; + + poll_wait(file, >autodump.gpu_hang_wait, poll_table); + + if (adev->in_gpu_reset) + return POLLIN | POLLRDNORM | POLLWRNORM; + + return 0; +} + +static const struct file_operations autodump_debug_fops = { + .owner = THIS_MODULE, + .open = amdgpu_debugfs_autodump_open, + .poll = amdgpu_debugfs_autodump_poll, + .release = amdgpu_debugfs_autodump_release, +}; + +static int amdgpu_debugfs_autodump_init(struct amdgpu_device *adev) +{ + struct dentry *entry; + + init_completion(>autodump.completed); + init_waitqueue_head(>autodump.gpu_hang_wait); + adev->autodump.registered = false; + + entry = debugfs_create_file("autodump", 0600, + adev->ddev->primary->debugfs_root, + adev, _debug_fops); + adev->autodump.dentry = entry; + + return 0; +} + /** * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes * @@ -1434,6 +1517,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev) amdgpu_ras_debugfs_create_all(adev); + amdgpu_debugfs_autodump_init(adev); + return amdgpu_debugfs_add_files(adev,
RE: [PATCH 1/8] drm/amdgpu: ignore TA ucode for SRIOV
Series is Acked-by: Yintian Tao -Original Message- From: amd-gfx On Behalf Of Monk Liu Sent: 2020年4月23日 15:02 To: amd-gfx@lists.freedesktop.org Cc: Liu, Monk Subject: [PATCH 1/8] drm/amdgpu: ignore TA ucode for SRIOV Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index 0afd610..b4b0242 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -194,6 +194,8 @@ static int psp_v11_0_init_microcode(struct psp_context *psp) case CHIP_NAVI10: case CHIP_NAVI14: case CHIP_NAVI12: + if (amdgpu_sriov_vf(adev)) + break; snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ta.bin", chip_name); err = request_firmware(>psp.ta_fw, fw_name, adev->dev); if (err) { -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cyintian.tao%40amd.com%7C591f6154216d45f81a0c08d7e7543671%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232221913347447sdata=jFv%2FMqS%2B0lHnuaN0%2B1GG6iSfyyAFPZbFBxa%2BiEL2tsg%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 8/8] drm/amdgpu: for nv12 always need smu ip
because nv12 SRIOV support one vf mode Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/nv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c index 995bdec..9c42316 100644 --- a/drivers/gpu/drm/amd/amdgpu/nv.c +++ b/drivers/gpu/drm/amd/amdgpu/nv.c @@ -498,8 +498,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev) amdgpu_device_ip_block_add(adev, _v10_0_ip_block); amdgpu_device_ip_block_add(adev, _ih_ip_block); amdgpu_device_ip_block_add(adev, _v11_0_ip_block); - if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP && - !amdgpu_sriov_vf(adev)) + if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) amdgpu_device_ip_block_add(adev, _v11_0_ip_block); if (adev->enable_virtual_display || amdgpu_sriov_vf(adev)) amdgpu_device_ip_block_add(adev, _virtual_ip_block); -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 2/8] drm/amdgpu: skip cg/pg set for SRIOV
Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 8a579ce..909ef08 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -7095,6 +7095,10 @@ static int gfx_v10_0_set_powergating_state(void *handle, { struct amdgpu_device *adev = (struct amdgpu_device *)handle; bool enable = (state == AMD_PG_STATE_GATE); + + if (amdgpu_sriov_vf(adev)) + return 0; + switch (adev->asic_type) { case CHIP_NAVI10: case CHIP_NAVI14: @@ -7115,6 +7119,9 @@ static int gfx_v10_0_set_clockgating_state(void *handle, { struct amdgpu_device *adev = (struct amdgpu_device *)handle; + if (amdgpu_sriov_vf(adev)) + return 0; + switch (adev->asic_type) { case CHIP_NAVI10: case CHIP_NAVI14: -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 5/8] drm/amdgpu: clear the messed up checking logic
for MI100 + ASICS, we always support SW_SMU for bare-metal and for SRIOV one_vf_mode Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 2bb1e0c..361a5b6 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -571,15 +571,10 @@ bool is_support_sw_smu(struct amdgpu_device *adev) if (adev->asic_type == CHIP_VEGA20) return (amdgpu_dpm == 2) ? true : false; else if (adev->asic_type >= CHIP_ARCTURUS) { - if (amdgpu_sriov_vf(adev) && - !(adev->asic_type == CHIP_ARCTURUS && - amdgpu_sriov_is_pp_one_vf(adev))) - - return false; - else + if (amdgpu_sriov_is_pp_one_vf(adev) || !amdgpu_sriov_vf(adev)) return true; - } else - return false; + } + return false; } bool is_support_sw_smu_xgmi(struct amdgpu_device *adev) -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 4/8] drm/amdgpu: provide RREG32_SOC15_NO_KIQ, will be used later
Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/soc15_common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h b/drivers/gpu/drm/amd/amdgpu/soc15_common.h index c893c64..56d02aa 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h +++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h @@ -35,6 +35,9 @@ #define RREG32_SOC15(ip, inst, reg) \ RREG32(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) +#define RREG32_SOC15_NO_KIQ(ip, inst, reg) \ + RREG32_NO_KIQ(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) + #define RREG32_SOC15_OFFSET(ip, inst, reg, offset) \ RREG32((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) + offset) -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 7/8] drm/amdgpu: skip sysfs node not belong to one vf mode
Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 48 -- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c index 49e2e43..c762deb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c @@ -3271,26 +3271,27 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev) return ret; } - - ret = device_create_file(adev->dev, _attr_pp_num_states); - if (ret) { - DRM_ERROR("failed to create device file pp_num_states\n"); - return ret; - } - ret = device_create_file(adev->dev, _attr_pp_cur_state); - if (ret) { - DRM_ERROR("failed to create device file pp_cur_state\n"); - return ret; - } - ret = device_create_file(adev->dev, _attr_pp_force_state); - if (ret) { - DRM_ERROR("failed to create device file pp_force_state\n"); - return ret; - } - ret = device_create_file(adev->dev, _attr_pp_table); - if (ret) { - DRM_ERROR("failed to create device file pp_table\n"); - return ret; + if (!amdgpu_sriov_vf(adev)) { + ret = device_create_file(adev->dev, _attr_pp_num_states); + if (ret) { + DRM_ERROR("failed to create device file pp_num_states\n"); + return ret; + } + ret = device_create_file(adev->dev, _attr_pp_cur_state); + if (ret) { + DRM_ERROR("failed to create device file pp_cur_state\n"); + return ret; + } + ret = device_create_file(adev->dev, _attr_pp_force_state); + if (ret) { + DRM_ERROR("failed to create device file pp_force_state\n"); + return ret; + } + ret = device_create_file(adev->dev, _attr_pp_table); + if (ret) { + DRM_ERROR("failed to create device file pp_table\n"); + return ret; + } } ret = device_create_file(adev->dev, _attr_pp_dpm_sclk); @@ -3337,6 +3338,13 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev) return ret; } } + + /* the reset are not needed for SRIOV one vf mode */ + if (amdgpu_sriov_vf(adev)) { + adev->pm.sysfs_initialized = true; + return ret; + } + if (adev->asic_type != CHIP_ARCTURUS) { ret = device_create_file(adev->dev, _attr_pp_dpm_pcie); if (ret) { -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 6/8] drm/amdgpu: enable one vf mode for nv12
Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 12 +++- drivers/gpu/drm/amd/powerplay/navi10_ppt.c | 6 +++- drivers/gpu/drm/amd/powerplay/smu_v11_0.c | 49 +- 3 files changed, 52 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 361a5b6..5964d63 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -347,13 +347,13 @@ int smu_get_dpm_freq_by_index(struct smu_context *smu, enum smu_clk_type clk_typ param = (uint32_t)(((clk_id & 0x) << 16) | (level & 0x)); ret = smu_send_smc_msg_with_param(smu, SMU_MSG_GetDpmFreqByIndex, - param, ); + param, value); if (ret) return ret; /* BIT31: 0 - Fine grained DPM, 1 - Dicrete DPM * now, we un-support it */ - *value = param & 0x7fff; + *value = *value & 0x7fff; return ret; } @@ -535,7 +535,6 @@ int smu_update_table(struct smu_context *smu, enum smu_table_id table_index, int int table_id = smu_table_get_index(smu, table_index); uint32_t table_size; int ret = 0; - if (!table_data || table_id >= SMU_TABLE_COUNT || table_id < 0) return -EINVAL; @@ -691,7 +690,6 @@ int smu_feature_is_enabled(struct smu_context *smu, enum smu_feature_mask mask) if (smu->is_apu) return 1; - feature_id = smu_feature_get_index(smu, mask); if (feature_id < 0) return 0; @@ -1339,6 +1337,9 @@ static int smu_hw_init(void *handle) struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct smu_context *smu = >smu; + if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) + return 0; + ret = smu_start_smc_engine(smu); if (ret) { pr_err("SMU is not ready yet!\n"); @@ -1352,9 +1353,6 @@ static int smu_hw_init(void *handle) smu_set_gfx_cgpg(>smu, true); } - if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev)) - return 0; - if (!smu->pm_enabled) return 0; diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c index c94270f..2184d24 100644 --- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c +++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c @@ -1817,7 +1817,8 @@ static int navi10_get_power_limit(struct smu_context *smu, int power_src; if (!smu->power_limit) { - if (smu_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT)) { + if (smu_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT) && + !amdgpu_sriov_vf(smu->adev)) { power_src = smu_power_get_index(smu, SMU_POWER_SOURCE_AC); if (power_src < 0) return -EINVAL; @@ -1960,6 +1961,9 @@ static int navi10_set_default_od_settings(struct smu_context *smu, bool initiali OverDriveTable_t *od_table, *boot_od_table; int ret = 0; + if (amdgpu_sriov_vf(smu->adev)) + return 0; + ret = smu_v11_0_set_default_od_settings(smu, initialize, sizeof(OverDriveTable_t)); if (ret) return ret; diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c index a97b296..3e1b3ed 100644 --- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c +++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c @@ -57,7 +57,7 @@ static int smu_v11_0_send_msg_without_waiting(struct smu_context *smu, uint16_t msg) { struct amdgpu_device *adev = smu->adev; - WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg); + WREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_66, msg); return 0; } @@ -65,7 +65,7 @@ static int smu_v11_0_read_arg(struct smu_context *smu, uint32_t *arg) { struct amdgpu_device *adev = smu->adev; - *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82); + *arg = RREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_82); return 0; } @@ -75,7 +75,7 @@ static int smu_v11_0_wait_for_response(struct smu_context *smu) uint32_t cur_value, i, timeout = adev->usec_timeout * 10; for (i = 0; i < timeout; i++) { - cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90); + cur_value = RREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_90); if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0) return cur_value == 0x1 ? 0 : -EIO; @@ -83,7 +83,10 @@ static int smu_v11_0_wait_for_response(struct smu_context *smu) } /* timeout means wrong logic */ - return -ETIME; + if (i == timeout) +
[PATCH 3/8] drm/amdgpu: sriov is forbidden to call disable DPM
Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c index 88b4e56..2bb1e0c 100644 --- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c @@ -1403,6 +1403,9 @@ static int smu_hw_init(void *handle) static int smu_stop_dpms(struct smu_context *smu) { + if (amdgpu_sriov_vf(smu->adev)) + return 0; + return smu_system_features_control(smu, false); } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH 1/8] drm/amdgpu: ignore TA ucode for SRIOV
Signed-off-by: Monk Liu --- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index 0afd610..b4b0242 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -194,6 +194,8 @@ static int psp_v11_0_init_microcode(struct psp_context *psp) case CHIP_NAVI10: case CHIP_NAVI14: case CHIP_NAVI12: + if (amdgpu_sriov_vf(adev)) + break; snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ta.bin", chip_name); err = request_firmware(>psp.ta_fw, fw_name, adev->dev); if (err) { -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx