RE: [PATCH 8/8] drm/amdgpu: for nv12 always need smu ip

2020-04-23 Thread Quan, Evan
Please check whether this is needed also for the following code:
if (adev->firmware.load_type == AMDGPU_FW_LOAD_DIRECT &&
!amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);

Other than that, this one and patch1-4 are acked-by: Evan Quan 


-Original Message-
From: amd-gfx  On Behalf Of Monk Liu
Sent: Thursday, April 23, 2020 3:02 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk 
Subject: [PATCH 8/8] drm/amdgpu: for nv12 always need smu ip

because nv12 SRIOV support one vf mode

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/nv.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c 
index 995bdec..9c42316 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -498,8 +498,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_device_ip_block_add(adev, _v10_0_ip_block);
amdgpu_device_ip_block_add(adev, _ih_ip_block);
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
-   if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP &&
-   !amdgpu_sriov_vf(adev))
+   if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP)
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
if (adev->enable_virtual_display || amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _virtual_ip_block);
--
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cevan.quan%40amd.com%7C0776cfa8343f4992679608d7e7543a7e%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232221889995485sdata=L4BeRmyYxVu4to1tk%2BtQANvWg3fxTtjyv0zroWUJ3jA%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 7/8] drm/amdgpu: skip sysfs node not belong to one vf mode

2020-04-23 Thread Quan, Evan



-Original Message-
From: amd-gfx  On Behalf Of Monk Liu
Sent: Thursday, April 23, 2020 3:02 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk 
Subject: [PATCH 7/8] drm/amdgpu: skip sysfs node not belong to one vf mode

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 48 --
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index 49e2e43..c762deb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -3271,26 +3271,27 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
return ret;
}
 
-
-   ret = device_create_file(adev->dev, _attr_pp_num_states);
-   if (ret) {
-   DRM_ERROR("failed to create device file pp_num_states\n");
-   return ret;
-   }
-   ret = device_create_file(adev->dev, _attr_pp_cur_state);
-   if (ret) {
-   DRM_ERROR("failed to create device file pp_cur_state\n");
-   return ret;
-   }
-   ret = device_create_file(adev->dev, _attr_pp_force_state);
-   if (ret) {
-   DRM_ERROR("failed to create device file pp_force_state\n");
-   return ret;
-   }
-   ret = device_create_file(adev->dev, _attr_pp_table);
-   if (ret) {
-   DRM_ERROR("failed to create device file pp_table\n");
-   return ret;
+   if (!amdgpu_sriov_vf(adev)) {
+   ret = device_create_file(adev->dev, _attr_pp_num_states);
+   if (ret) {
+   DRM_ERROR("failed to create device file 
pp_num_states\n");
+   return ret;
+   }
+   ret = device_create_file(adev->dev, _attr_pp_cur_state);
+   if (ret) {
+   DRM_ERROR("failed to create device file 
pp_cur_state\n");
+   return ret;
+   }
+   ret = device_create_file(adev->dev, _attr_pp_force_state);
+   if (ret) {
+   DRM_ERROR("failed to create device file 
pp_force_state\n");
+   return ret;
+   }
+   ret = device_create_file(adev->dev, _attr_pp_table);
+   if (ret) {
+   DRM_ERROR("failed to create device file pp_table\n");
+   return ret;
+   }
}
 [Evan] Please add this for amdgpu_pm_sysfs_init also.
ret = device_create_file(adev->dev, _attr_pp_dpm_sclk); @@ -3337,6 
+3338,13 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
return ret;
}
}
+
+   /* the reset are not needed for SRIOV one vf mode */
[Evan] Typo: 'reset' should be 'rest'.
+   if (amdgpu_sriov_vf(adev)) {
+   adev->pm.sysfs_initialized = true;
+   return ret;
+   }
+
if (adev->asic_type != CHIP_ARCTURUS) {
ret = device_create_file(adev->dev, _attr_pp_dpm_pcie);
if (ret) {
--
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cevan.quan%40amd.com%7C2fcb995e57fb4ceeb23608d7e7543adc%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232221351766801sdata=waHsAN5irYjlYb%2FrY8UT5h2eLu2wjaEw67DcaWJaUDs%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 6/8] drm/amdgpu: enable one vf mode for nv12

2020-04-23 Thread Quan, Evan



-Original Message-
From: amd-gfx  On Behalf Of Monk Liu
Sent: Thursday, April 23, 2020 3:02 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk 
Subject: [PATCH 6/8] drm/amdgpu: enable one vf mode for nv12

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 12 +++-  
drivers/gpu/drm/amd/powerplay/navi10_ppt.c |  6 +++-  
drivers/gpu/drm/amd/powerplay/smu_v11_0.c  | 49 +-
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 361a5b6..5964d63 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -347,13 +347,13 @@ int smu_get_dpm_freq_by_index(struct smu_context *smu, 
enum smu_clk_type clk_typ
param = (uint32_t)(((clk_id & 0x) << 16) | (level & 0x));
 
ret = smu_send_smc_msg_with_param(smu, SMU_MSG_GetDpmFreqByIndex,
- param, );
+ param, value);
if (ret)
return ret;
 
/* BIT31:  0 - Fine grained DPM, 1 - Dicrete DPM
 * now, we un-support it */
-   *value = param & 0x7fff;
+   *value = *value & 0x7fff;
 
return ret;
 }
@@ -535,7 +535,6 @@ int smu_update_table(struct smu_context *smu, enum 
smu_table_id table_index, int
int table_id = smu_table_get_index(smu, table_index);
uint32_t table_size;
int ret = 0;
-
if (!table_data || table_id >= SMU_TABLE_COUNT || table_id < 0)
return -EINVAL;
 
@@ -691,7 +690,6 @@ int smu_feature_is_enabled(struct smu_context *smu, enum 
smu_feature_mask mask)
 
if (smu->is_apu)
return 1;
-
feature_id = smu_feature_get_index(smu, mask);
if (feature_id < 0)
return 0;
@@ -1339,6 +1337,9 @@ static int smu_hw_init(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
struct smu_context *smu = >smu;
 
+   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
+   return 0;
+
ret = smu_start_smc_engine(smu);
if (ret) {
pr_err("SMU is not ready yet!\n");
@@ -1352,9 +1353,6 @@ static int smu_hw_init(void *handle)
smu_set_gfx_cgpg(>smu, true);
}
 
-   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
-   return 0;
-
if (!smu->pm_enabled)
return 0;
 
diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index c94270f..2184d24 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -1817,7 +1817,8 @@ static int navi10_get_power_limit(struct smu_context *smu,
int power_src;
 
if (!smu->power_limit) {
-   if (smu_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT)) {
+   if (smu_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT) &&
+   !amdgpu_sriov_vf(smu->adev)) {
power_src = smu_power_get_index(smu, 
SMU_POWER_SOURCE_AC);
if (power_src < 0)
return -EINVAL;
@@ -1960,6 +1961,9 @@ static int navi10_set_default_od_settings(struct 
smu_context *smu, bool initiali
OverDriveTable_t *od_table, *boot_od_table;
int ret = 0;
 
+   if (amdgpu_sriov_vf(smu->adev))
+   return 0;
+
ret = smu_v11_0_set_default_od_settings(smu, initialize, 
sizeof(OverDriveTable_t));
if (ret)
return ret;
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index a97b296..3e1b3ed 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -57,7 +57,7 @@ static int smu_v11_0_send_msg_without_waiting(struct 
smu_context *smu,
  uint16_t msg)
 {
struct amdgpu_device *adev = smu->adev;
-   WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
+   WREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
return 0;
 }
 
@@ -65,7 +65,7 @@ static int smu_v11_0_read_arg(struct smu_context *smu, 
uint32_t *arg)  {
struct amdgpu_device *adev = smu->adev;
 
-   *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
+   *arg = RREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_82);
return 0;
 }
 
@@ -75,7 +75,7 @@ static int smu_v11_0_wait_for_response(struct smu_context 
*smu)
uint32_t cur_value, i, timeout = adev->usec_timeout * 10;
 
for (i = 0; i < timeout; i++) {
-   cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+   cur_value = RREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_90);
if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
return 

RE: [PATCH 5/8] drm/amdgpu: clear the messed up checking logic

2020-04-23 Thread Quan, Evan
Please make this the last one of the series.
Other than that, this is acked-by: Evan Quan 

-Original Message-
From: amd-gfx  On Behalf Of Monk Liu
Sent: Thursday, April 23, 2020 3:02 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk 
Subject: [PATCH 5/8] drm/amdgpu: clear the messed up checking logic

for MI100 + ASICS, we always support SW_SMU for bare-metal and for SRIOV 
one_vf_mode

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 2bb1e0c..361a5b6 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -571,15 +571,10 @@ bool is_support_sw_smu(struct amdgpu_device *adev)
if (adev->asic_type == CHIP_VEGA20)
return (amdgpu_dpm == 2) ? true : false;
else if (adev->asic_type >= CHIP_ARCTURUS) {
-   if (amdgpu_sriov_vf(adev) &&
-   !(adev->asic_type == CHIP_ARCTURUS &&
- amdgpu_sriov_is_pp_one_vf(adev)))
-
-   return false;
-   else
+ if (amdgpu_sriov_is_pp_one_vf(adev) || !amdgpu_sriov_vf(adev))
return true;
-   } else
-   return false;
+   }
+   return false;
 }
 
 bool is_support_sw_smu_xgmi(struct amdgpu_device *adev)
--
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cevan.quan%40amd.com%7C88cd44199dc3451d709608d7e75438b0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63723119291353sdata=rDamzIdmtaTo%2FCiEzyIo015JHJcZIV5ZyVLnkrJHcec%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: check ring type for secure IBs

2020-04-23 Thread Liu, Aaron
[AMD Official Use Only - Internal Distribution Only]

Reviewed-by: Aaron Liu 

--
Best Regards
Aaron Liu

-Original Message-
From: amd-gfx  On Behalf Of Alex Deucher
Sent: Friday, April 24, 2020 4:47 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander 
Subject: [PATCH] drm/amdgpu: check ring type for secure IBs

We don't support secure operation on compute rings at the moment so reject them.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index ec2c5e164cd3..b91853fd66d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -161,6 +161,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
return -EINVAL;
}
 
+   if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) &&
+   (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) {
+   dev_err(adev->dev, "secure submissions not supported on compute 
rings\n");
+   return -EINVAL;
+   }
+
alloc_size = ring->funcs->emit_frame_size + num_ibs *
ring->funcs->emit_ib_size;
 
--
2.25.3

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Caaron.liu%40amd.com%7C4237b46dff1743425aa708d7e7c78c83%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232717308400132sdata=0%2BSeNlm5n2%2F3OaMf%2Bw0ArpVcyenjJ2fO%2FwaRymsbZxI%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: check ring type for secure IBs

2020-04-23 Thread Andrey Grodzovsky

Reviewed-by: Andrey Grodzovsky 

Andrey

On 4/23/20 4:47 PM, Alex Deucher wrote:

We don't support secure operation on compute rings at the
moment so reject them.

Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 6 ++
  1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index ec2c5e164cd3..b91853fd66d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -161,6 +161,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
return -EINVAL;
}
  
+	if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) &&

+   (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) {
+   dev_err(adev->dev, "secure submissions not supported on compute 
rings\n");
+   return -EINVAL;
+   }
+
alloc_size = ring->funcs->emit_frame_size + num_ibs *
ring->funcs->emit_ib_size;
  

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: check ring type for secure IBs

2020-04-23 Thread Alex Deucher
We don't support secure operation on compute rings at the
moment so reject them.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index ec2c5e164cd3..b91853fd66d3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -161,6 +161,12 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned 
num_ibs,
return -EINVAL;
}
 
+   if ((ib->flags & AMDGPU_IB_FLAGS_SECURE) &&
+   (ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)) {
+   dev_err(adev->dev, "secure submissions not supported on compute 
rings\n");
+   return -EINVAL;
+   }
+
alloc_size = ring->funcs->emit_frame_size + num_ibs *
ring->funcs->emit_ib_size;
 
-- 
2.25.3

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] PCI/P2PDMA: Add additional AMD ZEN root ports to the whitelist

2020-04-23 Thread Bjorn Helgaas
On Mon, Apr 06, 2020 at 03:42:01PM -0400, Alex Deucher wrote:
> According to the hw architect, pre-ZEN parts support
> p2p writes and ZEN parts support both p2p reads and writes.
> 
> Add entries for Zen parts Raven (0x15d0) and Renoir (0x1630).
> 
> Cc: Christian König 
> Acked-by: Christian König 
> Signed-off-by: Alex Deucher 

Applied with Huang's ack to pci/p2pdma for v5.8, thanks!

> ---
>  drivers/pci/p2pdma.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 9a8a38384121..91a4c987399d 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -282,6 +282,8 @@ static const struct pci_p2pdma_whitelist_entry {
>  } pci_p2pdma_whitelist[] = {
>   /* AMD ZEN */
>   {PCI_VENDOR_ID_AMD, 0x1450, 0},
> + {PCI_VENDOR_ID_AMD, 0x15d0, 0},
> + {PCI_VENDOR_ID_AMD, 0x1630, 0},
>  
>   /* Intel Xeon E5/Core i7 */
>   {PCI_VENDOR_ID_INTEL,   0x3c00, REQ_SAME_HOST_BRIDGE},
> -- 
> 2.25.1
> 
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] PCI/P2PDMA: Add additional AMD ZEN root ports to the whitelist

2020-04-23 Thread Alex Deucher
+ Bjorn

Can chance I can get this picked up for -next?

Thanks,

Alex

On Mon, Apr 6, 2020 at 3:42 PM Alex Deucher  wrote:
>
> According to the hw architect, pre-ZEN parts support
> p2p writes and ZEN parts support both p2p reads and writes.
>
> Add entries for Zen parts Raven (0x15d0) and Renoir (0x1630).
>
> Cc: Christian König 
> Acked-by: Christian König 
> Signed-off-by: Alex Deucher 
> ---
>  drivers/pci/p2pdma.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index 9a8a38384121..91a4c987399d 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -282,6 +282,8 @@ static const struct pci_p2pdma_whitelist_entry {
>  } pci_p2pdma_whitelist[] = {
> /* AMD ZEN */
> {PCI_VENDOR_ID_AMD, 0x1450, 0},
> +   {PCI_VENDOR_ID_AMD, 0x15d0, 0},
> +   {PCI_VENDOR_ID_AMD, 0x1630, 0},
>
> /* Intel Xeon E5/Core i7 */
> {PCI_VENDOR_ID_INTEL,   0x3c00, REQ_SAME_HOST_BRIDGE},
> --
> 2.25.1
>
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH][next] drm/amd/display: remove redundant assignment to variable ret

2020-04-23 Thread Alex Deucher
On Thu, Apr 23, 2020 at 10:18 AM Colin King  wrote:
>
> From: Colin Ian King 
>
> The variable ret is being initialized with a value that is never read
> and it is being updated later with a new value. The initialization is
> redundant and can be removed.
>
> Addresses-Coverity: ("Unused value")
> Signed-off-by: Colin Ian King 

Applied.  Thanks!

Alex

> ---
>  drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c 
> b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
> index d5b306384d79..9ef9e50a34fa 100644
> --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
> +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
> @@ -4231,7 +4231,7 @@ void dpcd_set_source_specific_data(struct dc_link *link)
>  {
> const uint32_t post_oui_delay = 30; // 30ms
> uint8_t dspc = 0;
> -   enum dc_status ret = DC_ERROR_UNEXPECTED;
> +   enum dc_status ret;
>
> ret = core_link_read_dpcd(link, DP_DOWN_STREAM_PORT_COUNT, ,
>   sizeof(dspc));
> --
> 2.25.1
>
> ___
> dri-devel mailing list
> dri-de...@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/dri-devel
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH][next] drm/amd/display: fix incorrect assignment due to a typo

2020-04-23 Thread Koo, Anthony
Hi Colin,

Sorry for any confusion of this code.
I think in this case, it seems like the comment is wrong (but original 
implementation is somewhat wrong as well). Probably the original code 
implementation makes it unclear.

There are three scenarios:
1. Variable refresh active, targeting a fixed rate
In this case, the min = max = fixed rate

2. Variable refresh active, with a variable range
In this case, the min = minimum refresh rate of the range.
And the max = maximum refresh rate of the range.

3. Variable refresh rate is disabled (The case you are modifying)
In the disabled case, we want to indicate to the display that the refresh rate 
is fixed, so we want to program min = max = the base refresh rate.
Today there seems to be an implication that max refresh = base refresh, which 
is not necessarily true.
I guess to make the code more clear and correct, the min and max should both be 
programmed equal to the base refresh rate (nominal field rate from 
mod_freesync_calc_nominal_field_rate)

Does that make sense?

Thanks,
Anthony

-Original Message-
From: Colin King  
Sent: Thursday, April 23, 2020 10:03 AM
To: Wentland, Harry ; Li, Sun peng (Leo) 
; Deucher, Alexander ; Koenig, 
Christian ; Zhou, David(ChunMing) 
; David Airlie ; Daniel Vetter 
; Koo, Anthony ; 
amd-gfx@lists.freedesktop.org; dri-de...@lists.freedesktop.org
Cc: kernel-janit...@vger.kernel.org; linux-ker...@vger.kernel.org
Subject: [PATCH][next] drm/amd/display: fix incorrect assignment due to a typo

From: Colin Ian King 

The assignment to infopacket->sb[7] looks incorrect, the comment states it is 
the minimum refresh rate yet it is being assigned a value from the maximum 
refresh rate max_refresh_in_uhz. Fix this by using min_refresh_in_uhz instead.

Addresses-Coverity: ("Copy-paste error")
Fixes: d2bacc38f6ca ("drm/amd/display: Change infopacket type programming")
Signed-off-by: Colin Ian King 
---
 drivers/gpu/drm/amd/display/modules/freesync/freesync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c 
b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
index eb7421e83b86..fe11436536e8 100644
--- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
+++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
@@ -587,7 +587,7 @@ static void build_vrr_infopacket_data_v3(const struct 
mod_vrr_params *vrr,
} else {
// Non-fs case, program nominal range
/* PB7 = FreeSync Minimum refresh rate (Hz) */
-   infopacket->sb[7] = (unsigned char)((vrr->max_refresh_in_uhz + 
50) / 100);
+   infopacket->sb[7] = (unsigned char)((vrr->min_refresh_in_uhz + 
+50) / 100);
/* PB8 = FreeSync Maximum refresh rate (Hz) */
infopacket->sb[8] = (unsigned char)((vrr->max_refresh_in_uhz + 
50) / 100);
}
--
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed

2020-04-23 Thread Alex Deucher
On Thu, Apr 23, 2020 at 10:55 AM Christian König
 wrote:
>
> Yeah, we certainly could try this again. But maybe split that up into
> individual patches for gfx7/8/9.
>
> In other words make it easy to revert if this still doesn't work well on
> gfx7 or some other generation.

Yeah, unless there is a good reason, I don't think we should do this.
IIRC, compute rings randomly fail to recover on a lot of hw
generations.

Alex

>
> Christian.
>
> Am 23.04.20 um 15:43 schrieb Zhang, Hawking:
> > [AMD Official Use Only - Internal Distribution Only]
> >
> > Would you mind to enable this and try it again? The recent gpu reset 
> > testing on vega20 looks very positive.
> >
> > Regards,
> > Hawking
> > -Original Message-
> > From: Christian König 
> > Sent: Thursday, April 23, 2020 20:31
> > To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org
> > Subject: Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test 
> > failed
> >
> > Am 23.04.20 um 11:01 schrieb Hawking Zhang:
> >> driver should stop cp resume once compute ring test failed
> > Mhm intentionally ignored those errors because the compute rings sometimes 
> > doesn't come up again after a GPU reset.
> >
> > We even have the necessary logic in the SW scheduler to redirect the jobs 
> > to another compute ring when one fails to come up again.
> >
> > Christian.
> >
> >> Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50
> >> Signed-off-by: Hawking Zhang 
> >> ---
> >>drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++-
> >>drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++-
> >>drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++-
> >>3 files changed, 9 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> >> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> >> index b2f10e3..fcee758 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> >> @@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct
> >> amdgpu_device *adev)
> >>
> >>  for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> >>  ring = >gfx.compute_ring[i];
> >> -amdgpu_ring_test_helper(ring);
> >> +r = amdgpu_ring_test_helper(ring);
> >> +if (r)
> >> +return r;
> >>  }
> >>
> >>  return 0;
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >> index 6c56ced..8dc8e90 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> >> @@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct
> >> amdgpu_device *adev)
> >>
> >>  for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> >>  ring = >gfx.compute_ring[i];
> >> -amdgpu_ring_test_helper(ring);
> >> +r = amdgpu_ring_test_helper(ring);
> >> +if (r)
> >> +return r;
> >>  }
> >>
> >>  return 0;
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >> index 09aa5f5..20937059 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> >> @@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct
> >> amdgpu_device *adev)
> >>
> >>  for (i = 0; i < adev->gfx.num_compute_rings; i++) {
> >>  ring = >gfx.compute_ring[i];
> >> -amdgpu_ring_test_helper(ring);
> >> +r = amdgpu_ring_test_helper(ring);
> >> +if (r)
> >> +return r;
> >>  }
> >>
> >>  gfx_v9_0_enable_gui_idle_interrupt(adev, true);
> > ___
> > amd-gfx mailing list
> > amd-gfx@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/amd-gfx
>
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/amd-gfx
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed

2020-04-23 Thread Christian König
Yeah, we certainly could try this again. But maybe split that up into 
individual patches for gfx7/8/9.


In other words make it easy to revert if this still doesn't work well on 
gfx7 or some other generation.


Christian.

Am 23.04.20 um 15:43 schrieb Zhang, Hawking:

[AMD Official Use Only - Internal Distribution Only]

Would you mind to enable this and try it again? The recent gpu reset testing on 
vega20 looks very positive.

Regards,
Hawking
-Original Message-
From: Christian König 
Sent: Thursday, April 23, 2020 20:31
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test 
failed

Am 23.04.20 um 11:01 schrieb Hawking Zhang:

driver should stop cp resume once compute ring test failed

Mhm intentionally ignored those errors because the compute rings sometimes 
doesn't come up again after a GPU reset.

We even have the necessary logic in the SW scheduler to redirect the jobs to 
another compute ring when one fails to come up again.

Christian.


Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50
Signed-off-by: Hawking Zhang 
---
   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++-
   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++-
   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++-
   3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index b2f10e3..fcee758 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct
amdgpu_device *adev)
   
   	for (i = 0; i < adev->gfx.num_compute_rings; i++) {

ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
   
   	return 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 6c56ced..8dc8e90 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct
amdgpu_device *adev)
   
   	for (i = 0; i < adev->gfx.num_compute_rings; i++) {

ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
   
   	return 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 09aa5f5..20937059 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct
amdgpu_device *adev)
   
   	for (i = 0; i < adev->gfx.num_compute_rings; i++) {

ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
   
   	gfx_v9_0_enable_gui_idle_interrupt(adev, true);

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH][next] drm/amd/display: remove redundant assignment to variable ret

2020-04-23 Thread Colin King
From: Colin Ian King 

The variable ret is being initialized with a value that is never read
and it is being updated later with a new value. The initialization is
redundant and can be removed.

Addresses-Coverity: ("Unused value")
Signed-off-by: Colin Ian King 
---
 drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c 
b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
index d5b306384d79..9ef9e50a34fa 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c
@@ -4231,7 +4231,7 @@ void dpcd_set_source_specific_data(struct dc_link *link)
 {
const uint32_t post_oui_delay = 30; // 30ms
uint8_t dspc = 0;
-   enum dc_status ret = DC_ERROR_UNEXPECTED;
+   enum dc_status ret;
 
ret = core_link_read_dpcd(link, DP_DOWN_STREAM_PORT_COUNT, ,
  sizeof(dspc));
-- 
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH][next] drm/amd/display: fix incorrect assignment due to a typo

2020-04-23 Thread Colin King
From: Colin Ian King 

The assignment to infopacket->sb[7] looks incorrect, the comment states it
is the minimum refresh rate yet it is being assigned a value from the
maximum refresh rate max_refresh_in_uhz. Fix this by using min_refresh_in_uhz
instead.

Addresses-Coverity: ("Copy-paste error")
Fixes: d2bacc38f6ca ("drm/amd/display: Change infopacket type programming")
Signed-off-by: Colin Ian King 
---
 drivers/gpu/drm/amd/display/modules/freesync/freesync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c 
b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
index eb7421e83b86..fe11436536e8 100644
--- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
+++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c
@@ -587,7 +587,7 @@ static void build_vrr_infopacket_data_v3(const struct 
mod_vrr_params *vrr,
} else {
// Non-fs case, program nominal range
/* PB7 = FreeSync Minimum refresh rate (Hz) */
-   infopacket->sb[7] = (unsigned char)((vrr->max_refresh_in_uhz + 
50) / 100);
+   infopacket->sb[7] = (unsigned char)((vrr->min_refresh_in_uhz + 
50) / 100);
/* PB8 = FreeSync Maximum refresh rate (Hz) */
infopacket->sb[8] = (unsigned char)((vrr->max_refresh_in_uhz + 
50) / 100);
}
-- 
2.25.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed

2020-04-23 Thread Zhang, Hawking
[AMD Official Use Only - Internal Distribution Only]

Would you mind to enable this and try it again? The recent gpu reset testing on 
vega20 looks very positive.

Regards,
Hawking
-Original Message-
From: Christian König  
Sent: Thursday, April 23, 2020 20:31
To: Zhang, Hawking ; amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test 
failed

Am 23.04.20 um 11:01 schrieb Hawking Zhang:
> driver should stop cp resume once compute ring test failed

Mhm intentionally ignored those errors because the compute rings sometimes 
doesn't come up again after a GPU reset.

We even have the necessary logic in the SW scheduler to redirect the jobs to 
another compute ring when one fails to come up again.

Christian.

>
> Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50
> Signed-off-by: Hawking Zhang 
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++-
>   3 files changed, 9 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> index b2f10e3..fcee758 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
> @@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct 
> amdgpu_device *adev)
>   
>   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>   ring = >gfx.compute_ring[i];
> - amdgpu_ring_test_helper(ring);
> + r = amdgpu_ring_test_helper(ring);
> + if (r)
> + return r;
>   }
>   
>   return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> index 6c56ced..8dc8e90 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
> @@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct 
> amdgpu_device *adev)
>   
>   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>   ring = >gfx.compute_ring[i];
> - amdgpu_ring_test_helper(ring);
> + r = amdgpu_ring_test_helper(ring);
> + if (r)
> + return r;
>   }
>   
>   return 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 09aa5f5..20937059 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct 
> amdgpu_device *adev)
>   
>   for (i = 0; i < adev->gfx.num_compute_rings; i++) {
>   ring = >gfx.compute_ring[i];
> - amdgpu_ring_test_helper(ring);
> + r = amdgpu_ring_test_helper(ring);
> + if (r)
> + return r;
>   }
>   
>   gfx_v9_0_enable_gui_idle_interrupt(adev, true);
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed

2020-04-23 Thread Christian König

Am 23.04.20 um 11:01 schrieb Hawking Zhang:

driver should stop cp resume once compute
ring test failed


Mhm intentionally ignored those errors because the compute rings 
sometimes doesn't come up again after a GPU reset.


We even have the necessary logic in the SW scheduler to redirect the 
jobs to another compute ring when one fails to come up again.


Christian.



Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50
Signed-off-by: Hawking Zhang 
---
  drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++-
  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++-
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++-
  3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index b2f10e3..fcee758 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct 
amdgpu_device *adev)
  
  	for (i = 0; i < adev->gfx.num_compute_rings; i++) {

ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
  
  	return 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 6c56ced..8dc8e90 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct 
amdgpu_device *adev)
  
  	for (i = 0; i < adev->gfx.num_compute_rings; i++) {

ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
  
  	return 0;

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 09aa5f5..20937059 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev)
  
  	for (i = 0; i < adev->gfx.num_compute_rings; i++) {

ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
  
  	gfx_v9_0_enable_gui_idle_interrupt(adev, true);


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: protect ring overrun

2020-04-23 Thread Tao, Yintian
Hi  Christian

Thanks. I will remove the initialization of r.

Best Regards
Yintian Tao
-Original Message-
From: Christian König  
Sent: 2020年4月23日 20:22
To: Tao, Yintian ; Koenig, Christian 
; Liu, Monk ; Liu, Shaoyun 

Cc: amd-gfx@lists.freedesktop.org
Subject: Re: [PATCH] drm/amdgpu: protect ring overrun

Am 23.04.20 um 11:06 schrieb Yintian Tao:
> Wait for the oldest sequence on the ring to be signaled in order to 
> make sure there will be no command overrun.
>
> v2: fix coding stype and remove abs operation

One nit pick below, with that fixed the patch is Reviewed-by: Christian König 


>
> Signed-off-by: Yintian Tao 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 10 +-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   | 22 ++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  3 ++-
>   drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |  8 +++-
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c|  1 -
>   drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c |  1 -
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 14 +++---
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  8 +++-
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  8 +++-
>   9 files changed, 61 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> index 7531527067df..397bd5fa77cb 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
> @@ -192,14 +192,22 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
> dma_fence **f,
>* Used For polling fence.
>* Returns 0 on success, -ENOMEM on failure.
>*/
> -int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s)
> +int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
> +   uint32_t timeout)
>   {
>   uint32_t seq;
> + signed long r = 0;

Please drop the initialization of r here. That is usually seen as rather bad 
style because it prevents the compiler from raising an warning when this really 
isn't initialized.

Regards,
Christian.

>   
>   if (!s)
>   return -EINVAL;
>   
>   seq = ++ring->fence_drv.sync_seq;
> + r = amdgpu_fence_wait_polling(ring,
> +   seq - ring->fence_drv.num_fences_mask,
> +   timeout);
> + if (r < 1)
> + return -ETIMEDOUT;
> +
>   amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
>  seq, 0);
>   
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index a721b0e0ff69..0103acc57474 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -675,13 +675,15 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device 
> *adev, uint32_t reg)
>   
>   spin_lock_irqsave(>ring_lock, flags);
>   if (amdgpu_device_wb_get(adev, _val_offs)) {
> - spin_unlock_irqrestore(>ring_lock, flags);
>   pr_err("critical bug! too many kiq readers\n");
> - goto failed_kiq_read;
> + goto failed_unlock;
>   }
>   amdgpu_ring_alloc(ring, 32);
>   amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
> - amdgpu_fence_emit_polling(ring, );
> + r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT);
> + if (r)
> + goto failed_undo;
> +
>   amdgpu_ring_commit(ring);
>   spin_unlock_irqrestore(>ring_lock, flags);
>   
> @@ -712,7 +714,13 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, 
> uint32_t reg)
>   amdgpu_device_wb_free(adev, reg_val_offs);
>   return value;
>   
> +failed_undo:
> + amdgpu_ring_undo(ring);
> +failed_unlock:
> + spin_unlock_irqrestore(>ring_lock, flags);
>   failed_kiq_read:
> + if (reg_val_offs)
> + amdgpu_device_wb_free(adev, reg_val_offs);
>   pr_err("failed to read reg:%x\n", reg);
>   return ~0;
>   }
> @@ -730,7 +738,10 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, 
> uint32_t reg, uint32_t v)
>   spin_lock_irqsave(>ring_lock, flags);
>   amdgpu_ring_alloc(ring, 32);
>   amdgpu_ring_emit_wreg(ring, reg, v);
> - amdgpu_fence_emit_polling(ring, );
> + r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT);
> + if (r)
> + goto failed_undo;
> +
>   amdgpu_ring_commit(ring);
>   spin_unlock_irqrestore(>ring_lock, flags);
>   
> @@ -759,6 +770,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, 
> uint32_t reg, uint32_t v)
>   
>   return;
>   
> +failed_undo:
> + amdgpu_ring_undo(ring);
> + spin_unlock_irqrestore(>ring_lock, flags);
>   failed_kiq_write:
>   pr_err("failed to write reg:%x\n", reg);
>   }
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> index 137d3d2b46e8..be218754629a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
> +++ 

Re: [PATCH 4/8] drm/amdgpu: provide RREG32_SOC15_NO_KIQ, will be used later

2020-04-23 Thread Christian König

Am 23.04.20 um 09:01 schrieb Monk Liu:

Signed-off-by: Monk Liu 



Yeah, I also stumbled over that recently. Patch is Acked-by: Christian 
König .



---
  drivers/gpu/drm/amd/amdgpu/soc15_common.h | 3 +++
  1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h 
b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
index c893c64..56d02aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
@@ -35,6 +35,9 @@
  #define RREG32_SOC15(ip, inst, reg) \
RREG32(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg)
  
+#define RREG32_SOC15_NO_KIQ(ip, inst, reg) \

+   RREG32_NO_KIQ(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg)
+
  #define RREG32_SOC15_OFFSET(ip, inst, reg, offset) \
RREG32((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) + 
offset)
  


___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: protect ring overrun

2020-04-23 Thread Christian König

Am 23.04.20 um 11:06 schrieb Yintian Tao:

Wait for the oldest sequence on the ring
to be signaled in order to make sure there
will be no command overrun.

v2: fix coding stype and remove abs operation


One nit pick below, with that fixed the patch is Reviewed-by: Christian 
König 




Signed-off-by: Yintian Tao 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   | 22 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  3 ++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |  8 +++-
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c|  1 -
  drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c |  1 -
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 14 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  8 +++-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  8 +++-
  9 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7531527067df..397bd5fa77cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -192,14 +192,22 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f,
   * Used For polling fence.
   * Returns 0 on success, -ENOMEM on failure.
   */
-int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s)
+int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
+ uint32_t timeout)
  {
uint32_t seq;
+   signed long r = 0;


Please drop the initialization of r here. That is usually seen as rather 
bad style because it prevents the compiler from raising an warning when 
this really isn't initialized.


Regards,
Christian.

  
  	if (!s)

return -EINVAL;
  
  	seq = ++ring->fence_drv.sync_seq;

+   r = amdgpu_fence_wait_polling(ring,
+ seq - ring->fence_drv.num_fences_mask,
+ timeout);
+   if (r < 1)
+   return -ETIMEDOUT;
+
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
   seq, 0);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

index a721b0e0ff69..0103acc57474 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -675,13 +675,15 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, 
uint32_t reg)
  
  	spin_lock_irqsave(>ring_lock, flags);

if (amdgpu_device_wb_get(adev, _val_offs)) {
-   spin_unlock_irqrestore(>ring_lock, flags);
pr_err("critical bug! too many kiq readers\n");
-   goto failed_kiq_read;
+   goto failed_unlock;
}
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
-   amdgpu_fence_emit_polling(ring, );
+   r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT);
+   if (r)
+   goto failed_undo;
+
amdgpu_ring_commit(ring);
spin_unlock_irqrestore(>ring_lock, flags);
  
@@ -712,7 +714,13 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, uint32_t reg)

amdgpu_device_wb_free(adev, reg_val_offs);
return value;
  
+failed_undo:

+   amdgpu_ring_undo(ring);
+failed_unlock:
+   spin_unlock_irqrestore(>ring_lock, flags);
  failed_kiq_read:
+   if (reg_val_offs)
+   amdgpu_device_wb_free(adev, reg_val_offs);
pr_err("failed to read reg:%x\n", reg);
return ~0;
  }
@@ -730,7 +738,10 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t 
reg, uint32_t v)
spin_lock_irqsave(>ring_lock, flags);
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_wreg(ring, reg, v);
-   amdgpu_fence_emit_polling(ring, );
+   r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT);
+   if (r)
+   goto failed_undo;
+
amdgpu_ring_commit(ring);
spin_unlock_irqrestore(>ring_lock, flags);
  
@@ -759,6 +770,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
  
  	return;
  
+failed_undo:

+   amdgpu_ring_undo(ring);
+   spin_unlock_irqrestore(>ring_lock, flags);
  failed_kiq_write:
pr_err("failed to write reg:%x\n", reg);
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 137d3d2b46e8..be218754629a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -118,7 +118,8 @@ void amdgpu_fence_driver_suspend(struct amdgpu_device 
*adev);
  void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
  int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
  unsigned flags);
-int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
+int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
+  

Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread Christian König

Am 23.04.20 um 12:44 schrieb Zhao, Jiange:

[AMD Official Use Only - Internal Distribution Only]

Hi Christian,

Here are some explanations,
(1) registered means that an app is listening to this node, completion means 
that this app has finished a dump.


Yeah and both are the same thing. E.g. when an app is listening the 
reset code needs to wait and when no app is listening it doesn't.


We could rename the member to something like app_listening if that makes 
things more clearer.



(2) after a dump, listening app would close this node. If it wants to get 
another reset hang, it needs to open the node again. In this way, release() 
function can wrap up and make it ready for another dump.


Yeah, but you are forgetting to reset the completion structure. With the 
current implementation the second timeout will reset the GPU immediately 
without waiting for the app to do the core dump.



(3) At the same time, considering the use case of this node, I believe that 
only the first GPU reset is worthy of a dump.


That is probably not such a bad idea, but we should leave this decision 
to the end user which can either restart or app or leave it like this.



(4) I didn't implement race condition guard because I believe that this node 
caters for a cautious super-user and a single client is enough to do all the 
work. I can add the logic if you think it is necessary.


That is a very dangerous assumption you made here. All kernel code must 
in general be free of such things.


Regards,
Christian.



Jiange

-Original Message-
From: Koenig, Christian 
Sent: Thursday, April 23, 2020 4:53 PM
To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Pelloux-prayer, Pierre-eric 
; Deucher, Alexander ; Zhang, Hawking 
; Liu, Monk ; Zhao, Jiange 
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

Am 23.04.20 um 09:19 schrieb jia...@amd.com:

From: Jiange Zhao 

When GPU got timeout, it would notify an interested part of an
opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system and
poll() for readable/writable. When a GPU reset is due, amdgpu would
notify usermode app through wait_queue_head and give it 10 minutes to
dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through the
completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

Signed-off-by: Jiange Zhao 
---
   drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
   4 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..a505b547f242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,13 @@ struct amd_powerplay {
const struct amd_pm_funcs *pp_funcs;
   };
   
+struct amdgpu_autodump {

+   boolregistered;
+   struct completion   completed;

Registered and completed seems to have the same meaning.


+   struct dentry   *dentry;
+   struct wait_queue_head  gpu_hang_wait;
+};
+
   #define AMDGPU_RESET_MAGIC_NUM 64
   #define AMDGPU_MAX_DF_PERFMONS 4
   struct amdgpu_device {
@@ -990,6 +997,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
   };
   
   static inline struct amdgpu_device *amdgpu_ttm_adev(struct

ttm_bo_device *bdev) diff --git
a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..cdd4bf00adee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
   }
   
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if

+defined(CONFIG_DEBUG_FS)
+   int ret;
+   unsigned long tmo = 600*HZ;

In general please declare constant lines first and variable like "i" or "r" 
last.


+
+   if (!adev->autodump.registered)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang_wait);
+
+   ret =
+wait_for_completion_interruptible_timeout(>autodump.completed,
+tmo);

This is racy, in other words it can happen that a new client opens up the 
debugfs file without being signaled but blocks the reset here.

You could use two completion structures to avoid that.


+   if (ret == 0) { /* 

RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread Zhao, Jiange
[AMD Official Use Only - Internal Distribution Only]

Hi Christian,

Here are some explanations,
(1) registered means that an app is listening to this node, completion means 
that this app has finished a dump.
(2) after a dump, listening app would close this node. If it wants to get 
another reset hang, it needs to open the node again. In this way, release() 
function can wrap up and make it ready for another dump.
(3) At the same time, considering the use case of this node, I believe that 
only the first GPU reset is worthy of a dump.
(4) I didn't implement race condition guard because I believe that this node 
caters for a cautious super-user and a single client is enough to do all the 
work. I can add the logic if you think it is necessary.

Jiange

-Original Message-
From: Koenig, Christian  
Sent: Thursday, April 23, 2020 4:53 PM
To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Pelloux-prayer, Pierre-eric 
; Deucher, Alexander 
; Zhang, Hawking ; Liu, Monk 
; Zhao, Jiange 
Subject: Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

Am 23.04.20 um 09:19 schrieb jia...@amd.com:
> From: Jiange Zhao 
>
> When GPU got timeout, it would notify an interested part of an 
> opportunity to dump info before actual GPU reset.
>
> A usermode app would open 'autodump' node under debugfs system and 
> poll() for readable/writable. When a GPU reset is due, amdgpu would 
> notify usermode app through wait_queue_head and give it 10 minutes to 
> dump info.
>
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through the 
> completion that is triggered in release().
>
> There is no write or read callback because necessary info can be 
> obtained through dmesg and umr. Messages back and forth between 
> usermode app and amdgpu are unnecessary.
>
> Signed-off-by: Jiange Zhao 
> ---
>   drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +
>   drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>   4 files changed, 97 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bc1e0fd71a09..a505b547f242 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -724,6 +724,13 @@ struct amd_powerplay {
>   const struct amd_pm_funcs *pp_funcs;
>   };
>   
> +struct amdgpu_autodump {
> + boolregistered;
> + struct completion   completed;

Registered and completed seems to have the same meaning.

> + struct dentry   *dentry;
> + struct wait_queue_head  gpu_hang_wait;
> +};
> +
>   #define AMDGPU_RESET_MAGIC_NUM 64
>   #define AMDGPU_MAX_DF_PERFMONS 4
>   struct amdgpu_device {
> @@ -990,6 +997,8 @@ struct amdgpu_device {
>   charproduct_number[16];
>   charproduct_name[32];
>   charserial[16];
> +
> + struct amdgpu_autodump  autodump;
>   };
>   
>   static inline struct amdgpu_device *amdgpu_ttm_adev(struct 
> ttm_bo_device *bdev) diff --git 
> a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..cdd4bf00adee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   return 0;
>   }
>   
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if 
> +defined(CONFIG_DEBUG_FS)
> + int ret;
> + unsigned long tmo = 600*HZ;

In general please declare constant lines first and variable like "i" or "r" 
last.

> +
> + if (!adev->autodump.registered)
> + return 0;
> +
> + wake_up_interruptible(>autodump.gpu_hang_wait);
> +
> + ret = 
> +wait_for_completion_interruptible_timeout(>autodump.completed, 
> +tmo);

This is racy, in other words it can happen that a new client opens up the 
debugfs file without being signaled but blocks the reset here.

You could use two completion structures to avoid that.

> + if (ret == 0) { /* time out and dump tool still not finish its dump*/
> + pr_err("autodump: timeout before dump finished, move on to gpu 
> recovery\n");
> + return -ETIMEDOUT;
> + }
> +#endif
> + return 0;
> +}
> +
>   #if defined(CONFIG_DEBUG_FS)
>   
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct 
> +file *file) {
> + int ret;
> + struct amdgpu_device *adev;
> +
> + ret = simple_open(inode, file);
> + if (ret)
> + return ret;
> +
> + adev = file->private_data;
> + if (adev->autodump.registered == true)
> + return -EINVAL;

Probably better to return -EBUSY here. 

Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread Pierre-Eric Pelloux-Prayer
Hi,

The build fails for me with this patch applied (poll_wait, POLLIN,
POLLRDNORM and POLLWRNORM are undeclared).

Adding "#include " to amdgpu_debugfs.c fixes it.

Pierre-Eric

On 23/04/2020 09:19, jia...@amd.com wrote:
> From: Jiange Zhao 
> 
> When GPU got timeout, it would notify an interested part
> of an opportunity to dump info before actual GPU reset.
> 
> A usermode app would open 'autodump' node under debugfs system
> and poll() for readable/writable. When a GPU reset is due,
> amdgpu would notify usermode app through wait_queue_head and give
> it 10 minutes to dump info.
> 
> After usermode app has done its work, this 'autodump' node is closed.
> On node closure, amdgpu gets to know the dump is done through
> the completion that is triggered in release().
> 
> There is no write or read callback because necessary info can be
> obtained through dmesg and umr. Messages back and forth between
> usermode app and amdgpu are unnecessary.
> 
> Signed-off-by: Jiange Zhao 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
>  4 files changed, 97 insertions(+)
> 
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index bc1e0fd71a09..a505b547f242 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -724,6 +724,13 @@ struct amd_powerplay {
>   const struct amd_pm_funcs *pp_funcs;
>  };
>  
> +struct amdgpu_autodump {
> + boolregistered;
> + struct completion   completed;
> + struct dentry   *dentry;
> + struct wait_queue_head  gpu_hang_wait;
> +};
> +
>  #define AMDGPU_RESET_MAGIC_NUM 64
>  #define AMDGPU_MAX_DF_PERFMONS 4
>  struct amdgpu_device {
> @@ -990,6 +997,8 @@ struct amdgpu_device {
>   charproduct_number[16];
>   charproduct_name[32];
>   charserial[16];
> +
> + struct amdgpu_autodump  autodump;
>  };
>  
>  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
> *bdev)
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> index 1a4894fa3693..cdd4bf00adee 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> @@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>   return 0;
>  }
>  
> +int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
> +{
> +#if defined(CONFIG_DEBUG_FS)
> + int ret;
> + unsigned long tmo = 600*HZ;
> +
> + if (!adev->autodump.registered)
> + return 0;
> +
> + wake_up_interruptible(>autodump.gpu_hang_wait);
> +
> + ret = 
> wait_for_completion_interruptible_timeout(>autodump.completed, tmo);
> + if (ret == 0) { /* time out and dump tool still not finish its dump*/
> + pr_err("autodump: timeout before dump finished, move on to gpu 
> recovery\n");
> + return -ETIMEDOUT;
> + }
> +#endif
> + return 0;
> +}
> +
>  #if defined(CONFIG_DEBUG_FS)
>  
> +static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file 
> *file)
> +{
> + int ret;
> + struct amdgpu_device *adev;
> +
> + ret = simple_open(inode, file);
> + if (ret)
> + return ret;
> +
> + adev = file->private_data;
> + if (adev->autodump.registered == true)
> + return -EINVAL;
> +
> + adev->autodump.registered = true;
> +
> + return 0;
> +}
> +
> +static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
> *file)
> +{
> + struct amdgpu_device *adev = file->private_data;
> +
> + complete(>autodump.completed);
> + adev->autodump.registered = false;
> +
> + return 0;
> +}
> +
> +unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
> poll_table_struct *poll_table)
> +{
> + struct amdgpu_device *adev = file->private_data;
> +
> + poll_wait(file, >autodump.gpu_hang_wait, poll_table);
> +
> + if (adev->in_gpu_reset)
> + return POLLIN | POLLRDNORM | POLLWRNORM;
> +
> + return 0;
> +}
> +
> +static const struct file_operations autodump_debug_fops = {
> + .owner = THIS_MODULE,
> + .open = amdgpu_debugfs_autodump_open,
> + .poll = amdgpu_debugfs_autodump_poll,
> + .release = amdgpu_debugfs_autodump_release,
> +};
> +
> +static int amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
> +{
> + struct dentry *entry;
> +
> + init_completion(>autodump.completed);
> + init_waitqueue_head(>autodump.gpu_hang_wait);
> + adev->autodump.registered = false;
> +
> + entry = debugfs_create_file("autodump", 0600,
> + adev->ddev->primary->debugfs_root,
> 

RE: [PATCH 2/2] drm/amdgpu: drop the unused local variable

2020-04-23 Thread Tao, Yintian
Hi  Hawking

Can you help also remove the same local variable kiq for 
gfx_v10_0_ring_emit_rreg? Thanks in advance.
After that , Reviewed-by: Yintian Tao 



Best Regards
Yintian Tao
-Original Message-
From: amd-gfx  On Behalf Of Hawking Zhang
Sent: 2020年4月23日 17:02
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Hawking 
Subject: [PATCH 2/2] drm/amdgpu: drop the unused local variable

local variable kiq won't be used in function gfx_v8_0_ring_emit_rreg

Change-Id: I6229987c8ce43ff0d55e1fae15ede9cb0827f76d
Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 8dc8e90..9644614 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6395,7 +6395,6 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring 
*ring, uint32_t reg,
uint32_t reg_val_offs)
 {
struct amdgpu_device *adev = ring->adev;
-   struct amdgpu_kiq *kiq = >gfx.kiq;
 
amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
amdgpu_ring_write(ring, 0 | /* src: register*/
--
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cyintian.tao%40amd.com%7C3e351ebdd7ff45259e6108d7e76502d0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232293332601376sdata=dH2o%2Bl9%2FjqzP2IIN43qIDrbpmmQpXjPwgGvyaAc%2B1L8%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: protect ring overrun

2020-04-23 Thread Yintian Tao
Wait for the oldest sequence on the ring
to be signaled in order to make sure there
will be no command overrun.

v2: fix coding stype and remove abs operation

Signed-off-by: Yintian Tao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   | 22 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |  8 +++-
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c|  1 -
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c |  1 -
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 14 +++---
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  8 +++-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  8 +++-
 9 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7531527067df..397bd5fa77cb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -192,14 +192,22 @@ int amdgpu_fence_emit(struct amdgpu_ring *ring, struct 
dma_fence **f,
  * Used For polling fence.
  * Returns 0 on success, -ENOMEM on failure.
  */
-int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s)
+int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
+ uint32_t timeout)
 {
uint32_t seq;
+   signed long r = 0;
 
if (!s)
return -EINVAL;
 
seq = ++ring->fence_drv.sync_seq;
+   r = amdgpu_fence_wait_polling(ring,
+ seq - ring->fence_drv.num_fences_mask,
+ timeout);
+   if (r < 1)
+   return -ETIMEDOUT;
+
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
   seq, 0);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index a721b0e0ff69..0103acc57474 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -675,13 +675,15 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, 
uint32_t reg)
 
spin_lock_irqsave(>ring_lock, flags);
if (amdgpu_device_wb_get(adev, _val_offs)) {
-   spin_unlock_irqrestore(>ring_lock, flags);
pr_err("critical bug! too many kiq readers\n");
-   goto failed_kiq_read;
+   goto failed_unlock;
}
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
-   amdgpu_fence_emit_polling(ring, );
+   r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT);
+   if (r)
+   goto failed_undo;
+
amdgpu_ring_commit(ring);
spin_unlock_irqrestore(>ring_lock, flags);
 
@@ -712,7 +714,13 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, 
uint32_t reg)
amdgpu_device_wb_free(adev, reg_val_offs);
return value;
 
+failed_undo:
+   amdgpu_ring_undo(ring);
+failed_unlock:
+   spin_unlock_irqrestore(>ring_lock, flags);
 failed_kiq_read:
+   if (reg_val_offs)
+   amdgpu_device_wb_free(adev, reg_val_offs);
pr_err("failed to read reg:%x\n", reg);
return ~0;
 }
@@ -730,7 +738,10 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t 
reg, uint32_t v)
spin_lock_irqsave(>ring_lock, flags);
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_wreg(ring, reg, v);
-   amdgpu_fence_emit_polling(ring, );
+   r = amdgpu_fence_emit_polling(ring, , MAX_KIQ_REG_WAIT);
+   if (r)
+   goto failed_undo;
+
amdgpu_ring_commit(ring);
spin_unlock_irqrestore(>ring_lock, flags);
 
@@ -759,6 +770,9 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t 
reg, uint32_t v)
 
return;
 
+failed_undo:
+   amdgpu_ring_undo(ring);
+   spin_unlock_irqrestore(>ring_lock, flags);
 failed_kiq_write:
pr_err("failed to write reg:%x\n", reg);
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 137d3d2b46e8..be218754629a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -118,7 +118,8 @@ void amdgpu_fence_driver_suspend(struct amdgpu_device 
*adev);
 void amdgpu_fence_driver_resume(struct amdgpu_device *adev);
 int amdgpu_fence_emit(struct amdgpu_ring *ring, struct dma_fence **fence,
  unsigned flags);
-int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s);
+int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, uint32_t *s,
+ uint32_t timeout);
 bool amdgpu_fence_process(struct amdgpu_ring *ring);
 int amdgpu_fence_wait_empty(struct amdgpu_ring *ring);
 signed long amdgpu_fence_wait_polling(struct amdgpu_ring *ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
index 8c10084f44ef..cbbb8d02535a 100644

[PATCH 2/2] drm/amdgpu: drop the unused local variable

2020-04-23 Thread Hawking Zhang
local variable kiq won't be used in function
gfx_v8_0_ring_emit_rreg

Change-Id: I6229987c8ce43ff0d55e1fae15ede9cb0827f76d
Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 8dc8e90..9644614 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6395,7 +6395,6 @@ static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring 
*ring, uint32_t reg,
uint32_t reg_val_offs)
 {
struct amdgpu_device *adev = ring->adev;
-   struct amdgpu_kiq *kiq = >gfx.kiq;
 
amdgpu_ring_write(ring, PACKET3(PACKET3_COPY_DATA, 4));
amdgpu_ring_write(ring, 0 | /* src: register*/
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH 2/2] drm/amdgpu: limit smu_set_mp1_state to pp_one_vf or bare-metal

2020-04-23 Thread Liu, Monk
Drop this one

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Monk Liu  
Sent: Thursday, April 23, 2020 4:13 PM
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk 
Subject: [PATCH 2/2] drm/amdgpu: limit smu_set_mp1_state to pp_one_vf or 
bare-metal

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3d601d5..810141f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2465,7 +2465,7 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
}
adev->ip_blocks[i].status.hw = false;
/* handle putting the SMC in the appropriate state */
-   if(!amdgpu_sriov_vf(adev)){
+   if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_is_pp_one_vf(adev)) {
if (adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_SMC) {
r = amdgpu_dpm_set_mp1_state(adev, 
adev->mp1_state);
if (r) {
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 1/2] drm/amdgpu: stop cp resume when compute ring test failed

2020-04-23 Thread Hawking Zhang
driver should stop cp resume once compute
ring test failed

Change-Id: I4cd3328f38e0755d0c877484936132d204c9fe50
Signed-off-by: Hawking Zhang 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c | 4 +++-
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 4 +++-
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 +++-
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
index b2f10e3..fcee758 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v7_0.c
@@ -3132,7 +3132,9 @@ static int gfx_v7_0_cp_compute_resume(struct 
amdgpu_device *adev)
 
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 6c56ced..8dc8e90 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -4781,7 +4781,9 @@ static int gfx_v8_0_cp_test_all_rings(struct 
amdgpu_device *adev)
 
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
 
return 0;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 09aa5f5..20937059 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3846,7 +3846,9 @@ static int gfx_v9_0_cp_resume(struct amdgpu_device *adev)
 
for (i = 0; i < adev->gfx.num_compute_rings; i++) {
ring = >gfx.compute_ring[i];
-   amdgpu_ring_test_helper(ring);
+   r = amdgpu_ring_test_helper(ring);
+   if (r)
+   return r;
}
 
gfx_v9_0_enable_gui_idle_interrupt(adev, true);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread Christian König

Am 23.04.20 um 09:19 schrieb jia...@amd.com:

From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

Signed-off-by: Jiange Zhao 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
  4 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..a505b547f242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,13 @@ struct amd_powerplay {
const struct amd_pm_funcs *pp_funcs;
  };
  
+struct amdgpu_autodump {

+   boolregistered;
+   struct completion   completed;


Registered and completed seems to have the same meaning.


+   struct dentry   *dentry;
+   struct wait_queue_head  gpu_hang_wait;
+};
+
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  struct amdgpu_device {
@@ -990,6 +997,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
  };
  
  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..cdd4bf00adee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
  }
  
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)

+{
+#if defined(CONFIG_DEBUG_FS)
+   int ret;
+   unsigned long tmo = 600*HZ;


In general please declare constant lines first and variable like "i" or 
"r" last.



+
+   if (!adev->autodump.registered)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang_wait);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.completed, tmo);


This is racy, in other words it can happen that a new client opens up 
the debugfs file without being signaled but blocks the reset here.


You could use two completion structures to avoid that.


+   if (ret == 0) { /* time out and dump tool still not finish its dump*/
+   pr_err("autodump: timeout before dump finished, move on to gpu 
recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
  #if defined(CONFIG_DEBUG_FS)
  
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)

+{
+   int ret;
+   struct amdgpu_device *adev;
+
+   ret = simple_open(inode, file);
+   if (ret)
+   return ret;
+
+   adev = file->private_data;
+   if (adev->autodump.registered == true)
+   return -EINVAL;


Probably better to return -EBUSY here. And this is racy, and might need 
a lock e.g. multiple clients could open the file at the same time.


If we use a struct completion for registered we could use the spinlock 
of that one for protection here.



+
+   adev->autodump.registered = true;


You also need to reset the completion structure here otherwise only the 
first GPU reset would work with this.



+
+   return 0;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   complete(>autodump.completed);
+   adev->autodump.registered = false;
+
+   return 0;
+}
+
+unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   poll_wait(file, >autodump.gpu_hang_wait, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = amdgpu_debugfs_autodump_open,
+   .poll = amdgpu_debugfs_autodump_poll,
+   .release = amdgpu_debugfs_autodump_release,
+};
+

[PATCH 1/2] drm/amdgpu: extent threshold of waiting FLR_COMPLETE

2020-04-23 Thread Monk Liu
to 5s to satisfy WHOLE GPU reset which need 3+ seconds to
finish

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h | 2 +-
 drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
index 52a6975..83b453f5 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.h
@@ -26,7 +26,7 @@
 
 #define AI_MAILBOX_POLL_ACK_TIMEDOUT   500
 #define AI_MAILBOX_POLL_MSG_TIMEDOUT   12000
-#define AI_MAILBOX_POLL_FLR_TIMEDOUT   500
+#define AI_MAILBOX_POLL_FLR_TIMEDOUT   5000
 
 enum idh_request {
IDH_REQ_GPU_INIT_ACCESS = 1,
diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h 
b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
index 45bcf43..52605e1 100644
--- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
+++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h
@@ -26,7 +26,7 @@
 
 #define NV_MAILBOX_POLL_ACK_TIMEDOUT   500
 #define NV_MAILBOX_POLL_MSG_TIMEDOUT   6000
-#define NV_MAILBOX_POLL_FLR_TIMEDOUT   500
+#define NV_MAILBOX_POLL_FLR_TIMEDOUT   5000
 
 enum idh_request {
IDH_REQ_GPU_INIT_ACCESS = 1,
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/2] drm/amdgpu: limit smu_set_mp1_state to pp_one_vf or bare-metal

2020-04-23 Thread Monk Liu
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3d601d5..810141f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2465,7 +2465,7 @@ static int amdgpu_device_ip_suspend_phase2(struct 
amdgpu_device *adev)
}
adev->ip_blocks[i].status.hw = false;
/* handle putting the SMC in the appropriate state */
-   if(!amdgpu_sriov_vf(adev)){
+   if (!amdgpu_sriov_vf(adev) || amdgpu_sriov_is_pp_one_vf(adev)) {
if (adev->ip_blocks[i].version->type == 
AMD_IP_BLOCK_TYPE_SMC) {
r = amdgpu_dpm_set_mp1_state(adev, 
adev->mp1_state);
if (r) {
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread Christian König

Yes, an open file descriptor holds a reference to the driver module.

So it shouldn't be possible to unload the driver while it is open.

Christian.

Am 23.04.20 um 09:54 schrieb Liu, Monk:

Oh, looks if the daemon is opening the node KMD don't have a chance to enter the path of 
shutdown/unload driver, thus no chance to return "kmd unloading" to the app...

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx  On Behalf Of Liu, Monk
Sent: Thursday, April 23, 2020 3:52 PM
To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Pelloux-prayer, Pierre-eric 
; Kuehling, Felix ; Koenig, Christian 
; Zhang, Hawking 
Subject: RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

Hi Christian

Do you think we need to kill the daemon app if we do KMD unloading ? usually 
user need to close the app first and then the KMD could be unloaded

If we don't want to manually shutdown the daemon app we can do a "KILL" signal send to that 
process, or we can implement "read" and let app call "read()" to fetch information like:
1) xxx process hang
2) kmd unloading

And daemon can close() the node if it receives "kmd unloading" instead of doing 
the dump

Thanks

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Zhao, Jiange 
Sent: Thursday, April 23, 2020 3:20 PM
To: amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian ; Kuehling, Felix ; Pelloux-prayer, 
Pierre-eric ; Deucher, Alexander ; Zhang, 
Hawking ; Liu, Monk ; Zhao, Jiange 
Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

From: Jiange Zhao 

When GPU got timeout, it would notify an interested part of an opportunity to 
dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system and poll() for 
readable/writable. When a GPU reset is due, amdgpu would notify usermode app 
through wait_queue_head and give it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through the completion 
that is triggered in release().

There is no write or read callback because necessary info can be obtained 
through dmesg and umr. Messages back and forth between usermode app and amdgpu 
are unnecessary.

Signed-off-by: Jiange Zhao 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
  4 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..a505b547f242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,13 @@ struct amd_powerplay {
const struct amd_pm_funcs *pp_funcs;
  };
  
+struct amdgpu_autodump {

+   boolregistered;
+   struct completion   completed;
+   struct dentry   *dentry;
+   struct wait_queue_head  gpu_hang_wait;
+};
+
  #define AMDGPU_RESET_MAGIC_NUM 64
  #define AMDGPU_MAX_DF_PERFMONS 4
  struct amdgpu_device {
@@ -990,6 +997,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
  };
  
  static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 1a4894fa3693..cdd4bf00adee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
  }
  
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if

+defined(CONFIG_DEBUG_FS)
+   int ret;
+   unsigned long tmo = 600*HZ;
+
+   if (!adev->autodump.registered)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang_wait);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.completed, tmo);
+   if (ret == 0) { /* time out and dump tool still not finish its dump*/
+   pr_err("autodump: timeout before dump finished, move on to gpu 
recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
  #if defined(CONFIG_DEBUG_FS)
  
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct

+file *file) {
+   int ret;
+   struct amdgpu_device *adev;
+
+   ret = simple_open(inode, file);
+   if (ret)
+   return ret;
+
+   adev = file->private_data;
+   if (adev->autodump.registered == true)
+   return -EINVAL;

RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread Liu, Monk
Oh, looks if the daemon is opening the node KMD don't have a chance to enter 
the path of shutdown/unload driver, thus no chance to return "kmd unloading" to 
the app...

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: amd-gfx  On Behalf Of Liu, Monk
Sent: Thursday, April 23, 2020 3:52 PM
To: Zhao, Jiange ; amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Pelloux-prayer, Pierre-eric 
; Kuehling, Felix ; 
Koenig, Christian ; Zhang, Hawking 

Subject: RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

Hi Christian 

Do you think we need to kill the daemon app if we do KMD unloading ? usually 
user need to close the app first and then the KMD could be unloaded 

If we don't want to manually shutdown the daemon app we can do a "KILL" signal 
send to that process, or we can implement "read" and let app call "read()" to 
fetch information like:
1) xxx process hang
2) kmd unloading

And daemon can close() the node if it receives "kmd unloading" instead of doing 
the dump 

Thanks 

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Zhao, Jiange 
Sent: Thursday, April 23, 2020 3:20 PM
To: amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian ; Kuehling, Felix 
; Pelloux-prayer, Pierre-eric 
; Deucher, Alexander 
; Zhang, Hawking ; Liu, Monk 
; Zhao, Jiange 
Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

From: Jiange Zhao 

When GPU got timeout, it would notify an interested part of an opportunity to 
dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system and poll() for 
readable/writable. When a GPU reset is due, amdgpu would notify usermode app 
through wait_queue_head and give it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through the completion 
that is triggered in release().

There is no write or read callback because necessary info can be obtained 
through dmesg and umr. Messages back and forth between usermode app and amdgpu 
are unnecessary.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..a505b547f242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,13 @@ struct amd_powerplay {
const struct amd_pm_funcs *pp_funcs;
 };
 
+struct amdgpu_autodump {
+   boolregistered;
+   struct completion   completed;
+   struct dentry   *dentry;
+   struct wait_queue_head  gpu_hang_wait;
+};
+
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 struct amdgpu_device {
@@ -990,6 +997,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
*bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..cdd4bf00adee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if
+defined(CONFIG_DEBUG_FS)
+   int ret;
+   unsigned long tmo = 600*HZ;
+
+   if (!adev->autodump.registered)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang_wait);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.completed, tmo);
+   if (ret == 0) { /* time out and dump tool still not finish its dump*/
+   pr_err("autodump: timeout before dump finished, move on to gpu 
recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct 
+file *file) {
+   int ret;
+   struct amdgpu_device *adev;
+
+   ret = simple_open(inode, file);
+   if (ret)
+   return ret;
+
+   adev = file->private_data;
+   if (adev->autodump.registered == true)
+   return -EINVAL;
+
+   adev->autodump.registered = true;
+
+   return 0;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct 
+file *file) {
+   struct amdgpu_device *adev = 

[PATCH] drm/amdgpu/display: Fix dc_sink refcnt leak in dc_link_detect_helper

2020-04-23 Thread Xiyu Yang
dc_link_detect_helper() invokes dc_sink_retain(), which increases the
refcount of the "prev_sink".

When dc_link_detect_helper() returns, local variable "prev_sink" becomes
invalid, so the refcount should be decreased to keep refcount balanced.

The reference counting issue happens in one exception handling path of
dc_link_detect_helper(). When alt mode times out, the function forgets
to decrease the refcnt increased by dc_sink_retain(), causing a refcnt
leak.

Fix this issue by calling dc_sink_release() when alt mode times out.

Signed-off-by: Xiyu Yang 
Signed-off-by: Xin Tan 
---
 drivers/gpu/drm/amd/display/dc/core/dc_link.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link.c 
b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
index a09119c10d7c..91550d9a1abb 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc_link.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc_link.c
@@ -832,6 +832,8 @@ static bool dc_link_detect_helper(struct dc_link *link,
 
/* if alt mode times out, return false */
if (wait_for_alt_mode(link) == false) {
+   if (prev_sink != NULL)
+   dc_sink_release(prev_sink);
return false;
}
}
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/display: Fix dc_sink refcnt leak when detecting link

2020-04-23 Thread Xiyu Yang
emulated_link_detect() invokes dc_sink_retain(), which increases the
refcount of the "prev_sink".

When emulated_link_detect() returns, local variable "prev_sink" becomes
invalid, so the refcount should be decreased to keep refcount balanced.

The reference counting issue happens in all paths of
emulated_link_detect(), which forgets to decrease the refcnt increased
by dc_sink_retain(), causing a refcnt leak.

Fix this issue by adding a "err_sink_put" label and calling
dc_sink_release() before emulated_link_detect() returns.

Signed-off-by: Xiyu Yang 
Signed-off-by: Xin Tan 
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index e997251a8b57..1b0c4f11b9b1 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -1633,7 +1633,7 @@ static void emulated_link_detect(struct dc_link *link)
default:
DC_ERROR("Invalid connector type! signal:%d\n",
link->connector_signal);
-   return;
+   goto err_sink_put;
}
 
sink_init_data.link = link;
@@ -1642,7 +1642,7 @@ static void emulated_link_detect(struct dc_link *link)
sink = dc_sink_create(_init_data);
if (!sink) {
DC_ERROR("Failed to create sink!\n");
-   return;
+   goto err_sink_put;
}
 
/* dc_sink_create returns a new reference */
@@ -1655,6 +1655,9 @@ static void emulated_link_detect(struct dc_link *link)
 
if (edid_status != EDID_OK)
DC_ERROR("Failed to read EDID");
+err_sink_put:
+   if (prev_sink != NULL)
+   dc_sink_release(prev_sink);
 
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread Liu, Monk
Hi Christian 

Do you think we need to kill the daemon app if we do KMD unloading ? usually 
user need to close the app first and then the KMD could be unloaded 

If we don't want to manually shutdown the daemon app we can do a "KILL" signal 
send to that process, or we can implement "read" and let app call "read()" to 
fetch information like:
1) xxx process hang
2) kmd unloading

And daemon can close() the node if it receives "kmd unloading" instead of doing 
the dump 

Thanks 

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Zhao, Jiange  
Sent: Thursday, April 23, 2020 3:20 PM
To: amd-gfx@lists.freedesktop.org
Cc: Koenig, Christian ; Kuehling, Felix 
; Pelloux-prayer, Pierre-eric 
; Deucher, Alexander 
; Zhang, Hawking ; Liu, Monk 
; Zhao, Jiange 
Subject: [PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

From: Jiange Zhao 

When GPU got timeout, it would notify an interested part of an opportunity to 
dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system and poll() for 
readable/writable. When a GPU reset is due, amdgpu would notify usermode app 
through wait_queue_head and give it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through the completion 
that is triggered in release().

There is no write or read callback because necessary info can be obtained 
through dmesg and umr. Messages back and forth between usermode app and amdgpu 
are unnecessary.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +  
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..a505b547f242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,13 @@ struct amd_powerplay {
const struct amd_pm_funcs *pp_funcs;
 };
 
+struct amdgpu_autodump {
+   boolregistered;
+   struct completion   completed;
+   struct dentry   *dentry;
+   struct wait_queue_head  gpu_hang_wait;
+};
+
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 struct amdgpu_device {
@@ -990,6 +997,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device 
*bdev) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..cdd4bf00adee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev) { #if 
+defined(CONFIG_DEBUG_FS)
+   int ret;
+   unsigned long tmo = 600*HZ;
+
+   if (!adev->autodump.registered)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang_wait);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.completed, tmo);
+   if (ret == 0) { /* time out and dump tool still not finish its dump*/
+   pr_err("autodump: timeout before dump finished, move on to gpu 
recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct 
+file *file) {
+   int ret;
+   struct amdgpu_device *adev;
+
+   ret = simple_open(inode, file);
+   if (ret)
+   return ret;
+
+   adev = file->private_data;
+   if (adev->autodump.registered == true)
+   return -EINVAL;
+
+   adev->autodump.registered = true;
+
+   return 0;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct 
+file *file) {
+   struct amdgpu_device *adev = file->private_data;
+
+   complete(>autodump.completed);
+   adev->autodump.registered = false;
+
+   return 0;
+}
+
+unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
+poll_table_struct *poll_table) {
+   struct amdgpu_device *adev = file->private_data;
+
+   poll_wait(file, >autodump.gpu_hang_wait, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = 

Re: [PATCH] drm/amdgpu: protect ring overrun

2020-04-23 Thread Christian König

Am 23.04.20 um 06:22 schrieb Yintian Tao:

Wait for the oldest sequence on the ring
to be signaled in order to make sure there
will be no command overrun.


One technical problem and a few style suggestions below. Apart from that 
looks good to me.




Signed-off-by: Yintian Tao 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c |  7 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c   | 17 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c  |  8 +++-
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c |  9 -
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c|  8 +++-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |  8 +++-
  6 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
index 7531527067df..5462ea83d8b2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_fence.c
@@ -200,6 +200,13 @@ int amdgpu_fence_emit_polling(struct amdgpu_ring *ring, 
uint32_t *s)
return -EINVAL;
  
  	seq = ++ring->fence_drv.sync_seq;

+   if ((abs(seq - ring->fence_drv.num_fences_mask) >
+   ring->fence_drv.num_fences_mask) &&


That is a rather bad idea and won't work. The sequence is a 32bit value 
and supposed to wrap around. Just dropping the extra check should do it 
should work.



+   (amdgpu_fence_wait_polling(ring,
+  seq - ring->fence_drv.num_fences_mask,
+  MAX_KIQ_REG_WAIT) < 1))


Maybe we should make the timeout a parameter here.

And it is usually better to use the style like this:

r = amdgpu_fence
if (r < 1)
    .


+return -ETIME;


We usually use -ETIMEDOUT because this is not really bound to a specific 
timer.



+
amdgpu_ring_emit_fence(ring, ring->fence_drv.gpu_addr,
   seq, 0);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

index a721b0e0ff69..7087333681f6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -681,7 +681,14 @@ uint32_t amdgpu_kiq_rreg(struct amdgpu_device *adev, 
uint32_t reg)
}
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_rreg(ring, reg, reg_val_offs);
-   amdgpu_fence_emit_polling(ring, );
+   r = amdgpu_fence_emit_polling(ring, );
+   if (r) {
+   amdgpu_ring_undo(ring);
+   amdgpu_device_wb_free(adev, reg_val_offs);
+   spin_unlock_irqrestore(>ring_lock, flags);
+   goto failed_kiq_read;
+   }
+


If we already have goto style error handling we should probably just add 
a new label to handle it.


Same for the other cases below where you already have a goto.

Regards,
Christian.


amdgpu_ring_commit(ring);
spin_unlock_irqrestore(>ring_lock, flags);
  
@@ -730,7 +737,13 @@ void amdgpu_kiq_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)

spin_lock_irqsave(>ring_lock, flags);
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_wreg(ring, reg, v);
-   amdgpu_fence_emit_polling(ring, );
+   r = amdgpu_fence_emit_polling(ring, );
+   if (r) {
+   amdgpu_ring_undo(ring);
+   spin_unlock_irqrestore(>ring_lock, flags);
+   goto failed_kiq_write;
+   }
+
amdgpu_ring_commit(ring);
spin_unlock_irqrestore(>ring_lock, flags);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

index 8c10084f44ef..12d181ac7e78 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c
@@ -60,7 +60,13 @@ void amdgpu_virt_kiq_reg_write_reg_wait(struct amdgpu_device 
*adev,
amdgpu_ring_alloc(ring, 32);
amdgpu_ring_emit_reg_write_reg_wait(ring, reg0, reg1,
ref, mask);
-   amdgpu_fence_emit_polling(ring, );
+   r = amdgpu_fence_emit_polling(ring, );
+   if (r) {
+   amdgpu_ring_undo(ring);
+   spin_unlock_irqrestore(>ring_lock, flags);
+   goto failed_kiq;
+   }
+
amdgpu_ring_commit(ring);
spin_unlock_irqrestore(>ring_lock, flags);
  
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c

index 5b1549f167b0..650b7a67d3bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -4068,7 +4068,14 @@ static uint64_t gfx_v9_0_kiq_read_clock(struct 
amdgpu_device *adev)
reg_val_offs * 4));
amdgpu_ring_write(ring, upper_32_bits(adev->wb.gpu_addr +
reg_val_offs * 4));
-   amdgpu_fence_emit_polling(ring, );
+   r = amdgpu_fence_emit_polling(ring, );
+   if (r) {
+   amdgpu_ring_undo(ring);
+   amdgpu_device_wb_free(adev, 

Re: [PATCH hmm 5/5] mm/hmm: remove the customizable pfn format from hmm_range_fault

2020-04-23 Thread Christoph Hellwig
On Wed, Apr 22, 2020 at 09:39:11AM -0300, Jason Gunthorpe wrote:
> > Nice callchain from hell..  Unfortunately such "code listings" tend to
> > get out of date very quickly, so I'm not sure it is worth keeping in
> > the code.  What would be really worthile is consolidating the two
> > different sets of defines (NVIF_VMM_PFNMAP_V0_ vs NVKM_VMM_PFN_)
> > to make the code a little easier to follow.
> 
> I was mainly concerned that this function is using hmm properly,
> becuase it sure looks like it is just forming the CPU physical address
> into a HW specific data. But it turns out it is just an internal data
> for some other code and the dma_map is impossibly far away
> 
> It took forever to find, I figured I'd leave a hint for the next poor
> soul that has to look at this.. 
> 
> Also, I think it shows there is no 'performance' argument here, if
> this path needs more performance the above should be cleaned
> before we abuse hmm_range_fault.
> 
> Put it in the commit message instead?

Yes, the graph itself sounds reasonable for the commit log as a point
of time information.

> > >   npages = (range->end - range->start) >> PAGE_SHIFT;
> > >   for (i = 0; i < npages; ++i) {
> > >   struct page *page;
> > >  
> > > + if (!(range->hmm_pfns[i] & HMM_PFN_VALID)) {
> > > + ioctl_addr[i] = 0;
> > >   continue;
> > > + }
> > 
> > Can't we rely on the caller pre-zeroing the array?
> 
> This ends up as args.phys in nouveau_svm_fault - I didn't see a
> zeroing?
> 
> I think it makes sense that this routine fully sets the output array
> and does not assume pre-initialize

Ok.
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: Add autodump debugfs node for gpu reset

2020-04-23 Thread jianzh
From: Jiange Zhao 

When GPU got timeout, it would notify an interested part
of an opportunity to dump info before actual GPU reset.

A usermode app would open 'autodump' node under debugfs system
and poll() for readable/writable. When a GPU reset is due,
amdgpu would notify usermode app through wait_queue_head and give
it 10 minutes to dump info.

After usermode app has done its work, this 'autodump' node is closed.
On node closure, amdgpu gets to know the dump is done through
the completion that is triggered in release().

There is no write or read callback because necessary info can be
obtained through dmesg and umr. Messages back and forth between
usermode app and amdgpu are unnecessary.

Signed-off-by: Jiange Zhao 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 85 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  2 +
 4 files changed, 97 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index bc1e0fd71a09..a505b547f242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -724,6 +724,13 @@ struct amd_powerplay {
const struct amd_pm_funcs *pp_funcs;
 };
 
+struct amdgpu_autodump {
+   boolregistered;
+   struct completion   completed;
+   struct dentry   *dentry;
+   struct wait_queue_head  gpu_hang_wait;
+};
+
 #define AMDGPU_RESET_MAGIC_NUM 64
 #define AMDGPU_MAX_DF_PERFMONS 4
 struct amdgpu_device {
@@ -990,6 +997,8 @@ struct amdgpu_device {
charproduct_number[16];
charproduct_name[32];
charserial[16];
+
+   struct amdgpu_autodump  autodump;
 };
 
 static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 1a4894fa3693..cdd4bf00adee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -74,8 +74,91 @@ int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
return 0;
 }
 
+int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)
+{
+#if defined(CONFIG_DEBUG_FS)
+   int ret;
+   unsigned long tmo = 600*HZ;
+
+   if (!adev->autodump.registered)
+   return 0;
+
+   wake_up_interruptible(>autodump.gpu_hang_wait);
+
+   ret = 
wait_for_completion_interruptible_timeout(>autodump.completed, tmo);
+   if (ret == 0) { /* time out and dump tool still not finish its dump*/
+   pr_err("autodump: timeout before dump finished, move on to gpu 
recovery\n");
+   return -ETIMEDOUT;
+   }
+#endif
+   return 0;
+}
+
 #if defined(CONFIG_DEBUG_FS)
 
+static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
+{
+   int ret;
+   struct amdgpu_device *adev;
+
+   ret = simple_open(inode, file);
+   if (ret)
+   return ret;
+
+   adev = file->private_data;
+   if (adev->autodump.registered == true)
+   return -EINVAL;
+
+   adev->autodump.registered = true;
+
+   return 0;
+}
+
+static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   complete(>autodump.completed);
+   adev->autodump.registered = false;
+
+   return 0;
+}
+
+unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
+{
+   struct amdgpu_device *adev = file->private_data;
+
+   poll_wait(file, >autodump.gpu_hang_wait, poll_table);
+
+   if (adev->in_gpu_reset)
+   return POLLIN | POLLRDNORM | POLLWRNORM;
+
+   return 0;
+}
+
+static const struct file_operations autodump_debug_fops = {
+   .owner = THIS_MODULE,
+   .open = amdgpu_debugfs_autodump_open,
+   .poll = amdgpu_debugfs_autodump_poll,
+   .release = amdgpu_debugfs_autodump_release,
+};
+
+static int amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
+{
+   struct dentry *entry;
+
+   init_completion(>autodump.completed);
+   init_waitqueue_head(>autodump.gpu_hang_wait);
+   adev->autodump.registered = false;
+
+   entry = debugfs_create_file("autodump", 0600,
+   adev->ddev->primary->debugfs_root,
+   adev, _debug_fops);
+   adev->autodump.dentry = entry;
+
+   return 0;
+}
+
 /**
  * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
  *
@@ -1434,6 +1517,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
 
amdgpu_ras_debugfs_create_all(adev);
 
+   amdgpu_debugfs_autodump_init(adev);
+
return amdgpu_debugfs_add_files(adev, 

RE: [PATCH 1/8] drm/amdgpu: ignore TA ucode for SRIOV

2020-04-23 Thread Tao, Yintian
Series is Acked-by: Yintian Tao 

-Original Message-
From: amd-gfx  On Behalf Of Monk Liu
Sent: 2020年4月23日 15:02
To: amd-gfx@lists.freedesktop.org
Cc: Liu, Monk 
Subject: [PATCH 1/8] drm/amdgpu: ignore TA ucode for SRIOV

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index 0afd610..b4b0242 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -194,6 +194,8 @@ static int psp_v11_0_init_microcode(struct psp_context *psp)
case CHIP_NAVI10:
case CHIP_NAVI14:
case CHIP_NAVI12:
+   if (amdgpu_sriov_vf(adev))
+   break;
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ta.bin", 
chip_name);
err = request_firmware(>psp.ta_fw, fw_name, adev->dev);
if (err) {
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cyintian.tao%40amd.com%7C591f6154216d45f81a0c08d7e7543671%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637232221913347447sdata=jFv%2FMqS%2B0lHnuaN0%2B1GG6iSfyyAFPZbFBxa%2BiEL2tsg%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 8/8] drm/amdgpu: for nv12 always need smu ip

2020-04-23 Thread Monk Liu
because nv12 SRIOV support one vf mode

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/nv.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/nv.c b/drivers/gpu/drm/amd/amdgpu/nv.c
index 995bdec..9c42316 100644
--- a/drivers/gpu/drm/amd/amdgpu/nv.c
+++ b/drivers/gpu/drm/amd/amdgpu/nv.c
@@ -498,8 +498,7 @@ int nv_set_ip_blocks(struct amdgpu_device *adev)
amdgpu_device_ip_block_add(adev, _v10_0_ip_block);
amdgpu_device_ip_block_add(adev, _ih_ip_block);
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
-   if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP &&
-   !amdgpu_sriov_vf(adev))
+   if (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP)
amdgpu_device_ip_block_add(adev, _v11_0_ip_block);
if (adev->enable_virtual_display || amdgpu_sriov_vf(adev))
amdgpu_device_ip_block_add(adev, _virtual_ip_block);
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 2/8] drm/amdgpu: skip cg/pg set for SRIOV

2020-04-23 Thread Monk Liu
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 8a579ce..909ef08 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -7095,6 +7095,10 @@ static int gfx_v10_0_set_powergating_state(void *handle,
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
bool enable = (state == AMD_PG_STATE_GATE);
+
+   if (amdgpu_sriov_vf(adev))
+   return 0;
+
switch (adev->asic_type) {
case CHIP_NAVI10:
case CHIP_NAVI14:
@@ -7115,6 +7119,9 @@ static int gfx_v10_0_set_clockgating_state(void *handle,
 {
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+   if (amdgpu_sriov_vf(adev))
+   return 0;
+
switch (adev->asic_type) {
case CHIP_NAVI10:
case CHIP_NAVI14:
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 5/8] drm/amdgpu: clear the messed up checking logic

2020-04-23 Thread Monk Liu
for MI100 + ASICS, we always support SW_SMU for bare-metal
and for SRIOV one_vf_mode

Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 2bb1e0c..361a5b6 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -571,15 +571,10 @@ bool is_support_sw_smu(struct amdgpu_device *adev)
if (adev->asic_type == CHIP_VEGA20)
return (amdgpu_dpm == 2) ? true : false;
else if (adev->asic_type >= CHIP_ARCTURUS) {
-   if (amdgpu_sriov_vf(adev) &&
-   !(adev->asic_type == CHIP_ARCTURUS &&
- amdgpu_sriov_is_pp_one_vf(adev)))
-
-   return false;
-   else
+ if (amdgpu_sriov_is_pp_one_vf(adev) || !amdgpu_sriov_vf(adev))
return true;
-   } else
-   return false;
+   }
+   return false;
 }
 
 bool is_support_sw_smu_xgmi(struct amdgpu_device *adev)
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 4/8] drm/amdgpu: provide RREG32_SOC15_NO_KIQ, will be used later

2020-04-23 Thread Monk Liu
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/soc15_common.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/soc15_common.h 
b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
index c893c64..56d02aa 100644
--- a/drivers/gpu/drm/amd/amdgpu/soc15_common.h
+++ b/drivers/gpu/drm/amd/amdgpu/soc15_common.h
@@ -35,6 +35,9 @@
 #define RREG32_SOC15(ip, inst, reg) \
RREG32(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg)
 
+#define RREG32_SOC15_NO_KIQ(ip, inst, reg) \
+   RREG32_NO_KIQ(adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg)
+
 #define RREG32_SOC15_OFFSET(ip, inst, reg, offset) \
RREG32((adev->reg_offset[ip##_HWIP][inst][reg##_BASE_IDX] + reg) + 
offset)
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 7/8] drm/amdgpu: skip sysfs node not belong to one vf mode

2020-04-23 Thread Monk Liu
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c | 48 --
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
index 49e2e43..c762deb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_pm.c
@@ -3271,26 +3271,27 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
return ret;
}
 
-
-   ret = device_create_file(adev->dev, _attr_pp_num_states);
-   if (ret) {
-   DRM_ERROR("failed to create device file pp_num_states\n");
-   return ret;
-   }
-   ret = device_create_file(adev->dev, _attr_pp_cur_state);
-   if (ret) {
-   DRM_ERROR("failed to create device file pp_cur_state\n");
-   return ret;
-   }
-   ret = device_create_file(adev->dev, _attr_pp_force_state);
-   if (ret) {
-   DRM_ERROR("failed to create device file pp_force_state\n");
-   return ret;
-   }
-   ret = device_create_file(adev->dev, _attr_pp_table);
-   if (ret) {
-   DRM_ERROR("failed to create device file pp_table\n");
-   return ret;
+   if (!amdgpu_sriov_vf(adev)) {
+   ret = device_create_file(adev->dev, _attr_pp_num_states);
+   if (ret) {
+   DRM_ERROR("failed to create device file 
pp_num_states\n");
+   return ret;
+   }
+   ret = device_create_file(adev->dev, _attr_pp_cur_state);
+   if (ret) {
+   DRM_ERROR("failed to create device file 
pp_cur_state\n");
+   return ret;
+   }
+   ret = device_create_file(adev->dev, _attr_pp_force_state);
+   if (ret) {
+   DRM_ERROR("failed to create device file 
pp_force_state\n");
+   return ret;
+   }
+   ret = device_create_file(adev->dev, _attr_pp_table);
+   if (ret) {
+   DRM_ERROR("failed to create device file pp_table\n");
+   return ret;
+   }
}
 
ret = device_create_file(adev->dev, _attr_pp_dpm_sclk);
@@ -3337,6 +3338,13 @@ int amdgpu_pm_sysfs_init(struct amdgpu_device *adev)
return ret;
}
}
+
+   /* the reset are not needed for SRIOV one vf mode */
+   if (amdgpu_sriov_vf(adev)) {
+   adev->pm.sysfs_initialized = true;
+   return ret;
+   }
+
if (adev->asic_type != CHIP_ARCTURUS) {
ret = device_create_file(adev->dev, _attr_pp_dpm_pcie);
if (ret) {
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 6/8] drm/amdgpu: enable one vf mode for nv12

2020-04-23 Thread Monk Liu
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 12 +++-
 drivers/gpu/drm/amd/powerplay/navi10_ppt.c |  6 +++-
 drivers/gpu/drm/amd/powerplay/smu_v11_0.c  | 49 +-
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 361a5b6..5964d63 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -347,13 +347,13 @@ int smu_get_dpm_freq_by_index(struct smu_context *smu, 
enum smu_clk_type clk_typ
param = (uint32_t)(((clk_id & 0x) << 16) | (level & 0x));
 
ret = smu_send_smc_msg_with_param(smu, SMU_MSG_GetDpmFreqByIndex,
- param, );
+ param, value);
if (ret)
return ret;
 
/* BIT31:  0 - Fine grained DPM, 1 - Dicrete DPM
 * now, we un-support it */
-   *value = param & 0x7fff;
+   *value = *value & 0x7fff;
 
return ret;
 }
@@ -535,7 +535,6 @@ int smu_update_table(struct smu_context *smu, enum 
smu_table_id table_index, int
int table_id = smu_table_get_index(smu, table_index);
uint32_t table_size;
int ret = 0;
-
if (!table_data || table_id >= SMU_TABLE_COUNT || table_id < 0)
return -EINVAL;
 
@@ -691,7 +690,6 @@ int smu_feature_is_enabled(struct smu_context *smu, enum 
smu_feature_mask mask)
 
if (smu->is_apu)
return 1;
-
feature_id = smu_feature_get_index(smu, mask);
if (feature_id < 0)
return 0;
@@ -1339,6 +1337,9 @@ static int smu_hw_init(void *handle)
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
struct smu_context *smu = >smu;
 
+   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
+   return 0;
+
ret = smu_start_smc_engine(smu);
if (ret) {
pr_err("SMU is not ready yet!\n");
@@ -1352,9 +1353,6 @@ static int smu_hw_init(void *handle)
smu_set_gfx_cgpg(>smu, true);
}
 
-   if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_is_pp_one_vf(adev))
-   return 0;
-
if (!smu->pm_enabled)
return 0;
 
diff --git a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c 
b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
index c94270f..2184d24 100644
--- a/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
+++ b/drivers/gpu/drm/amd/powerplay/navi10_ppt.c
@@ -1817,7 +1817,8 @@ static int navi10_get_power_limit(struct smu_context *smu,
int power_src;
 
if (!smu->power_limit) {
-   if (smu_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT)) {
+   if (smu_feature_is_enabled(smu, SMU_FEATURE_PPT_BIT) &&
+   !amdgpu_sriov_vf(smu->adev)) {
power_src = smu_power_get_index(smu, 
SMU_POWER_SOURCE_AC);
if (power_src < 0)
return -EINVAL;
@@ -1960,6 +1961,9 @@ static int navi10_set_default_od_settings(struct 
smu_context *smu, bool initiali
OverDriveTable_t *od_table, *boot_od_table;
int ret = 0;
 
+   if (amdgpu_sriov_vf(smu->adev))
+   return 0;
+
ret = smu_v11_0_set_default_od_settings(smu, initialize, 
sizeof(OverDriveTable_t));
if (ret)
return ret;
diff --git a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c 
b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
index a97b296..3e1b3ed 100644
--- a/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/powerplay/smu_v11_0.c
@@ -57,7 +57,7 @@ static int smu_v11_0_send_msg_without_waiting(struct 
smu_context *smu,
  uint16_t msg)
 {
struct amdgpu_device *adev = smu->adev;
-   WREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
+   WREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_66, msg);
return 0;
 }
 
@@ -65,7 +65,7 @@ static int smu_v11_0_read_arg(struct smu_context *smu, 
uint32_t *arg)
 {
struct amdgpu_device *adev = smu->adev;
 
-   *arg = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_82);
+   *arg = RREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_82);
return 0;
 }
 
@@ -75,7 +75,7 @@ static int smu_v11_0_wait_for_response(struct smu_context 
*smu)
uint32_t cur_value, i, timeout = adev->usec_timeout * 10;
 
for (i = 0; i < timeout; i++) {
-   cur_value = RREG32_SOC15(MP1, 0, mmMP1_SMN_C2PMSG_90);
+   cur_value = RREG32_SOC15_NO_KIQ(MP1, 0, mmMP1_SMN_C2PMSG_90);
if ((cur_value & MP1_C2PMSG_90__CONTENT_MASK) != 0)
return cur_value == 0x1 ? 0 : -EIO;
 
@@ -83,7 +83,10 @@ static int smu_v11_0_wait_for_response(struct smu_context 
*smu)
}
 
/* timeout means wrong logic */
-   return -ETIME;
+   if (i == timeout)
+  

[PATCH 3/8] drm/amdgpu: sriov is forbidden to call disable DPM

2020-04-23 Thread Monk Liu
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/powerplay/amdgpu_smu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c 
b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
index 88b4e56..2bb1e0c 100644
--- a/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/powerplay/amdgpu_smu.c
@@ -1403,6 +1403,9 @@ static int smu_hw_init(void *handle)
 
 static int smu_stop_dpms(struct smu_context *smu)
 {
+   if (amdgpu_sriov_vf(smu->adev))
+   return 0;
+
return smu_system_features_control(smu, false);
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH 1/8] drm/amdgpu: ignore TA ucode for SRIOV

2020-04-23 Thread Monk Liu
Signed-off-by: Monk Liu 
---
 drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
index 0afd610..b4b0242 100644
--- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c
@@ -194,6 +194,8 @@ static int psp_v11_0_init_microcode(struct psp_context *psp)
case CHIP_NAVI10:
case CHIP_NAVI14:
case CHIP_NAVI12:
+   if (amdgpu_sriov_vf(adev))
+   break;
snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ta.bin", 
chip_name);
err = request_firmware(>psp.ta_fw, fw_name, adev->dev);
if (err) {
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx