Hi Christian,

The feature is for GPU virtualization and has been checked in, you can refer to 
the following patch or commit 75b126427778218b36cfb68637e4f8d0e584b8ef.

From 2316518efc459928ad1d3d2d3511ea5fbda19475 Mon Sep 17 00:00:00 2001
From: pding <[email protected]>
Date: Mon, 23 Oct 2017 17:22:09 +0800
Subject: [PATCH 001/121] drm/amdgpu: retry init if it fails due to exclusive 
mode timeout (v3)

The exclusive mode has real-time limitation in reality, such like being
done in 300ms. It's easy observed if running many VF/VMs in single host
with heavy CPU workload.

If we find the init fails due to exclusive mode timeout, try it again.

v2:
 - rewrite the condition for readable value.

v3:
 - fix typo, add comments for sleep

Acked-by: Alex Deucher <[email protected]>
Signed-off-by: pding <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
Signed-off-by: Gary Sun <[email protected]>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   10 ++++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    |   15 +++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 125f77d..385b10e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2303,6 +2303,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 
        r = amdgpu_init(adev);
        if (r) {
+               /* failed in exclusive mode due to timeout */
+               if (amdgpu_sriov_vf(adev) &&
+                   !amdgpu_sriov_runtime(adev) &&
+                   amdgpu_virt_mmio_blocked(adev) &&
+                   !amdgpu_virt_wait_reset(adev)) {
+                       dev_err(adev->dev, "VF exclusive mode timeout\n");
+                       r = -EAGAIN;
+                       goto failed;
+               }
                dev_err(adev->dev, "amdgpu_init failed\n");
                amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 
0);
                amdgpu_fini(adev);
@@ -2390,6 +2399,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        amdgpu_vf_error_trans_all(adev);
        if (runtime)
                vga_switcheroo_fini_domain_pm_ops(adev->dev);
+
        return r;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 720139e..f313eee 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
 int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
 {
        struct amdgpu_device *adev;
-       int r, acpi_status;
+       int r, acpi_status, retry = 0;
 
 #ifdef CONFIG_DRM_AMDGPU_SI
        if (!amdgpu_si_support) {
@@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned 
long flags)
                }
        }
 #endif
+retry_init:
 
        adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
        if (adev == NULL) {
@@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, 
unsigned long flags)
         * VRAM allocation
         */
        r = amdgpu_device_init(adev, dev, dev->pdev, flags);
-       if (r) {
+       if (r == -EAGAIN && ++retry <= 3) {
+               adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
+               adev->virt.ops = NULL;
+               amdgpu_device_fini(adev);
+               kfree(adev);
+               dev->dev_private = NULL;
+               /* Don't request EX mode too frequently which is attacking */
+               msleep(5000);
+               dev_err(&dev->pdev->dev, "retry init %d\n", retry);
+               goto retry_init;
+       } else if (r) {
                dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
                goto out;
        }
-- 
1.7.1


Regards,
Gary


-----Original Message-----
From: Koenig, Christian 
Sent: Tuesday, November 07, 2017 3:48 PM
To: Sun, Gary <[email protected]>; [email protected]
Subject: Re: [PATCH] drm/amdgpu:remove debugfs file in amdgpu_device_finish

Hi Gary,

not sure what driver re-initialize feature you are talking about, but the last 
time I tried to re-initialize the driver it deadlocks in the modeset code 
because of some DC problem.

It's probably a good idea to fix that first, but in general please explain 
further what are you working on.

Regards,
Christian.

Am 07.11.2017 um 08:23 schrieb Sun, Gary:
> Hi Christian,
>
> The patch is for driver re- initialize  feature, not for driver exit or 
> rmmod.  When the driver initialize failed at some point, the re- initialize  
> feature will do some little clean and then try to initialize driver again, 
> then it will re-register some registered debugfs , so it will fail.
>
> Regards,
> Gary
>
>
> -----Original Message-----
> From: Koenig, Christian
> Sent: Monday, November 06, 2017 5:26 PM
> To: Sun, Gary <[email protected]>; [email protected]
> Subject: Re: [PATCH] drm/amdgpu:remove debugfs file in 
> amdgpu_device_finish
>
> Am 06.11.2017 um 10:20 schrieb Gary Sun:
>> remove debugfs file in amdgpu_device_finish
> NAK, the debugfs files are removed automatically by drm_debugfs_cleanup().
>
> So that patch is unnecessary.
>
> Regards,
> Christian.
>
>> Signed-off-by: Gary Sun <[email protected]>
>> ---
>>    drivers/gpu/drm/amd/amdgpu/amdgpu.h        |    1 +
>>    drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |   18 ++++++++++++++++++
>>    2 files changed, 19 insertions(+), 0 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> index 4f919d3..6cfcb5f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
>> @@ -1250,6 +1250,7 @@ struct amdgpu_debugfs {
>>    int amdgpu_debugfs_add_files(struct amdgpu_device *adev,
>>                           const struct drm_info_list *files,
>>                           unsigned nfiles);
>> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev);
>>    int amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
>>    
>>    #if defined(CONFIG_DEBUG_FS)
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> index 7b7439f..ee800ab 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
>> @@ -2520,6 +2520,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
>>      amdgpu_doorbell_fini(adev);
>>      amdgpu_pm_sysfs_fini(adev);
>>      amdgpu_debugfs_regs_cleanup(adev);
>> +    amdgpu_debugfs_cleanup_files(adev);
>>    }
>>    
>>    
>> @@ -3304,6 +3305,23 @@ int amdgpu_debugfs_add_files(struct amdgpu_device 
>> *adev,
>>      return 0;
>>    }
>>    
>> +int amdgpu_debugfs_cleanup_files(struct amdgpu_device *adev) {
>> +    unsigned int i;
>> +
>> +    for (i = 0; i < adev->debugfs_count; i++) { #if
>> +defined(CONFIG_DEBUG_FS)
>> +            drm_debugfs_remove_files(adev->debugfs[i].files,
>> +                            adev->debugfs[i].num_files,
>> +                            adev->ddev->primary);
>> +#endif
>> +            adev->debugfs[i].files = NUL;
>> +            adev->debugfs[i].num_files = 0;
>> +    }
>> +    adev->debugfs_count = 0;
>> +    return 0;
>> +}
>> +
>>    #if defined(CONFIG_DEBUG_FS)
>>    
>>    static ssize_t amdgpu_debugfs_regs_read(struct file *f, char 
>> __user *buf,
>

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to