On 2017-10-23 06:03 AM, Pixel Ding wrote:
From: pding <pixel.d...@amd.com>

The exclusive mode has real-time limitation in reality, such like being
done in 300ms. It's easy observed if running many VF/VMs in single host
with heavy CPU workload.

If we find the init fails due to exclusive mode timeout, try it again.

Signed-off-by: pding <pixel.d...@amd.com>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 10 ++++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 15 +++++++++++++--
  2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3458d46..1935f5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2306,6 +2306,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
r = amdgpu_init(adev);
        if (r) {
+               /* failed in exclusive mode due to timeout */
+               if (amdgpu_sriov_vf(adev) &&
+                   !amdgpu_sriov_runtime(adev) &&
+                   amdgpu_virt_mmio_blocked(adev) &&
+                   !amdgpu_virt_wait_reset(adev)) {
+                       dev_err(adev->dev, "VF exclusive mode timeout\n");
+                       r = -EAGAIN;
+                       goto failed;
+               }
                dev_err(adev->dev, "amdgpu_init failed\n");
                amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 
0);
                amdgpu_fini(adev);
@@ -2393,6 +2402,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        amdgpu_vf_error_trans_all(adev);
        if (runtime)
                vga_switcheroo_fini_domain_pm_ops(adev->dev);
+
        return r;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index f2eb7ac..fdc240a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -86,7 +86,7 @@ void amdgpu_driver_unload_kms(struct drm_device *dev)
  int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags)
  {
        struct amdgpu_device *adev;
-       int r, acpi_status;
+       int r, acpi_status, retry = 0;
#ifdef CONFIG_DRM_AMDGPU_SI
        if (!amdgpu_si_support) {
@@ -122,6 +122,7 @@ int amdgpu_driver_load_kms(struct drm_device *dev, unsigned 
long flags)
                }
        }
  #endif
+retry_init:
adev = kzalloc(sizeof(struct amdgpu_device), GFP_KERNEL);
        if (adev == NULL) {
@@ -144,7 +145,17 @@ int amdgpu_driver_load_kms(struct drm_device *dev, 
unsigned long flags)
         * VRAM allocation
         */
        r = amdgpu_device_init(adev, dev, dev->pdev, flags);
-       if (r) {
+       if (++retry != 3 && r == -EAGAIN) {

Minor nitpick here. Might want to rewrite the condition so that it evaluates to false for most values of retry (currently it evaluates to false only for one value of retry).

E.g. if (++retry >= 3 ...)

Or

int retry = 3;
...
if (--retry >= 0 ...)

+               adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
+               adev->virt.ops = NULL;
+               amdgpu_device_fini(adev);
+               kfree(adev);
+               dev->dev_private = NULL;
+               msleep(5000);
+               dev_err(&dev->pdev->dev, "retry init %d\n", retry);
+               amdgpu_init_log = 0;
+               goto retry_init;
+       } else if (r) {
                dev_err(&dev->pdev->dev, "Fatal error during GPU init\n");
                goto out;
        }

_______________________________________________
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to