On 2017年05月16日 16:13, Christian König wrote:
Am 16.05.2017 um 08:44 schrieb Chunming Zhou:
below ioctl will return -ENODEV:
amdgpu_cs_ioctl
amdgpu_cs_wait_ioctl
amdgpu_cs_wait_fences_ioctl
amdgpu_gem_va_ioctl
amdgpu_info_ioctl

Change-Id: I8970cde3301b7cfeb4263cc0f0e54aece215c98e
Signed-off-by: Chunming Zhou <[email protected]>
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  4 ++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c     |  9 +++++++++
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 +++-
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c    |  2 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c    | 10 ++++++++++
  5 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index f9da215..dcd6203 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -855,6 +855,7 @@ struct amdgpu_fpriv {
      struct amdgpu_ctx_mgr    ctx_mgr;
      spinlock_t        sem_handles_lock;
      struct idr        sem_handles;
+    u32            vram_lost_counter;
  };
    /*
@@ -1607,6 +1608,7 @@ struct amdgpu_device {
      atomic64_t            num_bytes_moved;
      atomic64_t            num_evictions;
      atomic_t            gpu_reset_counter;
+    atomic_t            vram_lost_counter;
        /* data for buffer migration throttling */
      struct {
@@ -2005,6 +2007,8 @@ static inline void amdgpu_unregister_atpx_handler(void) {}
  extern const struct drm_ioctl_desc amdgpu_ioctls_kms[];
  extern const int amdgpu_max_kms_ioctl;
  +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
+              struct amdgpu_fpriv *fpriv);
int amdgpu_driver_load_kms(struct drm_device *dev, unsigned long flags);
  int amdgpu_driver_unload_kms(struct drm_device *dev);
  void amdgpu_driver_lastclose_kms(struct drm_device *dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index b803412..911aa02 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -1097,6 +1097,7 @@ static int amdgpu_cs_submit(struct amdgpu_cs_parser *p, int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  {
      struct amdgpu_device *adev = dev->dev_private;
+    struct amdgpu_fpriv *fpriv = filp->driver_priv;
      union drm_amdgpu_cs *cs = data;
      struct amdgpu_cs_parser parser = {};
      bool reserved_buffers = false;
@@ -1104,6 +1105,8 @@ int amdgpu_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
        if (!adev->accel_working)
          return -EBUSY;
+    if (amdgpu_kms_vram_lost(adev, fpriv))
+        return -ENODEV;
        parser.adev = adev;
      parser.filp = filp;
@@ -1165,12 +1168,15 @@ int amdgpu_cs_wait_ioctl(struct drm_device *dev, void *data,
  {
      union drm_amdgpu_wait_cs *wait = data;
      struct amdgpu_device *adev = dev->dev_private;
+    struct amdgpu_fpriv *fpriv = filp->driver_priv;
      unsigned long timeout = amdgpu_gem_timeout(wait->in.timeout);
      struct amdgpu_ring *ring = NULL;
      struct amdgpu_ctx *ctx;
      struct fence *fence;
      long r;
  +    if (amdgpu_kms_vram_lost(adev, fpriv))
+        return -ENODEV;
r = amdgpu_cs_get_ring(adev, wait->in.ip_type, wait->in.ip_instance,
                     wait->in.ring, &ring);
      if (r)
@@ -1344,12 +1350,15 @@ int amdgpu_cs_wait_fences_ioctl(struct drm_device *dev, void *data,
                  struct drm_file *filp)
  {
      struct amdgpu_device *adev = dev->dev_private;
+    struct amdgpu_fpriv *fpriv = filp->driver_priv;
      union drm_amdgpu_wait_fences *wait = data;
      uint32_t fence_count = wait->in.fence_count;
      struct drm_amdgpu_fence *fences_user;
      struct drm_amdgpu_fence *fences;
      int r;
  +    if (amdgpu_kms_vram_lost(adev, fpriv))
+        return -ENODEV;
      /* Get the fences from userspace */
fences = kmalloc_array(fence_count, sizeof(struct drm_amdgpu_fence),
              GFP_KERNEL);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 58af9ea..417b8f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -2919,8 +2919,10 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
              if (r)
                  goto out;
              vram_lost = amdgpu_check_vram_lost(adev);
-            if (vram_lost)
+            if (vram_lost) {
                  DRM_ERROR("VRAM is lost!\n");
+                atomic_inc(&adev->vram_lost_counter);
+            }
              r = amdgpu_ttm_recover_gart(adev);
              if (r)
                  goto out;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index d8275ef..0f0b736 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -776,6 +776,8 @@ int amdgpu_gem_va_ioctl(struct drm_device *dev, void *data,
        if (!adev->vm_manager.enabled)
          return -ENOTTY;
+    if (amdgpu_kms_vram_lost(adev, fpriv))
+        return -ENODEV;

We should only block AMDGPU_VA_OP_MAP and AMDGPU_VA_OP_REPLACE here and still allow AMDGPU_VA_OP_UNMAP and AMDGPU_VA_OP_CLEAR.

BTW: How should the UMD recover from that situation? Completely close the fd and recreate it?

That might be tricky for processes like X or the Compositor. Should we have an IOCTL to reset the vram_lost counter for an fd?
if no NAK, I can add it.

Regards,
David Zhou

Christian.

        if (args->va_address < AMDGPU_VA_RESERVED_SIZE) {
          dev_err(&dev->pdev->dev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 368829a..a231aa1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -235,6 +235,7 @@ static int amdgpu_firmware_info(struct drm_amdgpu_info_firmware *fw_info, static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  {
      struct amdgpu_device *adev = dev->dev_private;
+    struct amdgpu_fpriv *fpriv = filp->driver_priv;
      struct drm_amdgpu_info *info = data;
      struct amdgpu_mode_info *minfo = &adev->mode_info;
      void __user *out = (void __user *)(uintptr_t)info->return_pointer;
@@ -247,6 +248,8 @@ static int amdgpu_info_ioctl(struct drm_device *dev, void *data, struct drm_file
        if (!info->return_size || !info->return_pointer)
          return -EINVAL;
+    if (amdgpu_kms_vram_lost(adev, fpriv))
+        return -ENODEV;
        switch (info->query) {
      case AMDGPU_INFO_VIRTUAL_RANGE: {
@@ -779,6 +782,12 @@ void amdgpu_driver_lastclose_kms(struct drm_device *dev)
      vga_switcheroo_process_delayed_switch();
  }
  +bool amdgpu_kms_vram_lost(struct amdgpu_device *adev,
+              struct amdgpu_fpriv *fpriv)
+{
+ return fpriv->vram_lost_counter != atomic_read(&adev->vram_lost_counter);
+}
+
  /**
   * amdgpu_driver_open_kms - drm callback for open
   *
@@ -833,6 +842,7 @@ int amdgpu_driver_open_kms(struct drm_device *dev, struct drm_file *file_priv)
        amdgpu_ctx_mgr_init(&fpriv->ctx_mgr);
  +    fpriv->vram_lost_counter = atomic_read(&adev->vram_lost_counter);
      file_priv->driver_priv = fpriv;
    out_suspend:



_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to