On Fri, Oct 24, 2025 at 5:45 AM Jesse.Zhang <[email protected]> wrote:
>
> This commit adds support for tracking and exposing the reset capabilities
> of user mode queues across different IP blocks (GFX, Compute, SDMA).
>
> These changes allow userspace to query the reset capabilities of user
> mode queues and ensure reset operations are only attempted when supported
> by the hardware and driver.
>
> Suggested-by: Alex Deucher <[email protected]>
> Signed-off-by: Jesse Zhang <[email protected]>
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h        |  3 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 17 ++++++++-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c    | 44 ++++++++++++++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c   | 21 +++++++++++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c  | 13 +++++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c     | 17 +++++++++
>  drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c     | 12 ++++++
>  drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c     | 34 ++++++++++-------
>  drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c     | 24 ++++++++----
>  9 files changed, 163 insertions(+), 22 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index d0fb4eb1d7c4..48b21863065e 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -1182,6 +1182,7 @@ struct amdgpu_device {
>          * Value: struct amdgpu_usermode_queue
>          */
>         struct xarray userq_doorbell_xa;
> +       u32 userq_supported_reset[AMDGPU_RING_TYPE_MAX];

I don't think we need a separate userq_supported_reset array.  Just
use the existing reset masks.  We use the same functionality in both
kernel and userq cases so I don't see a reason to have a separate
tracker.

Alex

>
>         /* df */
>         struct amdgpu_df                df;
> @@ -1612,6 +1613,8 @@ struct dma_fence 
> *amdgpu_device_enforce_isolation(struct amdgpu_device *adev,
>                                                   struct amdgpu_ring *ring,
>                                                   struct amdgpu_job *job);
>  bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev);
> +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev,
> +                                   int ring_type);
>  ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring);
>  ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset);
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8480b72258f2..a0064c5314df 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -7649,7 +7649,8 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct 
> amdgpu_ring *ring)
>         if (!ring || !ring->adev)
>                 return size;
>
> -       if (amdgpu_device_should_recover_gpu(ring->adev))
> +       if (amdgpu_device_should_recover_gpu(ring->adev) &&
> +           unlikely(!ring->adev->debug_disable_gpu_ring_reset))
>                 size |= AMDGPU_RESET_TYPE_FULL;
>
>         if (unlikely(!ring->adev->debug_disable_soft_recovery) &&
> @@ -7659,6 +7660,20 @@ ssize_t amdgpu_get_soft_full_reset_mask(struct 
> amdgpu_ring *ring)
>         return size;
>  }
>
> +ssize_t amdgpu_userq_get_full_reset_mask(struct amdgpu_device *adev, int 
> ring_type)
> +{
> +       ssize_t size = 0;
> +
> +       if (!adev || !adev->userq_funcs[ring_type])
> +               return size;
> +
> +       if (amdgpu_device_should_recover_gpu(adev) &&
> +           unlikely(!adev->debug_disable_gpu_ring_reset))
> +               size |= AMDGPU_RESET_TYPE_FULL;
> +
> +       return size;
> +}
> +
>  ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset)
>  {
>         ssize_t size = 0;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> index 3d24f9cd750a..5597753ec61a 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
> @@ -1826,6 +1826,32 @@ static ssize_t 
> amdgpu_gfx_get_compute_reset_mask(struct device *dev,
>         return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset);
>  }
>
> +static ssize_t amdgpu_userq_get_gfx_reset_mask(struct device *dev,
> +                                               struct device_attribute *attr,
> +                                               char *buf)
> +{
> +       struct drm_device *ddev = dev_get_drvdata(dev);
> +       struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +       if (!adev)
> +               return -ENODEV;
> +
> +       return amdgpu_show_reset_mask(buf, 
> adev->userq_supported_reset[AMDGPU_HW_IP_GFX]);
> +}
> +
> +static ssize_t amdgpu_userq_get_compute_reset_mask(struct device *dev,
> +                                               struct device_attribute *attr,
> +                                               char *buf)
> +{
> +       struct drm_device *ddev = dev_get_drvdata(dev);
> +       struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +       if (!adev)
> +               return -ENODEV;
> +
> +       return amdgpu_show_reset_mask(buf, 
> adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE]);
> +}
> +
>  static DEVICE_ATTR(run_cleaner_shader, 0200,
>                    NULL, amdgpu_gfx_set_run_cleaner_shader);
>
> @@ -1845,6 +1871,12 @@ static DEVICE_ATTR(gfx_reset_mask, 0444,
>  static DEVICE_ATTR(compute_reset_mask, 0444,
>                    amdgpu_gfx_get_compute_reset_mask, NULL);
>
> +static DEVICE_ATTR(gfx_userq_reset_mask, 0444,
> +                  amdgpu_userq_get_gfx_reset_mask, NULL);
> +
> +static DEVICE_ATTR(compute_userq_reset_mask, 0444,
> +                  amdgpu_userq_get_compute_reset_mask, NULL);
> +
>  static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev)
>  {
>         struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr;
> @@ -1928,6 +1960,18 @@ static int amdgpu_gfx_sysfs_reset_mask_init(struct 
> amdgpu_device *adev)
>                         return r;
>         }
>
> +       if (adev->userq_funcs[AMDGPU_HW_IP_GFX]) {
> +               r = device_create_file(adev->dev, 
> &dev_attr_gfx_userq_reset_mask);
> +               if (r)
> +                       return r;
> +       }
> +
> +       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]) {
> +               r = device_create_file(adev->dev, 
> &dev_attr_compute_userq_reset_mask);
> +               if (r)
> +                       return r;
> +       }
> +
>         return r;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 8b8a04138711..2fb288b2bfc4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -471,6 +471,21 @@ static ssize_t amdgpu_get_sdma_reset_mask(struct device 
> *dev,
>  static DEVICE_ATTR(sdma_reset_mask, 0444,
>                    amdgpu_get_sdma_reset_mask, NULL);
>
> +static ssize_t amdgpu_get_sdma_userq_reset_mask(struct device *dev,
> +                                               struct device_attribute *attr,
> +                                               char *buf)
> +{
> +       struct drm_device *ddev = dev_get_drvdata(dev);
> +       struct amdgpu_device *adev = drm_to_adev(ddev);
> +
> +       if (!adev)
> +               return -ENODEV;
> +
> +       return amdgpu_show_reset_mask(buf, 
> adev->userq_supported_reset[AMDGPU_HW_IP_DMA]);
> +}
> +static DEVICE_ATTR(sdma_userq_reset_mask, 0444,
> +                  amdgpu_get_sdma_userq_reset_mask, NULL);
> +
>  int amdgpu_sdma_sysfs_reset_mask_init(struct amdgpu_device *adev)
>  {
>         int r = 0;
> @@ -484,6 +499,12 @@ int amdgpu_sdma_sysfs_reset_mask_init(struct 
> amdgpu_device *adev)
>                         return r;
>         }
>
> +       if (adev->userq_funcs[AMDGPU_HW_IP_DMA]) {
> +               r = device_create_file(adev->dev, 
> &dev_attr_sdma_userq_reset_mask);
> +               if (r)
> +                       return r;
> +       }
> +
>         return r;
>  }
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> index 188de848c229..15ae72e2d679 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
> @@ -47,6 +47,16 @@ u32 amdgpu_userq_get_supported_ip_mask(struct 
> amdgpu_device *adev)
>         return userq_ip_mask;
>  }
>
> +bool amdgpu_userq_is_reset_type_supported(struct amdgpu_device *adev,
> +                                         int ring_type,
> +                                         int reset_type)
> +{
> +    if (ring_type < 0 || ring_type >= AMDGPU_RING_TYPE_MAX)
> +        return false;
> +
> +    return (adev->userq_supported_reset[ring_type] & reset_type) != 0;
> +}
> +
>  static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
>  {
>         if (amdgpu_device_should_recover_gpu(adev)) {
> @@ -94,6 +104,9 @@ amdgpu_userq_detect_and_reset_queues(struct 
> amdgpu_userq_mgr *uq_mgr)
>                 int ring_type = queue_types[i];
>                 const struct amdgpu_userq_funcs *funcs = 
> adev->userq_funcs[ring_type];
>
> +               if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, 
> AMDGPU_RESET_TYPE_PER_QUEUE))
> +                               continue;
> +
>                 if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
>                     funcs && funcs->detect_and_reset) {
>                         r = funcs->detect_and_reset(adev, ring_type);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> index 252517ce5d5a..82b7c365d720 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> @@ -1815,6 +1815,11 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block 
> *ip_block)
>                 amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>         adev->gfx.compute_supported_reset =
>                 amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> +                       amdgpu_userq_get_full_reset_mask(adev, 
> AMDGPU_HW_IP_GFX);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> +                       
> amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> +
>         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>         case IP_VERSION(11, 0, 0):
>         case IP_VERSION(11, 0, 2):
> @@ -1824,12 +1829,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block 
> *ip_block)
>                     !amdgpu_sriov_vf(adev)) {
>                         adev->gfx.compute_supported_reset |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
>                         adev->gfx.gfx_supported_reset |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +                           
> adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] 
> |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +                           
> adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +                               
> adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
>                 }
>                 break;
>         default:
>                 if (!amdgpu_sriov_vf(adev)) {
>                         adev->gfx.compute_supported_reset |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
>                         adev->gfx.gfx_supported_reset |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +                           
> adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] 
> |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +                           
> adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +                               
> adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
>                 }
>                 break;
>         }
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> index 35d5a7e99a7c..c5ac42a30789 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
> @@ -1543,6 +1543,11 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block 
> *ip_block)
>                 amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]);
>         adev->gfx.compute_supported_reset =
>                 amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_GFX] =
> +               amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_GFX);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] =
> +               amdgpu_userq_get_full_reset_mask(adev,AMDGPU_HW_IP_COMPUTE);
> +
>         switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
>         case IP_VERSION(12, 0, 0):
>         case IP_VERSION(12, 0, 1):
> @@ -1551,6 +1556,13 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block 
> *ip_block)
>                     !amdgpu_sriov_vf(adev)) {
>                         adev->gfx.compute_supported_reset |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
>                         adev->gfx.gfx_supported_reset |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_GFX] &&
> +                           
> adev->userq_funcs[AMDGPU_HW_IP_GFX]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_GFX] 
> |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_COMPUTE] &&
> +                           
> adev->userq_funcs[AMDGPU_HW_IP_COMPUTE]->detect_and_reset)
> +                               
> adev->userq_supported_reset[AMDGPU_HW_IP_COMPUTE] |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
> +
>                 }
>                 break;
>         default:
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> index db6e41967f12..8850eaf8d2c4 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
> @@ -1349,19 +1349,6 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block 
> *ip_block)
>                         return r;
>         }
>
> -       adev->sdma.supported_reset =
> -               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> -       switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> -       case IP_VERSION(6, 0, 0):
> -       case IP_VERSION(6, 0, 2):
> -       case IP_VERSION(6, 0, 3):
> -               if ((adev->sdma.instance[0].fw_version >= 21) &&
> -                   !amdgpu_sriov_vf(adev))
> -                       adev->sdma.supported_reset |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
> -               break;
> -       default:
> -               break;
> -       }
>
>         if (amdgpu_sdma_ras_sw_init(adev)) {
>                 dev_err(adev->dev, "Failed to initialize sdma ras block!\n");
> @@ -1412,6 +1399,27 @@ static int sdma_v6_0_sw_init(struct amdgpu_ip_block 
> *ip_block)
>                 break;
>         }
>
> +       adev->sdma.supported_reset =
> +               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> +               amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
> +
> +       switch (amdgpu_ip_version(adev, SDMA0_HWIP, 0)) {
> +       case IP_VERSION(6, 0, 0):
> +       case IP_VERSION(6, 0, 2):
> +       case IP_VERSION(6, 0, 3):
> +               if ((adev->sdma.instance[0].fw_version >= 21) &&
> +                   !amdgpu_sriov_vf(adev)) {
> +                       adev->sdma.supported_reset |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
> +                       if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> +                           
> adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> +                               adev->userq_supported_reset[AMDGPU_HW_IP_DMA] 
> |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +
> +               }
> +               break;
> +       default:
> +               break;
> +       }
>         r = amdgpu_sdma_sysfs_reset_mask_init(adev);
>         if (r)
>                 return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c 
> b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> index 326ecc8d37d2..9de46ac8b1db 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
> @@ -1335,14 +1335,6 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block 
> *ip_block)
>                         return r;
>         }
>
> -       adev->sdma.supported_reset =
> -               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> -       if (!amdgpu_sriov_vf(adev))
> -               adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> -
> -       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> -       if (r)
> -               return r;
>         /* Allocate memory for SDMA IP Dump buffer */
>         ptr = kcalloc(adev->sdma.num_instances * reg_count, sizeof(uint32_t), 
> GFP_KERNEL);
>         if (ptr)
> @@ -1360,6 +1352,22 @@ static int sdma_v7_0_sw_init(struct amdgpu_ip_block 
> *ip_block)
>                 break;
>         }
>
> +       adev->sdma.supported_reset =
> +               amdgpu_get_soft_full_reset_mask(&adev->sdma.instance[0].ring);
> +       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] =
> +               amdgpu_userq_get_full_reset_mask(adev, AMDGPU_HW_IP_DMA);
> +
> +       if (!amdgpu_sriov_vf(adev)) {
> +               adev->sdma.supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE;
> +               if (adev->userq_funcs[AMDGPU_HW_IP_DMA] &&
> +                   adev->userq_funcs[AMDGPU_HW_IP_DMA]->detect_and_reset)
> +                       adev->userq_supported_reset[AMDGPU_HW_IP_DMA] |= 
> AMDGPU_RESET_TYPE_PER_QUEUE;
> +
> +       }
> +       r = amdgpu_sdma_sysfs_reset_mask_init(adev);
> +       if (r)
> +               return r;
> +
>         return r;
>  }
>
> --
> 2.49.0
>

Reply via email to