Applied.  Thanks!

Alex

On Tue, Mar 24, 2026 at 10:58 PM Kuehling, Felix <[email protected]> wrote:
>
>
> On 2026-03-23 00:28, Donet Tom wrote:
> > For gfxV9, due to a hardware bug ("based on the comments in the code
> > here [1]"), the control stack of a user-mode compute queue must be
> > allocated immediately after the page boundary of its regular MQD buffer.
> > To handle this, we allocate an enlarged MQD buffer where the first page
> > is used as the MQD and the remaining pages store the control stack.
> > Although these regions share the same BO, they require different memory
> > types: the MQD must be UC (uncached), while the control stack must be
> > NC (non-coherent), matching the behavior when the control stack is
> > allocated in user space.
> >
> > This logic works correctly on systems where the CPU page size matches
> > the GPU page size (4K). However, the current implementation aligns both
> > the MQD and the control stack to the CPU PAGE_SIZE. On systems with a
> > larger CPU page size, the entire first CPU page is marked UC—even though
> > that page may contain multiple GPU pages. The GPU treats the second 4K
> > GPU page inside that CPU page as part of the control stack, but it is
> > incorrectly mapped as UC.
> >
> > This patch fixes the issue by aligning both the MQD and control stack
> > sizes to the GPU page size (4K). The first 4K page is correctly marked
> > as UC for the MQD, and the remaining GPU pages are marked NC for the
> > control stack. This ensures proper memory type assignment on systems
> > with larger CPU page sizes.
> >
> > [1]: 
> > https://elixir.bootlin.com/linux/v6.18/source/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c#L118
> >
> > Signed-off-by: Donet Tom <[email protected]>
>
> Acked-by: Felix Kuehling <[email protected]>
>
>
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c      | 44 +++++++++++++++++++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h      |  2 +
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c       | 16 ++-----
> >   .../gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c   | 23 ++++++----
> >   4 files changed, 64 insertions(+), 21 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> > index ec911dce345f..4d884180cf61 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
> > @@ -403,6 +403,50 @@ void amdgpu_gart_map_vram_range(struct amdgpu_device 
> > *adev, uint64_t pa,
> >       drm_dev_exit(idx);
> >   }
> >
> > +/**
> > + * amdgpu_gart_map_gfx9_mqd - map mqd and ctrl_stack dma_addresses into 
> > GART entries
> > + *
> > + * @adev: amdgpu_device pointer
> > + * @offset: offset into the GPU's gart aperture
> > + * @pages: number of pages to bind
> > + * @dma_addr: DMA addresses of pages
> > + * @flags: page table entry flags
> > + *
> > + * Map the MQD and control stack addresses into GART entries with the 
> > correct
> > + * memory types on gfxv9. The MQD occupies the first 4KB and is followed by
> > + * the control stack. The MQD uses UC (uncached) memory, while the control 
> > stack
> > + * uses NC (non-coherent) memory.
> > + */
> > +void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset,
> > +                     int pages, dma_addr_t *dma_addr, uint64_t flags)
> > +{
> > +     uint64_t page_base;
> > +     unsigned int i, j, t;
> > +     int idx;
> > +     uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC);
> > +     void *dst;
> > +
> > +     if (!adev->gart.ptr)
> > +             return;
> > +
> > +     if (!drm_dev_enter(adev_to_drm(adev), &idx))
> > +             return;
> > +
> > +     t = offset / AMDGPU_GPU_PAGE_SIZE;
> > +     dst = adev->gart.ptr;
> > +     for (i = 0; i < pages; i++) {
> > +             page_base = dma_addr[i];
> > +             for (j = 0; j < AMDGPU_GPU_PAGES_IN_CPU_PAGE; j++, t++) {
> > +                     if ((i == 0) && (j == 0))
> > +                             amdgpu_gmc_set_pte_pde(adev, dst, t, 
> > page_base, flags);
> > +                     else
> > +                             amdgpu_gmc_set_pte_pde(adev, dst, t, 
> > page_base, ctrl_flags);
> > +                     page_base += AMDGPU_GPU_PAGE_SIZE;
> > +             }
> > +     }
> > +     drm_dev_exit(idx);
> > +}
> > +
> >   /**
> >    * amdgpu_gart_bind - bind pages into the gart page table
> >    *
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> > index d3118275ddae..6ebd2da32ea6 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.h
> > @@ -62,6 +62,8 @@ void amdgpu_gart_unbind(struct amdgpu_device *adev, 
> > uint64_t offset,
> >   void amdgpu_gart_map(struct amdgpu_device *adev, uint64_t offset,
> >                    int pages, dma_addr_t *dma_addr, uint64_t flags,
> >                    void *dst);
> > +void amdgpu_gart_map_gfx9_mqd(struct amdgpu_device *adev, uint64_t offset,
> > +                     int pages, dma_addr_t *dma_addr, uint64_t flags);
> >   void amdgpu_gart_bind(struct amdgpu_device *adev, uint64_t offset,
> >                     int pages, dma_addr_t *dma_addr, uint64_t flags);
> >   void amdgpu_gart_map_vram_range(struct amdgpu_device *adev, uint64_t pa,
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > index 67983955a124..e086eb1d2b24 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > @@ -855,25 +855,15 @@ static void amdgpu_ttm_gart_bind_gfx9_mqd(struct 
> > amdgpu_device *adev,
> >       int num_xcc = max(1U, adev->gfx.num_xcc_per_xcp);
> >       uint64_t page_idx, pages_per_xcc;
> >       int i;
> > -     uint64_t ctrl_flags = AMDGPU_PTE_MTYPE_VG10(flags, AMDGPU_MTYPE_NC);
> >
> >       pages_per_xcc = total_pages;
> >       do_div(pages_per_xcc, num_xcc);
> >
> >       for (i = 0, page_idx = 0; i < num_xcc; i++, page_idx += 
> > pages_per_xcc) {
> > -             /* MQD page: use default flags */
> > -             amdgpu_gart_bind(adev,
> > +             amdgpu_gart_map_gfx9_mqd(adev,
> >                               gtt->offset + (page_idx << PAGE_SHIFT),
> > -                             1, &gtt->ttm.dma_address[page_idx], flags);
> > -             /*
> > -              * Ctrl pages - modify the memory type to NC (ctrl_flags) from
> > -              * the second page of the BO onward.
> > -              */
> > -             amdgpu_gart_bind(adev,
> > -                             gtt->offset + ((page_idx + 1) << PAGE_SHIFT),
> > -                             pages_per_xcc - 1,
> > -                             &gtt->ttm.dma_address[page_idx + 1],
> > -                             ctrl_flags);
> > +                             pages_per_xcc, 
> > &gtt->ttm.dma_address[page_idx],
> > +                             flags);
> >       }
> >   }
> >
> > diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c 
> > b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > index dcf4bbfa641b..ff0e483514da 100644
> > --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c
> > @@ -42,9 +42,16 @@ static uint64_t mqd_stride_v9(struct mqd_manager *mm,
> >                               struct queue_properties *q)
> >   {
> >       if (mm->dev->kfd->cwsr_enabled &&
> > -         q->type == KFD_QUEUE_TYPE_COMPUTE)
> > -             return ALIGN(q->ctl_stack_size, PAGE_SIZE) +
> > -                     ALIGN(sizeof(struct v9_mqd), PAGE_SIZE);
> > +         q->type == KFD_QUEUE_TYPE_COMPUTE) {
> > +
> > +             /* On gfxv9, the MQD resides in the first 4K page,
> > +              * followed by the control stack. Align both to
> > +              * AMDGPU_GPU_PAGE_SIZE to maintain the required 4K boundary.
> > +              */
> > +
> > +             return ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) +
> > +                     ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), 
> > PAGE_SIZE);
> > +     }
> >
> >       return mm->mqd_size;
> >   }
> > @@ -148,8 +155,8 @@ static struct kfd_mem_obj *allocate_mqd(struct 
> > mqd_manager *mm,
> >               if (!mqd_mem_obj)
> >                       return NULL;
> >               retval = amdgpu_amdkfd_alloc_kernel_mem(node->adev,
> > -                     (ALIGN(q->ctl_stack_size, PAGE_SIZE) +
> > -                     ALIGN(sizeof(struct v9_mqd), PAGE_SIZE)) *
> > +                     (ALIGN(ALIGN(q->ctl_stack_size, AMDGPU_GPU_PAGE_SIZE) 
> > +
> > +                     ALIGN(sizeof(struct v9_mqd), AMDGPU_GPU_PAGE_SIZE), 
> > PAGE_SIZE)) *
> >                       NUM_XCC(node->xcc_mask),
> >                       mqd_on_vram(node->adev) ? AMDGPU_GEM_DOMAIN_VRAM :
> >                                                 AMDGPU_GEM_DOMAIN_GTT,
> > @@ -357,7 +364,7 @@ static int get_wave_state(struct mqd_manager *mm, void 
> > *mqd,
> >       struct kfd_context_save_area_header header;
> >
> >       /* Control stack is located one page after MQD. */
> > -     void *mqd_ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
> > +     void *mqd_ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE);
> >
> >       m = get_mqd(mqd);
> >
> > @@ -394,7 +401,7 @@ static void checkpoint_mqd(struct mqd_manager *mm, void 
> > *mqd, void *mqd_dst, voi
> >   {
> >       struct v9_mqd *m;
> >       /* Control stack is located one page after MQD. */
> > -     void *ctl_stack = (void *)((uintptr_t)mqd + PAGE_SIZE);
> > +     void *ctl_stack = (void *)((uintptr_t)mqd + AMDGPU_GPU_PAGE_SIZE);
> >
> >       m = get_mqd(mqd);
> >
> > @@ -440,7 +447,7 @@ static void restore_mqd(struct mqd_manager *mm, void 
> > **mqd,
> >               *gart_addr = addr;
> >
> >       /* Control stack is located one page after MQD. */
> > -     ctl_stack = (void *)((uintptr_t)*mqd + PAGE_SIZE);
> > +     ctl_stack = (void *)((uintptr_t)*mqd + AMDGPU_GPU_PAGE_SIZE);
> >       memcpy(ctl_stack, ctl_stack_src, ctl_stack_size);
> >
> >       m->cp_hqd_pq_doorbell_control =

Reply via email to