Re: [PATCH v4] drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations

2024-05-09 Thread Faith Ekstrand
On Thu, May 9, 2024 at 3:44 PM Mohamed Ahmed <
mohamedahmedegypt2...@gmail.com> wrote:

> Allows PTE kind and tile mode on BO create with VM_BIND,
> and adds a GETPARAM to indicate this change. This is needed to support
> modifiers in NVK and ensure correctness when dealing with the nouveau
> GL driver.
>
> The userspace modifiers implementation this is for can be found here:
> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24795
>
> Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
> Signed-off-by: Mohamed Ahmed 
>

Reviewed-by: Faith Ekstrand 


> ---
>  drivers/gpu/drm/nouveau/nouveau_abi16.c |  3 ++
>  drivers/gpu/drm/nouveau/nouveau_bo.c| 44 +++--
>  include/uapi/drm/nouveau_drm.h  |  7 
>  3 files changed, 29 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> index 80f74ee0f..47e53e17b 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> @@ -272,6 +272,9 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> getparam->value =
> (u64)ttm_resource_manager_usage(vram_mgr);
> break;
> }
> +   case NOUVEAU_GETPARAM_HAS_VMA_TILEMODE:
> +   getparam->value = 1;
> +   break;
> default:
> NV_PRINTK(dbg, cli, "unknown parameter %lld\n",
> getparam->param);
> return -EINVAL;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
> b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index db8cbf615..186add400 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -241,28 +241,28 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> }
>
> nvbo->contig = !(tile_flags & NOUVEAU_GEM_TILE_NONCONTIG);
> -   if (!nouveau_cli_uvmm(cli) || internal) {
> -   /* for BO noVM allocs, don't assign kinds */
> -   if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
> -   nvbo->kind = (tile_flags & 0xff00) >> 8;
> -   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> -   kfree(nvbo);
> -   return ERR_PTR(-EINVAL);
> -   }
>
> -   nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
> -   } else if (cli->device.info.family >=
> NV_DEVICE_INFO_V0_TESLA) {
> -   nvbo->kind = (tile_flags & 0x7f00) >> 8;
> -   nvbo->comp = (tile_flags & 0x0003) >> 16;
> -   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> -   kfree(nvbo);
> -   return ERR_PTR(-EINVAL);
> -   }
> -   } else {
> -   nvbo->zeta = (tile_flags & 0x0007);
> +   if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
> +   nvbo->kind = (tile_flags & 0xff00) >> 8;
> +   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> +   kfree(nvbo);
> +   return ERR_PTR(-EINVAL);
> +   }
> +
> +   nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
> +   } else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
> +   nvbo->kind = (tile_flags & 0x7f00) >> 8;
> +   nvbo->comp = (tile_flags & 0x0003) >> 16;
> +   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> +   kfree(nvbo);
> +   return ERR_PTR(-EINVAL);
> }
> -   nvbo->mode = tile_mode;
> +   } else {
> +   nvbo->zeta = (tile_flags & 0x0007);
> +   }
> +   nvbo->mode = tile_mode;
>
> +   if (!nouveau_cli_uvmm(cli) || internal) {
> /* Determine the desirable target GPU page size for the
> buffer. */
> for (i = 0; i < vmm->page_nr; i++) {
> /* Because we cannot currently allow VMM maps to
> fail
> @@ -304,12 +304,6 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> }
> nvbo->page = vmm->page[pi].shift;
> } else {
> -   /* reject other tile flags when in VM mode. */
> -   if (tile_mode)
> -   

Re: [PATCH v3] drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations

2024-05-08 Thread Faith Ekstrand
On Wed, May 8, 2024 at 6:06 PM Mohamed Ahmed <
mohamedahmedegypt2...@gmail.com> wrote:

> Allows PTE kind and tile mode on BO create with VM_BIND,
> and adds a GETPARAM to indicate this change. This is needed to support
> modifiers in NVK and ensure correctness when dealing with the nouveau
> GL driver.
>
> The userspace modifiers implementation this is for can be found here:
> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28843
>
> Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
> Signed-off-by: Mohamed Ahmed 
> ---
>  drivers/gpu/drm/nouveau/nouveau_abi16.c |  3 ++
>  drivers/gpu/drm/nouveau/nouveau_bo.c| 45 +++--
>  include/uapi/drm/nouveau_drm.h  |  7 
>  3 files changed, 30 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> index 80f74ee0f..47e53e17b 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> @@ -272,6 +272,9 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> getparam->value =
> (u64)ttm_resource_manager_usage(vram_mgr);
> break;
> }
> +   case NOUVEAU_GETPARAM_HAS_VMA_TILEMODE:
> +   getparam->value = 1;
> +   break;
> default:
> NV_PRINTK(dbg, cli, "unknown parameter %lld\n",
> getparam->param);
> return -EINVAL;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
> b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index db8cbf615..583c962ef 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -241,28 +241,29 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> }
>
> nvbo->contig = !(tile_flags & NOUVEAU_GEM_TILE_NONCONTIG);
> -   if (!nouveau_cli_uvmm(cli) || internal) {
> -   /* for BO noVM allocs, don't assign kinds */
> -   if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
> -   nvbo->kind = (tile_flags & 0xff00) >> 8;
> -   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> -   kfree(nvbo);
> -   return ERR_PTR(-EINVAL);
> -   }
>
> -   nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
> -   } else if (cli->device.info.family >=
> NV_DEVICE_INFO_V0_TESLA) {
> -   nvbo->kind = (tile_flags & 0x7f00) >> 8;
> -   nvbo->comp = (tile_flags & 0x0003) >> 16;
> -   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> -   kfree(nvbo);
> -   return ERR_PTR(-EINVAL);
> -   }
> -   } else {
> -   nvbo->zeta = (tile_flags & 0x0007);
> +   /* for BO allocs, don't assign kinds */
>

I think this comment is stale. We're now assigning them in both cases.


> +   if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
> +   nvbo->kind = (tile_flags & 0xff00) >> 8;
> +   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> +   kfree(nvbo);
> +   return ERR_PTR(-EINVAL);
> +   }
> +
> +   nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
> +   } else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
> +   nvbo->kind = (tile_flags & 0x7f00) >> 8;
> +   nvbo->comp = (tile_flags & 0x0003) >> 16;
> +   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> +   kfree(nvbo);
> +   return ERR_PTR(-EINVAL);
> }
> -   nvbo->mode = tile_mode;
> +   } else {
> +   nvbo->zeta = (tile_flags & 0x0007);
> +   }
> +   nvbo->mode = tile_mode;
>
> +   if (!nouveau_cli_uvmm(cli) || internal) {
> /* Determine the desirable target GPU page size for the
> buffer. */
> for (i = 0; i < vmm->page_nr; i++) {
> /* Because we cannot currently allow VMM maps to
> fail
> @@ -304,12 +305,6 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> }
> nvbo->page = vmm->page[pi].shift;
> } else {
> -   /* reject other tile flags when in VM mode. */
> -   if (tile_mode)
> -   return ERR_PTR(-EINVAL);
> -   if (tile_flags & ~NOUVEAU_GEM_TILE_NONCONTIG)
> -   return ERR_PTR(-EINVAL);
> -
> /* Determine the desirable target GPU page size for the
> buffer. */
> for (i = 0; i < vmm->page_nr; i++) {
> /* Because we cannot currently allow VMM maps to
> fail
> diff --git a/include/uapi/drm/nouveau_drm.h
> 

Re: [Nouveau] [PATCH 3/3] drm/nouveau: exec: report max pushs through getparam

2023-09-29 Thread Faith Ekstrand
On Thu, 2023-09-28 at 11:12 +1000, Dave Airlie wrote:
> On Thu, 28 Sept 2023 at 07:55, Faith Ekstrand
>  wrote:
> > 
> > On Wed, 2023-09-27 at 03:22 +0200, Danilo Krummrich wrote:
> > > Report the maximum number of IBs that can be pushed with a single
> > > DRM_IOCTL_NOUVEAU_EXEC through DRM_IOCTL_NOUVEAU_GETPARAM.
> > > 
> > > While the maximum number of IBs per ring might vary between
> > > chipsets,
> > > the kernel will make sure that userspace can only push a fraction
> > > of
> > > the
> > > maximum number of IBs per ring per job, such that we avoid a
> > > situation
> > > where there's only a single job occupying the ring, which could
> > > potentially lead to the ring run dry.
> > > 
> > > Using DRM_IOCTL_NOUVEAU_GETPARAM to report the maximum number of
> > > IBs
> > > that can be pushed with a single DRM_IOCTL_NOUVEAU_EXEC implies
> > > that
> > > all channels of a given device have the same ring size.
> > 
> > There's a bunch of nouveau kernel details I don't know here but the
> > interface looks good and I prefer it to a #define in the header.
> > 
> > Acked-by: Faith Ekstrand 
> 
> For the series
> 
> Reviewed-by: Dave Airlie 
> 
> we should probably land this in drm-misc-fixes, since it would be
> good
> to have in 6.6

Agreed.  My Mesa patch should handle both the case where we have the
getparam and when we don't.  However, I'd rather just make it part of
the new UAPI from the start and have a hard requirement on it since it
may reduce the current maximum in the header.

~Faith


> Dave.
> 
> > 
> > 
> > > Signed-off-by: Danilo Krummrich 
> > > ---
> > >  drivers/gpu/drm/nouveau/nouveau_abi16.c | 19 +++
> > >  drivers/gpu/drm/nouveau/nouveau_chan.c  |  2 +-
> > >  drivers/gpu/drm/nouveau/nouveau_dma.h   |  3 +++
> > >  drivers/gpu/drm/nouveau/nouveau_exec.c  |  7 ---
> > >  drivers/gpu/drm/nouveau/nouveau_exec.h  |  5 +
> > >  include/uapi/drm/nouveau_drm.h  | 10 ++
> > >  6 files changed, 42 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> > > b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> > > index 30afbec9e3b1..1a198689b391 100644
> > > --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> > > +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> > > @@ -31,6 +31,7 @@
> > > 
> > >  #include "nouveau_drv.h"
> > >  #include "nouveau_dma.h"
> > > +#include "nouveau_exec.h"
> > >  #include "nouveau_gem.h"
> > >  #include "nouveau_chan.h"
> > >  #include "nouveau_abi16.h"
> > > @@ -183,6 +184,20 @@ nouveau_abi16_fini(struct nouveau_abi16
> > > *abi16)
> > >     cli->abi16 = NULL;
> > >  }
> > > 
> > > +static inline unsigned int
> > > +getparam_dma_ib_max(struct nvif_device *device)
> > > +{
> > > +   const struct nvif_mclass dmas[] = {
> > > +   { NV03_CHANNEL_DMA, 0 },
> > > +   { NV10_CHANNEL_DMA, 0 },
> > > +   { NV17_CHANNEL_DMA, 0 },
> > > +   { NV40_CHANNEL_DMA, 0 },
> > > +   {}
> > > +   };
> > > +
> > > +   return nvif_mclass(>object, dmas) < 0 ?
> > > NV50_DMA_IB_MAX : 0;
> > > +}
> > > +
> > >  int
> > >  nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> > >  {
> > > @@ -247,6 +262,10 @@
> > > nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> > >     case NOUVEAU_GETPARAM_GRAPH_UNITS:
> > >     getparam->value = nvkm_gr_units(gr);
> > >     break;
> > > +   case NOUVEAU_GETPARAM_EXEC_PUSH_MAX:
> > > +   getparam->value = getparam_dma_ib_max(device) /
> > > + NOUVEAU_EXEC_PUSH_MAX_DIV;
> > > +   break;
> > >     default:
> > >     NV_PRINTK(dbg, cli, "unknown parameter %lld\n",
> > > getparam->param);
> > >     return -EINVAL;
> > > diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c
> > > b/drivers/gpu/drm/nouveau/nouveau_chan.c
> > > index ac56f4689ee3..c3c2ab887978 100644
> > > --- a/drivers/gpu/drm/nouveau/nouveau_chan.c
> > > +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
> > > @@ -456,7 +456,7 @@ nouveau_channel

Re: [Nouveau] [PATCH 3/3] drm/nouveau: exec: report max pushs through getparam

2023-09-29 Thread Faith Ekstrand
On Wed, 2023-09-27 at 03:22 +0200, Danilo Krummrich wrote:
> Report the maximum number of IBs that can be pushed with a single
> DRM_IOCTL_NOUVEAU_EXEC through DRM_IOCTL_NOUVEAU_GETPARAM.
> 
> While the maximum number of IBs per ring might vary between chipsets,
> the kernel will make sure that userspace can only push a fraction of
> the
> maximum number of IBs per ring per job, such that we avoid a
> situation
> where there's only a single job occupying the ring, which could
> potentially lead to the ring run dry.
> 
> Using DRM_IOCTL_NOUVEAU_GETPARAM to report the maximum number of IBs
> that can be pushed with a single DRM_IOCTL_NOUVEAU_EXEC implies that
> all channels of a given device have the same ring size.

There's a bunch of nouveau kernel details I don't know here but the
interface looks good and I prefer it to a #define in the header.

Acked-by: Faith Ekstrand 


> Signed-off-by: Danilo Krummrich 
> ---
>  drivers/gpu/drm/nouveau/nouveau_abi16.c | 19 +++
>  drivers/gpu/drm/nouveau/nouveau_chan.c  |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_dma.h   |  3 +++
>  drivers/gpu/drm/nouveau/nouveau_exec.c  |  7 ---
>  drivers/gpu/drm/nouveau/nouveau_exec.h  |  5 +
>  include/uapi/drm/nouveau_drm.h  | 10 ++
>  6 files changed, 42 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> index 30afbec9e3b1..1a198689b391 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> @@ -31,6 +31,7 @@
>  
>  #include "nouveau_drv.h"
>  #include "nouveau_dma.h"
> +#include "nouveau_exec.h"
>  #include "nouveau_gem.h"
>  #include "nouveau_chan.h"
>  #include "nouveau_abi16.h"
> @@ -183,6 +184,20 @@ nouveau_abi16_fini(struct nouveau_abi16 *abi16)
> cli->abi16 = NULL;
>  }
>  
> +static inline unsigned int
> +getparam_dma_ib_max(struct nvif_device *device)
> +{
> +   const struct nvif_mclass dmas[] = {
> +   { NV03_CHANNEL_DMA, 0 },
> +   { NV10_CHANNEL_DMA, 0 },
> +   { NV17_CHANNEL_DMA, 0 },
> +   { NV40_CHANNEL_DMA, 0 },
> +   {}
> +   };
> +
> +   return nvif_mclass(>object, dmas) < 0 ?
> NV50_DMA_IB_MAX : 0;
> +}
> +
>  int
>  nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
>  {
> @@ -247,6 +262,10 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> case NOUVEAU_GETPARAM_GRAPH_UNITS:
> getparam->value = nvkm_gr_units(gr);
> break;
> +   case NOUVEAU_GETPARAM_EXEC_PUSH_MAX:
> +   getparam->value = getparam_dma_ib_max(device) /
> + NOUVEAU_EXEC_PUSH_MAX_DIV;
> +   break;
> default:
> NV_PRINTK(dbg, cli, "unknown parameter %lld\n",
> getparam->param);
> return -EINVAL;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c
> b/drivers/gpu/drm/nouveau/nouveau_chan.c
> index ac56f4689ee3..c3c2ab887978 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_chan.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
> @@ -456,7 +456,7 @@ nouveau_channel_init(struct nouveau_channel
> *chan, u32 vram, u32 gart)
> chan->user_get = 0x44;
> chan->user_get_hi = 0x60;
> chan->dma.ib_base =  0x1 / 4;
> -   chan->dma.ib_max  = (0x02000 / 8) - 1;
> +   chan->dma.ib_max  = NV50_DMA_IB_MAX;
> chan->dma.ib_put  = 0;
> chan->dma.ib_free = chan->dma.ib_max - chan-
> >dma.ib_put;
> chan->dma.max = chan->dma.ib_base;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h
> b/drivers/gpu/drm/nouveau/nouveau_dma.h
> index 1744d95b233e..c52cda82353e 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
> @@ -49,6 +49,9 @@ void nv50_dma_push(struct nouveau_channel *, u64
> addr, u32 length,
>  /* Maximum push buffer size. */
>  #define NV50_DMA_PUSH_MAX_LENGTH 0x7f
>  
> +/* Maximum IBs per ring. */
> +#define NV50_DMA_IB_MAX ((0x02000 / 8) - 1)
> +
>  /* Object handles - for stuff that's doesn't use handle == oclass.
> */
>  enum {
> NvDmaFB = 0x8002,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c
> b/drivers/gpu/drm/nouveau/nouveau_exec.c
> index ba6913a3efb6..5b5c4a77b8e6 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_exec.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
> @@ -346,7 +346,7 @@ nouveau_exec_ioctl_exec(stru

Re: [Nouveau] [PATCH drm-misc-next v2] drm/nouveau: uapi: don't pass NO_PREFETCH flag implicitly

2023-08-23 Thread Faith Ekstrand
On Wed, Aug 23, 2023 at 1:17 PM Danilo Krummrich  wrote:

> Currently, NO_PREFETCH is passed implicitly through
> drm_nouveau_gem_pushbuf_push::length and drm_nouveau_exec_push::va_len.
>
> Since this is a direct representation of how the HW is programmed it
> isn't really future proof for a uAPI. Hence, fix this up for the new
> uAPI and split up the va_len field of struct drm_nouveau_exec_push,
> such that we keep 32bit for va_len and 32bit for flags.
>
> For drm_nouveau_gem_pushbuf_push::length at least provide
> NOUVEAU_GEM_PUSHBUF_NO_PREFETCH to indicate the bit shift.
>
> While at it, fix up nv50_dma_push() as well, such that the caller
> doesn't need to encode the NO_PREFETCH flag into the length parameter.
>
> Signed-off-by: Danilo Krummrich 
>

Still

Reviewed-by: Faith Ekstrand 


> ---
> Changes in v2:
>   - dma: rename prefetch to no_prefetch in nv50_dma_push() (Faith)
>   - exec: print error message when pushbuf size larger max pushbuf size
> (Faith)
> ---
>  drivers/gpu/drm/nouveau/nouveau_dma.c  |  7 +--
>  drivers/gpu/drm/nouveau/nouveau_dma.h  |  8 ++--
>  drivers/gpu/drm/nouveau/nouveau_exec.c | 19 ---
>  drivers/gpu/drm/nouveau/nouveau_gem.c  |  6 --
>  include/uapi/drm/nouveau_drm.h |  8 +++-
>  5 files changed, 38 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.c
> b/drivers/gpu/drm/nouveau/nouveau_dma.c
> index b90cac6d5772..b01c029f3a90 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.c
> @@ -69,16 +69,19 @@ READ_GET(struct nouveau_channel *chan, uint64_t
> *prev_get, int *timeout)
>  }
>
>  void
> -nv50_dma_push(struct nouveau_channel *chan, u64 offset, int length)
> +nv50_dma_push(struct nouveau_channel *chan, u64 offset, u32 length,
> + bool no_prefetch)
>  {
> struct nvif_user *user = >drm->client.device.user;
> struct nouveau_bo *pb = chan->push.buffer;
> int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base;
>
> BUG_ON(chan->dma.ib_free < 1);
> +   WARN_ON(length > NV50_DMA_PUSH_MAX_LENGTH);
>
> nouveau_bo_wr32(pb, ip++, lower_32_bits(offset));
> -   nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8);
> +   nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8 |
> +   (no_prefetch ? (1 << 31) : 0));
>
> chan->dma.ib_put = (chan->dma.ib_put + 1) & chan->dma.ib_max;
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h
> b/drivers/gpu/drm/nouveau/nouveau_dma.h
> index 035a709c7be1..1744d95b233e 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
> @@ -31,7 +31,8 @@
>  #include "nouveau_chan.h"
>
>  int nouveau_dma_wait(struct nouveau_channel *, int slots, int size);
> -void nv50_dma_push(struct nouveau_channel *, u64 addr, int length);
> +void nv50_dma_push(struct nouveau_channel *, u64 addr, u32 length,
> +  bool no_prefetch);
>
>  /*
>   * There's a hw race condition where you can't jump to your PUT offset,
> @@ -45,6 +46,9 @@ void nv50_dma_push(struct nouveau_channel *, u64 addr,
> int length);
>   */
>  #define NOUVEAU_DMA_SKIPS (128 / 4)
>
> +/* Maximum push buffer size. */
> +#define NV50_DMA_PUSH_MAX_LENGTH 0x7f
> +
>  /* Object handles - for stuff that's doesn't use handle == oclass. */
>  enum {
> NvDmaFB = 0x8002,
> @@ -89,7 +93,7 @@ FIRE_RING(struct nouveau_channel *chan)
>
> if (chan->dma.ib_max) {
> nv50_dma_push(chan, chan->push.addr + (chan->dma.put << 2),
> - (chan->dma.cur - chan->dma.put) << 2);
> + (chan->dma.cur - chan->dma.put) << 2, false);
> } else {
> WRITE_PUT(chan->dma.cur);
> }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c
> b/drivers/gpu/drm/nouveau/nouveau_exec.c
> index 0f927adda4ed..a90c4cd8cbb2 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_exec.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
> @@ -164,8 +164,10 @@ nouveau_exec_job_run(struct nouveau_job *job)
> }
>
> for (i = 0; i < exec_job->push.count; i++) {
> -   nv50_dma_push(chan, exec_job->push.s[i].va,
> - exec_job->push.s[i].va_len);
> +   struct drm_nouveau_exec_push *p = _job->push.s[i];
> +   bool no_prefetch = p->flags &
> DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH;
> +
> +   nv50_dma_push(chan, p->va, p->va_len,

Re: [Nouveau] [PATCH drm-misc-next] drm/nouveau: uapi: don't pass NO_PREFETCH flag implicitly

2023-08-22 Thread Faith Ekstrand
>  {
> struct nouveau_exec_job *job;
> struct nouveau_job_args args = {};
> -   int ret;
> +   int i, ret;
> +
> +   for (i = 0; i < __args->push.count; i++) {
> +   struct drm_nouveau_exec_push *p = &__args->push.s[i];
> +
> +   if (p->va_len > NV50_DMA_PUSH_MAX_LENGTH)
> +   return -EINVAL;
>

This can probably be wrapped in unlikely().  Also, it'd be nice if we
printed an error message like we do if you try to push too many things.

Looks good. Thanks!

Reviewed-by: Faith Ekstrand 


> +   }
>
> job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> if (!job)
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
> b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index f39360870c70..2f3dc4d71657 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -856,9 +856,11 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev,
> void *data,
> for (i = 0; i < req->nr_push; i++) {
> struct nouveau_vma *vma = (void *)(unsigned long)
> bo[push[i].bo_index].user_priv;
> +   u64 addr = vma->addr + push[i].offset;
> +   u32 length = push[i].length &
> ~NOUVEAU_GEM_PUSHBUF_NO_PREFETCH;
> +   bool prefetch = !(push[i].length &
> NOUVEAU_GEM_PUSHBUF_NO_PREFETCH);
>
> -   nv50_dma_push(chan, vma->addr + push[i].offset,
> - push[i].length);
> +   nv50_dma_push(chan, addr, length, prefetch);
> }
> } else
> if (drm->client.device.info.chipset >= 0x25) {
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index b1ad9d5ffce8..8f16724b5d05 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -138,6 +138,7 @@ struct drm_nouveau_gem_pushbuf_push {
> __u32 pad;
> __u64 offset;
> __u64 length;
> +#define NOUVEAU_GEM_PUSHBUF_NO_PREFETCH (1 << 23)
>  };
>
>  struct drm_nouveau_gem_pushbuf {
> @@ -338,7 +339,12 @@ struct drm_nouveau_exec_push {
> /**
>  * @va_len: the length of the push buffer mapping
>  */
> -   __u64 va_len;
> +   __u32 va_len;
> +   /**
> +* flags: the flags for this push buffer mapping
> +*/
> +   __u32 flags;
> +#define DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH 0x1
>  };
>
>  /**
>
> base-commit: ad1367f831f8743746a1f49705c28e36a7c95525
> --
> 2.41.0
>
>


Re: [Nouveau] [PATCH] nouveau: find the smallest page allocation to cover a buffer alloc.

2023-08-12 Thread Faith Ekstrand
On Thu, Aug 10, 2023 at 10:15 PM Dave Airlie  wrote:

> From: Dave Airlie 
>
> With the new uapi we don't have the comp flags on the allocation,
> so we shouldn't be using the first size that works, we should be
> iterating until we get the correct one.
>
> This reduces allocations from 2MB to 64k in lots of places.
>
> Fixes dEQP-VK.memory.allocation.basic.size_8KiB.forward.count_4000
> on my ampere/gsp system.
>
> Signed-off-by: Dave Airlie 
>

Reviewed-by: Faith Ekstrand 


> ---
>  drivers/gpu/drm/nouveau/nouveau_bo.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
> b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index 949195d5d782..a6993c7807b6 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -318,8 +318,9 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> (!vmm->page[i].host || vmm->page[i].shift >
> PAGE_SHIFT))
> continue;
>
> -   if (pi < 0)
> -   pi = i;
> +   /* pick the last one as it will be smallest. */
> +   pi = i;
> +
> /* Stop once the buffer is larger than the current
> page size. */
> if (*size >= 1ULL << vmm->page[i].shift)
> break;
> --
> 2.41.0
>
>


Re: [Nouveau] [PATCH drm-misc-next] drm/nouveau: sched: avoid job races between entities

2023-08-11 Thread Faith Ekstrand
On Thu, Aug 10, 2023 at 8:06 PM Danilo Krummrich  wrote:

> If a sched job depends on a dma-fence from a job from the same GPU
> scheduler instance, but a different scheduler entity, the GPU scheduler
> does only wait for the particular job to be scheduled, rather than for
> the job to fully complete. This is due to the GPU scheduler assuming
> that there is a scheduler instance per ring. However, the current
> implementation, in order to avoid arbitrary amounts of kthreads, has a
> single scheduler instance while scheduler entities represent rings.
>
> As a workaround, set the DRM_SCHED_FENCE_DONT_PIPELINE for all
> out-fences in order to force the scheduler to wait for full job
> completion for dependent jobs from different entities and same scheduler
> instance.
>
> There is some work in progress [1] to address the issues of firmware
> schedulers; once it is in-tree the scheduler topology in Nouveau should
> be re-worked accordingly.
>
> [1]
> https://lore.kernel.org/dri-devel/20230801205103.627779-1-matthew.br...@intel.com/
>
> Signed-off-by: Danilo Krummrich 
> ---
>  drivers/gpu/drm/nouveau/nouveau_sched.c | 22 ++
>  1 file changed, 22 insertions(+)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c
> b/drivers/gpu/drm/nouveau/nouveau_sched.c
> index 3424a1bf6af3..88217185e0f3 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> @@ -292,6 +292,28 @@ nouveau_job_submit(struct nouveau_job *job)
> if (job->sync)
> done_fence = dma_fence_get(job->done_fence);
>
> +   /* If a sched job depends on a dma-fence from a job from the same
> GPU
> +* scheduler instance, but a different scheduler entity, the GPU
> +    * scheduler does only wait for the particular job to be scheduled,
>

s/does only wait/only waits/

Reviewed-by: Faith Ekstrand 

+* rather than for the job to fully complete. This is due to the GPU
> +* scheduler assuming that there is a scheduler instance per ring.
> +* However, the current implementation, in order to avoid arbitrary
> +* amounts of kthreads, has a single scheduler instance while
> scheduler
> +* entities represent rings.
> +*
> +* As a workaround, set the DRM_SCHED_FENCE_DONT_PIPELINE for all
> +* out-fences in order to force the scheduler to wait for full job
> +* completion for dependent jobs from different entities and same
> +* scheduler instance.
> +*
> +* There is some work in progress [1] to address the issues of
> firmware
> +* schedulers; once it is in-tree the scheduler topology in Nouveau
> +* should be re-worked accordingly.
> +*
> +* [1]
> https://lore.kernel.org/dri-devel/20230801205103.627779-1-matthew.br...@intel.com/
> +*/
> +   set_bit(DRM_SCHED_FENCE_DONT_PIPELINE, >done_fence->flags);
> +
> if (job->ops->armed_submit)
> job->ops->armed_submit(job);
>
>
> base-commit: 68132cc6d1bcbc78ade524c6c6c226de42139f0e
> --
> 2.41.0
>
>


[Nouveau] [PATCH] drm/nouveau/sched: Don't pass user flags to drm_syncobj_find_fence()

2023-08-07 Thread Faith Ekstrand
The flags field in drm_syncobj_find_fence() takes SYNCOBJ_WAIT flags
from the syncobj UAPI whereas sync->flags is from the nouveau UAPI. What
we actually want is 0 flags which tells it to just try to find the
fence and then return without waiting.

Signed-off-by: Faith Ekstrand 
Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
Cc: Danilo Krummrich 
Cc: Dave Airlie 
---
 drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c 
b/drivers/gpu/drm/nouveau/nouveau_sched.c
index b3b59fbec291..3424a1bf6af3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -142,7 +142,7 @@ sync_find_fence(struct nouveau_job *job,
 
ret = drm_syncobj_find_fence(job->file_priv,
 sync->handle, point,
-sync->flags, fence);
+0 /* flags */, fence);
if (ret)
return ret;
 
-- 
2.41.0



Re: [Nouveau] [PATCH drm-misc-next v9 00/11] Nouveau VM_BIND UAPI & DRM GPUVA Manager (merged)

2023-08-03 Thread Faith Ekstrand
On Thu, Aug 3, 2023 at 4:45 PM Dave Airlie  wrote:

> On Fri, 4 Aug 2023 at 02:52, Danilo Krummrich  wrote:
> >
> > This patch series provides a new UAPI for the Nouveau driver in order to
> > support Vulkan features, such as sparse bindings and sparse residency.
> >
>
> Now that Faith has reviewed the uAPI and userspace work, I think we
> should try and steer this in.
>
> I think the only thing I see is the SPDX + MIT header in some places,
> I think we can drop the MIT bits where SPDX is there, and leave
> copyright and authorship (if you like), personally I've been leaving
> authorship up to git, as it saves trouble with people randomly
> emailing you about things you wrote 10 years ago.
>
> Otherwise for the series:
> Reviewed-by: Dave Airlie 
>

FYI: There's a small, easily resolved conflict with the uapi fixup patch. I
don't care too much which goes in first but I'd like both to land before I
merge NVK so I don't have to deal with a nouveau_deprecated.h.

~Faith


> Dave.
>


Re: [Nouveau] [PATCH] drm/nouveau: fixup the uapi header file.

2023-08-03 Thread Faith Ekstrand
On Thu, Aug 3, 2023 at 2:33 PM Dave Airlie  wrote:

> From: Dave Airlie 
>
> nouveau > 10 years ago had a plan for new multiplexer inside a multiplexer
> API using nvif. It never fully reached fruition, fast forward 10 years,
> and the new vulkan driver is avoiding libdrm and calling ioctls, and
> these 3 ioctls, getparam, channel alloc + free don't seem to be things
> we'd want to use nvif for.
>
> Undeprecate and put them into the uapi header so we can just copy it
> into mesa later.
>
> v2: use uapi types.
>
> Signed-off-by: Dave Airlie 
>

Reviewed-by: Faith Ekstrand 


> ---
>  drivers/gpu/drm/nouveau/nouveau_abi16.h | 41 -
>  include/uapi/drm/nouveau_drm.h  | 48 +++--
>  2 files changed, 45 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.h
> b/drivers/gpu/drm/nouveau/nouveau_abi16.h
> index 27eae85f33e6..d5d80d0d9011 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_abi16.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.h
> @@ -43,28 +43,6 @@ int  nouveau_abi16_usif(struct drm_file *, void *data,
> u32 size);
>  #define NOUVEAU_GEM_DOMAIN_VRAM  (1 << 1)
>  #define NOUVEAU_GEM_DOMAIN_GART  (1 << 2)
>
> -struct drm_nouveau_channel_alloc {
> -   uint32_t fb_ctxdma_handle;
> -   uint32_t tt_ctxdma_handle;
> -
> -   int  channel;
> -   uint32_t pushbuf_domains;
> -
> -   /* Notifier memory */
> -   uint32_t notifier_handle;
> -
> -   /* DRM-enforced subchannel assignments */
> -   struct {
> -   uint32_t handle;
> -   uint32_t grclass;
> -   } subchan[8];
> -   uint32_t nr_subchan;
> -};
> -
> -struct drm_nouveau_channel_free {
> -   int channel;
> -};
> -
>  struct drm_nouveau_grobj_alloc {
> int  channel;
> uint32_t handle;
> @@ -83,31 +61,12 @@ struct drm_nouveau_gpuobj_free {
> uint32_t handle;
>  };
>
> -#define NOUVEAU_GETPARAM_PCI_VENDOR  3
> -#define NOUVEAU_GETPARAM_PCI_DEVICE  4
> -#define NOUVEAU_GETPARAM_BUS_TYPE5
> -#define NOUVEAU_GETPARAM_FB_SIZE 8
> -#define NOUVEAU_GETPARAM_AGP_SIZE9
> -#define NOUVEAU_GETPARAM_CHIPSET_ID  11
> -#define NOUVEAU_GETPARAM_VM_VRAM_BASE12
> -#define NOUVEAU_GETPARAM_GRAPH_UNITS 13
> -#define NOUVEAU_GETPARAM_PTIMER_TIME 14
> -#define NOUVEAU_GETPARAM_HAS_BO_USAGE15
> -#define NOUVEAU_GETPARAM_HAS_PAGEFLIP16
> -struct drm_nouveau_getparam {
> -   uint64_t param;
> -   uint64_t value;
> -};
> -
>  struct drm_nouveau_setparam {
> uint64_t param;
> uint64_t value;
>  };
>
> -#define DRM_IOCTL_NOUVEAU_GETPARAM   DRM_IOWR(DRM_COMMAND_BASE +
> DRM_NOUVEAU_GETPARAM, struct drm_nouveau_getparam)
>  #define DRM_IOCTL_NOUVEAU_SETPARAM   DRM_IOWR(DRM_COMMAND_BASE +
> DRM_NOUVEAU_SETPARAM, struct drm_nouveau_setparam)
> -#define DRM_IOCTL_NOUVEAU_CHANNEL_ALLOC  DRM_IOWR(DRM_COMMAND_BASE +
> DRM_NOUVEAU_CHANNEL_ALLOC, struct drm_nouveau_channel_alloc)
> -#define DRM_IOCTL_NOUVEAU_CHANNEL_FREE   DRM_IOW (DRM_COMMAND_BASE +
> DRM_NOUVEAU_CHANNEL_FREE, struct drm_nouveau_channel_free)
>  #define DRM_IOCTL_NOUVEAU_GROBJ_ALLOCDRM_IOW (DRM_COMMAND_BASE +
> DRM_NOUVEAU_GROBJ_ALLOC, struct drm_nouveau_grobj_alloc)
>  #define DRM_IOCTL_NOUVEAU_NOTIFIEROBJ_ALLOC  DRM_IOWR(DRM_COMMAND_BASE +
> DRM_NOUVEAU_NOTIFIEROBJ_ALLOC, struct drm_nouveau_notifierobj_alloc)
>  #define DRM_IOCTL_NOUVEAU_GPUOBJ_FREEDRM_IOW (DRM_COMMAND_BASE +
> DRM_NOUVEAU_GPUOBJ_FREE, struct drm_nouveau_gpuobj_free)
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index 853a327433d3..ca917e55b38f 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -33,6 +33,44 @@
>  extern "C" {
>  #endif
>
> +#define NOUVEAU_GETPARAM_PCI_VENDOR  3
> +#define NOUVEAU_GETPARAM_PCI_DEVICE  4
> +#define NOUVEAU_GETPARAM_BUS_TYPE5
> +#define NOUVEAU_GETPARAM_FB_SIZE 8
> +#define NOUVEAU_GETPARAM_AGP_SIZE9
> +#define NOUVEAU_GETPARAM_CHIPSET_ID  11
> +#define NOUVEAU_GETPARAM_VM_VRAM_BASE12
> +#define NOUVEAU_GETPARAM_GRAPH_UNITS 13
> +#define NOUVEAU_GETPARAM_PTIMER_TIME 14
> +#define NOUVEAU_GETPARAM_HAS_BO_USAGE15
> +#define NOUVEAU_GETPARAM_HAS_PAGEFLIP16
> +struct drm_nouveau_getparam {
> +   __u64 param;
> +   __u64 value;
> +};
> +
> +struct drm_nouveau_channel_alloc {
> +   __u32 fb_ctxdma_handle;
> +   __u32 tt_ctxdma_handle;
> +
> +   __s32 channe

Re: [Nouveau] [PATCH drm-misc-next v9 02/11] drm/nouveau: new VM_BIND uapi interfaces

2023-08-03 Thread Faith Ekstrand
On Thu, Aug 3, 2023 at 11:53 AM Danilo Krummrich  wrote:

> This commit provides the interfaces for the new UAPI motivated by the
> Vulkan API. It allows user mode drivers (UMDs) to:
>
> 1) Initialize a GPU virtual address (VA) space via the new
>DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>VA area.
>
> 2) Bind and unbind GPU VA space mappings via the new
>DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>
> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>
> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
> asynchronous processing with DRM syncobjs as synchronization mechanism.
>
> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>
> Co-authored-by: Dave Airlie 
> Signed-off-by: Danilo Krummrich 
>

Reviewed-by: Faith Ekstrand 

Userspace is also reviewed and sitting here:
https://gitlab.freedesktop.org/nouveau/mesa/-/merge_requests/150

I'll rev it all on top of the latest header here in a few minutes but I
think the only header change was the rename for VM_INIT which shouldn't be
a real problem.

~Faith


> ---
>  Documentation/gpu/driver-uapi.rst |   8 ++
>  include/uapi/drm/nouveau_drm.h| 217 ++
>  2 files changed, 225 insertions(+)
>
> diff --git a/Documentation/gpu/driver-uapi.rst
> b/Documentation/gpu/driver-uapi.rst
> index 4411e6919a3d..9c7ca6e33a68 100644
> --- a/Documentation/gpu/driver-uapi.rst
> +++ b/Documentation/gpu/driver-uapi.rst
> @@ -6,3 +6,11 @@ drm/i915 uAPI
>  =
>
>  .. kernel-doc:: include/uapi/drm/i915_drm.h
> +
> +drm/nouveau uAPI
> +
> +
> +VM_BIND / EXEC uAPI
> +---
> +
> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index 853a327433d3..b567892c128d 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -38,6 +38,8 @@ extern "C" {
>  #define NOUVEAU_GEM_DOMAIN_GART  (1 << 2)
>  #define NOUVEAU_GEM_DOMAIN_MAPPABLE  (1 << 3)
>  #define NOUVEAU_GEM_DOMAIN_COHERENT  (1 << 4)
> +/* The BO will never be shared via import or export. */
> +#define NOUVEAU_GEM_DOMAIN_NO_SHARE  (1 << 5)
>
>  #define NOUVEAU_GEM_TILE_COMP0x0003 /* nv50-only */
>  #define NOUVEAU_GEM_TILE_LAYOUT_MASK 0xff00
> @@ -126,6 +128,215 @@ struct drm_nouveau_gem_cpu_fini {
> __u32 handle;
>  };
>
> +/**
> + * struct drm_nouveau_sync - sync object
> + *
> + * This structure serves as synchronization mechanism for (potentially)
> + * asynchronous operations such as EXEC or VM_BIND.
> + */
> +struct drm_nouveau_sync {
> +   /**
> +* @flags: the flags for a sync object
> +*
> +* The first 8 bits are used to determine the type of the sync
> object.
> +*/
> +   __u32 flags;
> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
> +   /**
> +* @handle: the handle of the sync object
> +*/
> +   __u32 handle;
> +   /**
> +* @timeline_value:
> +*
> +* The timeline point of the sync object in case the syncobj is of
> +* type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
> +*/
> +   __u64 timeline_value;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_init - GPU VA space init structure
> + *
> + * Used to initialize the GPU's VA space for a user client, telling the
> kernel
> + * which portion of the VA space is managed by the UMD and kernel
> respectively.
> + *
> + * For the UMD to use the VM_BIND uAPI, this must be called before any
> BOs or
> + * channels are created; if called afterwards DRM_IOCTL_NOUVEAU_VM_INIT
> fails
> + * with -ENOSYS.
> + */
> +struct drm_nouveau_vm_init {
> +   /**
> +* @kernel_managed_addr: start address of the kernel managed VA
> space
> +* region
> +*/
> +   __u64 kernel_managed_addr;
> +   /**
> +* @kernel_managed_size: size of the kernel managed VA space
> region in
> +* bytes
> +*/
> +   __u64 kernel_managed_size;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
> + *
> + * This structure represents a single VM_BIND operation. UMDs should pass
> + * an array of this structure via struct drm_nouveau_vm_bind's _ptr
> field.
> + */
> +struct drm_nouveau_vm_bind_op {
> +   /**
> +* @op: the operation type
> +*/
> +   __u32 op;
> 

Re: [Nouveau] [PATCH] drm/nouveau: fixup the uapi header file.

2023-08-01 Thread Faith Ekstrand
On Tue, Aug 1, 2023 at 4:37 AM Karol Herbst  wrote:

> On Mon, Jul 31, 2023 at 9:16 PM Dave Airlie  wrote:
> >
> > From: Dave Airlie 
> >
> > nouveau > 10 years ago had a plan for new multiplexer inside a
> multiplexer
> > API using nvif. It never fully reached fruition, fast forward 10 years,
> > and the new vulkan driver is avoiding libdrm and calling ioctls, and
> > these 3 ioctls, getparam, channel alloc + free don't seem to be things
> > we'd want to use nvif for.
> >
> > Undeprecate and put them into the uapi header so we can just copy it
> > into mesa later.
> >
> > Signed-off-by: Dave Airlie 
> > ---
> >  drivers/gpu/drm/nouveau/nouveau_abi16.h | 41 -
> >  include/uapi/drm/nouveau_drm.h  | 48 +++--
> >  2 files changed, 45 insertions(+), 44 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.h
> b/drivers/gpu/drm/nouveau/nouveau_abi16.h
> > index 27eae85f33e6..d5d80d0d9011 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_abi16.h
> > +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.h
> > @@ -43,28 +43,6 @@ int  nouveau_abi16_usif(struct drm_file *, void
> *data, u32 size);
> >  #define NOUVEAU_GEM_DOMAIN_VRAM  (1 << 1)
> >  #define NOUVEAU_GEM_DOMAIN_GART  (1 << 2)
> >
> > -struct drm_nouveau_channel_alloc {
> > -   uint32_t fb_ctxdma_handle;
> > -   uint32_t tt_ctxdma_handle;
> > -
> > -   int  channel;
> > -   uint32_t pushbuf_domains;
> > -
> > -   /* Notifier memory */
> > -   uint32_t notifier_handle;
> > -
> > -   /* DRM-enforced subchannel assignments */
> > -   struct {
> > -   uint32_t handle;
> > -   uint32_t grclass;
> > -   } subchan[8];
> > -   uint32_t nr_subchan;
> > -};
> > -
> > -struct drm_nouveau_channel_free {
> > -   int channel;
> > -};
> > -
> >  struct drm_nouveau_grobj_alloc {
> > int  channel;
> > uint32_t handle;
> > @@ -83,31 +61,12 @@ struct drm_nouveau_gpuobj_free {
> > uint32_t handle;
> >  };
> >
> > -#define NOUVEAU_GETPARAM_PCI_VENDOR  3
> > -#define NOUVEAU_GETPARAM_PCI_DEVICE  4
> > -#define NOUVEAU_GETPARAM_BUS_TYPE5
> > -#define NOUVEAU_GETPARAM_FB_SIZE 8
> > -#define NOUVEAU_GETPARAM_AGP_SIZE9
> > -#define NOUVEAU_GETPARAM_CHIPSET_ID  11
> > -#define NOUVEAU_GETPARAM_VM_VRAM_BASE12
> > -#define NOUVEAU_GETPARAM_GRAPH_UNITS 13
> > -#define NOUVEAU_GETPARAM_PTIMER_TIME 14
> > -#define NOUVEAU_GETPARAM_HAS_BO_USAGE15
> > -#define NOUVEAU_GETPARAM_HAS_PAGEFLIP16
> > -struct drm_nouveau_getparam {
> > -   uint64_t param;
> > -   uint64_t value;
> > -};
> > -
> >  struct drm_nouveau_setparam {
> > uint64_t param;
> > uint64_t value;
> >  };
> >
> > -#define DRM_IOCTL_NOUVEAU_GETPARAM   DRM_IOWR(DRM_COMMAND_BASE
> + DRM_NOUVEAU_GETPARAM, struct drm_nouveau_getparam)
> >  #define DRM_IOCTL_NOUVEAU_SETPARAM   DRM_IOWR(DRM_COMMAND_BASE
> + DRM_NOUVEAU_SETPARAM, struct drm_nouveau_setparam)
> > -#define DRM_IOCTL_NOUVEAU_CHANNEL_ALLOC  DRM_IOWR(DRM_COMMAND_BASE
> + DRM_NOUVEAU_CHANNEL_ALLOC, struct drm_nouveau_channel_alloc)
> > -#define DRM_IOCTL_NOUVEAU_CHANNEL_FREE   DRM_IOW (DRM_COMMAND_BASE
> + DRM_NOUVEAU_CHANNEL_FREE, struct drm_nouveau_channel_free)
> >  #define DRM_IOCTL_NOUVEAU_GROBJ_ALLOCDRM_IOW (DRM_COMMAND_BASE
> + DRM_NOUVEAU_GROBJ_ALLOC, struct drm_nouveau_grobj_alloc)
> >  #define DRM_IOCTL_NOUVEAU_NOTIFIEROBJ_ALLOC  DRM_IOWR(DRM_COMMAND_BASE
> + DRM_NOUVEAU_NOTIFIEROBJ_ALLOC, struct drm_nouveau_notifierobj_alloc)
> >  #define DRM_IOCTL_NOUVEAU_GPUOBJ_FREEDRM_IOW (DRM_COMMAND_BASE
> + DRM_NOUVEAU_GPUOBJ_FREE, struct drm_nouveau_gpuobj_free)
> > diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> > index 853a327433d3..1cd97b3d8eda 100644
> > --- a/include/uapi/drm/nouveau_drm.h
> > +++ b/include/uapi/drm/nouveau_drm.h
> > @@ -33,6 +33,44 @@
> >  extern "C" {
> >  #endif
> >
> > +#define NOUVEAU_GETPARAM_PCI_VENDOR  3
> > +#define NOUVEAU_GETPARAM_PCI_DEVICE  4
> > +#define NOUVEAU_GETPARAM_BUS_TYPE5
> > +#define NOUVEAU_GETPARAM_FB_SIZE 8
> > +#define NOUVEAU_GETPARAM_AGP_SIZE9
> > +#define NOUVEAU_GETPARAM_CHIPSET_ID  11
> > +#define NOUVEAU_GETPARAM_VM_VRAM_BASE12
> > +#define NOUVEAU_GETPARAM_GRAPH_UNITS 13
> > +#define NOUVEAU_GETPARAM_PTIMER_TIME 14
> > +#define NOUVEAU_GETPARAM_HAS_BO_USAGE15
> > +#define NOUVEAU_GETPARAM_HAS_PAGEFLIP16
> > +struct drm_nouveau_getparam {
> > +   uint64_t param;
> > +   uint64_t value;
> > +};
> > +
> > +struct drm_nouveau_channel_alloc {
> > +   uint32_t fb_ctxdma_handle;
>

Do we want to use `uint32_t` or `__u32` here? It looks like the original
headers used `uint32_t` and then it got switched to `__u32` after the
deprecation happened.  We probably want `__u32` given that this is a uapi
header.


> > +   

Re: [Nouveau] [PATCH drm-misc-next v8 11/12] drm/nouveau: implement new VM_BIND uAPI

2023-07-31 Thread Faith Ekstrand
On Sun, Jul 30, 2023 at 10:30 PM Faith Ekstrand  wrote:

>
> On Tue, Jul 25, 2023 at 5:48 PM Danilo Krummrich  wrote:
>
>> On 7/25/23 18:43, Danilo Krummrich wrote:
>> > On 7/25/23 18:16, Faith Ekstrand wrote:
>> >> Thanks for the detailed write-up! That would definitely explain it. If
>> >> I remember, I'll try to do a single-threaded run or two. If your
>> >> theory is correct, there should be no real perf difference when
>> >> running single-threaded. Those runs will take a long time, though, so
>> >> I'll have to run them over night. I'll let you know in a few days once
>> >> I have the results.
>> >
>> > I can also push a separate branch where I just print out a warning
>> > whenever we run into such a condition including the time we were
>> waiting
>> > for things to complete. I can probably push something later today.
>>
>>
>> https://gitlab.freedesktop.org/nouvelles/kernel/-/tree/new-uapi-drm-next-track-stalls
>>
>> It prints out the duration of every wait as well as the total wait time
>> since boot.
>>
>> - Danilo
>>
>> >
>> >>
>> >> If this theory holds, then I'm not concerned about the performance of
>> >> the API itself. It would still be good to see if we can find a way to
>> >> reduce the cross-process drag in the implementation but that's a perf
>> >> optimization we can do later.
>> >
>> >  From the kernel side I think the only thing we could really do is to
>> > temporarily run a secondary drm_gpu_scheduler instance, one for
>> VM_BINDs
>> > and one for EXECs until we got the new page table handling in place.
>> >
>> > However, the UMD could avoid such conditions more effectively, since it
>> > controls the address space. Namely, avoid re-using the same region of
>> > the address space right away in certain cases. For instance, instead of
>> > replacing a sparse region A[0x0, 0x400] with a larger sparse region
>> > B[0x0, 0x800], replace it with B'[0x400, 0xC00] if possible.
>> >
>> > However, just mentioning this for completeness. The UMD surely
>> shouldn't
>> > probably even temporarily work around such a kernel limitation.
>> >
>> > Anyway, before doing any of those, let's see if the theory holds and
>> > we're actually running into such cases.
>> >
>> >>
>> >> Does it actually matter? Yes, it kinda does. No, it probably doesn't
>> >> matter for games because you're typically only running one game at a
>> >> time. From a development PoV, however, if it makes CI take longer then
>> >> that slows down development and that's not good for the users, either.
>>
>
> I've run a bunch of tests over the weekend and It's starting to look like
> the added CTS time might be from the newly enabled synchronization tests
> themselves.  We enabled timeline semaphores as well as semaphore and fence
> sharing along with the new uAPI and I did not expect those to be nearly
> that heavy-weight so I didn't think of that as a likely factor. I'm doing a
> couple more runs to confirm but what I'm seeing right now seems to indicate
> that the new API with the old feature set has about the same run time now
> that the submit overhead issue has been fixed.
>

My last two runs have come back and confirmed this theory. With the BO
fixes, all remaining slow-downs are coming from newly added tests which
turn out to be very slow tests.

~Faith


> I think this means that we can proceed under the assumption that there are
> no more serious perf issues with the new API.  I'm happy to RB/ACK the next
> version of the API and implementation patches, as long as it has the new
> NO_SHARE BO create flag and properly rejects exports of NO_SHARE BOs, even
> if not all of the dma_resv plumbing is fully baked.
>
> ~Faith
>


Re: [Nouveau] [PATCH drm-misc-next v8 11/12] drm/nouveau: implement new VM_BIND uAPI

2023-07-30 Thread Faith Ekstrand
On Tue, Jul 25, 2023 at 5:48 PM Danilo Krummrich  wrote:

> On 7/25/23 18:43, Danilo Krummrich wrote:
> > On 7/25/23 18:16, Faith Ekstrand wrote:
> >> Thanks for the detailed write-up! That would definitely explain it. If
> >> I remember, I'll try to do a single-threaded run or two. If your
> >> theory is correct, there should be no real perf difference when
> >> running single-threaded. Those runs will take a long time, though, so
> >> I'll have to run them over night. I'll let you know in a few days once
> >> I have the results.
> >
> > I can also push a separate branch where I just print out a warning
> > whenever we run into such a condition including the time we were waiting
> > for things to complete. I can probably push something later today.
>
>
> https://gitlab.freedesktop.org/nouvelles/kernel/-/tree/new-uapi-drm-next-track-stalls
>
> It prints out the duration of every wait as well as the total wait time
> since boot.
>
> - Danilo
>
> >
> >>
> >> If this theory holds, then I'm not concerned about the performance of
> >> the API itself. It would still be good to see if we can find a way to
> >> reduce the cross-process drag in the implementation but that's a perf
> >> optimization we can do later.
> >
> >  From the kernel side I think the only thing we could really do is to
> > temporarily run a secondary drm_gpu_scheduler instance, one for VM_BINDs
> > and one for EXECs until we got the new page table handling in place.
> >
> > However, the UMD could avoid such conditions more effectively, since it
> > controls the address space. Namely, avoid re-using the same region of
> > the address space right away in certain cases. For instance, instead of
> > replacing a sparse region A[0x0, 0x400] with a larger sparse region
> > B[0x0, 0x800], replace it with B'[0x400, 0xC00] if possible.
> >
> > However, just mentioning this for completeness. The UMD surely shouldn't
> > probably even temporarily work around such a kernel limitation.
> >
> > Anyway, before doing any of those, let's see if the theory holds and
> > we're actually running into such cases.
> >
> >>
> >> Does it actually matter? Yes, it kinda does. No, it probably doesn't
> >> matter for games because you're typically only running one game at a
> >> time. From a development PoV, however, if it makes CI take longer then
> >> that slows down development and that's not good for the users, either.
>

I've run a bunch of tests over the weekend and It's starting to look like
the added CTS time might be from the newly enabled synchronization tests
themselves.  We enabled timeline semaphores as well as semaphore and fence
sharing along with the new uAPI and I did not expect those to be nearly
that heavy-weight so I didn't think of that as a likely factor. I'm doing a
couple more runs to confirm but what I'm seeing right now seems to indicate
that the new API with the old feature set has about the same run time now
that the submit overhead issue has been fixed.

I think this means that we can proceed under the assumption that there are
no more serious perf issues with the new API.  I'm happy to RB/ACK the next
version of the API and implementation patches, as long as it has the new
NO_SHARE BO create flag and properly rejects exports of NO_SHARE BOs, even
if not all of the dma_resv plumbing is fully baked.

~Faith


Re: [Nouveau] [PATCH drm-misc-next v8 11/12] drm/nouveau: implement new VM_BIND uAPI

2023-07-25 Thread Faith Ekstrand
On Mon, Jul 24, 2023 at 9:04 PM Danilo Krummrich  wrote:

> On 7/22/23 17:12, Faith Ekstrand wrote:
> > On Wed, Jul 19, 2023 at 7:15 PM Danilo Krummrich  > <mailto:d...@redhat.com>> wrote:
> >
> > This commit provides the implementation for the new uapi motivated
> > by the
> > Vulkan API. It allows user mode drivers (UMDs) to:
> >
> > 1) Initialize a GPU virtual address (VA) space via the new
> > DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion
> > of VA
> > space managed by the kernel and userspace, respectively.
> >
> > 2) Allocate and free a VA space region as well as bind and unbind
> memory
> > to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > UMDs can request the named operations to be processed either
> > synchronously or asynchronously. It supports DRM syncobjs
> > (incl. timelines) as synchronization mechanism. The management
> > of the
> > GPU VA mappings is implemented with the DRM GPU VA manager.
> >
> > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
> The
> > execution happens asynchronously. It supports DRM syncobj (incl.
> > timelines) as synchronization mechanism. DRM GEM object locking
> is
> > handled with drm_exec.
> >
> > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the
> DRM
> > GPU scheduler for the asynchronous paths.
> >
> >
> > IDK where the best place to talk about this is but this seems as good as
> > any.
> >
> > I've been looking into why the Vulkan CTS runs about 2x slower for me on
> > the new UAPI and I created a little benchmark to facilitate testing:
> >
> > https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/141
> > <https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/141>
> >
> > The test, roughly, does the following:
> >   1. Allocates and binds 1000 BOs
> >   2. Constructs a pushbuf that executes a no-op compute shader.
> >   3. Does a single EXEC/wait combo to warm up the kernel
> >   4. Loops 10,000 times, doing SYNCOBJ_RESET (fast), EXEC, and then
> > SYNCOBJ_WAIT and times the loop
> >
> > Of course, there's a bit of userspace driver overhead but that's
> > negledgable.
> >
> > If you drop the top patch which allocates 1k buffers, the submit time on
> > the old uAPI is 54 us/exec vs. 66 us/exec on the new UAPI. This includes
> > the time to do a SYNCOBJ_RESET (fast), EXEC, and SYNCOBJ_WAIT.The Intel
> > driver, by comparison, is 33us/exec so it's not syncobj overhead. This
> > is a bit concerning (you'd think the new thing would be faster) but what
> > really has me concerned is the 1k buffer case.
> >
> > If you include the top patch in the crucible MR, it allocates 1000 BOs
> > and VM_BINDs them. All the binding is done before the warmup EXEC.
> > Suddenly, the submit time jumps to 257 us/exec with the new UAPI. The
> > old UAPI is much worse (1134 us/exec) but that's not the point. Once
> > we've done the first EXEC and created our VM bindings, the cost per EXEC
> > shouldn't change at all based on the number of BOs bound.  Part of the
> > point of VM_BIND is to get all that binding logic and BO walking off the
> > EXEC path.
> >
> > Normally, I wouldn't be too worried about a little performance problem
> > like this. This is the first implementation and we can improve it later.
> > I get that. However, I suspect the solution to this problem involves
> > more UAPI and I want to make sure we have it all before we call this all
> > done and dusted and land it.
> >
> > The way AMD solves this problem as well as the new Xe driver for Intel
> > is to have a concept of internal vs. external BOs. Basically, there's an
> > INTERNAL bit specified somewhere in BO creation that has a few userspace
> > implications:
> >   1. In the Xe world where VMs are objects, INTERNAL BOs are assigned a
> > VM on creation and can never be bound to any other VM.
> >   2. Any attempt to export an INTERNAL BO via prime or a similar
> > mechanism will fail with -EINVAL (I think?).
> >
> > Inside the kernel driver, all the internal BOs on a VM (or DRM file in
> > the case of nouveau/AMD since they don't have VM objects) share a single
> > dma_resv which allows you to avoid having to walk lists of BOs and take
> > locks on every exec. Instead, you can just look at the fences on the
> > dma_resv for the VM. There's still a BO list associated with the VM f

Re: [Nouveau] [PATCH drm-misc-next v8 03/12] drm/nouveau: new VM_BIND uapi interfaces

2023-07-25 Thread Faith Ekstrand
On Mon, Jul 24, 2023 at 9:04 PM Danilo Krummrich  wrote:

>
>
> On 7/22/23 00:58, Faith Ekstrand wrote:
> >
> > On Wed, Jul 19, 2023 at 7:15 PM Danilo Krummrich  > <mailto:d...@redhat.com>> wrote:
> >
> > This commit provides the interfaces for the new UAPI motivated by the
> > Vulkan API. It allows user mode drivers (UMDs) to:
> >
> > 1) Initialize a GPU virtual address (VA) space via the new
> > DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel
> reserved
> > VA area.
> >
> > 2) Bind and unbind GPU VA space mappings via the new
> > DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> >
> > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
> >
> > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
> > asynchronous processing with DRM syncobjs as synchronization
> mechanism.
> >
> > The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
> > DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
> >
> > Co-authored-by: Dave Airlie  > <mailto:airl...@redhat.com>>
> > Signed-off-by: Danilo Krummrich  > <mailto:d...@redhat.com>>
> > ---
> >   Documentation/gpu/driver-uapi.rst |   8 ++
> >   include/uapi/drm/nouveau_drm.h| 209
> ++
> >   2 files changed, 217 insertions(+)
> >
> > diff --git a/Documentation/gpu/driver-uapi.rst
> > b/Documentation/gpu/driver-uapi.rst
> > index 4411e6919a3d..9c7ca6e33a68 100644
> > --- a/Documentation/gpu/driver-uapi.rst
> > +++ b/Documentation/gpu/driver-uapi.rst
> > @@ -6,3 +6,11 @@ drm/i915 uAPI
> >   =
> >
> >   .. kernel-doc:: include/uapi/drm/i915_drm.h
> > +
> > +drm/nouveau uAPI
> > +
> > +
> > +VM_BIND / EXEC uAPI
> > +---
> > +
> > +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
> > diff --git a/include/uapi/drm/nouveau_drm.h
> > b/include/uapi/drm/nouveau_drm.h
> > index 853a327433d3..4d3a70529637 100644
> > --- a/include/uapi/drm/nouveau_drm.h
> > +++ b/include/uapi/drm/nouveau_drm.h
> > @@ -126,6 +126,209 @@ struct drm_nouveau_gem_cpu_fini {
> >  __u32 handle;
> >   };
> >
> > +/**
> > + * struct drm_nouveau_sync - sync object
> > + *
> > + * This structure serves as synchronization mechanism for
> (potentially)
> > + * asynchronous operations such as EXEC or VM_BIND.
> > + */
> > +struct drm_nouveau_sync {
> > +   /**
> > +* @flags: the flags for a sync object
> > +*
> > +* The first 8 bits are used to determine the type of the
> > sync object.
> > +*/
> > +   __u32 flags;
> > +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
> > +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
> > +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
> > +   /**
> > +* @handle: the handle of the sync object
> > +*/
> > +   __u32 handle;
> > +   /**
> > +* @timeline_value:
> > +*
> > +* The timeline point of the sync object in case the syncobj
> > is of
> > +* type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
> > +*/
> > +   __u64 timeline_value;
> > +};
> > +
> > +/**
> > + * struct drm_nouveau_vm_init - GPU VA space init structure
> > + *
> > + * Used to initialize the GPU's VA space for a user client, telling
> > the kernel
> > + * which portion of the VA space is managed by the UMD and kernel
> > respectively.
> >
> >
> > I assume this has to be called quite early. Like maybe before any BOs or
> > channels are created? In any case, it'd be nice to have that documented.
>
> Exactly, doing any of those will disable the new uAPI entirely if it
> wasn't yet initialized. I will add some documentation for this.
>

Thanks!


> >
> > + */
> > +struct drm_nouveau_vm_init {
> > +   /**
> > +* @unmanaged_addr: start address of the kernel managed VA
> > space region
> > +*/
> > +   __u64 unmanaged_addr;
> > +   /**
> > +* @unmanaged_size: size of the kernel managed VA space
> > region 

Re: [Nouveau] [PATCH drm-misc-next v8 03/12] drm/nouveau: new VM_BIND uapi interfaces

2023-07-23 Thread Faith Ekstrand
On Wed, Jul 19, 2023 at 7:15 PM Danilo Krummrich  wrote:

> This commit provides the interfaces for the new UAPI motivated by the
> Vulkan API. It allows user mode drivers (UMDs) to:
>
> 1) Initialize a GPU virtual address (VA) space via the new
>DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>VA area.
>
> 2) Bind and unbind GPU VA space mappings via the new
>DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>
> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>
> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
> asynchronous processing with DRM syncobjs as synchronization mechanism.
>
> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>
> Co-authored-by: Dave Airlie 
> Signed-off-by: Danilo Krummrich 
> ---
>  Documentation/gpu/driver-uapi.rst |   8 ++
>  include/uapi/drm/nouveau_drm.h| 209 ++
>  2 files changed, 217 insertions(+)
>
> diff --git a/Documentation/gpu/driver-uapi.rst
> b/Documentation/gpu/driver-uapi.rst
> index 4411e6919a3d..9c7ca6e33a68 100644
> --- a/Documentation/gpu/driver-uapi.rst
> +++ b/Documentation/gpu/driver-uapi.rst
> @@ -6,3 +6,11 @@ drm/i915 uAPI
>  =
>
>  .. kernel-doc:: include/uapi/drm/i915_drm.h
> +
> +drm/nouveau uAPI
> +
> +
> +VM_BIND / EXEC uAPI
> +---
> +
> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index 853a327433d3..4d3a70529637 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -126,6 +126,209 @@ struct drm_nouveau_gem_cpu_fini {
> __u32 handle;
>  };
>
> +/**
> + * struct drm_nouveau_sync - sync object
> + *
> + * This structure serves as synchronization mechanism for (potentially)
> + * asynchronous operations such as EXEC or VM_BIND.
> + */
> +struct drm_nouveau_sync {
> +   /**
> +* @flags: the flags for a sync object
> +*
> +* The first 8 bits are used to determine the type of the sync
> object.
> +*/
> +   __u32 flags;
> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
> +   /**
> +* @handle: the handle of the sync object
> +*/
> +   __u32 handle;
> +   /**
> +* @timeline_value:
> +*
> +* The timeline point of the sync object in case the syncobj is of
> +* type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
> +*/
> +   __u64 timeline_value;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_init - GPU VA space init structure
> + *
> + * Used to initialize the GPU's VA space for a user client, telling the
> kernel
> + * which portion of the VA space is managed by the UMD and kernel
> respectively.
>

I assume this has to be called quite early. Like maybe before any BOs or
channels are created? In any case, it'd be nice to have that documented.


> + */
> +struct drm_nouveau_vm_init {
> +   /**
> +* @unmanaged_addr: start address of the kernel managed VA space
> region
> +*/
> +   __u64 unmanaged_addr;
> +   /**
> +* @unmanaged_size: size of the kernel managed VA space region in
> bytes
> +*/
> +   __u64 unmanaged_size;
>

Over-all, I think this is the right API. My only concern is with the word
"unmanaged". None of the VA space is unmanaged. Some is userspace-managed
and some is kernel-managed.  I guess "unmanaged" kinda makes sense because
this is coming from userspace and it's saying which bits it manages and
which bits it doesn't.  Still seems clunky to me.  Maybe kernel_managed?
IDK, that feels weird too. Since we're already using UMD in this file, we
could call it kmd_managed. IDK. 路‍♀️

Yeah, I know this is a total bikeshed color thing and I'm not going to
block anything based on it.  Just wanted to see if we can come up with
anything better.  It's documented and that's the important thing.


> +};
> +
> +/**
> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
> + *
> + * This structure represents a single VM_BIND operation. UMDs should pass
> + * an array of this structure via struct drm_nouveau_vm_bind's _ptr
> field.
> + */
> +struct drm_nouveau_vm_bind_op {
> +   /**
> +* @op: the operation type
> +*/
> +   __u32 op;
> +/**
> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
> + *
> + * Map a GEM object to the GPU's VA space. Optionally, the
> + * _NOUVEAU_VM_BIND_SPARSE flag can be passed to instruct the kernel
> to
> + * create sparse mappings for the given range.
> + */
> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x0
> +/**
> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
> + *
> + * Unmap an existing mapping in the GPU's VA space. If the region the
> mapping
> + * is located in is a sparse region, new sparse mappings are created
> where the
> + * unmapped (memory backed) mapping was mapped