Re: [PATCH v4] drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations

2024-05-09 Thread Faith Ekstrand
On Thu, May 9, 2024 at 3:44 PM Mohamed Ahmed <
mohamedahmedegypt2...@gmail.com> wrote:

> Allows PTE kind and tile mode on BO create with VM_BIND,
> and adds a GETPARAM to indicate this change. This is needed to support
> modifiers in NVK and ensure correctness when dealing with the nouveau
> GL driver.
>
> The userspace modifiers implementation this is for can be found here:
> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24795
>
> Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
> Signed-off-by: Mohamed Ahmed 
>

Reviewed-by: Faith Ekstrand 


> ---
>  drivers/gpu/drm/nouveau/nouveau_abi16.c |  3 ++
>  drivers/gpu/drm/nouveau/nouveau_bo.c| 44 +++--
>  include/uapi/drm/nouveau_drm.h  |  7 
>  3 files changed, 29 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> index 80f74ee0f..47e53e17b 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> @@ -272,6 +272,9 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> getparam->value =
> (u64)ttm_resource_manager_usage(vram_mgr);
> break;
> }
> +   case NOUVEAU_GETPARAM_HAS_VMA_TILEMODE:
> +   getparam->value = 1;
> +   break;
> default:
> NV_PRINTK(dbg, cli, "unknown parameter %lld\n",
> getparam->param);
> return -EINVAL;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
> b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index db8cbf615..186add400 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -241,28 +241,28 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> }
>
> nvbo->contig = !(tile_flags & NOUVEAU_GEM_TILE_NONCONTIG);
> -   if (!nouveau_cli_uvmm(cli) || internal) {
> -   /* for BO noVM allocs, don't assign kinds */
> -   if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
> -   nvbo->kind = (tile_flags & 0xff00) >> 8;
> -   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> -   kfree(nvbo);
> -   return ERR_PTR(-EINVAL);
> -   }
>
> -   nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
> -   } else if (cli->device.info.family >=
> NV_DEVICE_INFO_V0_TESLA) {
> -   nvbo->kind = (tile_flags & 0x7f00) >> 8;
> -   nvbo->comp = (tile_flags & 0x0003) >> 16;
> -   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> -   kfree(nvbo);
> -   return ERR_PTR(-EINVAL);
> -   }
> -   } else {
> -   nvbo->zeta = (tile_flags & 0x0007);
> +   if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
> +   nvbo->kind = (tile_flags & 0xff00) >> 8;
> +   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> +   kfree(nvbo);
> +   return ERR_PTR(-EINVAL);
> +   }
> +
> +   nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
> +   } else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
> +   nvbo->kind = (tile_flags & 0x7f00) >> 8;
> +   nvbo->comp = (tile_flags & 0x0003) >> 16;
> +   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> +   kfree(nvbo);
> +   return ERR_PTR(-EINVAL);
> }
> -   nvbo->mode = tile_mode;
> +   } else {
> +   nvbo->zeta = (tile_flags & 0x0007);
> +   }
> +   nvbo->mode = tile_mode;
>
> +   if (!nouveau_cli_uvmm(cli) || internal) {
> /* Determine the desirable target GPU page size for the
> buffer. */
> for (i = 0; i < vmm->page_nr; i++) {
> /* Because we cannot currently allow VMM maps to
> fail
> @@ -304,12 +304,6 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> }
> nvbo->page = vmm->page[pi].shift;
> } else {
> -   /* reject other tile flags when in VM mode. */
> -   if (tile_mode)
> -   

Re: [PATCH v3] drm/nouveau: use tile_mode and pte_kind for VM_BIND bo allocations

2024-05-08 Thread Faith Ekstrand
On Wed, May 8, 2024 at 6:06 PM Mohamed Ahmed <
mohamedahmedegypt2...@gmail.com> wrote:

> Allows PTE kind and tile mode on BO create with VM_BIND,
> and adds a GETPARAM to indicate this change. This is needed to support
> modifiers in NVK and ensure correctness when dealing with the nouveau
> GL driver.
>
> The userspace modifiers implementation this is for can be found here:
> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28843
>
> Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
> Signed-off-by: Mohamed Ahmed 
> ---
>  drivers/gpu/drm/nouveau/nouveau_abi16.c |  3 ++
>  drivers/gpu/drm/nouveau/nouveau_bo.c| 45 +++--
>  include/uapi/drm/nouveau_drm.h  |  7 
>  3 files changed, 30 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> index 80f74ee0f..47e53e17b 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> @@ -272,6 +272,9 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> getparam->value =
> (u64)ttm_resource_manager_usage(vram_mgr);
> break;
> }
> +   case NOUVEAU_GETPARAM_HAS_VMA_TILEMODE:
> +   getparam->value = 1;
> +   break;
> default:
> NV_PRINTK(dbg, cli, "unknown parameter %lld\n",
> getparam->param);
> return -EINVAL;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
> b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index db8cbf615..583c962ef 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -241,28 +241,29 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> }
>
> nvbo->contig = !(tile_flags & NOUVEAU_GEM_TILE_NONCONTIG);
> -   if (!nouveau_cli_uvmm(cli) || internal) {
> -   /* for BO noVM allocs, don't assign kinds */
> -   if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
> -   nvbo->kind = (tile_flags & 0xff00) >> 8;
> -   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> -   kfree(nvbo);
> -   return ERR_PTR(-EINVAL);
> -   }
>
> -   nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
> -   } else if (cli->device.info.family >=
> NV_DEVICE_INFO_V0_TESLA) {
> -   nvbo->kind = (tile_flags & 0x7f00) >> 8;
> -   nvbo->comp = (tile_flags & 0x0003) >> 16;
> -   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> -   kfree(nvbo);
> -   return ERR_PTR(-EINVAL);
> -   }
> -   } else {
> -   nvbo->zeta = (tile_flags & 0x0007);
> +   /* for BO allocs, don't assign kinds */
>

I think this comment is stale. We're now assigning them in both cases.


> +   if (cli->device.info.family >= NV_DEVICE_INFO_V0_FERMI) {
> +   nvbo->kind = (tile_flags & 0xff00) >> 8;
> +   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> +   kfree(nvbo);
> +   return ERR_PTR(-EINVAL);
> +   }
> +
> +   nvbo->comp = mmu->kind[nvbo->kind] != nvbo->kind;
> +   } else if (cli->device.info.family >= NV_DEVICE_INFO_V0_TESLA) {
> +   nvbo->kind = (tile_flags & 0x7f00) >> 8;
> +   nvbo->comp = (tile_flags & 0x0003) >> 16;
> +   if (!nvif_mmu_kind_valid(mmu, nvbo->kind)) {
> +   kfree(nvbo);
> +   return ERR_PTR(-EINVAL);
> }
> -   nvbo->mode = tile_mode;
> +   } else {
> +   nvbo->zeta = (tile_flags & 0x0007);
> +   }
> +   nvbo->mode = tile_mode;
>
> +   if (!nouveau_cli_uvmm(cli) || internal) {
> /* Determine the desirable target GPU page size for the
> buffer. */
> for (i = 0; i < vmm->page_nr; i++) {
> /* Because we cannot currently allow VMM maps to
> fail
> @@ -304,12 +305,6 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> }
> nvbo->page = vmm->page[pi].shift;
> } else {
> -   /* reject other tile flags when in VM mode. */
> -   if (tile_mode)
> -   return ERR_PTR(-EINVAL);
> -   if (tile_flags & ~NOUVEAU_GEM_TILE_NONCONTIG)
> -   return ERR_PTR(-EINVAL);
> -
> /* Determine the desirable target GPU page size for the
> buffer. */
> for (i = 0; i < vmm->page_nr; i++) {
> /* Because we cannot currently allow VMM maps to
> fail
> diff --git a/include/uapi/drm/nouveau_drm.h
> 

Re: [PATCH v3 00/14] drm: Add a driver for CSF-based Mali GPUs

2023-12-11 Thread Faith Ekstrand
On Mon, 2023-12-11 at 09:52 +0100, Boris Brezillon wrote:
> Hi,
> 
> On Sun, 10 Dec 2023 13:58:51 +0900
> Tatsuyuki Ishi  wrote:
> 
> > > On Dec 5, 2023, at 2:32, Boris Brezillon
> > >  wrote:
> > > 
> > > Hello,
> > > 
> > > This is the 3rd version of the kernel driver for Mali CSF-based
> > > GPUs.
> > > 
> > > With all the DRM dependencies being merged (drm-sched single
> > > entity and
> > > drm_gpuvm), I thought now was a good time to post a new version.
> > > Note
> > > that the iommu series we depend on [1] has been merged recently.
> > > The
> > > only remaining dependency that hasn't been merged yet is this
> > > rather
> > > trival drm_gpuvm [2] patch.
> > > 
> > > As for v2, I pushed a branch based on drm-misc-next and
> > > containing
> > > all the dependencies that are not yet available in drm-misc-next
> > > here[3], and another [4] containing extra patches to have things
> > > working on rk3588. The CSF firmware binary can be found here[5],
> > > and
> > > should be placed under
> > > /lib/firmware/arm/mali/arch10.8/mali_csffw.bin.
> > > 
> > > The mesa MR adding v10 support on top of panthor is available
> > > here [6].
> > > 
> > > Regarding the GPL2+MIT relicensing, Liviu did an audit and found
> > > two
> > > more people that I didn't spot initially: Clément Péron for the
> > > devfreq
> > > code, and Alexey Sheplyakov for some bits in panthor_gpu.c. Both
> > > are
> > > Cc-ed on the relevant patches. The rest of the code is either
> > > new, or
> > > covered by the Linaro, Arm and Collabora acks.
> > > 
> > > And here is a non-exhaustive changelog, check each commit for a
> > > detailed
> > > changelog.
> > > 
> > > v3;
> > > - Quite a few changes at the MMU/sched level to make the fix some
> > >  race conditions and deadlocks
> > > - Addition of the a sync-only VM_BIND operation (to support
> > >  vkQueueSparseBind with zero commands).  
> > 
> > Hi Boris,
> > 
> > Just wanted to point out that vkQueueBindSparse's semantics is
> > rather different
> > from vkQueueSubmit when it comes to synchronization.  In short,
> > vkQueueBindSparse does not operate on a particular timeline (aka
> > scheduling
> > queue / drm_sched_entity).  The property of following a timeline
> > order is known
> > as “submission order” [1] in Vulkan, and applies to vkQueueSubmit
> > only and not
> > vkQueueBindSparse.
> 
> Hm, okay. I really though the same ordering guarantees applied to
> sparse binding queues too, as the spec [1] says
> 
> "
> Batches begin execution in the order they appear in pBindInfo, but
> may complete out of order.
> "

Right. So this is something where the Vulkan spec isn't terribly clear
and I think I need to go file a spec bug.  I'm fairly sure that the
intent is that bind operations MAY complete out-of-order but are not
required to complete out-of-order.  In other words, I'm fairly sure
that implementations are allowed but not required to insert extra
dependencies that force some amount of ordering.  We take advantage of
this in Mesa today to properly implement vkQueueWaitIdle() on sparse
binding queues.  This is also a requirement of Windows WDDM2 as far as
I can tell so I'd be very surprised if we disallowed it in the spec.

That said, that's all very unclear and IDK where you'd get that from
the spec.  I'm going to go file a spec bug so we can get this sorted
out.

~Faith


> which means things are submitted in order inside a vkQueueSparseBind
> context, so I was expecting the submission ordering guarantee to
> apply
> across vkQueueSparseBind() calls on the same queue too. Just want to
> mention that all kernel implementations I have seen so far assume
> VM_BIND submissions on a given queue are serialized (that's how
> drm_sched works, and Xe, Nouveau and Panthor are basing their VM_BIND
> implemenation on drm_sched).
> 
> Maybe Faith, or anyone deeply involved in the Vulkan specification,
> can
> confirm that submission ordering guarantees are relaxed on sparse
> binding queues.
> 
> > 
> > Hence, an implementation that takes full advantage of Vulkan
> > semantics would
> > essentially have an infinite amount of VM_BIND contexts.
> 
> Uh, that's definitely not how I understood it initially...
> 
> > It would also not need
> > sync-only VM_BIND submissions, assuming that drmSyncobjTransfer
> > works.
> 
> Sure, if each vkQueueSparseBind() has its own timeline, an internal
> timeline-syncobj with a bunch of drmSyncobjTransfer() calls would do
> the
> trick (might require several ioctls() to merge input syncobjs, but
> that's probably not the end of the world).
> 
> > 
> > I’m not saying that the driver should always be implemented that
> > way; in
> > particular, app developers are also confused by the semantics and
> > native Vulkan
> > games can be terribly wrong [2].
> 
> Okay, thanks for the pointer. If I may, I find this semantics utterly
> confusing, to say the least. At the very least, the
> vkQueueBindSparse()
> doc could be update so it clearly reflects the facts 

Re: [PATCH v8 04/20] drm/imagination/uapi: Add PowerVR driver UAPI

2023-11-06 Thread Faith Ekstrand
On Tue, 2023-10-31 at 15:12 +, Sarah Walker wrote:
> Add the UAPI implementation for the PowerVR driver.
> 
> Changes from v7:
> - Remove prefixes from DRM_PVR_BO_* flags
> - Improve struct drm_pvr_ioctl_create_hwrt_dataset_args documentation
> - Remove references to static area carveouts
> - CREATE_BO ioctl now returns an error if provided size isn't page
> aligned
> - Clarify documentation for DRM_PVR_STATIC_DATA_AREA_EOT
> 
> Changes from v6:
> - Add padding to struct drm_pvr_dev_query_gpu_info
> - Improve BYPASS_CACHE flag documentation
> - Add SUBMIT_JOB_FRAG_CMD_DISABLE_PIXELMERGE flag
> 
> Changes from v4:
> - Remove CREATE_ZEROED flag for BO creation (all buffers are now
> zeroed)

Latest API is

Reviewed-by: Faith Ekstrand 

I think we're good to begin the upstreaming process.

~Faith


> Co-developed-by: Frank Binns 
> Signed-off-by: Frank Binns 
> Co-developed-by: Boris Brezillon 
> Signed-off-by: Boris Brezillon 
> Co-developed-by: Matt Coster 
> Signed-off-by: Matt Coster 
> Co-developed-by: Donald Robson 
> Signed-off-by: Donald Robson 
> Signed-off-by: Sarah Walker 
> ---
>  MAINTAINERS    |    1 +
>  include/uapi/drm/pvr_drm.h | 1297
> 
>  2 files changed, 1298 insertions(+)
>  create mode 100644 include/uapi/drm/pvr_drm.h
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 5b2018ff3e0f..bef1ac7ed647 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -10265,6 +10265,7 @@ M:Frank Binns 
>  M:   Donald Robson 
>  S:   Supported
>  F:   Documentation/devicetree/bindings/gpu/img,powervr.yaml
> +F:   include/uapi/drm/pvr_drm.h
>  
>  IMON SOUNDGRAPH USB IR RECEIVER
>  M:   Sean Young 
> diff --git a/include/uapi/drm/pvr_drm.h b/include/uapi/drm/pvr_drm.h
> new file mode 100644
> index ..d8e82f2a072a
> --- /dev/null
> +++ b/include/uapi/drm/pvr_drm.h
> @@ -0,0 +1,1297 @@
> +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT
> */
> +/* Copyright (c) 2023 Imagination Technologies Ltd. */
> +
> +#ifndef PVR_DRM_UAPI_H
> +#define PVR_DRM_UAPI_H
> +
> +#include "drm.h"
> +
> +#include 
> +#include 
> +
> +#if defined(__cplusplus)
> +extern "C" {
> +#endif
> +
> +/**
> + * DOC: PowerVR UAPI
> + *
> + * The PowerVR IOCTL argument structs have a few limitations in
> place, in
> + * addition to the standard kernel restrictions:
> + *
> + *  - All members must be type-aligned.
> + *  - The overall struct must be padded to 64-bit alignment.
> + *  - Explicit padding is almost always required. This takes the
> form of
> + *    ``_padding_[x]`` members of sufficient size to pad to the next
> power-of-two
> + *    alignment, where [x] is the offset into the struct in
> hexadecimal. Arrays
> + *    are never used for alignment. Padding fields must be zeroed;
> this is
> + *    always checked.
> + *  - Unions may only appear as the last member of a struct.
> + *  - Individual union members may grow in the future. The space
> between the
> + *    end of a union member and the end of its containing union is
> considered
> + *    "implicit padding" and must be zeroed. This is always checked.
> + *
> + * In addition to the IOCTL argument structs, the PowerVR UAPI makes
> use of
> + * DEV_QUERY argument structs. These are used to fetch information
> about the
> + * device and runtime. These structs are subject to the same rules
> set out
> + * above.
> + */
> +
> +/**
> + * struct drm_pvr_obj_array - Container used to pass arrays of
> objects
> + *
> + * It is not unusual to have to extend objects to pass new
> parameters, and the DRM
> + * ioctl infrastructure is supporting that by padding ioctl
> arguments with zeros
> + * when the data passed by userspace is smaller than the struct
> defined in the
> + * drm_ioctl_desc, thus keeping things backward compatible. This
> type is just
> + * applying the same concepts to indirect objects passed through
> arrays referenced
> + * from the main ioctl arguments structure: the stride basically
> defines the size
> + * of the object passed by userspace, which allows the kernel driver
> to pad with
> + * zeros when it's smaller than the size of the object it expects.
> + *
> + * Use ``DRM_PVR_OBJ_ARRAY()`` to fill object array fields, unless
> you
> + * have a very good reason not to.
> + */
> +struct drm_pvr_obj_array {
> + /** @stride: Stride of object struct. Used for versioning.
> */
> + __u32 stride;
> +
> + /** @count: Number of objects in the array. */
> + __u32 count;
> +
> + /** @array: User pointer to an array of objects. */
&g

Re: [PATCH 3/3] drm/nouveau: exec: report max pushs through getparam

2023-09-27 Thread Faith Ekstrand
On Thu, 2023-09-28 at 11:12 +1000, Dave Airlie wrote:
> On Thu, 28 Sept 2023 at 07:55, Faith Ekstrand
>  wrote:
> > 
> > On Wed, 2023-09-27 at 03:22 +0200, Danilo Krummrich wrote:
> > > Report the maximum number of IBs that can be pushed with a single
> > > DRM_IOCTL_NOUVEAU_EXEC through DRM_IOCTL_NOUVEAU_GETPARAM.
> > > 
> > > While the maximum number of IBs per ring might vary between
> > > chipsets,
> > > the kernel will make sure that userspace can only push a fraction
> > > of
> > > the
> > > maximum number of IBs per ring per job, such that we avoid a
> > > situation
> > > where there's only a single job occupying the ring, which could
> > > potentially lead to the ring run dry.
> > > 
> > > Using DRM_IOCTL_NOUVEAU_GETPARAM to report the maximum number of
> > > IBs
> > > that can be pushed with a single DRM_IOCTL_NOUVEAU_EXEC implies
> > > that
> > > all channels of a given device have the same ring size.
> > 
> > There's a bunch of nouveau kernel details I don't know here but the
> > interface looks good and I prefer it to a #define in the header.
> > 
> > Acked-by: Faith Ekstrand 
> 
> For the series
> 
> Reviewed-by: Dave Airlie 
> 
> we should probably land this in drm-misc-fixes, since it would be
> good
> to have in 6.6

Agreed.  My Mesa patch should handle both the case where we have the
getparam and when we don't.  However, I'd rather just make it part of
the new UAPI from the start and have a hard requirement on it since it
may reduce the current maximum in the header.

~Faith


> Dave.
> 
> > 
> > 
> > > Signed-off-by: Danilo Krummrich 
> > > ---
> > >  drivers/gpu/drm/nouveau/nouveau_abi16.c | 19 +++
> > >  drivers/gpu/drm/nouveau/nouveau_chan.c  |  2 +-
> > >  drivers/gpu/drm/nouveau/nouveau_dma.h   |  3 +++
> > >  drivers/gpu/drm/nouveau/nouveau_exec.c  |  7 ---
> > >  drivers/gpu/drm/nouveau/nouveau_exec.h  |  5 +
> > >  include/uapi/drm/nouveau_drm.h  | 10 ++
> > >  6 files changed, 42 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> > > b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> > > index 30afbec9e3b1..1a198689b391 100644
> > > --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> > > +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> > > @@ -31,6 +31,7 @@
> > > 
> > >  #include "nouveau_drv.h"
> > >  #include "nouveau_dma.h"
> > > +#include "nouveau_exec.h"
> > >  #include "nouveau_gem.h"
> > >  #include "nouveau_chan.h"
> > >  #include "nouveau_abi16.h"
> > > @@ -183,6 +184,20 @@ nouveau_abi16_fini(struct nouveau_abi16
> > > *abi16)
> > >     cli->abi16 = NULL;
> > >  }
> > > 
> > > +static inline unsigned int
> > > +getparam_dma_ib_max(struct nvif_device *device)
> > > +{
> > > +   const struct nvif_mclass dmas[] = {
> > > +   { NV03_CHANNEL_DMA, 0 },
> > > +   { NV10_CHANNEL_DMA, 0 },
> > > +   { NV17_CHANNEL_DMA, 0 },
> > > +   { NV40_CHANNEL_DMA, 0 },
> > > +   {}
> > > +   };
> > > +
> > > +   return nvif_mclass(>object, dmas) < 0 ?
> > > NV50_DMA_IB_MAX : 0;
> > > +}
> > > +
> > >  int
> > >  nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> > >  {
> > > @@ -247,6 +262,10 @@
> > > nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> > >     case NOUVEAU_GETPARAM_GRAPH_UNITS:
> > >     getparam->value = nvkm_gr_units(gr);
> > >     break;
> > > +   case NOUVEAU_GETPARAM_EXEC_PUSH_MAX:
> > > +   getparam->value = getparam_dma_ib_max(device) /
> > > + NOUVEAU_EXEC_PUSH_MAX_DIV;
> > > +   break;
> > >     default:
> > >     NV_PRINTK(dbg, cli, "unknown parameter %lld\n",
> > > getparam->param);
> > >     return -EINVAL;
> > > diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c
> > > b/drivers/gpu/drm/nouveau/nouveau_chan.c
> > > index ac56f4689ee3..c3c2ab887978 100644
> > > --- a/drivers/gpu/drm/nouveau/nouveau_chan.c
> > > +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
> > > @@ -456,7 +456,7 @@ nouveau_channel

Re: [PATCH 3/3] drm/nouveau: exec: report max pushs through getparam

2023-09-27 Thread Faith Ekstrand
On Wed, 2023-09-27 at 03:22 +0200, Danilo Krummrich wrote:
> Report the maximum number of IBs that can be pushed with a single
> DRM_IOCTL_NOUVEAU_EXEC through DRM_IOCTL_NOUVEAU_GETPARAM.
> 
> While the maximum number of IBs per ring might vary between chipsets,
> the kernel will make sure that userspace can only push a fraction of
> the
> maximum number of IBs per ring per job, such that we avoid a
> situation
> where there's only a single job occupying the ring, which could
> potentially lead to the ring run dry.
> 
> Using DRM_IOCTL_NOUVEAU_GETPARAM to report the maximum number of IBs
> that can be pushed with a single DRM_IOCTL_NOUVEAU_EXEC implies that
> all channels of a given device have the same ring size.

There's a bunch of nouveau kernel details I don't know here but the
interface looks good and I prefer it to a #define in the header.

Acked-by: Faith Ekstrand 


> Signed-off-by: Danilo Krummrich 
> ---
>  drivers/gpu/drm/nouveau/nouveau_abi16.c | 19 +++
>  drivers/gpu/drm/nouveau/nouveau_chan.c  |  2 +-
>  drivers/gpu/drm/nouveau/nouveau_dma.h   |  3 +++
>  drivers/gpu/drm/nouveau/nouveau_exec.c  |  7 ---
>  drivers/gpu/drm/nouveau/nouveau_exec.h  |  5 +
>  include/uapi/drm/nouveau_drm.h  | 10 ++
>  6 files changed, 42 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> index 30afbec9e3b1..1a198689b391 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_abi16.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.c
> @@ -31,6 +31,7 @@
>  
>  #include "nouveau_drv.h"
>  #include "nouveau_dma.h"
> +#include "nouveau_exec.h"
>  #include "nouveau_gem.h"
>  #include "nouveau_chan.h"
>  #include "nouveau_abi16.h"
> @@ -183,6 +184,20 @@ nouveau_abi16_fini(struct nouveau_abi16 *abi16)
> cli->abi16 = NULL;
>  }
>  
> +static inline unsigned int
> +getparam_dma_ib_max(struct nvif_device *device)
> +{
> +   const struct nvif_mclass dmas[] = {
> +   { NV03_CHANNEL_DMA, 0 },
> +   { NV10_CHANNEL_DMA, 0 },
> +   { NV17_CHANNEL_DMA, 0 },
> +   { NV40_CHANNEL_DMA, 0 },
> +   {}
> +   };
> +
> +   return nvif_mclass(>object, dmas) < 0 ?
> NV50_DMA_IB_MAX : 0;
> +}
> +
>  int
>  nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
>  {
> @@ -247,6 +262,10 @@ nouveau_abi16_ioctl_getparam(ABI16_IOCTL_ARGS)
> case NOUVEAU_GETPARAM_GRAPH_UNITS:
> getparam->value = nvkm_gr_units(gr);
> break;
> +   case NOUVEAU_GETPARAM_EXEC_PUSH_MAX:
> +   getparam->value = getparam_dma_ib_max(device) /
> + NOUVEAU_EXEC_PUSH_MAX_DIV;
> +   break;
> default:
> NV_PRINTK(dbg, cli, "unknown parameter %lld\n",
> getparam->param);
> return -EINVAL;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c
> b/drivers/gpu/drm/nouveau/nouveau_chan.c
> index ac56f4689ee3..c3c2ab887978 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_chan.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c
> @@ -456,7 +456,7 @@ nouveau_channel_init(struct nouveau_channel
> *chan, u32 vram, u32 gart)
> chan->user_get = 0x44;
> chan->user_get_hi = 0x60;
> chan->dma.ib_base =  0x1 / 4;
> -   chan->dma.ib_max  = (0x02000 / 8) - 1;
> +   chan->dma.ib_max  = NV50_DMA_IB_MAX;
> chan->dma.ib_put  = 0;
> chan->dma.ib_free = chan->dma.ib_max - chan-
> >dma.ib_put;
> chan->dma.max = chan->dma.ib_base;
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h
> b/drivers/gpu/drm/nouveau/nouveau_dma.h
> index 1744d95b233e..c52cda82353e 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
> @@ -49,6 +49,9 @@ void nv50_dma_push(struct nouveau_channel *, u64
> addr, u32 length,
>  /* Maximum push buffer size. */
>  #define NV50_DMA_PUSH_MAX_LENGTH 0x7f
>  
> +/* Maximum IBs per ring. */
> +#define NV50_DMA_IB_MAX ((0x02000 / 8) - 1)
> +
>  /* Object handles - for stuff that's doesn't use handle == oclass.
> */
>  enum {
> NvDmaFB = 0x8002,
> diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c
> b/drivers/gpu/drm/nouveau/nouveau_exec.c
> index ba6913a3efb6..5b5c4a77b8e6 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_exec.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
> @@ -346,7 +346,7 @@ nouveau_exec_ioctl_exec(stru

Re: [PATCH drm-misc-next v2] drm/nouveau: uapi: don't pass NO_PREFETCH flag implicitly

2023-08-23 Thread Faith Ekstrand
On Wed, Aug 23, 2023 at 1:17 PM Danilo Krummrich  wrote:

> Currently, NO_PREFETCH is passed implicitly through
> drm_nouveau_gem_pushbuf_push::length and drm_nouveau_exec_push::va_len.
>
> Since this is a direct representation of how the HW is programmed it
> isn't really future proof for a uAPI. Hence, fix this up for the new
> uAPI and split up the va_len field of struct drm_nouveau_exec_push,
> such that we keep 32bit for va_len and 32bit for flags.
>
> For drm_nouveau_gem_pushbuf_push::length at least provide
> NOUVEAU_GEM_PUSHBUF_NO_PREFETCH to indicate the bit shift.
>
> While at it, fix up nv50_dma_push() as well, such that the caller
> doesn't need to encode the NO_PREFETCH flag into the length parameter.
>
> Signed-off-by: Danilo Krummrich 
>

Still

Reviewed-by: Faith Ekstrand 


> ---
> Changes in v2:
>   - dma: rename prefetch to no_prefetch in nv50_dma_push() (Faith)
>   - exec: print error message when pushbuf size larger max pushbuf size
> (Faith)
> ---
>  drivers/gpu/drm/nouveau/nouveau_dma.c  |  7 +--
>  drivers/gpu/drm/nouveau/nouveau_dma.h  |  8 ++--
>  drivers/gpu/drm/nouveau/nouveau_exec.c | 19 ---
>  drivers/gpu/drm/nouveau/nouveau_gem.c  |  6 --
>  include/uapi/drm/nouveau_drm.h |  8 +++-
>  5 files changed, 38 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.c
> b/drivers/gpu/drm/nouveau/nouveau_dma.c
> index b90cac6d5772..b01c029f3a90 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.c
> @@ -69,16 +69,19 @@ READ_GET(struct nouveau_channel *chan, uint64_t
> *prev_get, int *timeout)
>  }
>
>  void
> -nv50_dma_push(struct nouveau_channel *chan, u64 offset, int length)
> +nv50_dma_push(struct nouveau_channel *chan, u64 offset, u32 length,
> + bool no_prefetch)
>  {
> struct nvif_user *user = >drm->client.device.user;
> struct nouveau_bo *pb = chan->push.buffer;
> int ip = (chan->dma.ib_put * 2) + chan->dma.ib_base;
>
> BUG_ON(chan->dma.ib_free < 1);
> +   WARN_ON(length > NV50_DMA_PUSH_MAX_LENGTH);
>
> nouveau_bo_wr32(pb, ip++, lower_32_bits(offset));
> -   nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8);
> +   nouveau_bo_wr32(pb, ip++, upper_32_bits(offset) | length << 8 |
> +   (no_prefetch ? (1 << 31) : 0));
>
> chan->dma.ib_put = (chan->dma.ib_put + 1) & chan->dma.ib_max;
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_dma.h
> b/drivers/gpu/drm/nouveau/nouveau_dma.h
> index 035a709c7be1..1744d95b233e 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_dma.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_dma.h
> @@ -31,7 +31,8 @@
>  #include "nouveau_chan.h"
>
>  int nouveau_dma_wait(struct nouveau_channel *, int slots, int size);
> -void nv50_dma_push(struct nouveau_channel *, u64 addr, int length);
> +void nv50_dma_push(struct nouveau_channel *, u64 addr, u32 length,
> +  bool no_prefetch);
>
>  /*
>   * There's a hw race condition where you can't jump to your PUT offset,
> @@ -45,6 +46,9 @@ void nv50_dma_push(struct nouveau_channel *, u64 addr,
> int length);
>   */
>  #define NOUVEAU_DMA_SKIPS (128 / 4)
>
> +/* Maximum push buffer size. */
> +#define NV50_DMA_PUSH_MAX_LENGTH 0x7f
> +
>  /* Object handles - for stuff that's doesn't use handle == oclass. */
>  enum {
> NvDmaFB = 0x8002,
> @@ -89,7 +93,7 @@ FIRE_RING(struct nouveau_channel *chan)
>
> if (chan->dma.ib_max) {
> nv50_dma_push(chan, chan->push.addr + (chan->dma.put << 2),
> - (chan->dma.cur - chan->dma.put) << 2);
> + (chan->dma.cur - chan->dma.put) << 2, false);
> } else {
> WRITE_PUT(chan->dma.cur);
> }
> diff --git a/drivers/gpu/drm/nouveau/nouveau_exec.c
> b/drivers/gpu/drm/nouveau/nouveau_exec.c
> index 0f927adda4ed..a90c4cd8cbb2 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_exec.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_exec.c
> @@ -164,8 +164,10 @@ nouveau_exec_job_run(struct nouveau_job *job)
> }
>
> for (i = 0; i < exec_job->push.count; i++) {
> -   nv50_dma_push(chan, exec_job->push.s[i].va,
> - exec_job->push.s[i].va_len);
> +   struct drm_nouveau_exec_push *p = _job->push.s[i];
> +   bool no_prefetch = p->flags &
> DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH;
> +
> +   nv50_dma_push(chan, p->va, p->va_len,

Re: [PATCH drm-misc-next] drm/nouveau: uapi: don't pass NO_PREFETCH flag implicitly

2023-08-22 Thread Faith Ekstrand
>  {
> struct nouveau_exec_job *job;
> struct nouveau_job_args args = {};
> -   int ret;
> +   int i, ret;
> +
> +   for (i = 0; i < __args->push.count; i++) {
> +   struct drm_nouveau_exec_push *p = &__args->push.s[i];
> +
> +   if (p->va_len > NV50_DMA_PUSH_MAX_LENGTH)
> +   return -EINVAL;
>

This can probably be wrapped in unlikely().  Also, it'd be nice if we
printed an error message like we do if you try to push too many things.

Looks good. Thanks!

Reviewed-by: Faith Ekstrand 


> +   }
>
> job = *pjob = kzalloc(sizeof(*job), GFP_KERNEL);
> if (!job)
> diff --git a/drivers/gpu/drm/nouveau/nouveau_gem.c
> b/drivers/gpu/drm/nouveau/nouveau_gem.c
> index f39360870c70..2f3dc4d71657 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_gem.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_gem.c
> @@ -856,9 +856,11 @@ nouveau_gem_ioctl_pushbuf(struct drm_device *dev,
> void *data,
> for (i = 0; i < req->nr_push; i++) {
> struct nouveau_vma *vma = (void *)(unsigned long)
> bo[push[i].bo_index].user_priv;
> +   u64 addr = vma->addr + push[i].offset;
> +   u32 length = push[i].length &
> ~NOUVEAU_GEM_PUSHBUF_NO_PREFETCH;
> +   bool prefetch = !(push[i].length &
> NOUVEAU_GEM_PUSHBUF_NO_PREFETCH);
>
> -   nv50_dma_push(chan, vma->addr + push[i].offset,
> - push[i].length);
> +   nv50_dma_push(chan, addr, length, prefetch);
> }
> } else
> if (drm->client.device.info.chipset >= 0x25) {
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index b1ad9d5ffce8..8f16724b5d05 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -138,6 +138,7 @@ struct drm_nouveau_gem_pushbuf_push {
> __u32 pad;
> __u64 offset;
> __u64 length;
> +#define NOUVEAU_GEM_PUSHBUF_NO_PREFETCH (1 << 23)
>  };
>
>  struct drm_nouveau_gem_pushbuf {
> @@ -338,7 +339,12 @@ struct drm_nouveau_exec_push {
> /**
>  * @va_len: the length of the push buffer mapping
>  */
> -   __u64 va_len;
> +   __u32 va_len;
> +   /**
> +* flags: the flags for this push buffer mapping
> +*/
> +   __u32 flags;
> +#define DRM_NOUVEAU_EXEC_PUSH_NO_PREFETCH 0x1
>  };
>
>  /**
>
> base-commit: ad1367f831f8743746a1f49705c28e36a7c95525
> --
> 2.41.0
>
>


Re: [PATCH v2 1/9] drm/sched: Convert drm scheduler to use a work queue rather than kthread

2023-08-22 Thread Faith Ekstrand
On Tue, Aug 22, 2023 at 4:51 AM Christian König 
wrote:

> Am 21.08.23 um 21:46 schrieb Faith Ekstrand:
>
> On Mon, Aug 21, 2023 at 1:13 PM Christian König 
> wrote:
>
>> [SNIP]
>> So as long as nobody from userspace comes and says we absolutely need to
>> optimize this use case I would rather not do it.
>>
>
> This is a place where nouveau's needs are legitimately different from AMD
> or Intel, I think.  NVIDIA's command streamer model is very different from
> AMD and Intel.  On AMD and Intel, each EXEC turns into a single small
> packet (on the order of 16B) which kicks off a command buffer.  There may
> be a bit of cache management or something around it but that's it.  From
> there, it's userspace's job to make one command buffer chain to another
> until it's finally done and then do a "return", whatever that looks like.
>
> NVIDIA's model is much more static.  Each packet in the HW/FW ring is an
> address and a size and that much data is processed and then it grabs the
> next packet and processes. The result is that, if we use multiple buffers
> of commands, there's no way to chain them together.  We just have to pass
> the whole list of buffers to the kernel.
>
>
> So far that is actually completely identical to what AMD has.
>
> A single EXEC ioctl / job may have 500 such addr+size packets depending on
> how big the command buffer is.
>
>
> And that is what I don't understand. Why would you need 100dreds of such
> addr+size packets?
>

Well, we're not really in control of it.  We can control our base pushbuf
size and that's something we can tune but we're still limited by the
client.  We have to submit another pushbuf whenever:

 1. We run out of space (power-of-two growth is also possible but the size
is limited to a maximum of about 4MiB due to hardware limitations.)
 2. The client calls a secondary command buffer.
 3. Any usage of indirect draw or dispatch on pre-Turing hardware.

At some point we need to tune our BO size a bit to avoid (1) while also
avoiding piles of tiny BOs.  However, (2) and (3) are out of our control.

This is basically identical to what AMD has (well on newer hw there is an
> extension in the CP packets to JUMP/CALL subsequent IBs, but this isn't
> widely used as far as I know).
>

According to Bas, RADV chains on recent hardware.


> Previously the limit was something like 4 which we extended to because Bas
> came up with similar requirements for the AMD side from RADV.
>
> But essentially those approaches with 100dreds of IBs doesn't sound like a
> good idea to me.
>

No one's arguing that they like it.  Again, the hardware isn't designed to
have a kernel in the way. It's designed to be fed by userspace. But we're
going to have the kernel in the middle for a while so we need to make it
not suck too bad.

~Faith

It gets worse on pre-Turing hardware where we have to split the batch for
> every single DrawIndirect or DispatchIndirect.
>
> Lest you think NVIDIA is just crazy here, it's a perfectly reasonable
> model if you assume that userspace is feeding the firmware.  When that's
> happening, you just have a userspace thread that sits there and feeds the
> ringbuffer with whatever is next and you can marshal as much data through
> as you want. Sure, it'd be nice to have a 2nd level batch thing that gets
> launched from the FW ring and has all the individual launch commands but
> it's not at all necessary.
>
> What does that mean from a gpu_scheduler PoV? Basically, it means a
> variable packet size.
>
> What does this mean for implementation? IDK.  One option would be to teach
> the scheduler about actual job sizes. Another would be to virtualize it and
> have another layer underneath the scheduler that does the actual feeding of
> the ring. Another would be to decrease the job size somewhat and then have
> the front-end submit as many jobs as it needs to service userspace and only
> put the out-fences on the last job. All the options kinda suck.
>
>
> Yeah, agree. The job size Danilo suggested is still the least painful.
>
> Christian.
>
>
> ~Faith
>
>
>


Re: [PATCH v2 1/9] drm/sched: Convert drm scheduler to use a work queue rather than kthread

2023-08-21 Thread Faith Ekstrand
On Mon, Aug 21, 2023 at 1:13 PM Christian König 
wrote:

> Am 21.08.23 um 20:01 schrieb Danilo Krummrich:
> > On 8/21/23 16:07, Christian König wrote:
> >> Am 18.08.23 um 13:58 schrieb Danilo Krummrich:
> >>> [SNIP]
>  I only see two possible outcomes:
>  1. You return -EBUSY (or similar) error code indicating the the hw
>  can't receive more commands.
>  2. Wait on previously pushed commands to be executed.
>  (3. Your driver crash because you accidentally overwrite stuff in
>  the ring buffer which is still executed. I just assume that's
>  prevented).
> 
>  Resolution #1 with -EBUSY is actually something the UAPI should not
>  do, because your UAPI then depends on the specific timing of
>  submissions which is a really bad idea.
> 
>  Resolution #2 is usually bad because it forces the hw to run dry
>  between submission and so degrade performance.
> >>>
> >>> I agree, that is a good reason for at least limiting the maximum job
> >>> size to half of the ring size.
> >>>
> >>> However, there could still be cases where two subsequent jobs are
> >>> submitted with just a single IB, which as is would still block
> >>> subsequent jobs to be pushed to the ring although there is still
> >>> plenty of space. Depending on the (CPU) scheduler latency, such a
> >>> case can let the HW run dry as well.
> >>
> >> Yeah, that was intentionally not done as well. The crux here is that
> >> the more you push to the hw the worse the scheduling granularity
> >> becomes. It's just that neither Xe nor Nouveau relies that much on
> >> the scheduling granularity at all (because of hw queues).
> >>
> >> But Xe doesn't seem to need that feature and I would still try to
> >> avoid it because the more you have pushed to the hw the harder it is
> >> to get going again after a reset.
> >>
> >>>
> >>> Surely, we could just continue decrease the maximum job size even
> >>> further, but this would result in further overhead on user and
> >>> kernel for larger IB counts. Tracking the actual job size seems to
> >>> be the better solution for drivers where the job size can vary over
> >>> a rather huge range.
> >>
> >> I strongly disagree on that. A larger ring buffer is trivial to
> allocate
> >
> > That sounds like a workaround to me. The problem, in the case above,
> > isn't that the ring buffer does not have enough space, the problem is
> > that we account for the maximum job size although the actual job size
> > is much smaller. And enabling the scheduler to track the actual job
> > size is trivial as well.
>
> That's what I agree on, so far I just didn't see the reason for doing it
> but at least a few reason for not doing it.
>
> >
> >> and if userspace submissions are so small that the scheduler can't
> >> keep up submitting them then your ring buffer size is your smallest
> >> problem.
> >>
> >> In other words the submission overhead will completely kill your
> >> performance and you should probably consider stuffing more into a
> >> single submission.
> >
> > I fully agree and that is also the reason why I want to keep the
> > maximum job size as large as possible.
> >
> > However, afaik with Vulkan it's the applications themselves deciding
> > when and with how many command buffers a queue is submitted (@Faith:
> > please correct me if I'm wrong). Hence, why not optimize for this case
> > as well? It's not that it would make another case worse, right?
>
> As I said it does make both the scheduling granularity as well as the
> reset behavior worse.
>
> In general I think we should try to push just enough work to the
> hardware to keep it busy and not as much as possible.
>
> So as long as nobody from userspace comes and says we absolutely need to
> optimize this use case I would rather not do it.
>

This is a place where nouveau's needs are legitimately different from AMD
or Intel, I think.  NVIDIA's command streamer model is very different from
AMD and Intel.  On AMD and Intel, each EXEC turns into a single small
packet (on the order of 16B) which kicks off a command buffer.  There may
be a bit of cache management or something around it but that's it.  From
there, it's userspace's job to make one command buffer chain to another
until it's finally done and then do a "return", whatever that looks like.

NVIDIA's model is much more static.  Each packet in the HW/FW ring is an
address and a size and that much data is processed and then it grabs the
next packet and processes. The result is that, if we use multiple buffers
of commands, there's no way to chain them together.  We just have to pass
the whole list of buffers to the kernel.  A single EXEC ioctl / job may
have 500 such addr+size packets depending on how big the command buffer
is.  It gets worse on pre-Turing hardware where we have to split the batch
for every single DrawIndirect or DispatchIndirect.

Lest you think NVIDIA is just crazy here, it's a perfectly reasonable model
if you assume that userspace is feeding the 

Re: [PATCH v5 03/17] drm/imagination/uapi: Add PowerVR driver UAPI

2023-08-17 Thread Faith Ekstrand
On Wed, Aug 16, 2023 at 3:26 AM Sarah Walker 
wrote:

> Add the UAPI implementation for the PowerVR driver.
>
> Changes from v4 :
> - Remove CREATE_ZEROED flag for BO creation (all buffers are now zeroed)
>
> Co-developed-by: Frank Binns 
> Signed-off-by: Frank Binns 
> Co-developed-by: Boris Brezillon 
> Signed-off-by: Boris Brezillon 
> Co-developed-by: Matt Coster 
> Signed-off-by: Matt Coster 
> Co-developed-by: Donald Robson 
> Signed-off-by: Donald Robson 
> Signed-off-by: Sarah Walker 
> ---
>  MAINTAINERS|1 +
>  include/uapi/drm/pvr_drm.h | 1303 
>  2 files changed, 1304 insertions(+)
>  create mode 100644 include/uapi/drm/pvr_drm.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index f84390bb6cfe..55e17f8aea91 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -10144,6 +10144,7 @@ M:  Sarah Walker 
>  M: Donald Robson 
>  S: Supported
>  F: Documentation/devicetree/bindings/gpu/img,powervr.yaml
> +F: include/uapi/drm/pvr_drm.h
>
>  IMON SOUNDGRAPH USB IR RECEIVER
>  M: Sean Young 
> diff --git a/include/uapi/drm/pvr_drm.h b/include/uapi/drm/pvr_drm.h
> new file mode 100644
> index ..c0aac8b135ca
> --- /dev/null
> +++ b/include/uapi/drm/pvr_drm.h
> @@ -0,0 +1,1303 @@
> +/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
> +/* Copyright (c) 2023 Imagination Technologies Ltd. */
> +
> +#ifndef PVR_DRM_UAPI_H
> +#define PVR_DRM_UAPI_H
> +
> +#include "drm.h"
> +
> +#include 
> +#include 
> +
> +#if defined(__cplusplus)
> +extern "C" {
> +#endif
> +
> +/**
> + * DOC: PowerVR UAPI
> + *
> + * The PowerVR IOCTL argument structs have a few limitations in place, in
> + * addition to the standard kernel restrictions:
> + *
> + *  - All members must be type-aligned.
> + *  - The overall struct must be padded to 64-bit alignment.
> + *  - Explicit padding is almost always required. This takes the form of
> + *``_padding_[x]`` members of sufficient size to pad to the next
> power-of-two
> + *alignment, where [x] is the offset into the struct in hexadecimal.
> Arrays
> + *are never used for alignment. Padding fields must be zeroed; this is
> + *always checked.
> + *  - Unions may only appear as the last member of a struct.
> + *  - Individual union members may grow in the future. The space between
> the
> + *end of a union member and the end of its containing union is
> considered
> + *"implicit padding" and must be zeroed. This is always checked.
> + *
> + * In addition to the IOCTL argument structs, the PowerVR UAPI makes use
> of
> + * DEV_QUERY argument structs. These are used to fetch information about
> the
> + * device and runtime. These structs are subject to the same rules set out
> + * above.
> + */
> +
> +/**
> + * struct drm_pvr_obj_array - Container used to pass arrays of objects
> + *
> + * It is not unusual to have to extend objects to pass new parameters,
> and the DRM
> + * ioctl infrastructure is supporting that by padding ioctl arguments
> with zeros
> + * when the data passed by userspace is smaller than the struct defined
> in the
> + * drm_ioctl_desc, thus keeping things backward compatible. This type is
> just
> + * applying the same concepts to indirect objects passed through arrays
> referenced
> + * from the main ioctl arguments structure: the stride basically defines
> the size
> + * of the object passed by userspace, which allows the kernel driver to
> pad with
> + * zeros when it's smaller than the size of the object it expects.
> + *
> + * Use ``DRM_PVR_OBJ_ARRAY()`` to fill object array fields, unless you
> + * have a very good reason not to.
> + */
> +struct drm_pvr_obj_array {
> +   /** @stride: Stride of object struct. Used for versioning. */
> +   __u32 stride;
> +
> +   /** @count: Number of objects in the array. */
> +   __u32 count;
> +
> +   /** @array: User pointer to an array of objects. */
> +   __u64 array;
> +};
> +
> +/**
> + * DRM_PVR_OBJ_ARRAY() - Helper macro for filling 
> drm_pvr_obj_array.
> + * @cnt: Number of elements pointed to py @ptr.
> + * @ptr: Pointer to start of a C array.
> + *
> + * Return: Literal of type  drm_pvr_obj_array.
> + */
> +#define DRM_PVR_OBJ_ARRAY(cnt, ptr) \
> +   { .stride = sizeof((ptr)[0]), .count = (cnt), .array =
> (__u64)(uintptr_t)(ptr) }
> +
> +/**
> + * DOC: PowerVR IOCTL interface
> + */
> +
> +/**
> + * PVR_IOCTL() - Build a PowerVR IOCTL number
> + * @_ioctl: An incrementing id for this IOCTL. Added to %DRM_COMMAND_BASE.
> + * @_mode: Must be one of %DRM_IOR, %DRM_IOW or %DRM_IOWR.
> + * @_data: The type of the args struct passed by this IOCTL.
> + *
> + * The struct referred to by @_data must have a ``drm_pvr_ioctl_`` prefix
> and an
> + * ``_args suffix``. They are therefore omitted from @_data.
> + *
> + * This should only be used to build the constants described below; it
> should
> + * never be used to call an IOCTL directly.
> + *
> + * Return: An IOCTL 

Re: [PATCH] nouveau: find the smallest page allocation to cover a buffer alloc.

2023-08-12 Thread Faith Ekstrand
On Thu, Aug 10, 2023 at 10:15 PM Dave Airlie  wrote:

> From: Dave Airlie 
>
> With the new uapi we don't have the comp flags on the allocation,
> so we shouldn't be using the first size that works, we should be
> iterating until we get the correct one.
>
> This reduces allocations from 2MB to 64k in lots of places.
>
> Fixes dEQP-VK.memory.allocation.basic.size_8KiB.forward.count_4000
> on my ampere/gsp system.
>
> Signed-off-by: Dave Airlie 
>

Reviewed-by: Faith Ekstrand 


> ---
>  drivers/gpu/drm/nouveau/nouveau_bo.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_bo.c
> b/drivers/gpu/drm/nouveau/nouveau_bo.c
> index 949195d5d782..a6993c7807b6 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_bo.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_bo.c
> @@ -318,8 +318,9 @@ nouveau_bo_alloc(struct nouveau_cli *cli, u64 *size,
> int *align, u32 domain,
> (!vmm->page[i].host || vmm->page[i].shift >
> PAGE_SHIFT))
> continue;
>
> -   if (pi < 0)
> -   pi = i;
> +   /* pick the last one as it will be smallest. */
> +   pi = i;
> +
> /* Stop once the buffer is larger than the current
> page size. */
> if (*size >= 1ULL << vmm->page[i].shift)
> break;
> --
> 2.41.0
>
>


Re: [PATCH drm-misc-next] drm/nouveau: sched: avoid job races between entities

2023-08-11 Thread Faith Ekstrand
On Thu, Aug 10, 2023 at 8:06 PM Danilo Krummrich  wrote:

> If a sched job depends on a dma-fence from a job from the same GPU
> scheduler instance, but a different scheduler entity, the GPU scheduler
> does only wait for the particular job to be scheduled, rather than for
> the job to fully complete. This is due to the GPU scheduler assuming
> that there is a scheduler instance per ring. However, the current
> implementation, in order to avoid arbitrary amounts of kthreads, has a
> single scheduler instance while scheduler entities represent rings.
>
> As a workaround, set the DRM_SCHED_FENCE_DONT_PIPELINE for all
> out-fences in order to force the scheduler to wait for full job
> completion for dependent jobs from different entities and same scheduler
> instance.
>
> There is some work in progress [1] to address the issues of firmware
> schedulers; once it is in-tree the scheduler topology in Nouveau should
> be re-worked accordingly.
>
> [1]
> https://lore.kernel.org/dri-devel/20230801205103.627779-1-matthew.br...@intel.com/
>
> Signed-off-by: Danilo Krummrich 
> ---
>  drivers/gpu/drm/nouveau/nouveau_sched.c | 22 ++
>  1 file changed, 22 insertions(+)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c
> b/drivers/gpu/drm/nouveau/nouveau_sched.c
> index 3424a1bf6af3..88217185e0f3 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_sched.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
> @@ -292,6 +292,28 @@ nouveau_job_submit(struct nouveau_job *job)
> if (job->sync)
> done_fence = dma_fence_get(job->done_fence);
>
> +   /* If a sched job depends on a dma-fence from a job from the same
> GPU
> +* scheduler instance, but a different scheduler entity, the GPU
> +    * scheduler does only wait for the particular job to be scheduled,
>

s/does only wait/only waits/

Reviewed-by: Faith Ekstrand 

+* rather than for the job to fully complete. This is due to the GPU
> +* scheduler assuming that there is a scheduler instance per ring.
> +* However, the current implementation, in order to avoid arbitrary
> +* amounts of kthreads, has a single scheduler instance while
> scheduler
> +* entities represent rings.
> +*
> +* As a workaround, set the DRM_SCHED_FENCE_DONT_PIPELINE for all
> +* out-fences in order to force the scheduler to wait for full job
> +* completion for dependent jobs from different entities and same
> +* scheduler instance.
> +*
> +* There is some work in progress [1] to address the issues of
> firmware
> +* schedulers; once it is in-tree the scheduler topology in Nouveau
> +* should be re-worked accordingly.
> +*
> +* [1]
> https://lore.kernel.org/dri-devel/20230801205103.627779-1-matthew.br...@intel.com/
> +*/
> +   set_bit(DRM_SCHED_FENCE_DONT_PIPELINE, >done_fence->flags);
> +
> if (job->ops->armed_submit)
> job->ops->armed_submit(job);
>
>
> base-commit: 68132cc6d1bcbc78ade524c6c6c226de42139f0e
> --
> 2.41.0
>
>


[PATCH] drm/nouveau/sched: Don't pass user flags to drm_syncobj_find_fence()

2023-08-07 Thread Faith Ekstrand
The flags field in drm_syncobj_find_fence() takes SYNCOBJ_WAIT flags
from the syncobj UAPI whereas sync->flags is from the nouveau UAPI. What
we actually want is 0 flags which tells it to just try to find the
fence and then return without waiting.

Signed-off-by: Faith Ekstrand 
Fixes: b88baab82871 ("drm/nouveau: implement new VM_BIND uAPI")
Cc: Danilo Krummrich 
Cc: Dave Airlie 
---
 drivers/gpu/drm/nouveau/nouveau_sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/nouveau/nouveau_sched.c 
b/drivers/gpu/drm/nouveau/nouveau_sched.c
index b3b59fbec291..3424a1bf6af3 100644
--- a/drivers/gpu/drm/nouveau/nouveau_sched.c
+++ b/drivers/gpu/drm/nouveau/nouveau_sched.c
@@ -142,7 +142,7 @@ sync_find_fence(struct nouveau_job *job,
 
ret = drm_syncobj_find_fence(job->file_priv,
 sync->handle, point,
-sync->flags, fence);
+0 /* flags */, fence);
if (ret)
return ret;
 
-- 
2.41.0



Re: [PATCH drm-misc-next v9 00/11] Nouveau VM_BIND UAPI & DRM GPUVA Manager (merged)

2023-08-03 Thread Faith Ekstrand
On Thu, Aug 3, 2023 at 4:45 PM Dave Airlie  wrote:

> On Fri, 4 Aug 2023 at 02:52, Danilo Krummrich  wrote:
> >
> > This patch series provides a new UAPI for the Nouveau driver in order to
> > support Vulkan features, such as sparse bindings and sparse residency.
> >
>
> Now that Faith has reviewed the uAPI and userspace work, I think we
> should try and steer this in.
>
> I think the only thing I see is the SPDX + MIT header in some places,
> I think we can drop the MIT bits where SPDX is there, and leave
> copyright and authorship (if you like), personally I've been leaving
> authorship up to git, as it saves trouble with people randomly
> emailing you about things you wrote 10 years ago.
>
> Otherwise for the series:
> Reviewed-by: Dave Airlie 
>

FYI: There's a small, easily resolved conflict with the uapi fixup patch. I
don't care too much which goes in first but I'd like both to land before I
merge NVK so I don't have to deal with a nouveau_deprecated.h.

~Faith


> Dave.
>


Re: [PATCH] drm/nouveau: fixup the uapi header file.

2023-08-03 Thread Faith Ekstrand
On Thu, Aug 3, 2023 at 2:33 PM Dave Airlie  wrote:

> From: Dave Airlie 
>
> nouveau > 10 years ago had a plan for new multiplexer inside a multiplexer
> API using nvif. It never fully reached fruition, fast forward 10 years,
> and the new vulkan driver is avoiding libdrm and calling ioctls, and
> these 3 ioctls, getparam, channel alloc + free don't seem to be things
> we'd want to use nvif for.
>
> Undeprecate and put them into the uapi header so we can just copy it
> into mesa later.
>
> v2: use uapi types.
>
> Signed-off-by: Dave Airlie 
>

Reviewed-by: Faith Ekstrand 


> ---
>  drivers/gpu/drm/nouveau/nouveau_abi16.h | 41 -
>  include/uapi/drm/nouveau_drm.h  | 48 +++--
>  2 files changed, 45 insertions(+), 44 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.h
> b/drivers/gpu/drm/nouveau/nouveau_abi16.h
> index 27eae85f33e6..d5d80d0d9011 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_abi16.h
> +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.h
> @@ -43,28 +43,6 @@ int  nouveau_abi16_usif(struct drm_file *, void *data,
> u32 size);
>  #define NOUVEAU_GEM_DOMAIN_VRAM  (1 << 1)
>  #define NOUVEAU_GEM_DOMAIN_GART  (1 << 2)
>
> -struct drm_nouveau_channel_alloc {
> -   uint32_t fb_ctxdma_handle;
> -   uint32_t tt_ctxdma_handle;
> -
> -   int  channel;
> -   uint32_t pushbuf_domains;
> -
> -   /* Notifier memory */
> -   uint32_t notifier_handle;
> -
> -   /* DRM-enforced subchannel assignments */
> -   struct {
> -   uint32_t handle;
> -   uint32_t grclass;
> -   } subchan[8];
> -   uint32_t nr_subchan;
> -};
> -
> -struct drm_nouveau_channel_free {
> -   int channel;
> -};
> -
>  struct drm_nouveau_grobj_alloc {
> int  channel;
> uint32_t handle;
> @@ -83,31 +61,12 @@ struct drm_nouveau_gpuobj_free {
> uint32_t handle;
>  };
>
> -#define NOUVEAU_GETPARAM_PCI_VENDOR  3
> -#define NOUVEAU_GETPARAM_PCI_DEVICE  4
> -#define NOUVEAU_GETPARAM_BUS_TYPE5
> -#define NOUVEAU_GETPARAM_FB_SIZE 8
> -#define NOUVEAU_GETPARAM_AGP_SIZE9
> -#define NOUVEAU_GETPARAM_CHIPSET_ID  11
> -#define NOUVEAU_GETPARAM_VM_VRAM_BASE12
> -#define NOUVEAU_GETPARAM_GRAPH_UNITS 13
> -#define NOUVEAU_GETPARAM_PTIMER_TIME 14
> -#define NOUVEAU_GETPARAM_HAS_BO_USAGE15
> -#define NOUVEAU_GETPARAM_HAS_PAGEFLIP16
> -struct drm_nouveau_getparam {
> -   uint64_t param;
> -   uint64_t value;
> -};
> -
>  struct drm_nouveau_setparam {
> uint64_t param;
> uint64_t value;
>  };
>
> -#define DRM_IOCTL_NOUVEAU_GETPARAM   DRM_IOWR(DRM_COMMAND_BASE +
> DRM_NOUVEAU_GETPARAM, struct drm_nouveau_getparam)
>  #define DRM_IOCTL_NOUVEAU_SETPARAM   DRM_IOWR(DRM_COMMAND_BASE +
> DRM_NOUVEAU_SETPARAM, struct drm_nouveau_setparam)
> -#define DRM_IOCTL_NOUVEAU_CHANNEL_ALLOC  DRM_IOWR(DRM_COMMAND_BASE +
> DRM_NOUVEAU_CHANNEL_ALLOC, struct drm_nouveau_channel_alloc)
> -#define DRM_IOCTL_NOUVEAU_CHANNEL_FREE   DRM_IOW (DRM_COMMAND_BASE +
> DRM_NOUVEAU_CHANNEL_FREE, struct drm_nouveau_channel_free)
>  #define DRM_IOCTL_NOUVEAU_GROBJ_ALLOCDRM_IOW (DRM_COMMAND_BASE +
> DRM_NOUVEAU_GROBJ_ALLOC, struct drm_nouveau_grobj_alloc)
>  #define DRM_IOCTL_NOUVEAU_NOTIFIEROBJ_ALLOC  DRM_IOWR(DRM_COMMAND_BASE +
> DRM_NOUVEAU_NOTIFIEROBJ_ALLOC, struct drm_nouveau_notifierobj_alloc)
>  #define DRM_IOCTL_NOUVEAU_GPUOBJ_FREEDRM_IOW (DRM_COMMAND_BASE +
> DRM_NOUVEAU_GPUOBJ_FREE, struct drm_nouveau_gpuobj_free)
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index 853a327433d3..ca917e55b38f 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -33,6 +33,44 @@
>  extern "C" {
>  #endif
>
> +#define NOUVEAU_GETPARAM_PCI_VENDOR  3
> +#define NOUVEAU_GETPARAM_PCI_DEVICE  4
> +#define NOUVEAU_GETPARAM_BUS_TYPE5
> +#define NOUVEAU_GETPARAM_FB_SIZE 8
> +#define NOUVEAU_GETPARAM_AGP_SIZE9
> +#define NOUVEAU_GETPARAM_CHIPSET_ID  11
> +#define NOUVEAU_GETPARAM_VM_VRAM_BASE12
> +#define NOUVEAU_GETPARAM_GRAPH_UNITS 13
> +#define NOUVEAU_GETPARAM_PTIMER_TIME 14
> +#define NOUVEAU_GETPARAM_HAS_BO_USAGE15
> +#define NOUVEAU_GETPARAM_HAS_PAGEFLIP16
> +struct drm_nouveau_getparam {
> +   __u64 param;
> +   __u64 value;
> +};
> +
> +struct drm_nouveau_channel_alloc {
> +   __u32 fb_ctxdma_handle;
> +   __u32 tt_ctxdma_handle;
> +
> +   __s32 channe

Re: [PATCH drm-misc-next v9 02/11] drm/nouveau: new VM_BIND uapi interfaces

2023-08-03 Thread Faith Ekstrand
On Thu, Aug 3, 2023 at 11:53 AM Danilo Krummrich  wrote:

> This commit provides the interfaces for the new UAPI motivated by the
> Vulkan API. It allows user mode drivers (UMDs) to:
>
> 1) Initialize a GPU virtual address (VA) space via the new
>DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>VA area.
>
> 2) Bind and unbind GPU VA space mappings via the new
>DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>
> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>
> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
> asynchronous processing with DRM syncobjs as synchronization mechanism.
>
> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>
> Co-authored-by: Dave Airlie 
> Signed-off-by: Danilo Krummrich 
>

Reviewed-by: Faith Ekstrand 

Userspace is also reviewed and sitting here:
https://gitlab.freedesktop.org/nouveau/mesa/-/merge_requests/150

I'll rev it all on top of the latest header here in a few minutes but I
think the only header change was the rename for VM_INIT which shouldn't be
a real problem.

~Faith


> ---
>  Documentation/gpu/driver-uapi.rst |   8 ++
>  include/uapi/drm/nouveau_drm.h| 217 ++
>  2 files changed, 225 insertions(+)
>
> diff --git a/Documentation/gpu/driver-uapi.rst
> b/Documentation/gpu/driver-uapi.rst
> index 4411e6919a3d..9c7ca6e33a68 100644
> --- a/Documentation/gpu/driver-uapi.rst
> +++ b/Documentation/gpu/driver-uapi.rst
> @@ -6,3 +6,11 @@ drm/i915 uAPI
>  =
>
>  .. kernel-doc:: include/uapi/drm/i915_drm.h
> +
> +drm/nouveau uAPI
> +
> +
> +VM_BIND / EXEC uAPI
> +---
> +
> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index 853a327433d3..b567892c128d 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -38,6 +38,8 @@ extern "C" {
>  #define NOUVEAU_GEM_DOMAIN_GART  (1 << 2)
>  #define NOUVEAU_GEM_DOMAIN_MAPPABLE  (1 << 3)
>  #define NOUVEAU_GEM_DOMAIN_COHERENT  (1 << 4)
> +/* The BO will never be shared via import or export. */
> +#define NOUVEAU_GEM_DOMAIN_NO_SHARE  (1 << 5)
>
>  #define NOUVEAU_GEM_TILE_COMP0x0003 /* nv50-only */
>  #define NOUVEAU_GEM_TILE_LAYOUT_MASK 0xff00
> @@ -126,6 +128,215 @@ struct drm_nouveau_gem_cpu_fini {
> __u32 handle;
>  };
>
> +/**
> + * struct drm_nouveau_sync - sync object
> + *
> + * This structure serves as synchronization mechanism for (potentially)
> + * asynchronous operations such as EXEC or VM_BIND.
> + */
> +struct drm_nouveau_sync {
> +   /**
> +* @flags: the flags for a sync object
> +*
> +* The first 8 bits are used to determine the type of the sync
> object.
> +*/
> +   __u32 flags;
> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
> +   /**
> +* @handle: the handle of the sync object
> +*/
> +   __u32 handle;
> +   /**
> +* @timeline_value:
> +*
> +* The timeline point of the sync object in case the syncobj is of
> +* type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
> +*/
> +   __u64 timeline_value;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_init - GPU VA space init structure
> + *
> + * Used to initialize the GPU's VA space for a user client, telling the
> kernel
> + * which portion of the VA space is managed by the UMD and kernel
> respectively.
> + *
> + * For the UMD to use the VM_BIND uAPI, this must be called before any
> BOs or
> + * channels are created; if called afterwards DRM_IOCTL_NOUVEAU_VM_INIT
> fails
> + * with -ENOSYS.
> + */
> +struct drm_nouveau_vm_init {
> +   /**
> +* @kernel_managed_addr: start address of the kernel managed VA
> space
> +* region
> +*/
> +   __u64 kernel_managed_addr;
> +   /**
> +* @kernel_managed_size: size of the kernel managed VA space
> region in
> +* bytes
> +*/
> +   __u64 kernel_managed_size;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
> + *
> + * This structure represents a single VM_BIND operation. UMDs should pass
> + * an array of this structure via struct drm_nouveau_vm_bind's _ptr
> field.
> + */
> +struct drm_nouveau_vm_bind_op {
> +   /**
> +* @op: the operation type
> +*/
> +   __u32 op;
> 

Re: [PATCH] drm/nouveau: fixup the uapi header file.

2023-08-01 Thread Faith Ekstrand
On Tue, Aug 1, 2023 at 4:37 AM Karol Herbst  wrote:

> On Mon, Jul 31, 2023 at 9:16 PM Dave Airlie  wrote:
> >
> > From: Dave Airlie 
> >
> > nouveau > 10 years ago had a plan for new multiplexer inside a
> multiplexer
> > API using nvif. It never fully reached fruition, fast forward 10 years,
> > and the new vulkan driver is avoiding libdrm and calling ioctls, and
> > these 3 ioctls, getparam, channel alloc + free don't seem to be things
> > we'd want to use nvif for.
> >
> > Undeprecate and put them into the uapi header so we can just copy it
> > into mesa later.
> >
> > Signed-off-by: Dave Airlie 
> > ---
> >  drivers/gpu/drm/nouveau/nouveau_abi16.h | 41 -
> >  include/uapi/drm/nouveau_drm.h  | 48 +++--
> >  2 files changed, 45 insertions(+), 44 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/nouveau/nouveau_abi16.h
> b/drivers/gpu/drm/nouveau/nouveau_abi16.h
> > index 27eae85f33e6..d5d80d0d9011 100644
> > --- a/drivers/gpu/drm/nouveau/nouveau_abi16.h
> > +++ b/drivers/gpu/drm/nouveau/nouveau_abi16.h
> > @@ -43,28 +43,6 @@ int  nouveau_abi16_usif(struct drm_file *, void
> *data, u32 size);
> >  #define NOUVEAU_GEM_DOMAIN_VRAM  (1 << 1)
> >  #define NOUVEAU_GEM_DOMAIN_GART  (1 << 2)
> >
> > -struct drm_nouveau_channel_alloc {
> > -   uint32_t fb_ctxdma_handle;
> > -   uint32_t tt_ctxdma_handle;
> > -
> > -   int  channel;
> > -   uint32_t pushbuf_domains;
> > -
> > -   /* Notifier memory */
> > -   uint32_t notifier_handle;
> > -
> > -   /* DRM-enforced subchannel assignments */
> > -   struct {
> > -   uint32_t handle;
> > -   uint32_t grclass;
> > -   } subchan[8];
> > -   uint32_t nr_subchan;
> > -};
> > -
> > -struct drm_nouveau_channel_free {
> > -   int channel;
> > -};
> > -
> >  struct drm_nouveau_grobj_alloc {
> > int  channel;
> > uint32_t handle;
> > @@ -83,31 +61,12 @@ struct drm_nouveau_gpuobj_free {
> > uint32_t handle;
> >  };
> >
> > -#define NOUVEAU_GETPARAM_PCI_VENDOR  3
> > -#define NOUVEAU_GETPARAM_PCI_DEVICE  4
> > -#define NOUVEAU_GETPARAM_BUS_TYPE5
> > -#define NOUVEAU_GETPARAM_FB_SIZE 8
> > -#define NOUVEAU_GETPARAM_AGP_SIZE9
> > -#define NOUVEAU_GETPARAM_CHIPSET_ID  11
> > -#define NOUVEAU_GETPARAM_VM_VRAM_BASE12
> > -#define NOUVEAU_GETPARAM_GRAPH_UNITS 13
> > -#define NOUVEAU_GETPARAM_PTIMER_TIME 14
> > -#define NOUVEAU_GETPARAM_HAS_BO_USAGE15
> > -#define NOUVEAU_GETPARAM_HAS_PAGEFLIP16
> > -struct drm_nouveau_getparam {
> > -   uint64_t param;
> > -   uint64_t value;
> > -};
> > -
> >  struct drm_nouveau_setparam {
> > uint64_t param;
> > uint64_t value;
> >  };
> >
> > -#define DRM_IOCTL_NOUVEAU_GETPARAM   DRM_IOWR(DRM_COMMAND_BASE
> + DRM_NOUVEAU_GETPARAM, struct drm_nouveau_getparam)
> >  #define DRM_IOCTL_NOUVEAU_SETPARAM   DRM_IOWR(DRM_COMMAND_BASE
> + DRM_NOUVEAU_SETPARAM, struct drm_nouveau_setparam)
> > -#define DRM_IOCTL_NOUVEAU_CHANNEL_ALLOC  DRM_IOWR(DRM_COMMAND_BASE
> + DRM_NOUVEAU_CHANNEL_ALLOC, struct drm_nouveau_channel_alloc)
> > -#define DRM_IOCTL_NOUVEAU_CHANNEL_FREE   DRM_IOW (DRM_COMMAND_BASE
> + DRM_NOUVEAU_CHANNEL_FREE, struct drm_nouveau_channel_free)
> >  #define DRM_IOCTL_NOUVEAU_GROBJ_ALLOCDRM_IOW (DRM_COMMAND_BASE
> + DRM_NOUVEAU_GROBJ_ALLOC, struct drm_nouveau_grobj_alloc)
> >  #define DRM_IOCTL_NOUVEAU_NOTIFIEROBJ_ALLOC  DRM_IOWR(DRM_COMMAND_BASE
> + DRM_NOUVEAU_NOTIFIEROBJ_ALLOC, struct drm_nouveau_notifierobj_alloc)
> >  #define DRM_IOCTL_NOUVEAU_GPUOBJ_FREEDRM_IOW (DRM_COMMAND_BASE
> + DRM_NOUVEAU_GPUOBJ_FREE, struct drm_nouveau_gpuobj_free)
> > diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> > index 853a327433d3..1cd97b3d8eda 100644
> > --- a/include/uapi/drm/nouveau_drm.h
> > +++ b/include/uapi/drm/nouveau_drm.h
> > @@ -33,6 +33,44 @@
> >  extern "C" {
> >  #endif
> >
> > +#define NOUVEAU_GETPARAM_PCI_VENDOR  3
> > +#define NOUVEAU_GETPARAM_PCI_DEVICE  4
> > +#define NOUVEAU_GETPARAM_BUS_TYPE5
> > +#define NOUVEAU_GETPARAM_FB_SIZE 8
> > +#define NOUVEAU_GETPARAM_AGP_SIZE9
> > +#define NOUVEAU_GETPARAM_CHIPSET_ID  11
> > +#define NOUVEAU_GETPARAM_VM_VRAM_BASE12
> > +#define NOUVEAU_GETPARAM_GRAPH_UNITS 13
> > +#define NOUVEAU_GETPARAM_PTIMER_TIME 14
> > +#define NOUVEAU_GETPARAM_HAS_BO_USAGE15
> > +#define NOUVEAU_GETPARAM_HAS_PAGEFLIP16
> > +struct drm_nouveau_getparam {
> > +   uint64_t param;
> > +   uint64_t value;
> > +};
> > +
> > +struct drm_nouveau_channel_alloc {
> > +   uint32_t fb_ctxdma_handle;
>

Do we want to use `uint32_t` or `__u32` here? It looks like the original
headers used `uint32_t` and then it got switched to `__u32` after the
deprecation happened.  We probably want `__u32` given that this is a uapi
header.


> > +   

Re: [PATCH drm-misc-next v8 11/12] drm/nouveau: implement new VM_BIND uAPI

2023-07-31 Thread Faith Ekstrand
On Sun, Jul 30, 2023 at 10:30 PM Faith Ekstrand  wrote:

>
> On Tue, Jul 25, 2023 at 5:48 PM Danilo Krummrich  wrote:
>
>> On 7/25/23 18:43, Danilo Krummrich wrote:
>> > On 7/25/23 18:16, Faith Ekstrand wrote:
>> >> Thanks for the detailed write-up! That would definitely explain it. If
>> >> I remember, I'll try to do a single-threaded run or two. If your
>> >> theory is correct, there should be no real perf difference when
>> >> running single-threaded. Those runs will take a long time, though, so
>> >> I'll have to run them over night. I'll let you know in a few days once
>> >> I have the results.
>> >
>> > I can also push a separate branch where I just print out a warning
>> > whenever we run into such a condition including the time we were
>> waiting
>> > for things to complete. I can probably push something later today.
>>
>>
>> https://gitlab.freedesktop.org/nouvelles/kernel/-/tree/new-uapi-drm-next-track-stalls
>>
>> It prints out the duration of every wait as well as the total wait time
>> since boot.
>>
>> - Danilo
>>
>> >
>> >>
>> >> If this theory holds, then I'm not concerned about the performance of
>> >> the API itself. It would still be good to see if we can find a way to
>> >> reduce the cross-process drag in the implementation but that's a perf
>> >> optimization we can do later.
>> >
>> >  From the kernel side I think the only thing we could really do is to
>> > temporarily run a secondary drm_gpu_scheduler instance, one for
>> VM_BINDs
>> > and one for EXECs until we got the new page table handling in place.
>> >
>> > However, the UMD could avoid such conditions more effectively, since it
>> > controls the address space. Namely, avoid re-using the same region of
>> > the address space right away in certain cases. For instance, instead of
>> > replacing a sparse region A[0x0, 0x400] with a larger sparse region
>> > B[0x0, 0x800], replace it with B'[0x400, 0xC00] if possible.
>> >
>> > However, just mentioning this for completeness. The UMD surely
>> shouldn't
>> > probably even temporarily work around such a kernel limitation.
>> >
>> > Anyway, before doing any of those, let's see if the theory holds and
>> > we're actually running into such cases.
>> >
>> >>
>> >> Does it actually matter? Yes, it kinda does. No, it probably doesn't
>> >> matter for games because you're typically only running one game at a
>> >> time. From a development PoV, however, if it makes CI take longer then
>> >> that slows down development and that's not good for the users, either.
>>
>
> I've run a bunch of tests over the weekend and It's starting to look like
> the added CTS time might be from the newly enabled synchronization tests
> themselves.  We enabled timeline semaphores as well as semaphore and fence
> sharing along with the new uAPI and I did not expect those to be nearly
> that heavy-weight so I didn't think of that as a likely factor. I'm doing a
> couple more runs to confirm but what I'm seeing right now seems to indicate
> that the new API with the old feature set has about the same run time now
> that the submit overhead issue has been fixed.
>

My last two runs have come back and confirmed this theory. With the BO
fixes, all remaining slow-downs are coming from newly added tests which
turn out to be very slow tests.

~Faith


> I think this means that we can proceed under the assumption that there are
> no more serious perf issues with the new API.  I'm happy to RB/ACK the next
> version of the API and implementation patches, as long as it has the new
> NO_SHARE BO create flag and properly rejects exports of NO_SHARE BOs, even
> if not all of the dma_resv plumbing is fully baked.
>
> ~Faith
>


Re: [PATCH drm-misc-next v8 11/12] drm/nouveau: implement new VM_BIND uAPI

2023-07-30 Thread Faith Ekstrand
On Tue, Jul 25, 2023 at 5:48 PM Danilo Krummrich  wrote:

> On 7/25/23 18:43, Danilo Krummrich wrote:
> > On 7/25/23 18:16, Faith Ekstrand wrote:
> >> Thanks for the detailed write-up! That would definitely explain it. If
> >> I remember, I'll try to do a single-threaded run or two. If your
> >> theory is correct, there should be no real perf difference when
> >> running single-threaded. Those runs will take a long time, though, so
> >> I'll have to run them over night. I'll let you know in a few days once
> >> I have the results.
> >
> > I can also push a separate branch where I just print out a warning
> > whenever we run into such a condition including the time we were waiting
> > for things to complete. I can probably push something later today.
>
>
> https://gitlab.freedesktop.org/nouvelles/kernel/-/tree/new-uapi-drm-next-track-stalls
>
> It prints out the duration of every wait as well as the total wait time
> since boot.
>
> - Danilo
>
> >
> >>
> >> If this theory holds, then I'm not concerned about the performance of
> >> the API itself. It would still be good to see if we can find a way to
> >> reduce the cross-process drag in the implementation but that's a perf
> >> optimization we can do later.
> >
> >  From the kernel side I think the only thing we could really do is to
> > temporarily run a secondary drm_gpu_scheduler instance, one for VM_BINDs
> > and one for EXECs until we got the new page table handling in place.
> >
> > However, the UMD could avoid such conditions more effectively, since it
> > controls the address space. Namely, avoid re-using the same region of
> > the address space right away in certain cases. For instance, instead of
> > replacing a sparse region A[0x0, 0x400] with a larger sparse region
> > B[0x0, 0x800], replace it with B'[0x400, 0xC00] if possible.
> >
> > However, just mentioning this for completeness. The UMD surely shouldn't
> > probably even temporarily work around such a kernel limitation.
> >
> > Anyway, before doing any of those, let's see if the theory holds and
> > we're actually running into such cases.
> >
> >>
> >> Does it actually matter? Yes, it kinda does. No, it probably doesn't
> >> matter for games because you're typically only running one game at a
> >> time. From a development PoV, however, if it makes CI take longer then
> >> that slows down development and that's not good for the users, either.
>

I've run a bunch of tests over the weekend and It's starting to look like
the added CTS time might be from the newly enabled synchronization tests
themselves.  We enabled timeline semaphores as well as semaphore and fence
sharing along with the new uAPI and I did not expect those to be nearly
that heavy-weight so I didn't think of that as a likely factor. I'm doing a
couple more runs to confirm but what I'm seeing right now seems to indicate
that the new API with the old feature set has about the same run time now
that the submit overhead issue has been fixed.

I think this means that we can proceed under the assumption that there are
no more serious perf issues with the new API.  I'm happy to RB/ACK the next
version of the API and implementation patches, as long as it has the new
NO_SHARE BO create flag and properly rejects exports of NO_SHARE BOs, even
if not all of the dma_resv plumbing is fully baked.

~Faith


Re: [PATCH drm-misc-next v8 11/12] drm/nouveau: implement new VM_BIND uAPI

2023-07-25 Thread Faith Ekstrand
On Mon, Jul 24, 2023 at 9:04 PM Danilo Krummrich  wrote:

> On 7/22/23 17:12, Faith Ekstrand wrote:
> > On Wed, Jul 19, 2023 at 7:15 PM Danilo Krummrich  > <mailto:d...@redhat.com>> wrote:
> >
> > This commit provides the implementation for the new uapi motivated
> > by the
> > Vulkan API. It allows user mode drivers (UMDs) to:
> >
> > 1) Initialize a GPU virtual address (VA) space via the new
> > DRM_IOCTL_NOUVEAU_VM_INIT ioctl for UMDs to specify the portion
> > of VA
> > space managed by the kernel and userspace, respectively.
> >
> > 2) Allocate and free a VA space region as well as bind and unbind
> memory
> > to the GPUs VA space via the new DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> > UMDs can request the named operations to be processed either
> > synchronously or asynchronously. It supports DRM syncobjs
> > (incl. timelines) as synchronization mechanism. The management
> > of the
> > GPU VA mappings is implemented with the DRM GPU VA manager.
> >
> > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
> The
> > execution happens asynchronously. It supports DRM syncobj (incl.
> > timelines) as synchronization mechanism. DRM GEM object locking
> is
> > handled with drm_exec.
> >
> > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC, use the
> DRM
> > GPU scheduler for the asynchronous paths.
> >
> >
> > IDK where the best place to talk about this is but this seems as good as
> > any.
> >
> > I've been looking into why the Vulkan CTS runs about 2x slower for me on
> > the new UAPI and I created a little benchmark to facilitate testing:
> >
> > https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/141
> > <https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/141>
> >
> > The test, roughly, does the following:
> >   1. Allocates and binds 1000 BOs
> >   2. Constructs a pushbuf that executes a no-op compute shader.
> >   3. Does a single EXEC/wait combo to warm up the kernel
> >   4. Loops 10,000 times, doing SYNCOBJ_RESET (fast), EXEC, and then
> > SYNCOBJ_WAIT and times the loop
> >
> > Of course, there's a bit of userspace driver overhead but that's
> > negledgable.
> >
> > If you drop the top patch which allocates 1k buffers, the submit time on
> > the old uAPI is 54 us/exec vs. 66 us/exec on the new UAPI. This includes
> > the time to do a SYNCOBJ_RESET (fast), EXEC, and SYNCOBJ_WAIT.The Intel
> > driver, by comparison, is 33us/exec so it's not syncobj overhead. This
> > is a bit concerning (you'd think the new thing would be faster) but what
> > really has me concerned is the 1k buffer case.
> >
> > If you include the top patch in the crucible MR, it allocates 1000 BOs
> > and VM_BINDs them. All the binding is done before the warmup EXEC.
> > Suddenly, the submit time jumps to 257 us/exec with the new UAPI. The
> > old UAPI is much worse (1134 us/exec) but that's not the point. Once
> > we've done the first EXEC and created our VM bindings, the cost per EXEC
> > shouldn't change at all based on the number of BOs bound.  Part of the
> > point of VM_BIND is to get all that binding logic and BO walking off the
> > EXEC path.
> >
> > Normally, I wouldn't be too worried about a little performance problem
> > like this. This is the first implementation and we can improve it later.
> > I get that. However, I suspect the solution to this problem involves
> > more UAPI and I want to make sure we have it all before we call this all
> > done and dusted and land it.
> >
> > The way AMD solves this problem as well as the new Xe driver for Intel
> > is to have a concept of internal vs. external BOs. Basically, there's an
> > INTERNAL bit specified somewhere in BO creation that has a few userspace
> > implications:
> >   1. In the Xe world where VMs are objects, INTERNAL BOs are assigned a
> > VM on creation and can never be bound to any other VM.
> >   2. Any attempt to export an INTERNAL BO via prime or a similar
> > mechanism will fail with -EINVAL (I think?).
> >
> > Inside the kernel driver, all the internal BOs on a VM (or DRM file in
> > the case of nouveau/AMD since they don't have VM objects) share a single
> > dma_resv which allows you to avoid having to walk lists of BOs and take
> > locks on every exec. Instead, you can just look at the fences on the
> > dma_resv for the VM. There's still a BO list associated with the VM f

Re: [PATCH drm-misc-next v8 03/12] drm/nouveau: new VM_BIND uapi interfaces

2023-07-25 Thread Faith Ekstrand
On Mon, Jul 24, 2023 at 9:04 PM Danilo Krummrich  wrote:

>
>
> On 7/22/23 00:58, Faith Ekstrand wrote:
> >
> > On Wed, Jul 19, 2023 at 7:15 PM Danilo Krummrich  > <mailto:d...@redhat.com>> wrote:
> >
> > This commit provides the interfaces for the new UAPI motivated by the
> > Vulkan API. It allows user mode drivers (UMDs) to:
> >
> > 1) Initialize a GPU virtual address (VA) space via the new
> > DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel
> reserved
> > VA area.
> >
> > 2) Bind and unbind GPU VA space mappings via the new
> > DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
> >
> > 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
> >
> > Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
> > asynchronous processing with DRM syncobjs as synchronization
> mechanism.
> >
> > The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
> > DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
> >
> > Co-authored-by: Dave Airlie  > <mailto:airl...@redhat.com>>
> > Signed-off-by: Danilo Krummrich  > <mailto:d...@redhat.com>>
> > ---
> >   Documentation/gpu/driver-uapi.rst |   8 ++
> >   include/uapi/drm/nouveau_drm.h| 209
> ++
> >   2 files changed, 217 insertions(+)
> >
> > diff --git a/Documentation/gpu/driver-uapi.rst
> > b/Documentation/gpu/driver-uapi.rst
> > index 4411e6919a3d..9c7ca6e33a68 100644
> > --- a/Documentation/gpu/driver-uapi.rst
> > +++ b/Documentation/gpu/driver-uapi.rst
> > @@ -6,3 +6,11 @@ drm/i915 uAPI
> >   =
> >
> >   .. kernel-doc:: include/uapi/drm/i915_drm.h
> > +
> > +drm/nouveau uAPI
> > +
> > +
> > +VM_BIND / EXEC uAPI
> > +---
> > +
> > +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
> > diff --git a/include/uapi/drm/nouveau_drm.h
> > b/include/uapi/drm/nouveau_drm.h
> > index 853a327433d3..4d3a70529637 100644
> > --- a/include/uapi/drm/nouveau_drm.h
> > +++ b/include/uapi/drm/nouveau_drm.h
> > @@ -126,6 +126,209 @@ struct drm_nouveau_gem_cpu_fini {
> >  __u32 handle;
> >   };
> >
> > +/**
> > + * struct drm_nouveau_sync - sync object
> > + *
> > + * This structure serves as synchronization mechanism for
> (potentially)
> > + * asynchronous operations such as EXEC or VM_BIND.
> > + */
> > +struct drm_nouveau_sync {
> > +   /**
> > +* @flags: the flags for a sync object
> > +*
> > +* The first 8 bits are used to determine the type of the
> > sync object.
> > +*/
> > +   __u32 flags;
> > +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
> > +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
> > +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
> > +   /**
> > +* @handle: the handle of the sync object
> > +*/
> > +   __u32 handle;
> > +   /**
> > +* @timeline_value:
> > +*
> > +* The timeline point of the sync object in case the syncobj
> > is of
> > +* type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
> > +*/
> > +   __u64 timeline_value;
> > +};
> > +
> > +/**
> > + * struct drm_nouveau_vm_init - GPU VA space init structure
> > + *
> > + * Used to initialize the GPU's VA space for a user client, telling
> > the kernel
> > + * which portion of the VA space is managed by the UMD and kernel
> > respectively.
> >
> >
> > I assume this has to be called quite early. Like maybe before any BOs or
> > channels are created? In any case, it'd be nice to have that documented.
>
> Exactly, doing any of those will disable the new uAPI entirely if it
> wasn't yet initialized. I will add some documentation for this.
>

Thanks!


> >
> > + */
> > +struct drm_nouveau_vm_init {
> > +   /**
> > +* @unmanaged_addr: start address of the kernel managed VA
> > space region
> > +*/
> > +   __u64 unmanaged_addr;
> > +   /**
> > +* @unmanaged_size: size of the kernel managed VA space
> > region 

Re: [PATCH drm-misc-next v8 03/12] drm/nouveau: new VM_BIND uapi interfaces

2023-07-21 Thread Faith Ekstrand
On Wed, Jul 19, 2023 at 7:15 PM Danilo Krummrich  wrote:

> This commit provides the interfaces for the new UAPI motivated by the
> Vulkan API. It allows user mode drivers (UMDs) to:
>
> 1) Initialize a GPU virtual address (VA) space via the new
>DRM_IOCTL_NOUVEAU_VM_INIT ioctl. UMDs can provide a kernel reserved
>VA area.
>
> 2) Bind and unbind GPU VA space mappings via the new
>DRM_IOCTL_NOUVEAU_VM_BIND ioctl.
>
> 3) Execute push buffers with the new DRM_IOCTL_NOUVEAU_EXEC ioctl.
>
> Both, DRM_IOCTL_NOUVEAU_VM_BIND and DRM_IOCTL_NOUVEAU_EXEC support
> asynchronous processing with DRM syncobjs as synchronization mechanism.
>
> The default DRM_IOCTL_NOUVEAU_VM_BIND is synchronous processing,
> DRM_IOCTL_NOUVEAU_EXEC supports asynchronous processing only.
>
> Co-authored-by: Dave Airlie 
> Signed-off-by: Danilo Krummrich 
> ---
>  Documentation/gpu/driver-uapi.rst |   8 ++
>  include/uapi/drm/nouveau_drm.h| 209 ++
>  2 files changed, 217 insertions(+)
>
> diff --git a/Documentation/gpu/driver-uapi.rst
> b/Documentation/gpu/driver-uapi.rst
> index 4411e6919a3d..9c7ca6e33a68 100644
> --- a/Documentation/gpu/driver-uapi.rst
> +++ b/Documentation/gpu/driver-uapi.rst
> @@ -6,3 +6,11 @@ drm/i915 uAPI
>  =
>
>  .. kernel-doc:: include/uapi/drm/i915_drm.h
> +
> +drm/nouveau uAPI
> +
> +
> +VM_BIND / EXEC uAPI
> +---
> +
> +.. kernel-doc:: include/uapi/drm/nouveau_drm.h
> diff --git a/include/uapi/drm/nouveau_drm.h
> b/include/uapi/drm/nouveau_drm.h
> index 853a327433d3..4d3a70529637 100644
> --- a/include/uapi/drm/nouveau_drm.h
> +++ b/include/uapi/drm/nouveau_drm.h
> @@ -126,6 +126,209 @@ struct drm_nouveau_gem_cpu_fini {
> __u32 handle;
>  };
>
> +/**
> + * struct drm_nouveau_sync - sync object
> + *
> + * This structure serves as synchronization mechanism for (potentially)
> + * asynchronous operations such as EXEC or VM_BIND.
> + */
> +struct drm_nouveau_sync {
> +   /**
> +* @flags: the flags for a sync object
> +*
> +* The first 8 bits are used to determine the type of the sync
> object.
> +*/
> +   __u32 flags;
> +#define DRM_NOUVEAU_SYNC_SYNCOBJ 0x0
> +#define DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ 0x1
> +#define DRM_NOUVEAU_SYNC_TYPE_MASK 0xf
> +   /**
> +* @handle: the handle of the sync object
> +*/
> +   __u32 handle;
> +   /**
> +* @timeline_value:
> +*
> +* The timeline point of the sync object in case the syncobj is of
> +* type DRM_NOUVEAU_SYNC_TIMELINE_SYNCOBJ.
> +*/
> +   __u64 timeline_value;
> +};
> +
> +/**
> + * struct drm_nouveau_vm_init - GPU VA space init structure
> + *
> + * Used to initialize the GPU's VA space for a user client, telling the
> kernel
> + * which portion of the VA space is managed by the UMD and kernel
> respectively.
>

I assume this has to be called quite early. Like maybe before any BOs or
channels are created? In any case, it'd be nice to have that documented.


> + */
> +struct drm_nouveau_vm_init {
> +   /**
> +* @unmanaged_addr: start address of the kernel managed VA space
> region
> +*/
> +   __u64 unmanaged_addr;
> +   /**
> +* @unmanaged_size: size of the kernel managed VA space region in
> bytes
> +*/
> +   __u64 unmanaged_size;
>

Over-all, I think this is the right API. My only concern is with the word
"unmanaged". None of the VA space is unmanaged. Some is userspace-managed
and some is kernel-managed.  I guess "unmanaged" kinda makes sense because
this is coming from userspace and it's saying which bits it manages and
which bits it doesn't.  Still seems clunky to me.  Maybe kernel_managed?
IDK, that feels weird too. Since we're already using UMD in this file, we
could call it kmd_managed. IDK. 路‍♀️

Yeah, I know this is a total bikeshed color thing and I'm not going to
block anything based on it.  Just wanted to see if we can come up with
anything better.  It's documented and that's the important thing.


> +};
> +
> +/**
> + * struct drm_nouveau_vm_bind_op - VM_BIND operation
> + *
> + * This structure represents a single VM_BIND operation. UMDs should pass
> + * an array of this structure via struct drm_nouveau_vm_bind's _ptr
> field.
> + */
> +struct drm_nouveau_vm_bind_op {
> +   /**
> +* @op: the operation type
> +*/
> +   __u32 op;
> +/**
> + * @DRM_NOUVEAU_VM_BIND_OP_MAP:
> + *
> + * Map a GEM object to the GPU's VA space. Optionally, the
> + * _NOUVEAU_VM_BIND_SPARSE flag can be passed to instruct the kernel
> to
> + * create sparse mappings for the given range.
> + */
> +#define DRM_NOUVEAU_VM_BIND_OP_MAP 0x0
> +/**
> + * @DRM_NOUVEAU_VM_BIND_OP_UNMAP:
> + *
> + * Unmap an existing mapping in the GPU's VA space. If the region the
> mapping
> + * is located in is a sparse region, new sparse mappings are created
> where the
> + * unmapped (memory backed) mapping was mapped 

Re: [Intel-gfx] [PATCH v10 09/15] drm/syncobj: Add deadline support for syncobj waits

2023-03-19 Thread Faith Ekstrand
On Wed, Mar 8, 2023 at 9:54 AM Rob Clark  wrote:

> From: Rob Clark 
>
> Add a new flag to let userspace provide a deadline as a hint for syncobj
> and timeline waits.  This gives a hint to the driver signaling the
> backing fences about how soon userspace needs it to compete work, so it
> can addjust GPU frequency accordingly.  An immediate deadline can be
> given to provide something equivalent to i915 "wait boost".
>
> v2: Use absolute u64 ns value for deadline hint, drop cap and driver
> feature flag in favor of allowing count_handles==0 as a way for
> userspace to probe kernel for support of new flag
> v3: More verbose comments about UAPI
>
> Signed-off-by: Rob Clark 
> ---
>  drivers/gpu/drm/drm_syncobj.c | 64 ---
>  include/uapi/drm/drm.h| 17 ++
>  2 files changed, 68 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index 0c2be8360525..a85e9464f07b 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -126,6 +126,11 @@
>   * synchronize between the two.
>   * This requirement is inherited from the Vulkan fence API.
>   *
> + * If _SYNCOBJ_WAIT_FLAGS_WAIT_DEADLINE is set, the ioctl will also
> set
> + * a fence deadline hint on the backing fences before waiting, to provide
> the
> + * fence signaler with an appropriate sense of urgency.  The deadline is
> + * specified as an absolute _MONOTONIC value in units of ns.
> + *
>   * Similarly, _IOCTL_SYNCOBJ_TIMELINE_WAIT takes an array of syncobj
>   * handles as well as an array of u64 points and does a host-side wait on
> all
>   * of syncobj fences at the given points simultaneously.
> @@ -973,7 +978,8 @@ static signed long
> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
>   uint32_t count,
>   uint32_t flags,
>   signed long timeout,
> - uint32_t *idx)
> + uint32_t *idx,
> + ktime_t *deadline)
>  {
> struct syncobj_wait_entry *entries;
> struct dma_fence *fence;
> @@ -1053,6 +1059,15 @@ static signed long
> drm_syncobj_array_wait_timeout(struct drm_syncobj **syncobjs,
> drm_syncobj_fence_add_wait(syncobjs[i],
> [i]);
> }
>
> +   if (deadline) {
> +   for (i = 0; i < count; ++i) {
> +   fence = entries[i].fence;
> +   if (!fence)
> +   continue;
> +   dma_fence_set_deadline(fence, *deadline);
> +   }
> +   }
> +
> do {
> set_current_state(TASK_INTERRUPTIBLE);
>
> @@ -1151,7 +1166,8 @@ static int drm_syncobj_array_wait(struct drm_device
> *dev,
>   struct drm_file *file_private,
>   struct drm_syncobj_wait *wait,
>   struct drm_syncobj_timeline_wait
> *timeline_wait,
> - struct drm_syncobj **syncobjs, bool
> timeline)
> + struct drm_syncobj **syncobjs, bool
> timeline,
> + ktime_t *deadline)
>  {
> signed long timeout = 0;
> uint32_t first = ~0;
> @@ -1162,7 +1178,8 @@ static int drm_syncobj_array_wait(struct drm_device
> *dev,
>  NULL,
>
>  wait->count_handles,
>  wait->flags,
> -timeout, );
> +timeout, ,
> +deadline);
> if (timeout < 0)
> return timeout;
> wait->first_signaled = first;
> @@ -1172,7 +1189,8 @@ static int drm_syncobj_array_wait(struct drm_device
> *dev,
>
>  u64_to_user_ptr(timeline_wait->points),
>
>  timeline_wait->count_handles,
>
>  timeline_wait->flags,
> -timeout, );
> +timeout, ,
> +deadline);
> if (timeout < 0)
> return timeout;
> timeline_wait->first_signaled = first;
> @@ -1243,17 +1261,22 @@ drm_syncobj_wait_ioctl(struct drm_device *dev,
> void *data,
>  {
> struct drm_syncobj_wait *args = data;
> struct drm_syncobj **syncobjs;
> +   unsigned possible_flags;
> +   ktime_t t, *tp = NULL;
> int ret = 0;
>
> if (!drm_core_check_feature(dev, DRIVER_SYNCOBJ))
> return -EOPNOTSUPP;
>
> 

Re: [PATCH RFC 11/18] drm/scheduler: Clean up jobs when the scheduler is torn down

2023-03-13 Thread Faith Ekstrand
On Fri, 2023-03-10 at 18:58 +0900, Asahi Lina wrote:
> On 10/03/2023 04.59, Faith Ekstrand wrote:
> > On Thu, 2023-03-09 at 18:43 +0900, Asahi Lina wrote:
> > > On 09/03/2023 17.42, Christian König wrote:
> > > > Long story short: Don't do this! This is what the Windows
> > > > drivers
> > > > have 
> > > > been doing and it creates tons of problems.
> > 
> > Yeah, we tried to do a bit of that in the GL days.  It was a bad
> > idea.
> 
> I think I should clarify: I was proposing re-queueing innocent jobs
> from
> innocent queues/VMs that were impacted by a fault. The reason is that
> we
> may be able to tweak firmware state to force it to do that safely,
> during the firmware recovery cycle, such that an aborted job restarts
> and then subsequent jobs/commands continue as normal. We can't leave
> it
> to userspace because if we do nothing, the affected job ends up
> incomplete but then everything after it that is already queued still
> runs, and that is definitely a recipe for a bigger mess if userspace
> wants to seamlessly recover. The firmware recovery cycle is a
> "stop-the-world" situation for the GPU (the firmware literally
> busy-loops waiting for the driver to set a continue flag in
> memory...),
> so that's the only real chance that the driver gets to make decisions
> about what is going to happen next.

Ok, that makes sense.  Yes, if you have other jobs on other queues and
are able to recover everything that isn't in the faulting VM, that's a
good thing.  I wasn't sure how hang/fault recovery worked on AGX.  In
tat case, I don't think there's a dma_fence problem.  As long as you
keep recovering and killing off any faulting contexts, eventually the
good contexts should make progress and those fences should signal.

Of course, the firmware recovery cycle may be complex and need (or at
least appear to) memory allocation or similar and that's where
everything gets hairy.  Hopefully, though, if you've already got the
resources from the old context, you can re-use them after a bit of
clean-up work and still get deterministic and reliable recovery cycles.

> Of course, that only works if individual possibly concurrently
> running
> commands are idempotent, but I think a lot of typical GPU work is?

No, that's not a valid assumption.  For a single 3D render pass which
doesn't do any image or SSBO access, it may be possible to re-run it. 
However, that won't be true of compute work and isn't necessarily true
of back-to-back passes. Lots of modern apps do temporal stuff where one
frame depends on the previous and a re-run might screw that up. Also,
with Vulkan's memory aliasing, it's hard to tell just from which
resources are accessed whether or not a command buffer leaves its input
memory undamaged.

> (E.g.
> any render pass without side effects other than the render targets
> and
> where the background shader does no loads, or even render passes that
> do
> loads but where all draws are opaque, which are all things the
> current
> Gallium driver is intimately familiar with since Crazy Tiler
> Optimizations™ need that info to be provided anyway). So I was
> wondering
> whether it'd make sense to have such an idempotency/restartable flag
> on
> job submission, and then the driver would do its best to recover and
> rerun it if it gets killed by an unrelated concurrent bad job.
> 
> Then again this all depends on an investigation into what we *can* do
> during firmware recovery that hasn't happened at all yet. It might be
> that it isn't safe to do anything really, or that doing things
> depends
> on touching even deeper firmware state structs that we treat as
> opaque
> right now and we really don't want to have to touch...
> 
> But maybe none of this is worth it in practice, it just sounded like
> it
> could be useful maybe?

Maybe? It's not clear to me that such a flag would be useful or even
practical to provide from the Mesa side.  Ideally, you'd be able to
figure out when a fault happens, what VM it happened in and exactly
what work was in-flight when it happened and only kill the one guilty
VM.  However, it sounds like your understanding of the firmware is
currently rough enough that doing so may not be practical.  In that
case, the best thing to do is to kill any VMs which were on the GPU at
the time and hope the individual apps are able to recover.

> Now that I look at it, we have a lovely "what is this flag doing
> anyway"
> bit already passed from Mesa through to the firmware we called
> ASAHI_RENDER_SET_WHEN_RELOADING_Z_OR_S which, now that I look at it,
> is
> actually getting set when any attachment (any color, Z, S) is not
> being
> cleared for that pass (so it's loaded). That could very well be an
> "is
> not

Re: [PATCH RFC 03/18] rust: drm: file: Add File abstraction

2023-03-13 Thread Faith Ekstrand
On Fri, 2023-03-10 at 07:16 +0900, Asahi Lina wrote:
> On 10/03/2023 06.16, Faith Ekstrand wrote:
> > On Tue, 2023-03-07 at 23:25 +0900, Asahi Lina wrote:
> > > A DRM File is the DRM counterpart to a kernel file structure,
> > > representing an open DRM file descriptor. Add a Rust abstraction
> > > to
> > > allow drivers to implement their own File types that implement
> > > the
> > > DriverFile trait.
> > > 
> > > Signed-off-by: Asahi Lina 
> > > ---
> > >  rust/bindings/bindings_helper.h |   1 +
> > >  rust/kernel/drm/drv.rs  |   7 ++-
> > >  rust/kernel/drm/file.rs | 113
> > > 
> > >  rust/kernel/drm/mod.rs  |   1 +
> > >  4 files changed, 120 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/rust/bindings/bindings_helper.h
> > > b/rust/bindings/bindings_helper.h
> > > index 2a999138c4ae..7d7828faf89c 100644
> > > --- a/rust/bindings/bindings_helper.h
> > > +++ b/rust/bindings/bindings_helper.h
> > > @@ -8,6 +8,7 @@
> > >  
> > >  #include 
> > >  #include 
> > > +#include 
> > >  #include 
> > >  #include 
> > >  #include 
> > > diff --git a/rust/kernel/drm/drv.rs b/rust/kernel/drm/drv.rs
> > > index 29a465515dc9..1dcb651e1417 100644
> > > --- a/rust/kernel/drm/drv.rs
> > > +++ b/rust/kernel/drm/drv.rs
> > > @@ -144,6 +144,9 @@ pub trait Driver {
> > >  /// Should be either `drm::gem::Object` or
> > > `drm::gem::shmem::Object`.
> > >  type Object: AllocImpl;
> > >  
> > > +    /// The type used to represent a DRM File (client)
> > > +    type File: drm::file::DriverFile;
> > > +
> > >  /// Driver metadata
> > >  const INFO: DriverInfo;
> > >  
> > > @@ -213,8 +216,8 @@ macro_rules! drm_device_register {
> > >  impl Registration {
> > >  const VTABLE: bindings::drm_driver = drm_legacy_fields! {
> > >  load: None,
> > > -    open: None, // TODO: File abstraction
> > > -    postclose: None, // TODO: File abstraction
> > > +    open: Some(drm::file::open_callback::),
> > > +    postclose:
> > > Some(drm::file::postclose_callback::),
> > >  lastclose: None,
> > >  unload: None,
> > >  release: None,
> > > diff --git a/rust/kernel/drm/file.rs b/rust/kernel/drm/file.rs
> > > new file mode 100644
> > > index ..48751e93c38a
> > > --- /dev/null
> > > +++ b/rust/kernel/drm/file.rs
> > > @@ -0,0 +1,113 @@
> > > +// SPDX-License-Identifier: GPL-2.0 OR MIT
> > > +
> > > +//! DRM File objects.
> > > +//!
> > > +//! C header:
> > > [`include/linux/drm/drm_file.h`](../../../../include/linux/drm/dr
> > > m_fi
> > > le.h)
> > > +
> > > +use crate::{bindings, drm, error::Result};
> > > +use alloc::boxed::Box;
> > > +use core::marker::PhantomData;
> > > +use core::ops::Deref;
> > > +
> > > +/// Trait that must be implemented by DRM drivers to represent a
> > > DRM
> > > File (a client instance).
> > > +pub trait DriverFile {
> > > +    /// The parent `Driver` implementation for this
> > > `DriverFile`.
> > > +    type Driver: drm::drv::Driver;
> > > +
> > > +    /// Open a new file (called when a client opens the DRM
> > > device).
> > > +    fn open(device: ::device::Device) ->
> > > Result>;
> > > +}
> > > +
> > > +/// An open DRM File.
> > > +///
> > > +/// # Invariants
> > > +/// `raw` is a valid pointer to a `drm_file` struct.
> > > +#[repr(transparent)]
> > > +pub struct File {
> > > +    raw: *mut bindings::drm_file,
> > > +    _p: PhantomData,
> > > +}
> > > +
> > > +pub(super) unsafe extern "C" fn open_callback(
> > > +    raw_dev: *mut bindings::drm_device,
> > > +    raw_file: *mut bindings::drm_file,
> > > +) -> core::ffi::c_int {
> > > +    let drm = core::mem::ManuallyDrop::new(unsafe {
> > > drm::device::Device::from_raw(raw_dev) });
> > 
> > Maybe you can help educate me a bit here... This feels like a
> > really
> > sketchy pattern.  We're creating a Device from a pointer, an
> > operation
> > which inherently consumes a reference but then marking it
> 

Re: [PATCH RFC 03/18] rust: drm: file: Add File abstraction

2023-03-09 Thread Faith Ekstrand
On Tue, 2023-03-07 at 23:25 +0900, Asahi Lina wrote:
> A DRM File is the DRM counterpart to a kernel file structure,
> representing an open DRM file descriptor. Add a Rust abstraction to
> allow drivers to implement their own File types that implement the
> DriverFile trait.
> 
> Signed-off-by: Asahi Lina 
> ---
>  rust/bindings/bindings_helper.h |   1 +
>  rust/kernel/drm/drv.rs  |   7 ++-
>  rust/kernel/drm/file.rs | 113
> 
>  rust/kernel/drm/mod.rs  |   1 +
>  4 files changed, 120 insertions(+), 2 deletions(-)
> 
> diff --git a/rust/bindings/bindings_helper.h
> b/rust/bindings/bindings_helper.h
> index 2a999138c4ae..7d7828faf89c 100644
> --- a/rust/bindings/bindings_helper.h
> +++ b/rust/bindings/bindings_helper.h
> @@ -8,6 +8,7 @@
>  
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> diff --git a/rust/kernel/drm/drv.rs b/rust/kernel/drm/drv.rs
> index 29a465515dc9..1dcb651e1417 100644
> --- a/rust/kernel/drm/drv.rs
> +++ b/rust/kernel/drm/drv.rs
> @@ -144,6 +144,9 @@ pub trait Driver {
>  /// Should be either `drm::gem::Object` or
> `drm::gem::shmem::Object`.
>  type Object: AllocImpl;
>  
> +    /// The type used to represent a DRM File (client)
> +    type File: drm::file::DriverFile;
> +
>  /// Driver metadata
>  const INFO: DriverInfo;
>  
> @@ -213,8 +216,8 @@ macro_rules! drm_device_register {
>  impl Registration {
>  const VTABLE: bindings::drm_driver = drm_legacy_fields! {
>  load: None,
> -    open: None, // TODO: File abstraction
> -    postclose: None, // TODO: File abstraction
> +    open: Some(drm::file::open_callback::),
> +    postclose: Some(drm::file::postclose_callback::),
>  lastclose: None,
>  unload: None,
>  release: None,
> diff --git a/rust/kernel/drm/file.rs b/rust/kernel/drm/file.rs
> new file mode 100644
> index ..48751e93c38a
> --- /dev/null
> +++ b/rust/kernel/drm/file.rs
> @@ -0,0 +1,113 @@
> +// SPDX-License-Identifier: GPL-2.0 OR MIT
> +
> +//! DRM File objects.
> +//!
> +//! C header:
> [`include/linux/drm/drm_file.h`](../../../../include/linux/drm/drm_fi
> le.h)
> +
> +use crate::{bindings, drm, error::Result};
> +use alloc::boxed::Box;
> +use core::marker::PhantomData;
> +use core::ops::Deref;
> +
> +/// Trait that must be implemented by DRM drivers to represent a DRM
> File (a client instance).
> +pub trait DriverFile {
> +    /// The parent `Driver` implementation for this `DriverFile`.
> +    type Driver: drm::drv::Driver;
> +
> +    /// Open a new file (called when a client opens the DRM device).
> +    fn open(device: ::device::Device) ->
> Result>;
> +}
> +
> +/// An open DRM File.
> +///
> +/// # Invariants
> +/// `raw` is a valid pointer to a `drm_file` struct.
> +#[repr(transparent)]
> +pub struct File {
> +    raw: *mut bindings::drm_file,
> +    _p: PhantomData,
> +}
> +
> +pub(super) unsafe extern "C" fn open_callback(
> +    raw_dev: *mut bindings::drm_device,
> +    raw_file: *mut bindings::drm_file,
> +) -> core::ffi::c_int {
> +    let drm = core::mem::ManuallyDrop::new(unsafe {
> drm::device::Device::from_raw(raw_dev) });

Maybe you can help educate me a bit here... This feels like a really
sketchy pattern.  We're creating a Device from a pointer, an operation
which inherently consumes a reference but then marking it ManuallyDrop
so drm_device_put() never gets called.  It took me a while but I think
I figured out what you're trying to do: Make it so all the Rust stuff
works with Device, not drm_device but it still feels really wrong.  It
works, it just feels like there's a lot of unsafe abstraction juggling
happening here and I expect this operation is going to be pretty common
in the Rust abstraction layer.

Am I missing something?

~Faith


> +    // SAFETY: This reference won't escape this function
> +    let file = unsafe {  *raw_file };
> +
> +    let inner = match T::open() {
> +    Err(e) => {
> +    return e.to_kernel_errno();
> +    }
> +    Ok(i) => i,
> +    };
> +
> +    file.driver_priv = Box::into_raw(inner) as *mut _;
> +
> +    0
> +}
> +
> +pub(super) unsafe extern "C" fn postclose_callback(
> +    _dev: *mut bindings::drm_device,
> +    raw_file: *mut bindings::drm_file,
> +) {
> +    // SAFETY: This reference won't escape this function
> +    let file = unsafe { &*raw_file };
> +
> +    // Drop the DriverFile
> +    unsafe { Box::from_raw(file.driver_priv as *mut T) };
> +}
> +
> +impl File {
> +    // Not intended to be called externally, except via
> declare_drm_ioctls!()
> +    #[doc(hidden)]
> +    pub unsafe fn from_raw(raw_file: *mut bindings::drm_file) ->
> File {
> +    File {
> +    raw: raw_file,
> +    _p: PhantomData,
> +    }
> +    }
> +
> +    #[allow(dead_code)]
> +    /// Return the raw pointer to the underlying `drm_file`.
> +    pub(super) fn raw() -> *const bindings::drm_file {
> +    self.raw

Re: [PATCH RFC 01/18] rust: drm: ioctl: Add DRM ioctl abstraction

2023-03-09 Thread Faith Ekstrand
On Thu, 2023-03-09 at 15:04 +0900, Asahi Lina wrote:
> On 08/03/2023 02.34, Björn Roy Baron wrote:
> > > +    // SAFETY: This is just the ioctl
> > > argument, which hopefully has the right type
> > > +    // (we've done our best checking the
> > > size).
> > 
> > In the rust tree there is the ReadableFromBytes [1] trait which
> > indicates that it is safe to read arbitrary bytes into the type.
> > Maybe you could add it as bound on the argument type when it lands
> > in rust-next? This way you can't end up with for example a struct
> > containing a bool with the byte value 2, which is UB.
> 
> There's actually a much bigger story here, because that trait isn't
> really very useful without a way to auto-derive it. I need the same
> kind
> of guarantee for all the GPU firmware structs...
> 
> There's one using only declarative macros [1] and one using proc
> macros
> [2]. And then, since ioctl arguments are declared in C UAPI header
> files, we need a way to be able to derive those traits for them...
> which
> I guess means bindgen changes?

It'd be cool to be able to auto-verify that uAPI structs are all
tightly packed and use the right subset of types.  Maybe not possible
this iteration but it'd be cool to see in future.  I'd like to see it
for C as well, ideally.

~Faith


Re: [PATCH RFC 11/18] drm/scheduler: Clean up jobs when the scheduler is torn down

2023-03-09 Thread Faith Ekstrand
On Thu, 2023-03-09 at 18:43 +0900, Asahi Lina wrote:
> On 09/03/2023 17.42, Christian König wrote:
> > Am 08.03.23 um 20:37 schrieb Asahi Lina:
> > > On 09/03/2023 03.12, Christian König wrote:
> > > > Am 08.03.23 um 18:32 schrieb Asahi Lina:
> > > > > [SNIP]
> > > > > Yes but... none of this cleans up jobs that are already
> > > > > submitted by the
> > > > > scheduler and in its pending list, with registered completion
> > > > > callbacks,
> > > > > which were already popped off of the entities.
> > > > > 
> > > > > *That* is the problem this patch fixes!
> > > > Ah! Yes that makes more sense now.
> > > > 
> > > > > > We could add a warning when users of this API doesn't do
> > > > > > this
> > > > > > correctly, but cleaning up incorrect API use is clearly
> > > > > > something we
> > > > > > don't want here.
> > > > > It is the job of the Rust abstractions to make incorrect API
> > > > > use that
> > > > > leads to memory unsafety impossible. So even if you don't
> > > > > want that in
> > > > > C, it's my job to do that for Rust... and right now, I just
> > > > > can't
> > > > > because drm_sched doesn't provide an API that can be safely
> > > > > wrapped
> > > > > without weird bits of babysitting functionality on top (like
> > > > > tracking
> > > > > jobs outside or awkwardly making jobs hold a reference to the
> > > > > scheduler
> > > > > and defer dropping it to another thread).
> > > > Yeah, that was discussed before but rejected.
> > > > 
> > > > The argument was that upper layer needs to wait for the hw to
> > > > become
> > > > idle before the scheduler can be destroyed anyway.
> > > Unfortunately, that's not a requirement you can encode in the
> > > Rust type
> > > system easily as far as I know, and Rust safety rules mean we
> > > need to
> > > make it safe even if the upper layer doesn't do this... (or else
> > > we have
> > > to mark the entire drm_sched abstraction unsafe, but that would
> > > be a pity).
> > 
> > Yeah, that should really not be something we should do.
> > 
> > But you could make the scheduler depend on your fw context object,
> > don't 
> > you?
> 
> Yes, and that would fix the problem for this driver, but it wouldn't
> make the abstraction safe. The thing is we have to make it
> *impossible*
> to misuse drm_sched in such a way that it crashes, at the Rust
> abstraction level. If we start depending on the driver following
> rules
> like that, that means the drm_sched abstraction has to be marked
> unsafe.
> 
> > Detaching the scheduler from the underlying hw fences is certainly 
> > possible, but we removed that functionality because some people
> > people 
> > tried to force push some Windows recovery module into Linux. We are
> > in 
> > the process of reverting that and cleaning things up once more, but
> > that 
> > will take a while.
> 
> Okay, but I don't see why that should block the Rust abstractions...
> I
> don't even need a new API to do that, all I need is to know that
> drm_sched_fini() will do it so it won't crash when the hw fences
> complete later, as this patch does.
> 
> > Instead of detaching you could also block for the hw to become
> > idle, but 
> > if you do that synchronous on process termination you run into
> > trouble 
> > as well.
> 
> Yes, but again this something that can only be done at the driver
> level
> so it doesn't solve the safe abstraction problem...
> 
> > > The firmware queue is itself reference counted and any firmware
> > > queue
> > > that has acquired an event notification resource (that is, which
> > > is busy
> > > with running or upcoming jobs) hands off a reference to itself
> > > into the
> > > event subsystem, so it can get notified of job completions by the
> > > firmware. Then once it becomes idle it unregisters itself, and at
> > > that
> > > point if it has no owning userspace queue, that would be the last
> > > reference and it gets dropped. So we don't tear down firmware
> > > queues
> > > until they are idle.
> > 
> > And could those fw queue not reference the scheduler?
> 
> Yes but again, that rule can't be encoded in the abstraction... so
> that
> makes it unsafe. The goal is to have a safe abstraction, which means
> that all the rules that you need to follow to avoid memory safety
> issues
> are checked by the Rust compiler.
> 
> > > I actually don't know of any way to actively abort jobs on the
> > > firmware,
> > > so this is pretty much the only option I have. I've even seen
> > > long-running compute jobs on macOS run to completion even if you
> > > kill
> > > the submitting process, so there might be no way to do this at
> > > all.
> > > Though in practice since we unmap everything from the VM anyway
> > > when the
> > > userspace stuff gets torn down, almost any normal GPU work is
> > > going to
> > > immediately fault at that point (macOS doesn't do this because
> > > macOS
> > > effectively does implicit sync with BO tracking at the kernel
> > > level...).
> > 
> > Oh, that is an interesting 

Re: [PATCH RFC 10/18] drm/scheduler: Add can_run_job callback

2023-03-09 Thread Faith Ekstrand
Jumping in here quick... (Sorry, I was out yesterday and was ignoring
my e-mail on Tuesday so I could finally type some compiler code.)

On Thu, 2023-03-09 at 18:14 +0900, Asahi Lina wrote:
> On 09/03/2023 17.05, Christian König wrote:
> > Am 09.03.23 um 07:30 schrieb Asahi Lina:
> > > On 09/03/2023 05.14, Christian König wrote:
> > > > > I think you mean wake_up_interruptible(). That would be
> > > > > drm_sched_job_done(), on the fence callback when a job
> > > > > completes, which
> > > > > as I keep saying is the same logic used for
> > > > > hw_rq_count/hw_submission_limit tracking.
> > > > As the documentation to wait_event says:
> > > > 
> > > >    * wake_up() has to be called after changing any variable
> > > > that could
> > > >    * change the result of the wait condition.
> > > > 
> > > > So what you essentially try to do here is to skip that and say
> > > > drm_sched_job_done() would call that anyway, but when you read
> > > > any
> > > > variable to determine that state then as far as I can see
> > > > nothing is
> > > > guarantying that order.
> > > The driver needs to guarantee that any changes to that state
> > > precede a
> > > job completion fence signal of course, that's the entire idea of
> > > the
> > > API. It's supposed to represent a check for per-scheduler (or
> > > more
> > > specific, but not more global) resources that are released on job
> > > completion. Of course if you misuse the API you could cause a
> > > problem,
> > > but what I'm trying to say is that the API as designed and when
> > > used as
> > > intended does work properly.
> > > 
> > > Put another way: job completions always need to cause the sched
> > > main
> > > loop to run an iteration anyway (otherwise we wouldn't make
> > > forward
> > > progress), and job completions are exactly the signal that the
> > > can_run_job() condition may have changed.
> > > 
> > > > The only other possibility how you could use the callback
> > > > correctly
> > > > would be to call drm_fence_is_signaled() to query the state of
> > > > your hw
> > > > submission from the same fence which is then signaled. But then
> > > > the
> > > > question is once more why you don't give that fence directly to
> > > > the
> > > > scheduler?
> > > But the driver is supposed to guarantee that the ordering is
> > > always 1.
> > > resources freed, 2. fence signaled. So you don't need to check
> > > for the
> > > fence, you can just check for the resource state.
> > 
> > Yeah, but this is exactly what the dma_fence framework tried to
> > prevent. 
> > We try very hard to avoid such side channel signaling :)
> 
> Right, and it's fine, I can use the fences directly easily enough.
> I'm
> just trying to explain why my original idea works too, even if it's
> not
> the best solution for other reasons!
> 
> Of course I don't have the context of what other drivers are doing or
> did historically and what the pitfalls are, so I can't know what the
> "right" solution for any of this is in that context. I did my best to
> understand the drm_sched code and come up with a solution that works
> (which it does) without any more info. When I saw the hw submission
> limit stuff, I thought "okay, I need the same thing but with slightly
> more complex logic, so let's add a callback so the driver can
> customize
> it and do its own inflight counting".

So, I think there's a difference here between "impossible to implement
correctly", "likely to be implemented correctly", and "impossible to
implement incorrectly".  It's obviously possible to implement
correctly.  You can just always return true or do exactly the same
check or do some simple thing where you can guarantee that it will only
ever return false when there's a bunch of other stuff in the queue. 
That doesn't mean that it's likely to be implemented correctly by some
other driver.  Some idiot will come along and try to take advantage of
it and cause themselves horrible problems.

And, to be clear, for the purposes of this discussion, we're ALL
idiots, myself included.  If there's one thing the DRM community has
learned over the years, it's that drivers are so complex that we all
turn into idiots at some point, relative to the complexity of the code
and hardware behavior.  That's why things like dma_fence are written so
incredibly defensively and why we're so harsh about the rules.  It's
the rules and not our individual smarts that keep us from making
mistakes.  (Kinda like Rust, in a way.)  So while I appreciate the
frustration of "I'm just trying to do something that's clearly correct
here", that doesn't mean that then next person to come by and fix a bug
by tweaking that callback isn't going to screw it up irreparably.  That
person may even be you in 6 to 12 months after this e-mail thread is a
distant memory.

So, yes, does the implementation you have today work without deadlocks
or starvation?  Maybe it does.  I've not verified.  Is the suggested
callback a giant foot-gun in the already treacherous