On Sat, Oct 25, 2025 at 02:04:09PM +0200, Thomas Hellström wrote:
> Use fds to represent pagemaps on foreign or local devices.
> The underlying files are opened at madvise() time and remain open
> as long as there are remaining madvises pointing to the
> foreign pagemap.
> 
> Extend the madvise preferred_location UAPI to support the region
> instance to identify the foreign placement.
> 
> Signed-off-by: Thomas Hellström <[email protected]>
> ---
>  drivers/gpu/drm/xe/xe_device.c     | 14 ++++++
>  drivers/gpu/drm/xe/xe_device.h     |  2 +
>  drivers/gpu/drm/xe/xe_svm.c        | 73 ++++++++++++++++++++++++++++++
>  drivers/gpu/drm/xe/xe_svm.h        |  7 +++
>  drivers/gpu/drm/xe/xe_vm_madvise.c | 72 ++++++++++++++++++++++++-----
>  include/uapi/drm/xe_drm.h          |  4 +-
>  6 files changed, 159 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/gpu/drm/xe/xe_device.c b/drivers/gpu/drm/xe/xe_device.c
> index ad004aab67ce..1a7502e4fc3e 100644
> --- a/drivers/gpu/drm/xe/xe_device.c
> +++ b/drivers/gpu/drm/xe/xe_device.c
> @@ -372,6 +372,20 @@ static const struct file_operations xe_driver_fops = {
>       .fop_flags = FOP_UNSIGNED_OFFSET,
>  };
>  
> +/**
> + * xe_is_xe_file() - Is the file an xe device file?
> + * @file: The file.
> + *
> + * Checks whether the file is opened against
> + * an xe device.
> + *
> + * Return: %true if an xe file, %false if not.
> + */
> +bool xe_is_xe_file(const struct file *file)
> +{
> +     return file->f_op == &xe_driver_fops;
> +}
> +
>  static struct drm_driver driver = {
>       /* Don't use MTRRs here; the Xserver or userspace app should
>        * deal with them for Intel hardware.
> diff --git a/drivers/gpu/drm/xe/xe_device.h b/drivers/gpu/drm/xe/xe_device.h
> index 32cc6323b7f6..475e2245c955 100644
> --- a/drivers/gpu/drm/xe/xe_device.h
> +++ b/drivers/gpu/drm/xe/xe_device.h
> @@ -195,6 +195,8 @@ void xe_file_put(struct xe_file *xef);
>  
>  int xe_is_injection_active(void);
>  
> +bool xe_is_xe_file(const struct file *file);
> +
>  /*
>   * Occasionally it is seen that the G2H worker starts running after a delay 
> of more than
>   * a second even after being queued and activated by the Linux workqueue 
> subsystem. This
> diff --git a/drivers/gpu/drm/xe/xe_svm.c b/drivers/gpu/drm/xe/xe_svm.c
> index 36a6ac293e71..9dd96dad2cca 100644
> --- a/drivers/gpu/drm/xe/xe_svm.c
> +++ b/drivers/gpu/drm/xe/xe_svm.c
> @@ -1763,6 +1763,73 @@ int xe_pagemap_cache_create(struct xe_tile *tile)
>       return 0;
>  }
>  
> +static struct drm_pagemap *xe_devmem_open(struct xe_device *xe, u32 
> region_instance)
> +{
> +     u32 tile_id = region_instance - 1;
> +     struct xe_pagemap *xpagemap;
> +     struct xe_vram_region *vr;
> +
> +     if (tile_id >= xe->info.tile_count)
> +             return ERR_PTR(-ENOENT);
> +
> +     if (!((BIT(tile_id) << 1) & xe->info.mem_region_mask))
> +             return ERR_PTR(-ENOENT);
> +
> +     vr = xe_tile_to_vr(&xe->tiles[tile_id]);
> +     xpagemap = xe_pagemap_find_or_create(xe, vr->dpagemap_cache, vr);

This is from a different patch, but I was trying to trace where the
reference drop to the drm_pagemap in xe_madvise_details_fini comes from.
I figured out it was from the function above, but I didn’t see anything
in the kernel documentation for xe_pagemap_find_or_create indicating
that it takes a reference to the drm_pagemap.

I’d suggest adding that for completeness.


> +     if (IS_ERR(xpagemap))
> +             return ERR_CAST(xpagemap);
> +
> +     return &xpagemap->dpagemap;
> +}
> +
> +/**
> + * xe_drm_pagemap_from_fd() - Return a drm_pagemap pointer from a
> + * (file_descriptor, region_instance) pair.
> + * @fd: An fd opened against an xe device.
> + * @region_instance: The region instance representing the device memory
> + * on the opened xe device.
> + *
> + * Opens a struct drm_pagemap pointer on the
> + * indicated device and region_instance.
> + *
> + * Return: A reference-counted struct drm_pagemap pointer on success,
> + * negative error pointer on failure.
> + */
> +struct drm_pagemap *xe_drm_pagemap_from_fd(int fd, u32 region_instance)
> +{
> +     struct drm_pagemap *dpagemap;
> +     struct file *file;
> +     struct drm_file *fpriv;
> +     struct drm_device *drm;
> +     int idx;
> +
> +     if (fd <= 0)
> +             return ERR_PTR(-EINVAL);
> +
> +     file = fget(fd);
> +     if (!file)
> +             return ERR_PTR(-ENOENT);
> +
> +     if (!xe_is_xe_file(file)) {
> +             dpagemap = ERR_PTR(-ENOENT);
> +             goto out;
> +     }
> +
> +     fpriv = file->private_data;
> +     drm = fpriv->minor->dev;
> +     if (!drm_dev_enter(drm, &idx)) {
> +             dpagemap = ERR_PTR(-ENODEV);
> +             goto out;
> +     }
> +
> +     dpagemap = xe_devmem_open(to_xe_device(drm), region_instance);
> +     drm_dev_exit(idx);
> +out:
> +     fput(file);
> +     return dpagemap;
> +}
> +
>  #else
>  
>  int xe_pagemap_shrinker_create(struct xe_device *xe)
> @@ -1786,6 +1853,12 @@ struct drm_pagemap *xe_vma_resolve_pagemap(struct 
> xe_vma *vma, struct xe_tile *t
>  {
>       return NULL;
>  }
> +
> +struct drm_pagemap *xe_drm_pagemap_from_fd(int fd, u32 region_instance)
> +{
> +     return ERR_PTR(-ENOENT);
> +}
> +
>  #endif
>  
>  /**
> diff --git a/drivers/gpu/drm/xe/xe_svm.h b/drivers/gpu/drm/xe/xe_svm.h
> index c7027facf6e9..7cd7932f56c8 100644
> --- a/drivers/gpu/drm/xe/xe_svm.h
> +++ b/drivers/gpu/drm/xe/xe_svm.h
> @@ -187,6 +187,8 @@ int xe_pagemap_shrinker_create(struct xe_device *xe);
>  
>  int xe_pagemap_cache_create(struct xe_tile *tile);
>  
> +struct drm_pagemap *xe_drm_pagemap_from_fd(int fd, u32 region_instance);
> +
>  #else
>  #include <linux/interval_tree.h>
>  #include "xe_vm.h"
> @@ -378,6 +380,11 @@ static inline int xe_pagemap_cache_create(struct xe_tile 
> *tile)
>       return 0;
>  }
>  
> +static inline struct drm_pagemap *xe_drm_pagemap_from_fd(int fd, u32 
> region_instance)
> +{
> +     return ERR_PTR(-ENOENT);
> +}
> +
>  #define xe_svm_range_has_dma_mapping(...) false
>  #endif /* CONFIG_DRM_XE_GPUSVM */
>  
> diff --git a/drivers/gpu/drm/xe/xe_vm_madvise.c 
> b/drivers/gpu/drm/xe/xe_vm_madvise.c
> index d6f47c8e146d..d03d052fcc44 100644
> --- a/drivers/gpu/drm/xe/xe_vm_madvise.c
> +++ b/drivers/gpu/drm/xe/xe_vm_madvise.c
> @@ -22,6 +22,19 @@ struct xe_vmas_in_madvise_range {
>       bool has_svm_userptr_vmas;
>  };
>  
> +/**
> + * struct xe_madvise_details - Argument to madvise_funcs
> + * @dpagemap: Reference-counted pointer to a struct drm_pagemap.
> + *
> + * The madvise IOCTL handler may, in addition to the user-space
> + * args, have additional info to pass into the madvise_func that
> + * handles the madvise type. Use a struct_xe_madvise_details
> + * for that and extend the struct as necessary.
> + */
> +struct xe_madvise_details {
> +     struct drm_pagemap *dpagemap;
> +};
> +
>  static int get_vmas(struct xe_vm *vm, struct xe_vmas_in_madvise_range 
> *madvise_range)
>  {
>       u64 addr = madvise_range->addr;
> @@ -74,7 +87,8 @@ static int get_vmas(struct xe_vm *vm, struct 
> xe_vmas_in_madvise_range *madvise_r
>  
>  static void madvise_preferred_mem_loc(struct xe_device *xe, struct xe_vm *vm,
>                                     struct xe_vma **vmas, int num_vmas,
> -                                   struct drm_xe_madvise *op)
> +                                   struct drm_xe_madvise *op,
> +                                   struct xe_madvise_details *details)
>  {
>       int i;
>  
> @@ -96,14 +110,18 @@ static void madvise_preferred_mem_loc(struct xe_device 
> *xe, struct xe_vm *vm,
>                        * is of no use and can be ignored.
>                        */
>                       loc->migration_policy = 
> op->preferred_mem_loc.migration_policy;
> +                     drm_pagemap_put(loc->dpagemap);
>                       loc->dpagemap = NULL;
> +                     if (details->dpagemap)
> +                             loc->dpagemap = 
> drm_pagemap_get(details->dpagemap);
>               }
>       }
>  }
>  
>  static void madvise_atomic(struct xe_device *xe, struct xe_vm *vm,
>                          struct xe_vma **vmas, int num_vmas,
> -                        struct drm_xe_madvise *op)
> +                        struct drm_xe_madvise *op,
> +                        struct xe_madvise_details *details)
>  {
>       struct xe_bo *bo;
>       int i;
> @@ -144,7 +162,8 @@ static void madvise_atomic(struct xe_device *xe, struct 
> xe_vm *vm,
>  
>  static void madvise_pat_index(struct xe_device *xe, struct xe_vm *vm,
>                             struct xe_vma **vmas, int num_vmas,
> -                           struct drm_xe_madvise *op)
> +                           struct drm_xe_madvise *op,
> +                           struct xe_madvise_details *details)
>  {
>       int i;
>  
> @@ -162,7 +181,8 @@ static void madvise_pat_index(struct xe_device *xe, 
> struct xe_vm *vm,
>  
>  typedef void (*madvise_func)(struct xe_device *xe, struct xe_vm *vm,
>                            struct xe_vma **vmas, int num_vmas,
> -                          struct drm_xe_madvise *op);
> +                          struct drm_xe_madvise *op,
> +                          struct xe_madvise_details *details);
>  
>  static const madvise_func madvise_funcs[] = {
>       [DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC] = madvise_preferred_mem_loc,
> @@ -250,9 +270,6 @@ static bool madvise_args_are_sane(struct xe_device *xe, 
> const struct drm_xe_madv
>                                    DRM_XE_MIGRATE_ONLY_SYSTEM_PAGES))
>                       return false;
>  
> -             if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.pad))
> -                     return false;
> -

Should we still reject region_instance if fd <=0 ?

>               if (XE_IOCTL_DBG(xe, args->preferred_mem_loc.reserved))
>                       return false;
>               break;
> @@ -296,6 +313,31 @@ static bool madvise_args_are_sane(struct xe_device *xe, 
> const struct drm_xe_madv
>       return true;
>  }
>  
> +static int xe_madvise_details_init(struct xe_device *xe, const struct 
> drm_xe_madvise *args,
> +                                struct xe_madvise_details *details)
> +{
> +     memset(details, 0, sizeof(*details));
> +
> +     if (args->type == DRM_XE_MEM_RANGE_ATTR_PREFERRED_LOC) {
> +             int fd = args->preferred_mem_loc.devmem_fd;
> +
> +             if (fd <= 0)
> +                     return 0;
> +

I think you need to santize 'args->preferred_mem_loc.region_instance'
somewhere and reject 0 (system memory) or xe_devmem_open is blow up as
tile_id will be -1 in that function.

> +             details->dpagemap = 
> xe_drm_pagemap_from_fd(args->preferred_mem_loc.devmem_fd,
> +                                                        
> args->preferred_mem_loc.region_instance);

You have local fd varibale here, but don't use it. Should we also have
local region_instance to avoid bigs wraps?

> +             if (XE_IOCTL_DBG(xe, IS_ERR(details->dpagemap)))
> +                     return PTR_ERR(details->dpagemap);
> +     }
> +
> +     return 0;
> +}
> +
> +static void xe_madvise_details_fini(struct xe_madvise_details *details)
> +{
> +     drm_pagemap_put(details->dpagemap);
> +}
> +
>  static bool check_bo_args_are_sane(struct xe_vm *vm, struct xe_vma **vmas,
>                                  int num_vmas, u32 atomic_val)
>  {
> @@ -349,6 +391,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *fil
>       struct drm_xe_madvise *args = data;
>       struct xe_vmas_in_madvise_range madvise_range = {.addr = args->start,
>                                                        .range =  args->range, 
> };
> +     struct xe_madvise_details details;
>       struct xe_vm *vm;
>       struct drm_exec exec;
>       int err, attr_type;
> @@ -373,13 +416,17 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *fil
>               goto unlock_vm;
>       }
>  
> -     err = xe_vm_alloc_madvise_vma(vm, args->start, args->range);
> +     err = xe_madvise_details_init(xe, args, &details);
>       if (err)
>               goto unlock_vm;
>  
> +     err = xe_vm_alloc_madvise_vma(vm, args->start, args->range);
> +     if (err)
> +             goto madv_fini;
> +
>       err = get_vmas(vm, &madvise_range);
>       if (err || !madvise_range.num_vmas)
> -             goto unlock_vm;
> +             goto madv_fini;
>  
>       if (madvise_range.has_bo_vmas) {
>               if (args->type == DRM_XE_MEM_RANGE_ATTR_ATOMIC) {
> @@ -387,7 +434,7 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *fil
>                                                   madvise_range.num_vmas,
>                                                   args->atomic.val)) {
>                               err = -EINVAL;
> -                             goto unlock_vm;
> +                             goto madv_fini;
>                       }
>               }
>  
> @@ -413,7 +460,8 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *fil
>       }
>  
>       attr_type = array_index_nospec(args->type, ARRAY_SIZE(madvise_funcs));
> -     madvise_funcs[attr_type](xe, vm, madvise_range.vmas, 
> madvise_range.num_vmas, args);
> +     madvise_funcs[attr_type](xe, vm, madvise_range.vmas, 
> madvise_range.num_vmas, args,
> +                              &details);
>  
>       err = xe_vm_invalidate_madvise_range(vm, args->start, args->start + 
> args->range);
>  
> @@ -425,6 +473,8 @@ int xe_vm_madvise_ioctl(struct drm_device *dev, void 
> *data, struct drm_file *fil
>               drm_exec_fini(&exec);
>       kfree(madvise_range.vmas);
>       madvise_range.vmas = NULL;
> +madv_fini:
> +     xe_madvise_details_fini(&details);
>  unlock_vm:
>       up_write(&vm->lock);
>  put_vm:
> diff --git a/include/uapi/drm/xe_drm.h b/include/uapi/drm/xe_drm.h
> index 47853659a705..c79de1019816 100644
> --- a/include/uapi/drm/xe_drm.h
> +++ b/include/uapi/drm/xe_drm.h
> @@ -2079,8 +2079,8 @@ struct drm_xe_madvise {
>                       /** @preferred_mem_loc.migration_policy: Page migration 
> policy */
>                       __u16 migration_policy;
>  
> -                     /** @preferred_mem_loc.pad : MBZ */
> -                     __u16 pad;
> +                     /** @preferred_mem_loc.region_instance : Region 
> instance */
> +                     __u16 region_instance;

I'd mention here this field is only relavent if devmem_fd > 0, perhaps a
little more for its usage. Also perhaps system_memory regions are not
allowed (assuming we land on that).

Matt

>  
>                       /** @preferred_mem_loc.reserved : Reserved */
>                       __u64 reserved;
> -- 
> 2.51.0
> 

Reply via email to