from:"Jan Kara"

Re: [PATCH v2 4/4] eventfd: make eventfd_signal{_mask}() void

2023-11-22 Thread Jan Kara

On Wed 22-11-23 13:48:25, Christian Brauner wrote:
> No caller care about the return value.
> 
> Signed-off-by: Christian Brauner 

Yup. Feel free to add:

Reviewed-by: Jan Kara 

Honza

> ---
>  fs/eventfd.c| 40 +++-
>  include/linux/eventfd.h | 16 +++-
>  2 files changed, 22 insertions(+), 34 deletions(-)
> 
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index a9a6de920fb4..13be2fb7fc96 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -43,10 +43,19 @@ struct eventfd_ctx {
>   int id;
>  };
>  
> -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
> +/**
> + * eventfd_signal - Adds @n to the eventfd counter.
> + * @ctx: [in] Pointer to the eventfd context.
> + * @mask: [in] poll mask
> + *
> + * This function is supposed to be called by the kernel in paths that do not
> + * allow sleeping. In this function we allow the counter to reach the 
> ULLONG_MAX
> + * value, and we signal this as overflow condition by returning a EPOLLERR
> + * to poll(2).
> + */
> +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
>  {
>   unsigned long flags;
> - __u64 n = 1;
>  
>   /*
>* Deadlock or stack overflow issues can happen if we recurse here
> @@ -57,37 +66,18 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, 
> __poll_t mask)
>* safe context.
>*/
>   if (WARN_ON_ONCE(current->in_eventfd))
> - return 0;
> + return;
>  
>   spin_lock_irqsave(>wqh.lock, flags);
>   current->in_eventfd = 1;
> - if (ULLONG_MAX - ctx->count < n)
> - n = ULLONG_MAX - ctx->count;
> - ctx->count += n;
> + if (ctx->count < ULLONG_MAX)
> + ctx->count++;
>   if (waitqueue_active(>wqh))
>   wake_up_locked_poll(>wqh, EPOLLIN | mask);
>   current->in_eventfd = 0;
>   spin_unlock_irqrestore(>wqh.lock, flags);
> -
> - return n == 1;
> -}
> -
> -/**
> - * eventfd_signal - Adds @n to the eventfd counter.
> - * @ctx: [in] Pointer to the eventfd context.
> - *
> - * This function is supposed to be called by the kernel in paths that do not
> - * allow sleeping. In this function we allow the counter to reach the 
> ULLONG_MAX
> - * value, and we signal this as overflow condition by returning a EPOLLERR
> - * to poll(2).
> - *
> - * Returns the amount by which the counter was incremented.
> - */
> -__u64 eventfd_signal(struct eventfd_ctx *ctx)
> -{
> - return eventfd_signal_mask(ctx, 0);
>  }
> -EXPORT_SYMBOL_GPL(eventfd_signal);
> +EXPORT_SYMBOL_GPL(eventfd_signal_mask);
>  
>  static void eventfd_free_ctx(struct eventfd_ctx *ctx)
>  {
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index 4f8aac7eb62a..fea7c4eb01d6 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -35,8 +35,7 @@ void eventfd_ctx_put(struct eventfd_ctx *ctx);
>  struct file *eventfd_fget(int fd);
>  struct eventfd_ctx *eventfd_ctx_fdget(int fd);
>  struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
> -__u64 eventfd_signal(struct eventfd_ctx *ctx);
> -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
> +void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
>  int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, 
> wait_queue_entry_t *wait,
> __u64 *cnt);
>  void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
> @@ -58,14 +57,8 @@ static inline struct eventfd_ctx *eventfd_ctx_fdget(int fd)
>   return ERR_PTR(-ENOSYS);
>  }
>  
> -static inline int eventfd_signal(struct eventfd_ctx *ctx)
> +static inline void eventfd_signal_mask(struct eventfd_ctx *ctx, unsigned 
> mask)
>  {
> - return -ENOSYS;
> -}
> -
> -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, unsigned mask)
> -{
> - return -ENOSYS;
>  }
>  
>  static inline void eventfd_ctx_put(struct eventfd_ctx *ctx)
> @@ -91,5 +84,10 @@ static inline void eventfd_ctx_do_read(struct eventfd_ctx 
> *ctx, __u64 *cnt)
>  
>  #endif
>  
> +static inline void eventfd_signal(struct eventfd_ctx *ctx)
> +{
> + eventfd_signal_mask(ctx, 0);
> +}
> +
>  #endif /* _LINUX_EVENTFD_H */
>  
> 
> -- 
> 2.42.0
> 
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH v2 3/4] eventfd: simplify eventfd_signal_mask()

2023-11-22 Thread Jan Kara

On Wed 22-11-23 13:48:24, Christian Brauner wrote:
> The eventfd_signal_mask() helper was introduced for io_uring and similar
> to eventfd_signal() it always passed 1 for @n. So don't bother with that
> argument at all.
> 
> Signed-off-by: Christian Brauner 

Nice. Feel free to add:

Reviewed-by: Jan Kara 

Honza

> ---
>  fs/eventfd.c| 7 ---
>  include/linux/eventfd.h | 5 ++---
>  io_uring/io_uring.c | 4 ++--
>  3 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/fs/eventfd.c b/fs/eventfd.c
> index dc9e01053235..a9a6de920fb4 100644
> --- a/fs/eventfd.c
> +++ b/fs/eventfd.c
> @@ -43,9 +43,10 @@ struct eventfd_ctx {
>   int id;
>  };
>  
> -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
> +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
>  {
>   unsigned long flags;
> + __u64 n = 1;
>  
>   /*
>* Deadlock or stack overflow issues can happen if we recurse here
> @@ -68,7 +69,7 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, 
> __poll_t mask)
>   current->in_eventfd = 0;
>   spin_unlock_irqrestore(>wqh.lock, flags);
>  
> - return n;
> + return n == 1;
>  }
>  
>  /**
> @@ -84,7 +85,7 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, 
> __poll_t mask)
>   */
>  __u64 eventfd_signal(struct eventfd_ctx *ctx)
>  {
> - return eventfd_signal_mask(ctx, 1, 0);
> + return eventfd_signal_mask(ctx, 0);
>  }
>  EXPORT_SYMBOL_GPL(eventfd_signal);
>  
> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
> index 562089431551..4f8aac7eb62a 100644
> --- a/include/linux/eventfd.h
> +++ b/include/linux/eventfd.h
> @@ -36,7 +36,7 @@ struct file *eventfd_fget(int fd);
>  struct eventfd_ctx *eventfd_ctx_fdget(int fd);
>  struct eventfd_ctx *eventfd_ctx_fileget(struct file *file);
>  __u64 eventfd_signal(struct eventfd_ctx *ctx);
> -__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask);
> +__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask);
>  int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, 
> wait_queue_entry_t *wait,
> __u64 *cnt);
>  void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt);
> @@ -63,8 +63,7 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx)
>   return -ENOSYS;
>  }
>  
> -static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n,
> -   unsigned mask)
> +static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, unsigned mask)
>  {
>   return -ENOSYS;
>  }
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index ed254076c723..70170a41eac4 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -558,7 +558,7 @@ static void io_eventfd_ops(struct rcu_head *rcu)
>   int ops = atomic_xchg(_fd->ops, 0);
>  
>   if (ops & BIT(IO_EVENTFD_OP_SIGNAL_BIT))
> - eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
> + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
>  
>   /* IO_EVENTFD_OP_FREE_BIT may not be set here depending on callback
>* ordering in a race but if references are 0 we know we have to free
> @@ -594,7 +594,7 @@ static void io_eventfd_signal(struct io_ring_ctx *ctx)
>   goto out;
>  
>   if (likely(eventfd_signal_allowed())) {
> - eventfd_signal_mask(ev_fd->cq_ev_fd, 1, EPOLL_URING_WAKE);
> + eventfd_signal_mask(ev_fd->cq_ev_fd, EPOLL_URING_WAKE);
>   } else {
>   atomic_inc(_fd->refs);
>   if (!atomic_fetch_or(BIT(IO_EVENTFD_OP_SIGNAL_BIT), 
> _fd->ops))
> 
> -- 
> 2.42.0
> 
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH v2 2/4] eventfd: simplify eventfd_signal()

2023-11-22 Thread Jan Kara

On Wed 22-11-23 13:48:23, Christian Brauner wrote:
> Ever since the evenfd type was introduced back in 2007 in commit
> e1ad7468c77d ("signal/timer/event: eventfd core") the eventfd_signal()
> function only ever passed 1 as a value for @n. There's no point in
> keeping that additional argument.
> 
> Signed-off-by: Christian Brauner 

Looks good. Feel free to add:

Reviewed-by: Jan Kara 
Honza

> ---
>  arch/x86/kvm/hyperv.c |  2 +-
>  arch/x86/kvm/xen.c|  2 +-
>  drivers/accel/habanalabs/common/device.c  |  2 +-
>  drivers/fpga/dfl.c|  2 +-
>  drivers/gpu/drm/drm_syncobj.c |  6 +++---
>  drivers/gpu/drm/i915/gvt/interrupt.c  |  2 +-
>  drivers/infiniband/hw/mlx5/devx.c |  2 +-
>  drivers/misc/ocxl/file.c  |  2 +-
>  drivers/s390/cio/vfio_ccw_chp.c   |  2 +-
>  drivers/s390/cio/vfio_ccw_drv.c   |  4 ++--
>  drivers/s390/cio/vfio_ccw_ops.c   |  6 +++---
>  drivers/s390/crypto/vfio_ap_ops.c |  2 +-
>  drivers/usb/gadget/function/f_fs.c|  4 ++--
>  drivers/vdpa/vdpa_user/vduse_dev.c|  6 +++---
>  drivers/vfio/fsl-mc/vfio_fsl_mc_intr.c|  2 +-
>  drivers/vfio/pci/vfio_pci_core.c  |  6 +++---
>  drivers/vfio/pci/vfio_pci_intrs.c | 12 ++--
>  drivers/vfio/platform/vfio_platform_irq.c |  4 ++--
>  drivers/vhost/vdpa.c  |  4 ++--
>  drivers/vhost/vhost.c | 10 +-
>  drivers/vhost/vhost.h |  2 +-
>  drivers/virt/acrn/ioeventfd.c |  2 +-
>  drivers/xen/privcmd.c |  2 +-
>  fs/aio.c  |  2 +-
>  fs/eventfd.c  |  9 +++--
>  include/linux/eventfd.h   |  4 ++--
>  mm/memcontrol.c   | 10 +-
>  mm/vmpressure.c   |  2 +-
>  samples/vfio-mdev/mtty.c  |  4 ++--
>  virt/kvm/eventfd.c|  4 ++--
>  30 files changed, 60 insertions(+), 63 deletions(-)
> 
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index 238afd7335e4..4943f6b2bbee 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -2388,7 +2388,7 @@ static u16 kvm_hvcall_signal_event(struct kvm_vcpu 
> *vcpu, struct kvm_hv_hcall *h
>   if (!eventfd)
>   return HV_STATUS_INVALID_PORT_ID;
>  
> - eventfd_signal(eventfd, 1);
> + eventfd_signal(eventfd);
>   return HV_STATUS_SUCCESS;
>  }
>  
> diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c
> index e53fad915a62..523bb6df5ac9 100644
> --- a/arch/x86/kvm/xen.c
> +++ b/arch/x86/kvm/xen.c
> @@ -2088,7 +2088,7 @@ static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu 
> *vcpu, u64 param, u64 *r)
>   if (ret < 0 && ret != -ENOTCONN)
>   return false;
>   } else {
> - eventfd_signal(evtchnfd->deliver.eventfd.ctx, 1);
> + eventfd_signal(evtchnfd->deliver.eventfd.ctx);
>   }
>  
>   *r = 0;
> diff --git a/drivers/accel/habanalabs/common/device.c 
> b/drivers/accel/habanalabs/common/device.c
> index 9711e8fc979d..3a89644f087c 100644
> --- a/drivers/accel/habanalabs/common/device.c
> +++ b/drivers/accel/habanalabs/common/device.c
> @@ -2044,7 +2044,7 @@ static void hl_notifier_event_send(struct 
> hl_notifier_event *notifier_event, u64
>   notifier_event->events_mask |= event_mask;
>  
>   if (notifier_event->eventfd)
> - eventfd_signal(notifier_event->eventfd, 1);
> + eventfd_signal(notifier_event->eventfd);
>  
>   mutex_unlock(_event->lock);
>  }
> diff --git a/drivers/fpga/dfl.c b/drivers/fpga/dfl.c
> index dd7a783d53b5..e73f88050f08 100644
> --- a/drivers/fpga/dfl.c
> +++ b/drivers/fpga/dfl.c
> @@ -1872,7 +1872,7 @@ static irqreturn_t dfl_irq_handler(int irq, void *arg)
>  {
>   struct eventfd_ctx *trigger = arg;
>  
> - eventfd_signal(trigger, 1);
> + eventfd_signal(trigger);
>   return IRQ_HANDLED;
>  }
>  
> diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c
> index 01da6789d044..b9cc62982196 100644
> --- a/drivers/gpu/drm/drm_syncobj.c
> +++ b/drivers/gpu/drm/drm_syncobj.c
> @@ -1365,7 +1365,7 @@ static void syncobj_eventfd_entry_fence_func(struct 
> dma_fence *fence,
>   struct syncobj_eventfd_entry *entry =
>   container_of(cb, struct syncobj_eventfd_entry, fence_cb);
>  
> - eventfd_signal(entry->ev_fd_ctx, 1);
> + eventfd_si

Re: [PATCH v2 1/4] i915: make inject_virtual_interrupt() void

2023-11-22 Thread Jan Kara

On Wed 22-11-23 13:48:22, Christian Brauner wrote:
> The single caller of inject_virtual_interrupt() ignores the return value
> anyway. This allows us to simplify eventfd_signal() in follow-up
> patches.
> 
> Signed-off-by: Christian Brauner 

Looks good. Feel free to add:

Reviewed-by: Jan Kara 

Honza

> ---
>  drivers/gpu/drm/i915/gvt/interrupt.c | 14 +++---
>  1 file changed, 7 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/gpu/drm/i915/gvt/interrupt.c 
> b/drivers/gpu/drm/i915/gvt/interrupt.c
> index de3f5903d1a7..9665876b4b13 100644
> --- a/drivers/gpu/drm/i915/gvt/interrupt.c
> +++ b/drivers/gpu/drm/i915/gvt/interrupt.c
> @@ -422,7 +422,7 @@ static void init_irq_map(struct intel_gvt_irq *irq)
>  #define MSI_CAP_DATA(offset) (offset + 8)
>  #define MSI_CAP_EN 0x1
>  
> -static int inject_virtual_interrupt(struct intel_vgpu *vgpu)
> +static void inject_virtual_interrupt(struct intel_vgpu *vgpu)
>  {
>   unsigned long offset = vgpu->gvt->device_info.msi_cap_offset;
>   u16 control, data;
> @@ -434,10 +434,10 @@ static int inject_virtual_interrupt(struct intel_vgpu 
> *vgpu)
>  
>   /* Do not generate MSI if MSIEN is disabled */
>   if (!(control & MSI_CAP_EN))
> - return 0;
> + return;
>  
>   if (WARN(control & GENMASK(15, 1), "only support one MSI format\n"))
> - return -EINVAL;
> + return;
>  
>   trace_inject_msi(vgpu->id, addr, data);
>  
> @@ -451,10 +451,10 @@ static int inject_virtual_interrupt(struct intel_vgpu 
> *vgpu)
>* returned and don't inject interrupt into guest.
>*/
>   if (!test_bit(INTEL_VGPU_STATUS_ATTACHED, vgpu->status))
> - return -ESRCH;
> - if (vgpu->msi_trigger && eventfd_signal(vgpu->msi_trigger, 1) != 1)
> - return -EFAULT;
> - return 0;
> + return;
> +     if (!vgpu->msi_trigger)
> + return;
> + eventfd_signal(vgpu->msi_trigger, 1);
>  }
>  
>  static void propagate_event(struct intel_gvt_irq *irq,
> 
> -- 
> 2.42.0
> 
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH v2] fs: clean up usage of noop_dirty_folio

2023-08-28 Thread Jan Kara

On Mon 28-08-23 15:54:49, Xueshi Hu wrote:
> In folio_mark_dirty(), it can automatically fallback to
> noop_dirty_folio() if a_ops->dirty_folio is not registered.
> 
> As anon_aops, dev_dax_aops and fb_deferred_io_aops becames empty, remove
> them too.
> 
> Signed-off-by: Xueshi Hu 

Looks good to me. Feel free to add:

Reviewed-by: Jan Kara 

Honza

> ---
> Changes in v2:
> - make noop_dirty_folio() inline as suggested by Matthew
> - v1: 
> https://lore.kernel.org/linux-mm/zoxafrz9etovu...@infradead.org/T/#m073d45909b1df03ff09f382557dc4e84d0607c49
> 
>  drivers/dax/device.c|  5 -
>  drivers/video/fbdev/core/fb_defio.c |  5 -
>  fs/aio.c|  1 -
>  fs/ext2/inode.c |  1 -
>  fs/ext4/inode.c |  1 -
>  fs/fuse/dax.c   |  1 -
>  fs/hugetlbfs/inode.c|  1 -
>  fs/libfs.c  |  5 -
>  fs/xfs/xfs_aops.c   |  1 -
>  include/linux/pagemap.h |  1 -
>  mm/page-writeback.c | 18 +-
>  mm/secretmem.c  |  1 -
>  mm/shmem.c  |  1 -
>  mm/swap_state.c |  1 -
>  14 files changed, 5 insertions(+), 38 deletions(-)
> 
> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index 30665a3ff6ea..018aa9f88ec7 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -345,10 +345,6 @@ static unsigned long dax_get_unmapped_area(struct file 
> *filp,
>   return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
>  }
>  
> -static const struct address_space_operations dev_dax_aops = {
> - .dirty_folio= noop_dirty_folio,
> -};
> -
>  static int dax_open(struct inode *inode, struct file *filp)
>  {
>   struct dax_device *dax_dev = inode_dax(inode);
> @@ -358,7 +354,6 @@ static int dax_open(struct inode *inode, struct file 
> *filp)
>   dev_dbg(_dax->dev, "trace\n");
>   inode->i_mapping = __dax_inode->i_mapping;
>   inode->i_mapping->host = __dax_inode;
> - inode->i_mapping->a_ops = _dax_aops;
>   filp->f_mapping = inode->i_mapping;
>   filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
>   filp->f_sb_err = file_sample_sb_err(filp);
> diff --git a/drivers/video/fbdev/core/fb_defio.c 
> b/drivers/video/fbdev/core/fb_defio.c
> index 274f5d0fa247..08be3592281f 100644
> --- a/drivers/video/fbdev/core/fb_defio.c
> +++ b/drivers/video/fbdev/core/fb_defio.c
> @@ -221,10 +221,6 @@ static const struct vm_operations_struct 
> fb_deferred_io_vm_ops = {
>   .page_mkwrite   = fb_deferred_io_mkwrite,
>  };
>  
> -static const struct address_space_operations fb_deferred_io_aops = {
> - .dirty_folio= noop_dirty_folio,
> -};
> -
>  int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
>  {
>   vma->vm_ops = _deferred_io_vm_ops;
> @@ -307,7 +303,6 @@ void fb_deferred_io_open(struct fb_info *info,
>  {
>   struct fb_deferred_io *fbdefio = info->fbdefio;
>  
> - file->f_mapping->a_ops = _deferred_io_aops;
>   fbdefio->open_count++;
>  }
>  EXPORT_SYMBOL_GPL(fb_deferred_io_open);
> diff --git a/fs/aio.c b/fs/aio.c
> index 77e33619de40..4cf386f9cb1c 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -484,7 +484,6 @@ static int aio_migrate_folio(struct address_space 
> *mapping, struct folio *dst,
>  #endif
>  
>  static const struct address_space_operations aio_ctx_aops = {
> - .dirty_folio= noop_dirty_folio,
>   .migrate_folio  = aio_migrate_folio,
>  };
>  
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 75983215c7a1..ce191bdf1c78 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -971,7 +971,6 @@ const struct address_space_operations ext2_aops = {
>  static const struct address_space_operations ext2_dax_aops = {
>   .writepages = ext2_dax_writepages,
>   .direct_IO  = noop_direct_IO,
> - .dirty_folio= noop_dirty_folio,
>  };
>  
>  /*
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 43775a6ca505..67c1710c01b0 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3561,7 +3561,6 @@ static const struct address_space_operations 
> ext4_da_aops = {
>  static const struct address_space_operations ext4_dax_aops = {
>   .writepages = ext4_dax_writepages,
>   .direct_IO  = noop_direct_IO,
> - .dirty_folio= noop_dirty_folio,
>   .bmap

Re: [PATCH] fs: clean up usage of noop_dirty_folio

2023-08-21 Thread Jan Kara

On Sat 19-08-23 20:42:25, Xueshi Hu wrote:
> In folio_mark_dirty(), it will automatically fallback to
> noop_dirty_folio() if a_ops->dirty_folio is not registered.
> 
> As anon_aops, dev_dax_aops and fb_deferred_io_aops becames empty, remove
> them too.
> 
> Signed-off-by: Xueshi Hu 

Yeah, looks sensible to me but for some callbacks we are oscilating between
all users having to provide some callback and providing some default
behavior for NULL callback. I don't have a strong opinion either way so
feel free to add:

Reviewed-by: Jan Kara 

But I guess let's see what Matthew thinks about this and what plans he has
so that we don't switch back again in the near future. Matthew?

Honza

> ---
>  drivers/dax/device.c| 5 -
>  drivers/video/fbdev/core/fb_defio.c | 5 -
>  fs/aio.c| 1 -
>  fs/ext2/inode.c | 1 -
>  fs/ext4/inode.c | 1 -
>  fs/fuse/dax.c   | 1 -
>  fs/hugetlbfs/inode.c| 1 -
>  fs/libfs.c  | 5 -
>  fs/xfs/xfs_aops.c   | 1 -
>  include/linux/pagemap.h | 1 -
>  mm/page-writeback.c | 6 +++---
>  mm/secretmem.c  | 1 -
>  mm/shmem.c  | 1 -
>  mm/swap_state.c | 1 -
>  14 files changed, 3 insertions(+), 28 deletions(-)
> 
> diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> index 30665a3ff6ea..018aa9f88ec7 100644
> --- a/drivers/dax/device.c
> +++ b/drivers/dax/device.c
> @@ -345,10 +345,6 @@ static unsigned long dax_get_unmapped_area(struct file 
> *filp,
>   return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
>  }
>  
> -static const struct address_space_operations dev_dax_aops = {
> - .dirty_folio= noop_dirty_folio,
> -};
> -
>  static int dax_open(struct inode *inode, struct file *filp)
>  {
>   struct dax_device *dax_dev = inode_dax(inode);
> @@ -358,7 +354,6 @@ static int dax_open(struct inode *inode, struct file 
> *filp)
>   dev_dbg(_dax->dev, "trace\n");
>   inode->i_mapping = __dax_inode->i_mapping;
>   inode->i_mapping->host = __dax_inode;
> - inode->i_mapping->a_ops = _dax_aops;
>   filp->f_mapping = inode->i_mapping;
>   filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
>   filp->f_sb_err = file_sample_sb_err(filp);
> diff --git a/drivers/video/fbdev/core/fb_defio.c 
> b/drivers/video/fbdev/core/fb_defio.c
> index 274f5d0fa247..08be3592281f 100644
> --- a/drivers/video/fbdev/core/fb_defio.c
> +++ b/drivers/video/fbdev/core/fb_defio.c
> @@ -221,10 +221,6 @@ static const struct vm_operations_struct 
> fb_deferred_io_vm_ops = {
>   .page_mkwrite   = fb_deferred_io_mkwrite,
>  };
>  
> -static const struct address_space_operations fb_deferred_io_aops = {
> - .dirty_folio= noop_dirty_folio,
> -};
> -
>  int fb_deferred_io_mmap(struct fb_info *info, struct vm_area_struct *vma)
>  {
>   vma->vm_ops = _deferred_io_vm_ops;
> @@ -307,7 +303,6 @@ void fb_deferred_io_open(struct fb_info *info,
>  {
>   struct fb_deferred_io *fbdefio = info->fbdefio;
>  
> - file->f_mapping->a_ops = _deferred_io_aops;
>   fbdefio->open_count++;
>  }
>  EXPORT_SYMBOL_GPL(fb_deferred_io_open);
> diff --git a/fs/aio.c b/fs/aio.c
> index 77e33619de40..4cf386f9cb1c 100644
> --- a/fs/aio.c
> +++ b/fs/aio.c
> @@ -484,7 +484,6 @@ static int aio_migrate_folio(struct address_space 
> *mapping, struct folio *dst,
>  #endif
>  
>  static const struct address_space_operations aio_ctx_aops = {
> - .dirty_folio= noop_dirty_folio,
>   .migrate_folio  = aio_migrate_folio,
>  };
>  
> diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
> index 75983215c7a1..ce191bdf1c78 100644
> --- a/fs/ext2/inode.c
> +++ b/fs/ext2/inode.c
> @@ -971,7 +971,6 @@ const struct address_space_operations ext2_aops = {
>  static const struct address_space_operations ext2_dax_aops = {
>   .writepages = ext2_dax_writepages,
>   .direct_IO  = noop_direct_IO,
> - .dirty_folio= noop_dirty_folio,
>  };
>  
>  /*
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 43775a6ca505..67c1710c01b0 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3561,7 +3561,6 @@ static const struct address_space_operations 
> ext4_da_aops = {
>  static const struct address_space_operations ext4_dax_aops = {
>   .writepages = ext4_dax_writepages,
>   .direct_IO  = noop_direct_IO

Re: [PATCH 09/10] udf: Replace license notice with SPDX identifier

2023-05-12 Thread Jan Kara

t; -/* Copyright (C) 1993, 1994, 1995, 1996, 1997 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -   Contributed by Paul Eggert (egg...@twinsun.com).
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Library General Public License as
> -   published by the Free Software Foundation; either version 2 of the
> -   License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Library General Public License for more details.
> -
> -   You should have received a copy of the GNU Library General Public
> -   License along with the GNU C Library; see the file COPYING.LIB.  If not,
> -   write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
> -   Boston, MA 02111-1307, USA.  */
> +/* SPDX-License-Identifier: GPL-2.0-only */
>  
>  /*
>   * dgb 10/02/98: ripped this from glibc source to help convert timestamps
> diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
> index 622569007b530b..5d6b66e15fcded 100644
> --- a/fs/udf/unicode.c
> +++ b/fs/udf/unicode.c
> @@ -1,3 +1,4 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
>  /*
>   * unicode.c
>   *
> @@ -11,11 +12,6 @@
>   *   UTF-8 is explained in the IETF RFC .
>   *   ftp://ftp.internic.net/rfc/rfc.txt
>   *
> - * COPYRIGHT
> - *   This file is distributed under the terms of the GNU General Public
> - *   License (GPL). Copies of the GPL can be obtained from:
> - *   ftp://prep.ai.mit.edu/pub/gnu/GPL
> - *   Each contributing author retains all rights to their own work.
>   */
>  
>  #include "udfdecl.h"
> -- 
> An old man doll... just what I always wanted! - Clara
> 
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH v2 3/5] treewide: use get_random_u32() when possible

2022-10-06 Thread Jan Kara

On Thu 06-10-22 07:25:08, Jason A. Donenfeld wrote:
> The prandom_u32() function has been a deprecated inline wrapper around
> get_random_u32() for several releases now, and compiles down to the
> exact same code. Replace the deprecated wrapper with a direct call to
> the real function.
> 
> Reviewed-by: Kees Cook 
> Signed-off-by: Jason A. Donenfeld 

Looks good. Feel free to add:

Reviewed-by: Jan Kara 

for the ext4 bits.

        Honza
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH v2 1/5] treewide: use prandom_u32_max() when possible

2022-10-06 Thread Jan Kara

On Thu 06-10-22 07:25:06, Jason A. Donenfeld wrote:
> Rather than incurring a division or requesting too many random bytes for
> the given range, use the prandom_u32_max() function, which only takes
> the minimum required bytes from the RNG and avoids divisions.
> 
> Reviewed-by: Kees Cook 
> Reviewed-by: KP Singh 
> Reviewed-by: Christoph Böhmwalder 
> Signed-off-by: Jason A. Donenfeld 

Feel free to add:

Reviewed-by: Jan Kara 

for the ext2, ext4, and lib/sbitmap.c bits.

        Honza
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH v1 3/5] treewide: use get_random_u32() when possible

2022-10-06 Thread Jan Kara

On Wed 05-10-22 23:48:42, Jason A. Donenfeld wrote:
> The prandom_u32() function has been a deprecated inline wrapper around
> get_random_u32() for several releases now, and compiles down to the
> exact same code. Replace the deprecated wrapper with a direct call to
> the real function.
> 
> Signed-off-by: Jason A. Donenfeld 

...

> diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
> index 998dd2ac8008..e439a872c398 100644
> --- a/fs/ext2/ialloc.c
> +++ b/fs/ext2/ialloc.c
> @@ -277,7 +277,7 @@ static int find_group_orlov(struct super_block *sb, 
> struct inode *parent)
>   int best_ndir = inodes_per_group;
>   int best_group = -1;
>  
> - group = prandom_u32();
> + group = get_random_u32();
>   parent_group = (unsigned)group % ngroups;
>   for (i = 0; i < ngroups; i++) {
>   group = (parent_group + i) % ngroups;

The code here is effectively doing the

parent_group = prandom_u32_max(ngroups);

> diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
> index f73e5eb43eae..954ec9736a8d 100644
> --- a/fs/ext4/ialloc.c
> +++ b/fs/ext4/ialloc.c
> @@ -465,7 +465,7 @@ static int find_group_orlov(struct super_block *sb, 
> struct inode *parent,
>   ext4fs_dirhash(parent, qstr->name, qstr->len, );
>   grp = hinfo.hash;
>   } else
> - grp = prandom_u32();
> + grp = get_random_u32();

Similarly here we can use prandom_u32_max(ngroups) like:

if (qstr) {
...
parent_group = hinfo.hash % ngroups;
} else
parent_group = prandom_u32_max(ngroups);

> diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
> index 9af68a7ecdcf..588cb09c5291 100644
> --- a/fs/ext4/mmp.c
> +++ b/fs/ext4/mmp.c
> @@ -265,7 +265,7 @@ static unsigned int mmp_new_seq(void)
>   u32 new_seq;
>  
>   do {
> - new_seq = prandom_u32();
> + new_seq = get_random_u32();
>   } while (new_seq > EXT4_MMP_SEQ_MAX);

OK, here we again effectively implement prandom_u32_max(EXT4_MMP_SEQ_MAX + 1).
Just presumably we didn't want to use modulo here because EXT4_MMP_SEQ_MAX
is rather big and so the resulting 'new_seq' would be seriously
non-uniform.

Honza
-- 
Jan Kara 
SUSE Labs, CR

Re: Report 2 in ext4 and journal based on v5.17-rc1

2022-03-03 Thread Jan Kara

On Thu 03-03-22 10:00:33, Byungchul Park wrote:
> On Mon, Feb 28, 2022 at 11:14:44AM +0100, Jan Kara wrote:
> > On Mon 28-02-22 18:28:26, Byungchul Park wrote:
> > > case 1. Code with an actual circular dependency, but not deadlock.
> > > 
> > >A circular dependency can be broken by a rescue wakeup source e.g.
> > >timeout. It's not a deadlock. If it's okay that the contexts
> > >participating in the circular dependency and others waiting for the
> > >events in the circle are stuck until it gets broken. Otherwise, say,
> > >if it's not meant, then it's anyway problematic.
> > > 
> > >1-1. What if we judge this code is problematic?
> > >1-2. What if we judge this code is good?
> > > 
> > > case 2. Code with an actual circular dependency, and deadlock.
> > > 
> > >There's no other wakeup source than those within the circular
> > >dependency. Literally deadlock. It's problematic and critical.
> > > 
> > >2-1. What if we judge this code is problematic?
> > >2-2. What if we judge this code is good?
> > > 
> > > case 3. Code with no actual circular dependency, and not deadlock.
> > > 
> > >Must be good.
> > > 
> > >3-1. What if we judge this code is problematic?
> > >3-2. What if we judge this code is good?
> > > 
> > > ---
> > > 
> > > I call only 3-1 "false positive" circular dependency. And you call 1-1
> > > and 3-1 "false positive" deadlock.
> > > 
> > > I've been wondering if the kernel guys esp. Linus considers code with
> > > any circular dependency is problematic or not, even if it won't lead to
> > > a deadlock, say, case 1. Even though I designed Dept based on what I
> > > believe is right, of course, I'm willing to change the design according
> > > to the majority opinion.
> > > 
> > > However, I would never allow case 1 if I were the owner of the kernel
> > > for better stability, even though the code works anyway okay for now.
> > 
> > So yes, I call a report for the situation "There is circular dependency but
> > deadlock is not possible." a false positive. And that is because in my
> > opinion your definition of circular dependency includes schemes that are
> > useful and used in the kernel.
> > 
> > Your example in case 1 is kind of borderline (I personally would consider
> > that bug as well) but there are other more valid schemes with multiple
> > wakeup sources like:
> > 
> > We have a queue of work to do Q protected by lock L. Consumer process has
> > code like:
> > 
> > while (1) {
> > lock L
> > prepare_to_wait(work_queued);
> > if (no work) {
> > unlock L
> > sleep
> > } else {
> > unlock L
> > do work
> > wake_up(work_done)
> > }
> > }
> > 
> > AFAIU Dept will create dependency here that 'wakeup work_done' is after
> > 'wait for work_queued'. Producer has code like:
> 
> First of all, thank you for this good example.
> 
> > while (1) {
> > lock L
> > prepare_to_wait(work_done)
> > if (too much work queued) {
> > unlock L
> > sleep
> > } else {
> > queue work
> > unlock L
> > wake_up(work_queued)
> > }
> > }
> > 
> > And Dept will create dependency here that 'wakeup work_queued' is after
> > 'wait for work_done'. And thus we have a trivial cycle in the dependencies
> > despite the code being perfectly valid and safe.
> 
> Unfortunately, it's neither perfect nor safe without another wakeup
> source - rescue wakeup source.
> 
>consumer   producer
> 
>   lock L
>   (too much work queued == true)
>   unlock L
>   --- preempted
>lock L
>unlock L
>do work
>lock L
>unlock L
>do work
>...
>(no work == true)
>sleep
>   --- scheduled in
>   sleep
> 
> This code leads a deadlock without another wakeup source, say, not safe.

So the scenario you describe above is indeed possible. But the trick is
that the wakeup from 'consumer' as is doing work will remove 'producer'
from the wait queue and change the 'producer' proc

Re: Report 2 in ext4 and journal based on v5.17-rc1

2022-02-28 Thread Jan Kara

On Mon 28-02-22 18:28:26, Byungchul Park wrote:
> On Thu, Feb 24, 2022 at 11:22:39AM +0100, Jan Kara wrote:
> > On Thu 24-02-22 10:11:02, Byungchul Park wrote:
> > > On Wed, Feb 23, 2022 at 03:48:59PM +0100, Jan Kara wrote:
> > > > > KJOURNALD2(kthread)   TASK1(ksys_write)   TASK2(ksys_write)
> > > > > 
> > > > > wait A
> > > > > --- stuck
> > > > >   wait B
> > > > >   --- stuck
> > > > >   wait C
> > > > >   --- stuck
> > > > > 
> > > > > wake up B wake up C   wake up A
> > > > > 
> > > > > where:
> > > > > A is a wait_queue, j_wait_commit
> > > > > B is a wait_queue, j_wait_transaction_locked
> > > > > C is a rwsem, mapping.invalidate_lock
> > > > 
> > > > I see. But a situation like this is not necessarily a guarantee of a
> > > > deadlock, is it? I mean there can be task D that will eventually call 
> > > > say
> > > > 'wake up B' and unblock everything and this is how things were designed 
> > > > to
> > > > work? Multiple sources of wakeups are quite common I'd say... What does
> > > 
> > > Yes. At the very beginning when I desgined Dept, I was thinking whether
> > > to support multiple wakeup sources or not for a quite long time.
> > > Supporting it would be a better option to aovid non-critical reports.
> > > However, I thought anyway we'd better fix it - not urgent tho - if
> > > there's any single circle dependency. That's why I decided not to
> > > support it for now and wanted to gather the kernel guys' opinions. Thing
> > > is which policy we should go with.
> > 
> > I see. So supporting only a single wakeup source is fine for locks I guess.
> > But for general wait queues or other synchronization mechanisms, I'm afraid
> > it will lead to quite some false positive reports. Just my 2c.
> 
> Thank you for your feedback.
> 
> I realized we've been using "false positive" differently. There exist
> the three types of code in terms of dependency and deadlock. It's worth
> noting that dependencies are built from between waits and events in Dept.
> 
> ---
> 
> case 1. Code with an actual circular dependency, but not deadlock.
> 
>A circular dependency can be broken by a rescue wakeup source e.g.
>timeout. It's not a deadlock. If it's okay that the contexts
>participating in the circular dependency and others waiting for the
>events in the circle are stuck until it gets broken. Otherwise, say,
>if it's not meant, then it's anyway problematic.
> 
>1-1. What if we judge this code is problematic?
>1-2. What if we judge this code is good?
> 
> case 2. Code with an actual circular dependency, and deadlock.
> 
>There's no other wakeup source than those within the circular
>dependency. Literally deadlock. It's problematic and critical.
> 
>2-1. What if we judge this code is problematic?
>2-2. What if we judge this code is good?
> 
> case 3. Code with no actual circular dependency, and not deadlock.
> 
>Must be good.
> 
>3-1. What if we judge this code is problematic?
>3-2. What if we judge this code is good?
> 
> ---
> 
> I call only 3-1 "false positive" circular dependency. And you call 1-1
> and 3-1 "false positive" deadlock.
> 
> I've been wondering if the kernel guys esp. Linus considers code with
> any circular dependency is problematic or not, even if it won't lead to
> a deadlock, say, case 1. Even though I designed Dept based on what I
> believe is right, of course, I'm willing to change the design according
> to the majority opinion.
> 
> However, I would never allow case 1 if I were the owner of the kernel
> for better stability, even though the code works anyway okay for now.

So yes, I call a report for the situation "There is circular dependency but
deadlock is not possible." a false positive. And that is because in my
opinion your definition of circular dependency includes schemes that are
useful and used in the kernel.

Your example in case 1 is kind of borderline (I personally would consider
that bug as well) but there are other more valid schemes with multiple
wakeup sources like:

We have a queue of work to do Q protected by lock L. Consumer process has
code like:

while (1) {
lock L
prepare_to_wait(work_queued);
if (no work) {
unlock L

Re: Report 2 in ext4 and journal based on v5.17-rc1

2022-02-24 Thread Jan Kara

On Thu 24-02-22 10:11:02, Byungchul Park wrote:
> On Wed, Feb 23, 2022 at 03:48:59PM +0100, Jan Kara wrote:
> > > KJOURNALD2(kthread)   TASK1(ksys_write)   TASK2(ksys_write)
> > > 
> > > wait A
> > > --- stuck
> > >   wait B
> > >   --- stuck
> > >   wait C
> > >   --- stuck
> > > 
> > > wake up B wake up C   wake up A
> > > 
> > > where:
> > > A is a wait_queue, j_wait_commit
> > > B is a wait_queue, j_wait_transaction_locked
> > > C is a rwsem, mapping.invalidate_lock
> > 
> > I see. But a situation like this is not necessarily a guarantee of a
> > deadlock, is it? I mean there can be task D that will eventually call say
> > 'wake up B' and unblock everything and this is how things were designed to
> > work? Multiple sources of wakeups are quite common I'd say... What does
> 
> Yes. At the very beginning when I desgined Dept, I was thinking whether
> to support multiple wakeup sources or not for a quite long time.
> Supporting it would be a better option to aovid non-critical reports.
> However, I thought anyway we'd better fix it - not urgent tho - if
> there's any single circle dependency. That's why I decided not to
> support it for now and wanted to gather the kernel guys' opinions. Thing
> is which policy we should go with.

I see. So supporting only a single wakeup source is fine for locks I guess.
But for general wait queues or other synchronization mechanisms, I'm afraid
it will lead to quite some false positive reports. Just my 2c.

> > Dept do to prevent false reports in cases like this?
> > 
> > > The above is the simplest form. And it's worth noting that Dept focuses
> > > on wait and event itself rather than grabing and releasing things like
> > > lock. The following is the more descriptive form of it.
> > > 
> > > KJOURNALD2(kthread)   TASK1(ksys_write)   TASK2(ksys_write)
> > > 
> > > wait @j_wait_commit
> > >   ext4_truncate_failed_write()
> > >  down_write(mapping.invalidate_lock)
> > > 
> > >  ext4_truncate()
> > > ...
> > > wait @j_wait_transaction_locked
> > > 
> > >   ext_truncate_failed_write()
> > >  
> > > down_write(mapping.invalidate_lock)
> > > 
> > >   ext4_should_retry_alloc()
> > >  ...
> > >  __jbd2_log_start_commit()
> > > wake_up(j_wait_commit)
> > > jbd2_journal_commit_transaction()
> > >wake_up(j_wait_transaction_locked)
> > >  up_write(mapping.invalidate_lock)
> > > 
> > > I hope this would help you understand the report.
> > 
> > I see, thanks for explanation! So the above scenario is impossible because
> 
> My pleasure.
> 
> > for anyone to block on @j_wait_transaction_locked the transaction must be
> > committing, which is done only by kjournald2 kthread and so that thread
> > cannot be waiting at @j_wait_commit. Essentially blocking on
> > @j_wait_transaction_locked means @j_wait_commit wakeup was already done.
> 
> kjournal2 repeatedly does the wait and the wake_up so the above scenario
> looks possible to me even based on what you explained. Maybe I should
> understand how the journal things work more for furhter discussion. Your
> explanation is so helpful. Thank you really.

OK, let me provide you with more details for better understanding :) In
jbd2 we have an object called 'transaction'. This object can go through
many states but for our case is important that transaction is moved to
T_LOCKED state and out of it only while jbd2_journal_commit_transaction()
function is executing and waiting on j_wait_transaction_locked waitqueue is
exactly waiting for a transaction to get out of T_LOCKED state. Function
jbd2_journal_commit_transaction() is executed only by kjournald. Hence
anyone can see transaction in T_LOCKED state only if kjournald is running
inside jbd2_journal_commit_transaction() and thus kjournald cannot be
sleeping on j_wait_commit at the same time. Does this explain things?

Honza
-- 
Jan Kara 
SUSE Labs, CR

Re: Report 2 in ext4 and journal based on v5.17-rc1

2022-02-23 Thread Jan Kara

On Wed 23-02-22 09:35:34, Byungchul Park wrote:
> On Mon, Feb 21, 2022 at 08:02:04PM +0100, Jan Kara wrote:
> > On Thu 17-02-22 20:10:04, Byungchul Park wrote:
> > > [9.008161] ===
> > > [9.008163] DEPT: Circular dependency has been detected.
> > > [9.008164] 5.17.0-rc1-00015-gb94f67143867-dirty #2 Tainted: GW
> > > [9.008166] ---
> > > [9.008167] summary
> > > [9.008167] ---
> > > [9.008168] *** DEADLOCK ***
> > > [9.008168]
> > > [9.008168] context A
> > > [9.008169] [S] 
> > > (unknown)(&(>j_wait_transaction_locked)->dmap:0)
> > > [9.008171] [W] wait(&(>j_wait_commit)->dmap:0)
> > > [9.008172] [E] 
> > > event(&(>j_wait_transaction_locked)->dmap:0)
> > > [9.008173]
> > > [9.008173] context B
> > > [9.008174] [S] down_write(mapping.invalidate_lock:0)
> > > [9.008175] [W] 
> > > wait(&(>j_wait_transaction_locked)->dmap:0)
> > > [9.008176] [E] up_write(mapping.invalidate_lock:0)
> > > [9.008177]
> > > [9.008178] context C
> > > [9.008179] [S] (unknown)(&(>j_wait_commit)->dmap:0)
> > > [9.008180] [W] down_write(mapping.invalidate_lock:0)
> > > [9.008181] [E] event(&(>j_wait_commit)->dmap:0)
> > > [9.008181]
> > > [9.008182] [S]: start of the event context
> > > [9.008183] [W]: the wait blocked
> > > [9.008183] [E]: the event not reachable
> > 
> > So what situation is your tool complaining about here? Can you perhaps show
> > it here in more common visualization like:
> 
> Sure.
> 
> > TASK1   TASK2
> > does foo, grabs Z
> > does X, grabs lock Y
> > blocks on Z
> > blocks on Y
> > 
> > or something like that? Because I was not able to decipher this from the
> > report even after trying for some time...
> 
> KJOURNALD2(kthread)   TASK1(ksys_write)   TASK2(ksys_write)
> 
> wait A
> --- stuck
>   wait B
>   --- stuck
>   wait C
>   --- stuck
> 
> wake up B wake up C   wake up A
> 
> where:
> A is a wait_queue, j_wait_commit
> B is a wait_queue, j_wait_transaction_locked
> C is a rwsem, mapping.invalidate_lock

I see. But a situation like this is not necessarily a guarantee of a
deadlock, is it? I mean there can be task D that will eventually call say
'wake up B' and unblock everything and this is how things were designed to
work? Multiple sources of wakeups are quite common I'd say... What does
Dept do to prevent false reports in cases like this?

> The above is the simplest form. And it's worth noting that Dept focuses
> on wait and event itself rather than grabing and releasing things like
> lock. The following is the more descriptive form of it.
> 
> KJOURNALD2(kthread)   TASK1(ksys_write)   TASK2(ksys_write)
> 
> wait @j_wait_commit
>   ext4_truncate_failed_write()
>  down_write(mapping.invalidate_lock)
> 
>  ext4_truncate()
> ...
> wait @j_wait_transaction_locked
> 
>   ext_truncate_failed_write()
>  
> down_write(mapping.invalidate_lock)
> 
>   ext4_should_retry_alloc()
>  ...
>  __jbd2_log_start_commit()
> wake_up(j_wait_commit)
> jbd2_journal_commit_transaction()
>wake_up(j_wait_transaction_locked)
>  up_write(mapping.invalidate_lock)
> 
> I hope this would help you understand the report.

I see, thanks for explanation! So the above scenario is impossible because
for anyone to block on @j_wait_transaction_locked the transaction must be
committing, which is done only by kjournald2 kthread and so that thread
cannot be waiting at @j_wait_commit. Essentially blocking on
@j_wait_transaction_locked means @j_wait_commit wakeup was already done.

I guess this shows there can be non-trivial dependencies between wait
queues which are difficult to track in an automated way and without such
tracking we are going to see false positives...

Honza

-- 
Jan Kara 
SUSE Labs, CR

Re: Report 1 in ext4 and journal based on v5.17-rc1

2022-02-22 Thread Jan Kara

On Thu 17-02-22 20:10:03, Byungchul Park wrote:
> [7.009608] ===
> [7.009613] DEPT: Circular dependency has been detected.
> [7.009614] 5.17.0-rc1-00014-g8a599299c0cb-dirty #30 Tainted: GW
> [7.009616] ---
> [7.009617] summary
> [7.009618] ---
> [7.009618] *** DEADLOCK ***
> [7.009618]
> [7.009619] context A
> [7.009619] [S] (unknown)(&(bit_wait_table + i)->dmap:0)
> [7.009621] [W] down_write(>i_data_sem:0)
> [7.009623] [E] event(&(bit_wait_table + i)->dmap:0)
> [7.009624]
> [7.009625] context B
> [7.009625] [S] down_read(>i_data_sem:0)
> [7.009626] [W] wait(&(bit_wait_table + i)->dmap:0)
> [7.009627] [E] up_read(>i_data_sem:0)
> [7.009628]

Looking into this I have noticed that Dept here tracks bitlocks (buffer
locks in particular) but it apparently treats locks on all buffers as one
locking class so it conflates lock on superblock buffer with a lock on
extent tree block buffer. These are wastly different locks with different
locking constraints. So to avoid false positives in filesystems we will
need to add annotations to differentiate locks on different buffers (based
on what the block is used for). Similarly how we e.g. annotate i_rwsem for
different inodes.

Honza
-- 
Jan Kara 
SUSE Labs, CR

Re: Report 2 in ext4 and journal based on v5.17-rc1

2022-02-21 Thread Jan Kara

d: GW 
> 5.17.0-rc1-00015-gb94f67143867-dirty #2
> [9.008352] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
> Bochs 01/01/2011
> [9.008353] Call Trace:
> [9.008354]  
> [9.008355] dump_stack_lvl (lib/dump_stack.c:107) 
> [9.008358] print_circle (./arch/x86/include/asm/atomic.h:108 
> ./include/linux/atomic/atomic-instrumented.h:258 kernel/dependency/dept.c:157 
> kernel/dependency/dept.c:762) 
> [9.008360] ? print_circle (kernel/dependency/dept.c:1086) 
> [9.008362] cb_check_dl (kernel/dependency/dept.c:1104) 
> [9.008364] bfs (kernel/dependency/dept.c:860) 
> [9.008366] add_dep (kernel/dependency/dept.c:1423) 
> [9.008368] do_event.isra.25 (kernel/dependency/dept.c:1651) 
> [9.008370] ? __wake_up_common (kernel/sched/wait.c:108) 
> [9.008372] dept_event (kernel/dependency/dept.c:2337) 
> [9.008374] __wake_up_common (kernel/sched/wait.c:109) 
> [9.008376] __wake_up_common_lock (./include/linux/spinlock.h:428 
> (discriminator 1) kernel/sched/wait.c:141 (discriminator 1)) 
> [9.008379] jbd2_journal_commit_transaction (fs/jbd2/commit.c:583) 
> [9.008381] ? arch_stack_walk (arch/x86/kernel/stacktrace.c:24) 
> [9.008385] ? ret_from_fork (arch/x86/entry/entry_64.S:301) 
> [9.008387] ? dept_enable_hardirq (./arch/x86/include/asm/current.h:15 
> kernel/dependency/dept.c:241 kernel/dependency/dept.c:999 
> kernel/dependency/dept.c:1043 kernel/dependency/dept.c:1843) 
> [9.008389] ? _raw_spin_unlock_irqrestore 
> (./arch/x86/include/asm/irqflags.h:45 ./arch/x86/include/asm/irqflags.h:80 
> ./arch/x86/include/asm/irqflags.h:138 ./include/linux/spinlock_api_smp.h:151 
> kernel/locking/spinlock.c:194) 
> [9.008392] ? _raw_spin_unlock_irqrestore 
> (./arch/x86/include/asm/preempt.h:103 ./include/linux/spinlock_api_smp.h:152 
> kernel/locking/spinlock.c:194) 
> [9.008394] ? try_to_del_timer_sync (kernel/time/timer.c:1239) 
> [9.008396] kjournald2 (fs/jbd2/journal.c:214 (discriminator 3)) 
> [9.008398] ? prepare_to_wait_exclusive (kernel/sched/wait.c:431) 
> [9.008400] ? commit_timeout (fs/jbd2/journal.c:173) 
> [9.008402] kthread (kernel/kthread.c:377) 
> [9.008404] ? kthread_complete_and_exit (kernel/kthread.c:332) 
> [9.008407] ret_from_fork (arch/x86/entry/entry_64.S:301) 
> [9.008410]  
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH v2 4/8] ocfs2: simplify subdirectory registration with register_sysctl()

2021-11-24 Thread Jan Kara

On Tue 23-11-21 12:24:18, Luis Chamberlain wrote:
> There is no need to user boiler plate code to specify a set of base
> directories we're going to stuff sysctls under. Simplify this by using
> register_sysctl() and specifying the directory path directly.
> 
> // pycocci sysctl-subdir-register-sysctl-simplify.cocci PATH

Heh, nice example of using Coccinelle. The result looks good. Feel free to
add:

Reviewed-by: Jan Kara 

Honza


> 
> @c1@
> expression E1;
> identifier subdir, sysctls;
> @@
> 
> static struct ctl_table subdir[] = {
>   {
>   .procname = E1,
>   .maxlen = 0,
>   .mode = 0555,
>   .child = sysctls,
>   },
>   { }
> };
> 
> @c2@
> identifier c1.subdir;
> 
> expression E2;
> identifier base;
> @@
> 
> static struct ctl_table base[] = {
>   {
>   .procname = E2,
>   .maxlen = 0,
>   .mode = 0555,
>   .child = subdir,
>   },
>   { }
> };
> 
> @c3@
> identifier c2.base;
> identifier header;
> @@
> 
> header = register_sysctl_table(base);
> 
> @r1 depends on c1 && c2 && c3@
> expression c1.E1;
> identifier c1.subdir, c1.sysctls;
> @@
> 
> -static struct ctl_table subdir[] = {
> - {
> - .procname = E1,
> - .maxlen = 0,
> - .mode = 0555,
> - .child = sysctls,
> - },
> - { }
> -};
> 
> @r2 depends on c1 && c2 && c3@
> identifier c1.subdir;
> 
> expression c2.E2;
> identifier c2.base;
> @@
> -static struct ctl_table base[] = {
> - {
> - .procname = E2,
> - .maxlen = 0,
> - .mode = 0555,
> - .child = subdir,
> - },
> - { }
> -};
> 
> @initialize:python@
> @@
> 
> def make_my_fresh_expression(s1, s2):
>   return '"' + s1.strip('"') + "/" + s2.strip('"') + '"'
> 
> @r3 depends on c1 && c2 && c3@
> expression c1.E1;
> identifier c1.sysctls;
> expression c2.E2;
> identifier c2.base;
> identifier c3.header;
> fresh identifier E3 = script:python(E2, E1) { make_my_fresh_expression(E2, 
> E1) };
> @@
> 
> header =
> -register_sysctl_table(base);
> +register_sysctl(E3, sysctls);
> 
> Generated-by: Coccinelle SmPL
> Signed-off-by: Luis Chamberlain 
> ---
>  fs/ocfs2/stackglue.c | 25 +
>  1 file changed, 1 insertion(+), 24 deletions(-)
> 
> diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
> index 16f1bfc407f2..731558a6f27d 100644
> --- a/fs/ocfs2/stackglue.c
> +++ b/fs/ocfs2/stackglue.c
> @@ -672,31 +672,8 @@ static struct ctl_table ocfs2_mod_table[] = {
>   { }
>  };
>  
> -static struct ctl_table ocfs2_kern_table[] = {
> - {
> - .procname   = "ocfs2",
> - .data   = NULL,
> - .maxlen = 0,
> - .mode   = 0555,
> - .child  = ocfs2_mod_table
> - },
> - { }
> -};
> -
> -static struct ctl_table ocfs2_root_table[] = {
> - {
> - .procname   = "fs",
> - .data   = NULL,
> - .maxlen = 0,
> - .mode   = 0555,
> - .child  = ocfs2_kern_table
> - },
> - { }
> -};
> -
>  static struct ctl_table_header *ocfs2_table_header;
>  
> -
>  /*
>   * Initialization
>   */
> @@ -705,7 +682,7 @@ static int __init ocfs2_stack_glue_init(void)
>  {
>   strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
>  
> - ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
> + ocfs2_table_header = register_sysctl("fs/ocfs2", ocfs2_mod_table);
>   if (!ocfs2_table_header) {
>   printk(KERN_ERR
>  "ocfs2 stack glue: unable to register sysctl\n");
> -- 
> 2.33.0
> 
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH v2 6/8] inotify: simplify subdirectory registration with register_sysctl()

2021-11-24 Thread Jan Kara

On Tue 23-11-21 12:24:20, Luis Chamberlain wrote:
> From: Xiaoming Ni 
> 
> There is no need to user boiler plate code to specify a set of base
> directories we're going to stuff sysctls under. Simplify this by using
> register_sysctl() and specifying the directory path directly.
> 
> Move inotify_user sysctl to inotify_user.c while at it to remove clutter
> from kernel/sysctl.c.
> 
> Signed-off-by: Xiaoming Ni 
> [mcgrof: update commit log to reflect new path we decided to take]
> Signed-off-by: Luis Chamberlain 

This looks fishy. You register inotify_table but not fanotify_table and
remove both...

Honza

> ---
>  fs/notify/inotify/inotify_user.c | 11 ++-
>  include/linux/inotify.h  |  3 ---
>  kernel/sysctl.c  | 21 -
>  3 files changed, 10 insertions(+), 25 deletions(-)
> 
> diff --git a/fs/notify/inotify/inotify_user.c 
> b/fs/notify/inotify/inotify_user.c
> index 29fca3284bb5..54583f62dc44 100644
> --- a/fs/notify/inotify/inotify_user.c
> +++ b/fs/notify/inotify/inotify_user.c
> @@ -58,7 +58,7 @@ struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
>  static long it_zero = 0;
>  static long it_int_max = INT_MAX;
>  
> -struct ctl_table inotify_table[] = {
> +static struct ctl_table inotify_table[] = {
>   {
>   .procname   = "max_user_instances",
>   .data   = 
> _user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES],
> @@ -87,6 +87,14 @@ struct ctl_table inotify_table[] = {
>   },
>   { }
>  };
> +
> +static void __init inotify_sysctls_init(void)
> +{
> + register_sysctl("fs/inotify", inotify_table);
> +}
> +
> +#else
> +#define inotify_sysctls_init() do { } while (0)
>  #endif /* CONFIG_SYSCTL */
>  
>  static inline __u32 inotify_arg_to_mask(struct inode *inode, u32 arg)
> @@ -849,6 +857,7 @@ static int __init inotify_user_setup(void)
>   inotify_max_queued_events = 16384;
>   init_user_ns.ucount_max[UCOUNT_INOTIFY_INSTANCES] = 128;
>   init_user_ns.ucount_max[UCOUNT_INOTIFY_WATCHES] = watches_max;
> + inotify_sysctls_init();
>  
>   return 0;
>  }
> diff --git a/include/linux/inotify.h b/include/linux/inotify.h
> index 6a24905f6e1e..8d20caa1b268 100644
> --- a/include/linux/inotify.h
> +++ b/include/linux/inotify.h
> @@ -7,11 +7,8 @@
>  #ifndef _LINUX_INOTIFY_H
>  #define _LINUX_INOTIFY_H
>  
> -#include 
>  #include 
>  
> -extern struct ctl_table inotify_table[]; /* for sysctl */
> -
>  #define ALL_INOTIFY_BITS (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE 
> | \
> IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | \
> IN_MOVED_TO | IN_CREATE | IN_DELETE | \
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 7a90a12b9ea4..6aa67c737e4e 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -125,13 +125,6 @@ static const int maxolduid = 65535;
>  static const int ngroups_max = NGROUPS_MAX;
>  static const int cap_last_cap = CAP_LAST_CAP;
>  
> -#ifdef CONFIG_INOTIFY_USER
> -#include 
> -#endif
> -#ifdef CONFIG_FANOTIFY
> -#include 
> -#endif
> -
>  #ifdef CONFIG_PROC_SYSCTL
>  
>  /**
> @@ -3099,20 +3092,6 @@ static struct ctl_table fs_table[] = {
>   .proc_handler   = proc_dointvec,
>   },
>  #endif
> -#ifdef CONFIG_INOTIFY_USER
> - {
> - .procname   = "inotify",
> - .mode   = 0555,
> - .child  = inotify_table,
> - },
> -#endif
> -#ifdef CONFIG_FANOTIFY
> - {
> - .procname   = "fanotify",
> - .mode   = 0555,
> - .child  = fanotify_table,
> - },
> -#endif
>  #ifdef CONFIG_EPOLL
>   {
>   .procname   = "epoll",
> -- 
> 2.33.0
> 
-- 
Jan Kara 
SUSE Labs, CR

Re: [PATCH 2/2] mm/frame-vec: use FOLL_LONGTERM

2020-10-05 Thread Jan Kara

On Mon 05-10-20 14:38:54, Jason Gunthorpe wrote:
> When get_vaddr_frames() does its hacky follow_pfn() loop it should never
> be allowed to extract a struct page from a normal VMA. This could allow a
> serious use-after-free problem on any kernel memory.
> 
> Restrict this to only work on VMA's with one of VM_IO | VM_PFNMAP
> set. This limits the use-after-free problem to only IO memory, which while
> still serious, is an improvement.
> 
> Cc: sta...@vger.kernel.org
> Fixes: 8025e5ddf9c1 ("[media] mm: Provide new get_vaddr_frames() helper")
> Signed-off-by: Jason Gunthorpe 
> ---
>  mm/frame_vector.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/mm/frame_vector.c b/mm/frame_vector.c
> index 10f82d5643b6de..26cb20544b6c37 100644
> --- a/mm/frame_vector.c
> +++ b/mm/frame_vector.c
> @@ -99,6 +99,10 @@ int get_vaddr_frames(unsigned long start, unsigned int 
> nr_frames,
>   if (ret >= nr_frames || start < vma->vm_end)
>   break;
>   vma = find_vma_intersection(mm, start, start + 1);
> + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
> + ret = -EINVAL;
> + goto out;
> + }
>   } while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));

Hum, I fail to see how this helps. If vma has no VM_IO or VM_PFNMAP flag,
we'd exit the loop (to out: label) anyway due to the loop termination
condition and why not return the frames we already have? Furthermore
find_vma_intersection() can return NULL which would oops in your check
then. What am I missing?

    Honza

>  out:
>   if (locked)
> -- 
> 2.28.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 2/2] mm/frame-vec: use FOLL_LONGTERM

2020-10-05 Thread Jan Kara

Hi!

On Fri 02-10-20 15:06:03, Jason Gunthorpe wrote:
> This get_vaddr_frames() thing looks impossible to use properly. How on
> earth does a driver guarentee
> 
>  "If @start belongs to VM_IO | VM_PFNMAP vma, we don't touch page
>  structures and the caller must make sure pfns aren't reused for
>  anything else while he is using them."
> 
> The only possible way to do that is if the driver restricts the VMAs
> to ones it owns and interacts with the vm_private data to refcount
> something.
> 
> Since every driver does this wrong anything that uses this is creating
> terrifying security issues.
> 
> IMHO this whole API should be deleted :(

So I'm the one guilty for introducing this API. The API was created to
factor out code in several (mostly V4L AFAIR) drivers thus reducing amount
of drivers poking into MM internals and getting things wrong in various
cases. It may well be that the API is still broken from "can driver ensure
this" POV - I tried to keep things things as they were before in this
regard as I have very little knowledge in how these drivers are supposed to
work.

Anyway, if you can make this go away, sure go ahead :)

        Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v11 00/25] mm/gup: track dma-pinned pages: FOLL_PIN

2020-01-06 Thread Jan Kara

On Sat 28-12-19 20:33:32, John Hubbard wrote:
> On 12/27/19 1:56 PM, John Hubbard wrote:
> ...
> >> It is ancient verification test (~10y) which is not an easy task to
> >> make it understandable and standalone :).
> >>
> > 
> > Is this the only test that fails, btw? No other test failures or hints of
> > problems?
> > 
> > (Also, maybe hopeless, but can *anyone* on the RDMA list provide some
> > characterization of the test, such as how many pins per page, what page
> > sizes are used? I'm still hoping to write a test to trigger something
> > close to this...)
> > 
> > I do have a couple more ideas for test runs:
> > 
> > 1. Reduce GUP_PIN_COUNTING_BIAS to 1. That would turn the whole override of
> > page->_refcount into a no-op, and so if all is well (it may not be!) with 
> > the
> > rest of the patch, then we'd expect this problem to not reappear.
> > 
> > 2. Active /proc/vmstat *foll_pin* statistics unconditionally (just for these
> > tests, of course), so we can see if there is a get/put mismatch. However, 
> > that
> > will change the timing, and so it must be attempted independently of (1), in
> > order to see if it ends up hiding the repro.
> > 
> > I've updated this branch to implement (1), but not (2), hoping you can give
> > this one a spin?
> > 
> >     g...@github.com:johnhubbard/linux.git  
> > pin_user_pages_tracking_v11_with_diags
> > 
> > 
> 
> Also, looking ahead:
> 
> a) if the problem disappears with the latest above test, then we likely have
>a huge page refcount overflow, and there are a couple of different ways to
>fix it. 
> 
> b) if it still reproduces with the above, then it's some other random mistake,
>and in that case I'd be inclined to do a sort of guided (or classic, 
> unguided)
>git bisect of the series. Because it could be any of several patches.
> 
>If that's too much trouble, then I'd have to fall back to submitting a few
>patches at a time and working my way up to the tracking patch...

It could also be that an ordinary page reference is dropped with 'unpin'
thus underflowing the page refcount...

Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v11 00/25] mm/gup: track dma-pinned pages: FOLL_PIN

2019-12-20 Thread Jan Kara

On Thu 19-12-19 12:30:31, John Hubbard wrote:
> On 12/19/19 5:26 AM, Leon Romanovsky wrote:
> > On Mon, Dec 16, 2019 at 02:25:12PM -0800, John Hubbard wrote:
> > > Hi,
> > > 
> > > This implements an API naming change (put_user_page*() -->
> > > unpin_user_page*()), and also implements tracking of FOLL_PIN pages. It
> > > extends that tracking to a few select subsystems. More subsystems will
> > > be added in follow up work.
> > 
> > Hi John,
> > 
> > The patchset generates kernel panics in our IB testing. In our tests, we
> > allocated single memory block and registered multiple MRs using the single
> > block.
> > 
> > The possible bad flow is:
> >   ib_umem_geti() ->
> >pin_user_pages_fast(FOLL_WRITE) ->
> > internal_get_user_pages_fast(FOLL_WRITE) ->
> >  gup_pgd_range() ->
> >   gup_huge_pd() ->
> >gup_hugepte() ->
> > try_grab_compound_head() ->
> 
> Hi Leon,
> 
> Thanks very much for the detailed report! So we're overflowing...
> 
> At first look, this seems likely to be hitting a weak point in the
> GUP_PIN_COUNTING_BIAS-based design, one that I believed could be deferred
> (there's a writeup in Documentation/core-api/pin_user_page.rst, lines
> 99-121). Basically it's pretty easy to overflow the page->_refcount
> with huge pages if the pages have a *lot* of subpages.
> 
> We can only do about 7 pins on 1GB huge pages that use 4KB subpages.
> Do you have any idea how many pins (repeated pins on the same page, which
> it sounds like you have) might be involved in your test case,
> and the huge page and system page sizes? That would allow calculating
> if we're likely overflowing for that reason.
> 
> So, ideas and next steps:
> 
> 1. Assuming that you *are* hitting this, I think I may have to fall back to
> implementing the "deferred" part of this design, as part of this series, after
> all. That means:
> 
>   For the pin/unpin calls at least, stop treating all pages as if they are
>   a cluster of PAGE_SIZE pages; instead, retrieve a huge page as one page.
>   That's not how it works now, and the need to hand back a huge array of
>   subpages is part of the problem. This affects the callers too, so it's not
>   a super quick change to make. (I was really hoping not to have to do this
>   yet.)

Does that mean that you would need to make all GUP users huge page aware?
Otherwise I don't see how what you suggest would work... And I don't think
making all GUP users huge page aware is realistic (effort-wise) or even
wanted (maintenance overhead in all those places).

I believe there might be also a different solution for this: For
transparent huge pages, we could find a space in 'struct page' of the
second page in the huge page for proper pin counter and just account pins
there so we'd have full width of 32-bits for it.

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v11 00/25] mm/gup: track dma-pinned pages: FOLL_PIN

2019-12-16 Thread Jan Kara

Hi!

On Mon 16-12-19 14:25:12, John Hubbard wrote:
> Hi,
> 
> This implements an API naming change (put_user_page*() -->
> unpin_user_page*()), and also implements tracking of FOLL_PIN pages. It
> extends that tracking to a few select subsystems. More subsystems will
> be added in follow up work.

Just a note for Andrew and others watching this series: At this point I'm fine
with the series so if someone still has some review feedback or wants to
check the series, now is the right time. Otherwise I think Andrew can push
the series to MM tree so that it will get wider testing exposure and is
prepared for the next merge window.

        Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v11 23/25] mm/gup: track FOLL_PIN pages

2019-12-16 Thread Jan Kara

On Mon 16-12-19 14:18:59, John Hubbard wrote:
> On 12/16/19 4:53 AM, Jan Kara wrote:
> > With this fixed, the patch looks good to me so you can then add:
> > 
> > Reviewed-by: Jan Kara 
> > 
> > Honza
> > 
> 
> btw, thanks for the thorough review of this critical patch (and for your
> patience with my mistakes). I really appreciate it, and this patchset would
> not have made it this far without your detailed help and explanations.

You're welcome! I'd also like to thank you for persistently driving this
series :)

        Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v11 23/25] mm/gup: track FOLL_PIN pages

2019-12-16 Thread Jan Kara

On Fri 13-12-19 19:26:17, John Hubbard wrote:
> Add tracking of pages that were pinned via FOLL_PIN.
> 
> As mentioned in the FOLL_PIN documentation, callers who effectively set
> FOLL_PIN are required to ultimately free such pages via unpin_user_page().
> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
> for DIO and/or RDMA use".
> 
> Pages that have been pinned via FOLL_PIN are identifiable via a
> new function call:
> 
>bool page_dma_pinned(struct page *page);
> 
> What to do in response to encountering such a page, is left to later
> patchsets. There is discussion about this in [1], [2], and [3].
> 
> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
> 
> [1] Some slow progress on get_user_pages() (Apr 2, 2019):
> https://lwn.net/Articles/784574/
> [2] DMA and get_user_pages() (LPC: Dec 12, 2018):
> https://lwn.net/Articles/774411/
> [3] The trouble with get_user_pages() (Apr 30, 2018):
> https://lwn.net/Articles/753027/
> 
> Suggested-by: Jan Kara 
> Suggested-by: Jérôme Glisse 
> Cc: Kirill A. Shutemov 
> Signed-off-by: John Hubbard 
> ---
> 
> Hi Jan,
> 
> This should address all of your comments for patch 23!

Thanks. One comment below:

> @@ -1486,6 +1500,10 @@ struct page *follow_trans_huge_pmd(struct 
> vm_area_struct *vma,
>   VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
>   if (flags & FOLL_TOUCH)
>   touch_pmd(vma, addr, pmd, flags);
> +
> + if (!try_grab_page(page, flags))
> + return ERR_PTR(-ENOMEM);
> +
>   if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
>   /*
>* We don't mlock() pte-mapped THPs. This way we can avoid

I'd move this still a bit higher - just after VM_BUG_ON_PAGE() and before
if (flags & FOLL_TOUCH) test. Because touch_pmd() can update page tables
and we don't won't that if we're going to fail the fault.

With this fixed, the patch looks good to me so you can then add:

Reviewed-by: Jan Kara 

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v10 23/25] mm/gup: track FOLL_PIN pages

2019-12-12 Thread Jan Kara

On Thu 12-12-19 00:19:15, John Hubbard wrote:
> Add tracking of pages that were pinned via FOLL_PIN.
> 
> As mentioned in the FOLL_PIN documentation, callers who effectively set
> FOLL_PIN are required to ultimately free such pages via unpin_user_page().
> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
> for DIO and/or RDMA use".
> 
> Pages that have been pinned via FOLL_PIN are identifiable via a
> new function call:
> 
>bool page_dma_pinned(struct page *page);
> 
> What to do in response to encountering such a page, is left to later
> patchsets. There is discussion about this in [1], [2], and [3].
> 
> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
> 
> [1] Some slow progress on get_user_pages() (Apr 2, 2019):
> https://lwn.net/Articles/784574/
> [2] DMA and get_user_pages() (LPC: Dec 12, 2018):
> https://lwn.net/Articles/774411/
> [3] The trouble with get_user_pages() (Apr 30, 2018):
> https://lwn.net/Articles/753027/
> 
> Suggested-by: Jan Kara 
> Suggested-by: Jérôme Glisse 
> Cc: Kirill A. Shutemov 
> Signed-off-by: John Hubbard 

Thanks for the patch. As a side note, given this series is rather big, it
may be better to send just individual updated patches (as replies to the
review comments) instead of resending the whole series every time. And then
you can resend the whole series once enough changes accumulate or we reach
seemingly final state.  That way people don't have to crawl through lots of
uninteresing emails...  Just something to keep in mind for the future.

I've spotted just one issue in this patch (see below), the rest are just
small style nits.

> +#define page_ref_zero_or_close_to_bias_overflow(page) \
> + ((unsigned int) page_ref_count(page) + \
> + GUP_PIN_COUNTING_BIAS <= GUP_PIN_COUNTING_BIAS)
> +

...

> +/**
> + * page_dma_pinned() - report if a page is pinned for DMA.
> + *
> + * This function checks if a page has been pinned via a call to
> + * pin_user_pages*().
> + *
> + * The return value is partially fuzzy: false is not fuzzy, because it means
> + * "definitely not pinned for DMA", but true means "probably pinned for DMA, 
> but
> + * possibly a false positive due to having at least GUP_PIN_COUNTING_BIAS 
> worth
> + * of normal page references".
> + *
> + * False positives are OK, because: a) it's unlikely for a page to get that 
> many
> + * refcounts, and b) all the callers of this routine are expected to be able 
> to
> + * deal gracefully with a false positive.
> + *
> + * For more information, please see Documentation/vm/pin_user_pages.rst.
> + *
> + * @page:pointer to page to be queried.
> + * @Return:  True, if it is likely that the page has been "dma-pinned".
> + *   False, if the page is definitely not dma-pinned.
> + */
> +static inline bool page_dma_pinned(struct page *page)
> +{
> + return (page_ref_count(compound_head(page))) >= GUP_PIN_COUNTING_BIAS;
> +}
> +

I realized one think WRT handling of page refcount overflow: Page refcount is
signed and e.g. try_get_page() fails once the refcount is negative. That
means that:

a) page_ref_zero_or_close_to_bias_overflow() is not necessary - all places
that use pinning (i.e., advance refcount by GUP_PIN_COUNTING_BIAS) are not
necesary, we should just rely on the check for negative value for
consistency.

b) page_dma_pinned() has to be careful and type page_ref_count() to
unsigned type for comparison as otherwise overflowed refcount would
suddently appear as not-pinned.

> +/**
> + * try_pin_compound_head() - mark a compound page as being used by
> + * pin_user_pages*().
> + *
> + * This is the FOLL_PIN counterpart to try_get_compound_head().
> + *
> + * @page:pointer to page to be marked
> + * @Return:  the compound head page, with ref appropriately incremented,
> + * or NULL upon failure.
> + */
> +__must_check struct page *try_pin_compound_head(struct page *page, int refs)
> +{
> + struct page *head = try_get_compound_head(page,
> +   GUP_PIN_COUNTING_BIAS * refs);
> + if (!head)
> + return NULL;
> +
> + __update_proc_vmstat(page, NR_FOLL_PIN_REQUESTED, refs);
> + return head;
> +}
> +
> +/*
> + * try_grab_compound_head() - attempt to elevate a page's refcount, by a
> + * flags-dependent amount.
> + *
> + * "grab" names in this file mean, "look at flags to decide whether to use
> + * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
> + *
> + * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
> + * same time. (That's true throughout the get_user_pages*() and
> + *

Re: [PATCH v9 23/25] mm/gup: track FOLL_PIN pages

2019-12-11 Thread Jan Kara

On Tue 10-12-19 18:53:16, John Hubbard wrote:
> Add tracking of pages that were pinned via FOLL_PIN.
> 
> As mentioned in the FOLL_PIN documentation, callers who effectively set
> FOLL_PIN are required to ultimately free such pages via unpin_user_page().
> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
> for DIO and/or RDMA use".
> 
> Pages that have been pinned via FOLL_PIN are identifiable via a
> new function call:
> 
>bool page_dma_pinned(struct page *page);
> 
> What to do in response to encountering such a page, is left to later
> patchsets. There is discussion about this in [1], [2], and [3].
> 
> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
> 
> [1] Some slow progress on get_user_pages() (Apr 2, 2019):
> https://lwn.net/Articles/784574/
> [2] DMA and get_user_pages() (LPC: Dec 12, 2018):
> https://lwn.net/Articles/774411/
> [3] The trouble with get_user_pages() (Apr 30, 2018):
> https://lwn.net/Articles/753027/

The patch looks mostly good to me now. Just a few smaller comments below.

> Suggested-by: Jan Kara 
> Suggested-by: Jérôme Glisse 
> Reviewed-by: Jan Kara 
> Reviewed-by: Jérôme Glisse 
> Reviewed-by: Ira Weiny 

I think you inherited here the Reviewed-by tags from the "add flags" patch
you've merged into this one but that's not really fair since this patch
does much more... In particular I didn't give my Reviewed-by tag for this
patch yet.

> +/*
> + * try_grab_compound_head() - attempt to elevate a page's refcount, by a
> + * flags-dependent amount.
> + *
> + * This has a default assumption of "use FOLL_GET behavior, if FOLL_PIN is 
> not
> + * set".
> + *
> + * "grab" names in this file mean, "look at flags to decide whether to use
> + * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
> + */
> +static __maybe_unused struct page *try_grab_compound_head(struct page *page,
> +   int refs,
> +   unsigned int flags)
> +{
> + if (flags & FOLL_PIN)
> + return try_pin_compound_head(page, refs);
> +
> + return try_get_compound_head(page, refs);
> +}

I somewhat wonder about the asymmetry of try_grab_compound_head() vs
try_grab_page() in the treatment of 'flags'. How costly would it be to make
them symmetric (i.e., either set FOLL_GET for try_grab_compound_head()
callers or make sure one of FOLL_GET, FOLL_PIN is set for try_grab_page())?

Because this difference looks like a subtle catch in the long run...

> +
> +/**
> + * try_grab_page() - elevate a page's refcount by a flag-dependent amount
> + *
> + * This might not do anything at all, depending on the flags argument.
> + *
> + * "grab" names in this file mean, "look at flags to decide whether to use
> + * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
> + *
> + * @page:pointer to page to be grabbed
> + * @flags:   gup flags: these are the FOLL_* flag values.
> + *
> + * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the 
> same
> + * time. (That's true throughout the get_user_pages*() and pin_user_pages*()
> + * APIs.) Cases:
> + *
> + *   FOLL_GET: page's refcount will be incremented by 1.
> + *  FOLL_PIN: page's refcount will be incremented by 
> GUP_PIN_COUNTING_BIAS.
> + *
> + * Return: true for success, or if no action was required (if neither 
> FOLL_PIN
> + * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
> + * FOLL_PIN was set, but the page could not be grabbed.
> + */
> +bool __must_check try_grab_page(struct page *page, unsigned int flags)
> +{
> + if (flags & FOLL_GET)
> + return try_get_page(page);
> + else if (flags & FOLL_PIN) {
> + page = compound_head(page);
> + WARN_ON_ONCE(flags & FOLL_GET);
> +
> + if (WARN_ON_ONCE(page_ref_zero_or_close_to_bias_overflow(page)))
> + return false;
> +
> + page_ref_add(page, GUP_PIN_COUNTING_BIAS);
> + __update_proc_vmstat(page, NR_FOLL_PIN_REQUESTED, 1);
> + }
> +
> + return true;
> +}

...

> @@ -1522,8 +1536,8 @@ struct page *follow_trans_huge_pmd(struct 
> vm_area_struct *vma,
>  skip_mlock:
>   page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
>   VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
> - if (flags & FOLL_GET)
> - get_page(page);
> + if (!try_grab_page(page, flags))
> + page = ERR_PTR(-EFAULT);

I think you need to also mo

Re: [PATCH v9 20/25] powerpc: book3s64: convert to pin_user_pages() and put_user_page()

2019-12-11 Thread Jan Kara

On Tue 10-12-19 18:53:13, John Hubbard wrote:
> 1. Convert from get_user_pages() to pin_user_pages().
> 
> 2. As required by pin_user_pages(), release these pages via
> put_user_page().
> 
> Cc: Jan Kara 
> Signed-off-by: John Hubbard 

The patch looks good to me. You can add:

Reviewed-by: Jan Kara 

I'd just note that mm_iommu_do_alloc() has a pre-existing bug that the last
jump to 'free_exit' (at line 157) happens already after converting page
pointers to physical addresses so put_page() calls there will just crash.
But that's completely unrelated to your change. I'll send a fix separately.

Honza

> ---
>  arch/powerpc/mm/book3s64/iommu_api.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
> b/arch/powerpc/mm/book3s64/iommu_api.c
> index 56cc84520577..a86547822034 100644
> --- a/arch/powerpc/mm/book3s64/iommu_api.c
> +++ b/arch/powerpc/mm/book3s64/iommu_api.c
> @@ -103,7 +103,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>   for (entry = 0; entry < entries; entry += chunk) {
>   unsigned long n = min(entries - entry, chunk);
>  
> - ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
> + ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
>   FOLL_WRITE | FOLL_LONGTERM,
>   mem->hpages + entry, NULL);
>   if (ret == n) {
> @@ -167,9 +167,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>   return 0;
>  
>  free_exit:
> - /* free the reference taken */
> - for (i = 0; i < pinned; i++)
> - put_page(mem->hpages[i]);
> + /* free the references taken */
> + put_user_pages(mem->hpages, pinned);
>  
>   vfree(mem->hpas);
>   kfree(mem);
> @@ -215,7 +214,8 @@ static void mm_iommu_unpin(struct 
> mm_iommu_table_group_mem_t *mem)
>   if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
>   SetPageDirty(page);
>  
> - put_page(page);
> + put_user_page(page);
> +
>   mem->hpas[i] = 0;
>   }
>  }
> -- 
> 2.24.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v8 24/26] mm/gup: track FOLL_PIN pages

2019-12-10 Thread Jan Kara

On Mon 09-12-19 14:53:42, John Hubbard wrote:
> Add tracking of pages that were pinned via FOLL_PIN.
> 
> As mentioned in the FOLL_PIN documentation, callers who effectively set
> FOLL_PIN are required to ultimately free such pages via unpin_user_page().
> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
> for DIO and/or RDMA use".
> 
> Pages that have been pinned via FOLL_PIN are identifiable via a
> new function call:
> 
>bool page_dma_pinned(struct page *page);
> 
> What to do in response to encountering such a page, is left to later
> patchsets. There is discussion about this in [1], [2], and [3].
> 
> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
> 
> [1] Some slow progress on get_user_pages() (Apr 2, 2019):
> https://lwn.net/Articles/784574/
> [2] DMA and get_user_pages() (LPC: Dec 12, 2018):
> https://lwn.net/Articles/774411/
> [3] The trouble with get_user_pages() (Apr 30, 2018):
> https://lwn.net/Articles/753027/
> 
> Suggested-by: Jan Kara 
> Suggested-by: Jérôme Glisse 
> Signed-off-by: John Hubbard 

Looks nice, some comments below...

> +/*
> + * try_grab_compound_head() - attempt to elevate a page's refcount, by a
> + * flags-dependent amount.
> + *
> + * This has a default assumption of "use FOLL_GET behavior, if FOLL_PIN is 
> not
> + * set".
> + *
> + * "grab" names in this file mean, "look at flags to decide with to use 
> FOLL_PIN
> + * or FOLL_GET behavior, when incrementing the page's refcount.
> + */
> +static struct page *try_grab_compound_head(struct page *page, int refs,
> +unsigned int flags)
> +{
> + if (flags & FOLL_PIN)
> + return try_pin_compound_head(page, refs);
> +
> + return try_get_compound_head(page, refs);
> +}
> +
> +/**
> + * grab_page() - elevate a page's refcount by a flag-dependent amount
> + *
> + * This might not do anything at all, depending on the flags argument.
> + *
> + * "grab" names in this file mean, "look at flags to decide with to use 
> FOLL_PIN
   ^^^ whether

> + * or FOLL_GET behavior, when incrementing the page's refcount.
> + *
> + * @page:pointer to page to be grabbed
> + * @flags:   gup flags: these are the FOLL_* flag values.
> + *
> + * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the 
> same
> + * time. (That's true throughout the get_user_pages*() and pin_user_pages*()
> + * APIs.) Cases:
> + *
> + *   FOLL_GET: page's refcount will be incremented by 1.
> + *   FOLL_PIN: page's refcount will be incremented by GUP_PIN_COUNTING_BIAS.
> + */
> +void grab_page(struct page *page, unsigned int flags)
> +{
> + if (flags & FOLL_GET)
> + get_page(page);
> + else if (flags & FOLL_PIN) {
> + get_page(page);
> + WARN_ON_ONCE(flags & FOLL_GET);
> + /*
> +  * Use get_page(), above, to do the refcount error
> +  * checking. Then just add in the remaining references:
> +  */
> + page_ref_add(page, GUP_PIN_COUNTING_BIAS - 1);

This is wrong for two reasons:

1) You miss compound_head() indirection from get_page() for this
page_ref_add().

2) page_ref_add() could overflow the counter without noticing.

Especially with GUP_PIN_COUNTING_BIAS being non-trivial, it is realistic
that an attacker might try to overflow the page refcount and we have to
protect the kernel against that. So I think that all the places that would
use grab_page() actually need to use try_grab_page() and then gracefully
deal with the failure.

> @@ -278,11 +425,23 @@ static struct page *follow_page_pte(struct 
> vm_area_struct *vma,
>   goto retry;
>   }
>  
> - if (flags & FOLL_GET) {
> + if (flags & (FOLL_PIN | FOLL_GET)) {
> + /*
> +  * Allow try_get_page() to take care of error handling, for
> +  * both cases: FOLL_GET or FOLL_PIN:
> +  */
>   if (unlikely(!try_get_page(page))) {
>   page = ERR_PTR(-ENOMEM);
>   goto out;
>   }
> +
> + if (flags & FOLL_PIN) {
> + WARN_ON_ONCE(flags & FOLL_GET);
> +
> + /* We got a +1 refcount from try_get_page(), above. */
> + page_ref_add(page, GUP_PIN_COUNTING_BIAS - 1);
> + __update_proc_vmstat(page, NR_FOLL_PIN_REQUESTED, 1);
> + }
>   }

The same problem here as above, plus this place should use th

Re: [PATCH v8 23/26] mm/gup: pass flags arg to __gup_device_* functions

2019-12-10 Thread Jan Kara

On Mon 09-12-19 14:53:41, John Hubbard wrote:
> A subsequent patch requires access to gup flags, so pass the flags
> argument through to the __gup_device_* functions.
> 
> Also placate checkpatch.pl by shortening a nearby line.
> 
> TODO: Christoph Hellwig requested folding this into the patch the uses
> the gup flags arguments.

You should probably implement this TODO? :)

Honza

> 
> Reviewed-by: Jan Kara 
> Reviewed-by: Jérôme Glisse 
> Reviewed-by: Ira Weiny 
> Cc: Kirill A. Shutemov 
> Signed-off-by: John Hubbard 
> ---
>  mm/gup.c | 28 ++--
>  1 file changed, 18 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 73aedcefa4bd..687d48506f04 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1957,7 +1957,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
> unsigned long end,
>  
>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && 
> defined(CONFIG_TRANSPARENT_HUGEPAGE)
>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   int nr_start = *nr;
>   struct dev_pagemap *pgmap = NULL;
> @@ -1983,13 +1984,14 @@ static int __gup_device_huge(unsigned long pfn, 
> unsigned long addr,
>  }
>  
>  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   unsigned long fault_pfn;
>   int nr_start = *nr;
>  
>   fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> - if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
> + if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
>   return 0;
>  
>   if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
> @@ -2000,13 +2002,14 @@ static int __gup_device_huge_pmd(pmd_t orig, pmd_t 
> *pmdp, unsigned long addr,
>  }
>  
>  static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   unsigned long fault_pfn;
>   int nr_start = *nr;
>  
>   fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> - if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
> + if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
>   return 0;
>  
>   if (unlikely(pud_val(orig) != pud_val(*pudp))) {
> @@ -2017,14 +2020,16 @@ static int __gup_device_huge_pud(pud_t orig, pud_t 
> *pudp, unsigned long addr,
>  }
>  #else
>  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   BUILD_BUG();
>   return 0;
>  }
>  
>  static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   BUILD_BUG();
>   return 0;
> @@ -2136,7 +2141,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
> unsigned long addr,
>   if (pmd_devmap(orig)) {
>   if (unlikely(flags & FOLL_LONGTERM))
>   return 0;
> - return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
> + return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
> +  pages, nr);
>   }
>  
>   page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> @@ -2157,7 +2163,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
> unsigned long addr,
>  }
>  
>  static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
> - unsigned long end, unsigned int flags, struct page **pages, int 
> *nr)
> + unsigned long end, unsigned int flags,
> + struct page **pages, int *nr)
>  {
>   struct page *head, *page;
>

Re: [PATCH v8 17/26] media/v4l2-core: set pages dirty upon releasing DMA buffers

2019-12-10 Thread Jan Kara

On Mon 09-12-19 16:56:27, Andrew Morton wrote:
> On Mon, 9 Dec 2019 14:53:35 -0800 John Hubbard  wrote:
> 
> > After DMA is complete, and the device and CPU caches are synchronized,
> > it's still required to mark the CPU pages as dirty, if the data was
> > coming from the device. However, this driver was just issuing a
> > bare put_page() call, without any set_page_dirty*() call.
> > 
> > Fix the problem, by calling set_page_dirty_lock() if the CPU pages
> > were potentially receiving data from the device.
> > 
> > Reviewed-by: Christoph Hellwig 
> > Acked-by: Hans Verkuil 
> > Cc: Mauro Carvalho Chehab 
> > Cc: 
> 
> What are the user-visible effects of this change?

Presumably loss of captured video data if the page writeback hits in the
wrong moment (i.e., after the page was faulted in but before the video HW
stored data in the page) and the page then gets evicted from the page cache.

Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v8 08/26] mm/gup: allow FOLL_FORCE for get_user_pages_fast()

2019-12-10 Thread Jan Kara

On Mon 09-12-19 14:53:26, John Hubbard wrote:
> Commit 817be129e6f2 ("mm: validate get_user_pages_fast flags") allowed
> only FOLL_WRITE and FOLL_LONGTERM to be passed to get_user_pages_fast().
> This, combined with the fact that get_user_pages_fast() falls back to
> "slow gup", which *does* accept FOLL_FORCE, leads to an odd situation:
> if you need FOLL_FORCE, you cannot call get_user_pages_fast().
> 
> There does not appear to be any reason for filtering out FOLL_FORCE.
> There is nothing in the _fast() implementation that requires that we
> avoid writing to the pages. So it appears to have been an oversight.
> 
> Fix by allowing FOLL_FORCE to be set for get_user_pages_fast().
> 
> Fixes: 817be129e6f2 ("mm: validate get_user_pages_fast flags")
> Cc: Christoph Hellwig 
> Reviewed-by: Leon Romanovsky 
> Signed-off-by: John Hubbard 

Looks good to me. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  mm/gup.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index c0c56888e7cc..958ab0757389 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -2414,7 +2414,8 @@ int get_user_pages_fast(unsigned long start, int 
> nr_pages,
>   unsigned long addr, len, end;
>   int nr = 0, ret = 0;
>  
> - if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
> + if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
> +    FOLL_FORCE)))
>   return -EINVAL;
>  
>   start = untagged_addr(start) & PAGE_MASK;
> -- 
> 2.24.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 17/19] powerpc: book3s64: convert to pin_user_pages() and put_user_page()

2019-11-29 Thread Jan Kara

On Mon 25-11-19 15:10:33, John Hubbard wrote:
> 1. Convert from get_user_pages() to pin_user_pages().
> 
> 2. As required by pin_user_pages(), release these pages via
> put_user_page(). In this case, do so via put_user_pages_dirty_lock().
> 
> That has the side effect of calling set_page_dirty_lock(), instead
> of set_page_dirty(). This is probably more accurate.

Maybe more accurate but it doesn't work for mm_iommu_unpin(). As I'm
checking mm_iommu_unpin() gets called from RCU callback which is executed
interrupt context and you cannot lock pages from such context. So you need
to queue work from the RCU callback and then do the real work from the
workqueue...

Honza

> 
> As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
> dealing with a file backed page where we have reference on the inode it
> hangs off." [1]
> 
> [1] https://lore.kernel.org/r/20190723153640.gb...@lst.de
> 
> Cc: Jan Kara 
> Signed-off-by: John Hubbard 
> ---
>  arch/powerpc/mm/book3s64/iommu_api.c | 12 +---
>  1 file changed, 5 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
> b/arch/powerpc/mm/book3s64/iommu_api.c
> index 56cc84520577..fc1670a6fc3c 100644
> --- a/arch/powerpc/mm/book3s64/iommu_api.c
> +++ b/arch/powerpc/mm/book3s64/iommu_api.c
> @@ -103,7 +103,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>   for (entry = 0; entry < entries; entry += chunk) {
>   unsigned long n = min(entries - entry, chunk);
>  
> - ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
> + ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
>   FOLL_WRITE | FOLL_LONGTERM,
>   mem->hpages + entry, NULL);
>   if (ret == n) {
> @@ -167,9 +167,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>   return 0;
>  
>  free_exit:
> - /* free the reference taken */
> - for (i = 0; i < pinned; i++)
> - put_page(mem->hpages[i]);
> + /* free the references taken */
> + put_user_pages(mem->hpages, pinned);
>  
>   vfree(mem->hpas);
>   kfree(mem);
> @@ -212,10 +211,9 @@ static void mm_iommu_unpin(struct 
> mm_iommu_table_group_mem_t *mem)
>   if (!page)
>   continue;
>  
> - if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
> - SetPageDirty(page);
> + put_user_pages_dirty_lock(, 1,
> + mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);
>  
> - put_page(page);
>   mem->hpas[i] = 0;
>   }
>  }
> -- 
> 2.24.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 17/19] powerpc: book3s64: convert to pin_user_pages() and put_user_page()

2019-11-25 Thread Jan Kara

On Sun 24-11-19 20:20:09, John Hubbard wrote:
> 1. Convert from get_user_pages() to pin_user_pages().
> 
> 2. As required by pin_user_pages(), release these pages via
> put_user_page(). In this case, do so via put_user_pages_dirty_lock().
> 
> That has the side effect of calling set_page_dirty_lock(), instead
> of set_page_dirty(). This is probably more accurate.
> 
> As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
> dealing with a file backed page where we have reference on the inode it
> hangs off." [1]
> 
> 3. Release each page in mem->hpages[] (instead of mem->hpas[]), because
> that is the array that pin_longterm_pages() filled in. This is more
> accurate and should be a little safer from a maintenance point of
> view.

Except that this breaks the code. hpages is unioned with hpas...

> [1] https://lore.kernel.org/r/20190723153640.gb...@lst.de
> 
> Signed-off-by: John Hubbard 
> ---
>  arch/powerpc/mm/book3s64/iommu_api.c | 12 +---
>  1 file changed, 5 insertions(+), 7 deletions(-)
> 
> diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
> b/arch/powerpc/mm/book3s64/iommu_api.c
> index 56cc84520577..196383e8e5a9 100644
> --- a/arch/powerpc/mm/book3s64/iommu_api.c
> +++ b/arch/powerpc/mm/book3s64/iommu_api.c
> @@ -103,7 +103,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>   for (entry = 0; entry < entries; entry += chunk) {
>   unsigned long n = min(entries - entry, chunk);
>  
> - ret = get_user_pages(ua + (entry << PAGE_SHIFT), n,
> + ret = pin_user_pages(ua + (entry << PAGE_SHIFT), n,
>   FOLL_WRITE | FOLL_LONGTERM,
>   mem->hpages + entry, NULL);
>   if (ret == n) {
> @@ -167,9 +167,8 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>   return 0;
>  
>  free_exit:
> - /* free the reference taken */
> - for (i = 0; i < pinned; i++)
> - put_page(mem->hpages[i]);
> + /* free the references taken */
> + put_user_pages(mem->hpages, pinned);
>  
>   vfree(mem->hpas);
>   kfree(mem);
> @@ -212,10 +211,9 @@ static void mm_iommu_unpin(struct 
> mm_iommu_table_group_mem_t *mem)
>   if (!page)
>   continue;
>  
> - if (mem->hpas[i] & MM_IOMMU_TABLE_GROUP_PAGE_DIRTY)
> - SetPageDirty(page);
> + put_user_pages_dirty_lock(>hpages[i], 1,
> +   MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);

And the dirtying condition is wrong here as well. Currently it is always
true.

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v7 02/24] mm/gup: factor out duplicate code from four routines

2019-11-22 Thread Jan Kara

On Thu 21-11-19 18:54:02, John Hubbard wrote:
> On 11/21/19 1:54 AM, Jan Kara wrote:
> > On Thu 21-11-19 00:29:59, John Hubbard wrote:
> > > > 
> > > > Otherwise this looks fine and might be a worthwhile cleanup to feed
> > > > Andrew for 5.5 independent of the gut of the changes.
> > > > 
> > > > Reviewed-by: Christoph Hellwig 
> > > > 
> > > 
> > > Thanks for the reviews! Say, it sounds like your view here is that this
> > > series should be targeted at 5.6 (not 5.5), is that what you have in mind?
> > > And get the preparatory patches (1-9, and maybe even 10-16) into 5.5?
> > 
> > One more note :) If you are going to push pin_user_pages() interfaces
> > (which I'm fine with), it would probably make sense to push also the
> > put_user_pages() -> unpin_user_pages() renaming so that that inconsistency
> > in naming does not exist in the released upstream kernel.
> > 
> > Honza
> 
> Yes, that's what this patch series does. But I'm not sure if "push" here
> means, "push out: defer to 5.6", "push (now) into 5.5", or "advocate for"?

I meant to include the patch in the "for 5.5" batch.

> I will note that it's not going to be easy to rename in one step, now
> that this is being split up. Because various put_user_pages()-based items
> are going into 5.5 via different maintainer trees now. Probably I'd need
> to introduce unpin_user_page() alongside put_user_page()...thoughts?

Yes, I understand that moving that patch from the end of the series would
cause fair amount of conflicts. I was hoping that you could generate the
patch with sed/Coccinelle and then rebasing what remains for 5.6 on top of
that patch should not be that painful so overall it should not be that much
work. But I may be wrong so if it proves to be too tedious, let's just
postpone the renaming to 5.6. I don't find having both unpin_user_page()
and put_user_page() a better alternative to current state. Thanks!

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v7 02/24] mm/gup: factor out duplicate code from four routines

2019-11-21 Thread Jan Kara

On Thu 21-11-19 00:29:59, John Hubbard wrote:
> > 
> > Otherwise this looks fine and might be a worthwhile cleanup to feed
> > Andrew for 5.5 independent of the gut of the changes.
> > 
> > Reviewed-by: Christoph Hellwig 
> > 
> 
> Thanks for the reviews! Say, it sounds like your view here is that this
> series should be targeted at 5.6 (not 5.5), is that what you have in mind?
> And get the preparatory patches (1-9, and maybe even 10-16) into 5.5?

One more note :) If you are going to push pin_user_pages() interfaces
(which I'm fine with), it would probably make sense to push also the
put_user_pages() -> unpin_user_pages() renaming so that that inconsistency
in naming does not exist in the released upstream kernel.

        Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v7 02/24] mm/gup: factor out duplicate code from four routines

2019-11-21 Thread Jan Kara

On Thu 21-11-19 00:29:59, John Hubbard wrote:
> On 11/21/19 12:03 AM, Christoph Hellwig wrote:
> > Otherwise this looks fine and might be a worthwhile cleanup to feed
> > Andrew for 5.5 independent of the gut of the changes.
> > 
> > Reviewed-by: Christoph Hellwig 
> > 
> 
> Thanks for the reviews! Say, it sounds like your view here is that this
> series should be targeted at 5.6 (not 5.5), is that what you have in mind?
> And get the preparatory patches (1-9, and maybe even 10-16) into 5.5?

Yeah, actually I feel the same. The merge window is going to open on Sunday
and the series isn't still fully baked and happily sitting in linux-next
(and larger changes should really sit in linux-next for at least a week,
preferably two, before the merge window opens to get some reasonable test
coverage).  So I'd take out the independent easy patches that are already
reviewed, get them merged into Andrew's (or whatever other appropriate
tree) now so that they get at least a week of testing in linux-next before
going upstream.  And the more involved bits will have to wait for 5.6 -
which means let's just continue working on them as we do now because
ideally in 4 weeks we should have them ready with all the reviews so that
they can be picked up and integrated into linux-next.

        Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v7 17/24] mm/gup: track FOLL_PIN pages

2019-11-21 Thread Jan Kara

On Wed 20-11-19 23:13:47, John Hubbard wrote:
> Add tracking of pages that were pinned via FOLL_PIN.
> 
> As mentioned in the FOLL_PIN documentation, callers who effectively set
> FOLL_PIN are required to ultimately free such pages via put_user_page().
> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
> for DIO and/or RDMA use".
> 
> Pages that have been pinned via FOLL_PIN are identifiable via a
> new function call:
> 
>bool page_dma_pinned(struct page *page);
> 
> What to do in response to encountering such a page, is left to later
> patchsets. There is discussion about this in [1], [2], and [3].
> 
> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
> 
> [1] Some slow progress on get_user_pages() (Apr 2, 2019):
> https://lwn.net/Articles/784574/
> [2] DMA and get_user_pages() (LPC: Dec 12, 2018):
> https://lwn.net/Articles/774411/
> [3] The trouble with get_user_pages() (Apr 30, 2018):
> https://lwn.net/Articles/753027/
> 
> Suggested-by: Jan Kara 
> Suggested-by: Jérôme Glisse 
> Signed-off-by: John Hubbard 

Thanks for the patch! We are mostly getting there. Some smaller comments
below.

> +/**
> + * try_pin_compound_head() - mark a compound page as being used by
> + * pin_user_pages*().
> + *
> + * This is the FOLL_PIN counterpart to try_get_compound_head().
> + *
> + * @page:pointer to page to be marked
> + * @Return:  true for success, false for failure
> + */
> +__must_check bool try_pin_compound_head(struct page *page, int refs)
> +{
> + page = try_get_compound_head(page, GUP_PIN_COUNTING_BIAS * refs);
> + if (!page)
> + return false;
> +
> + __update_proc_vmstat(page, NR_FOLL_PIN_REQUESTED, refs);
> + return true;
> +}
> +
> +#ifdef CONFIG_DEV_PAGEMAP_OPS
> +static bool __put_devmap_managed_user_page(struct page *page)

Probably call this __unpin_devmap_managed_user_page()? To match the later
conversion of put_user_page() to unpin_user_page()?

> +{
> + bool is_devmap = page_is_devmap_managed(page);
> +
> + if (is_devmap) {
> + int count = page_ref_sub_return(page, GUP_PIN_COUNTING_BIAS);
> +
> + __update_proc_vmstat(page, NR_FOLL_PIN_RETURNED, 1);
> + /*
> +  * devmap page refcounts are 1-based, rather than 0-based: if
> +  * refcount is 1, then the page is free and the refcount is
> +  * stable because nobody holds a reference on the page.
> +  */
> + if (count == 1)
> + free_devmap_managed_page(page);
> + else if (!count)
> + __put_page(page);
> + }
> +
> + return is_devmap;
> +}
> +#else
> +static bool __put_devmap_managed_user_page(struct page *page)
> +{
> + return false;
> +}
> +#endif /* CONFIG_DEV_PAGEMAP_OPS */
> +
> +/**
> + * put_user_page() - release a dma-pinned page
> + * @page:pointer to page to be released
> + *
> + * Pages that were pinned via pin_user_pages*() must be released via either
> + * put_user_page(), or one of the put_user_pages*() routines. This is so that
> + * such pages can be separately tracked and uniquely handled. In particular,
> + * interactions with RDMA and filesystems need special handling.
> + */
> +void put_user_page(struct page *page)
> +{
> + page = compound_head(page);
> +
> + /*
> +  * For devmap managed pages we need to catch refcount transition from
> +  * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
> +  * page is free and we need to inform the device driver through
> +  * callback. See include/linux/memremap.h and HMM for details.
> +  */
> + if (__put_devmap_managed_user_page(page))
> + return;
> +
> + if (page_ref_sub_and_test(page, GUP_PIN_COUNTING_BIAS))
> + __put_page(page);
> +
> + __update_proc_vmstat(page, NR_FOLL_PIN_RETURNED, 1);
> +}
> +EXPORT_SYMBOL(put_user_page);
> +
>  /**
>   * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned 
> pages
>   * @pages:  array of pages to be maybe marked dirty, and definitely released.
> @@ -237,10 +327,11 @@ static struct page *follow_page_pte(struct 
> vm_area_struct *vma,
>   }
>  
>   page = vm_normal_page(vma, address, pte);
> - if (!page && pte_devmap(pte) && (flags & FOLL_GET)) {
> + if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
>   /*
> -  * Only return device mapping pages in the FOLL_GET case since
> -  * they are only valid while holding t

Re: [PATCH v6 24/24] mm, tree-wide: rename put_user_page() to unpin_user_page()

2019-11-19 Thread Jan Kara

On Tue 19-11-19 00:16:43, John Hubbard wrote:
> In order to provide a clearer, more symmetric API for pinning
> and unpinning DMA pages. This way, pin_user_pages*() calls
> match up with unpin_user_pages*() calls, and the API is a lot
> closer to being self-explanatory.
> 
> Signed-off-by: John Hubbard 

Looks good to me. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  Documentation/core-api/pin_user_pages.rst   |  2 +-
>  arch/powerpc/mm/book3s64/iommu_api.c|  6 +--
>  drivers/gpu/drm/via/via_dmablit.c   |  4 +-
>  drivers/infiniband/core/umem.c  |  2 +-
>  drivers/infiniband/hw/hfi1/user_pages.c |  2 +-
>  drivers/infiniband/hw/mthca/mthca_memfree.c |  6 +--
>  drivers/infiniband/hw/qib/qib_user_pages.c  |  2 +-
>  drivers/infiniband/hw/qib/qib_user_sdma.c   |  6 +--
>  drivers/infiniband/hw/usnic/usnic_uiom.c|  2 +-
>  drivers/infiniband/sw/siw/siw_mem.c |  2 +-
>  drivers/media/v4l2-core/videobuf-dma-sg.c   |  4 +-
>  drivers/platform/goldfish/goldfish_pipe.c   |  4 +-
>  drivers/vfio/vfio_iommu_type1.c |  2 +-
>  fs/io_uring.c   |  4 +-
>  include/linux/mm.h  | 30 +++---
>  include/linux/mmzone.h  |  2 +-
>  mm/gup.c| 46 ++---
>  mm/gup_benchmark.c  |  2 +-
>  mm/process_vm_access.c  |  4 +-
>  net/xdp/xdp_umem.c  |  2 +-
>  20 files changed, 67 insertions(+), 67 deletions(-)
> 
> diff --git a/Documentation/core-api/pin_user_pages.rst 
> b/Documentation/core-api/pin_user_pages.rst
> index baa288a44a77..6d93ef203561 100644
> --- a/Documentation/core-api/pin_user_pages.rst
> +++ b/Documentation/core-api/pin_user_pages.rst
> @@ -220,7 +220,7 @@ since the system was booted, via two new /proc/vmstat 
> entries: ::
>  /proc/vmstat/nr_foll_pin_requested
>  
>  Those are both going to show zero, unless CONFIG_DEBUG_VM is set. This is
> -because there is a noticeable performance drop in put_user_page(), when they
> +because there is a noticeable performance drop in unpin_user_page(), when 
> they
>  are activated.
>  
>  References
> diff --git a/arch/powerpc/mm/book3s64/iommu_api.c 
> b/arch/powerpc/mm/book3s64/iommu_api.c
> index 196383e8e5a9..dd7aa5a4f33c 100644
> --- a/arch/powerpc/mm/book3s64/iommu_api.c
> +++ b/arch/powerpc/mm/book3s64/iommu_api.c
> @@ -168,7 +168,7 @@ static long mm_iommu_do_alloc(struct mm_struct *mm, 
> unsigned long ua,
>  
>  free_exit:
>   /* free the references taken */
> - put_user_pages(mem->hpages, pinned);
> + unpin_user_pages(mem->hpages, pinned);
>  
>   vfree(mem->hpas);
>   kfree(mem);
> @@ -211,8 +211,8 @@ static void mm_iommu_unpin(struct 
> mm_iommu_table_group_mem_t *mem)
>   if (!page)
>   continue;
>  
> - put_user_pages_dirty_lock(>hpages[i], 1,
> -   MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);
> + unpin_user_pages_dirty_lock(>hpages[i], 1,
> + MM_IOMMU_TABLE_GROUP_PAGE_DIRTY);
>  
>   mem->hpas[i] = 0;
>   }
> diff --git a/drivers/gpu/drm/via/via_dmablit.c 
> b/drivers/gpu/drm/via/via_dmablit.c
> index 37c5e572993a..719d036c9384 100644
> --- a/drivers/gpu/drm/via/via_dmablit.c
> +++ b/drivers/gpu/drm/via/via_dmablit.c
> @@ -188,8 +188,8 @@ via_free_sg_info(struct pci_dev *pdev, drm_via_sg_info_t 
> *vsg)
>   kfree(vsg->desc_pages);
>   /* fall through */
>   case dr_via_pages_locked:
> - put_user_pages_dirty_lock(vsg->pages, vsg->num_pages,
> -   (vsg->direction == DMA_FROM_DEVICE));
> + unpin_user_pages_dirty_lock(vsg->pages, vsg->num_pages,
> +(vsg->direction == DMA_FROM_DEVICE));
>   /* fall through */
>   case dr_via_pages_alloc:
>   vfree(vsg->pages);
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 2c287ced3439..119a147da904 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -54,7 +54,7 @@ static void __ib_umem_release(struct ib_device *dev, struct 
> ib_umem *umem, int d
>  
>   for_each_sg_page(umem->sg_head.sgl, _iter, umem->sg_nents, 0) {
>   page = sg_page_iter_page(_iter);
> - put_user_pages_dirty_lock(, 1, umem->writable && dirty);
> + unpin_user

Re: [PATCH v6 17/24] mm/gup: track FOLL_PIN pages

2019-11-19 Thread Jan Kara

On Tue 19-11-19 00:16:36, John Hubbard wrote:
> @@ -2025,6 +2149,20 @@ static int __record_subpages(struct page *page, 
> unsigned long addr,
>   return nr;
>  }
>  
> +static bool __pin_compound_head(struct page *head, int refs, unsigned int 
> flags)
> +{

I don't quite like the proliferation of names starting with __. I don't
think there's a good reason for that, particularly in this case. Also 'pin'
here is somewhat misleading as we already use term "pin" for the particular
way of pinning the page. We could have grab_compound_head() or maybe
nail_compound_head() :), but you're native speaker so you may come up with
better word.

> + if (flags & FOLL_PIN) {
> + if (unlikely(!try_pin_compound_head(head, refs)))
> + return false;
> + } else {
> + head = try_get_compound_head(head, refs);
> + if (!head)
> + return false;
> + }
> +
> + return true;
> +}
> +
>  static void put_compound_head(struct page *page, int refs)
>  {
>   /* Do a get_page() first, in case refs == page->_refcount */

put_compound_head() needs similar treatment as undo_dev_pagemap(), doesn't
it?

> @@ -968,7 +973,18 @@ struct page *follow_devmap_pmd(struct vm_area_struct 
> *vma, unsigned long addr,
>   if (!*pgmap)
>   return ERR_PTR(-EFAULT);
>   page = pfn_to_page(pfn);
> - get_page(page);
> +
> + if (flags & FOLL_GET)
> + get_page(page);
> + else if (flags & FOLL_PIN) {
> + /*
> +  * try_pin_page() is not actually expected to fail here because
> +  * we hold the pmd lock so no one can unmap the pmd and free the
> +  * page that it points to.
> +  */
> + if (unlikely(!try_pin_page(page)))
> + page = ERR_PTR(-EFAULT);
> + }

This pattern is rather common. So maybe I'd add a helper grab_page(page,
flags) doing

if (flags & FOLL_GET)
get_page(page);
else if (flags & FOLL_PIN)
return try_pin_page(page);
return true;

Otherwise the patch looks good to me now.

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v6 02/24] mm/gup: factor out duplicate code from four routines

2019-11-19 Thread Jan Kara

On Tue 19-11-19 00:16:21, John Hubbard wrote:
> There are four locations in gup.c that have a fair amount of code
> duplication. This means that changing one requires making the same
> changes in four places, not to mention reading the same code four
> times, and wondering if there are subtle differences.
> 
> Factor out the common code into static functions, thus reducing the
> overall line count and the code's complexity.
> 
> Also, take the opportunity to slightly improve the efficiency of the
> error cases, by doing a mass subtraction of the refcount, surrounded
> by get_page()/put_page().
> 
> Also, further simplify (slightly), by waiting until the the successful
> end of each routine, to increment *nr.
> 
> Reviewed-by: Jérôme Glisse 
> Cc: Jan Kara 
> Cc: Ira Weiny 
> Cc: Christoph Hellwig 
> Cc: Aneesh Kumar K.V 
> Signed-off-by: John Hubbard 

Looks good to me now! You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  mm/gup.c | 91 ++--
>  1 file changed, 36 insertions(+), 55 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 85caf76b3012..f3c7d6625817 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1969,6 +1969,25 @@ static int __gup_device_huge_pud(pud_t pud, pud_t 
> *pudp, unsigned long addr,
>  }
>  #endif
>  
> +static int __record_subpages(struct page *page, unsigned long addr,
> +  unsigned long end, struct page **pages)
> +{
> + int nr;
> +
> + for (nr = 0; addr != end; addr += PAGE_SIZE)
> + pages[nr++] = page++;
> +
> + return nr;
> +}
> +
> +static void put_compound_head(struct page *page, int refs)
> +{
> + /* Do a get_page() first, in case refs == page->_refcount */
> + get_page(page);
> + page_ref_sub(page, refs);
> + put_page(page);
> +}
> +
>  #ifdef CONFIG_ARCH_HAS_HUGEPD
>  static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
> unsigned long sz)
> @@ -1998,32 +2017,20 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, 
> unsigned long addr,
>   /* hugepages are never "special" */
>   VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
>  
> - refs = 0;
>   head = pte_page(pte);
> -
>   page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
> - do {
> - VM_BUG_ON(compound_head(page) != head);
> - pages[*nr] = page;
> - (*nr)++;
> - page++;
> - refs++;
> - } while (addr += PAGE_SIZE, addr != end);
> + refs = __record_subpages(page, addr, end, pages + *nr);
>  
>   head = try_get_compound_head(head, refs);
> - if (!head) {
> - *nr -= refs;
> + if (!head)
>   return 0;
> - }
>  
>   if (unlikely(pte_val(pte) != pte_val(*ptep))) {
> - /* Could be optimized better */
> - *nr -= refs;
> - while (refs--)
> - put_page(head);
> + put_compound_head(head, refs);
>   return 0;
>   }
>  
> + *nr += refs;
>   SetPageReferenced(head);
>   return 1;
>  }
> @@ -2071,28 +2078,19 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
> unsigned long addr,
>pages, nr);
>   }
>  
> - refs = 0;
>   page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> - do {
> - pages[*nr] = page;
> - (*nr)++;
> - page++;
> - refs++;
> - } while (addr += PAGE_SIZE, addr != end);
> + refs = __record_subpages(page, addr, end, pages + *nr);
>  
>   head = try_get_compound_head(pmd_page(orig), refs);
> - if (!head) {
> - *nr -= refs;
> + if (!head)
>   return 0;
> - }
>  
>   if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
> - *nr -= refs;
> - while (refs--)
> - put_page(head);
> + put_compound_head(head, refs);
>   return 0;
>   }
>  
> + *nr += refs;
>   SetPageReferenced(head);
>   return 1;
>  }
> @@ -2114,28 +2112,19 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, 
> unsigned long addr,
>pages, nr);
>   }
>  
> - refs = 0;
>   page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> - do {
> - pages[*nr] = page;
> - (*nr)++;
> - page++;
>

Re: [PATCH v5 07/24] IB/umem: use get_user_pages_fast() to pin DMA pages

2019-11-18 Thread Jan Kara

On Thu 14-11-19 21:53:23, John Hubbard wrote:
> And get rid of the mmap_sem calls, as part of that. Note
> that get_user_pages_fast() will, if necessary, fall back to
> __gup_longterm_unlocked(), which takes the mmap_sem as needed.
> 
> Reviewed-by: Jason Gunthorpe 
> Reviewed-by: Ira Weiny 
> Signed-off-by: John Hubbard 

Looks good to me. You can add:

Reviewed-by: Jan Kara 

Honza


> ---
>  drivers/infiniband/core/umem.c | 17 ++---
>  1 file changed, 6 insertions(+), 11 deletions(-)
> 
> diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
> index 24244a2f68cc..3d664a2539eb 100644
> --- a/drivers/infiniband/core/umem.c
> +++ b/drivers/infiniband/core/umem.c
> @@ -271,16 +271,13 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, 
> unsigned long addr,
>   sg = umem->sg_head.sgl;
>  
>   while (npages) {
> - down_read(>mmap_sem);
> - ret = get_user_pages(cur_base,
> -  min_t(unsigned long, npages,
> -PAGE_SIZE / sizeof (struct page *)),
> -  gup_flags | FOLL_LONGTERM,
> -  page_list, NULL);
> - if (ret < 0) {
> - up_read(>mmap_sem);
> + ret = get_user_pages_fast(cur_base,
> +   min_t(unsigned long, npages,
> + PAGE_SIZE /
> + sizeof(struct page *)),
> +   gup_flags | FOLL_LONGTERM, page_list);
> + if (ret < 0)
>   goto umem_release;
> - }
>  
>   cur_base += ret * PAGE_SIZE;
>   npages   -= ret;
> @@ -288,8 +285,6 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, 
> unsigned long addr,
>   sg = ib_umem_add_sg_table(sg, page_list, ret,
>   dma_get_max_seg_size(context->device->dma_device),
>   >sg_nents);
> -
> - up_read(>mmap_sem);
>   }
>  
>   sg_mark_end(sg);
> -- 
> 2.24.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 06/24] goldish_pipe: rename local pin_user_pages() routine

2019-11-18 Thread Jan Kara

On Thu 14-11-19 21:53:22, John Hubbard wrote:
> 1. Avoid naming conflicts: rename local static function from
> "pin_user_pages()" to "pin_goldfish_pages()".
> 
> An upcoming patch will introduce a global pin_user_pages()
> function.
> 
> Reviewed-by: Jérôme Glisse 
> Reviewed-by: Ira Weiny 
> Signed-off-by: John Hubbard 

Looks good to me. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  drivers/platform/goldfish/goldfish_pipe.c | 18 +-
>  1 file changed, 9 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/platform/goldfish/goldfish_pipe.c 
> b/drivers/platform/goldfish/goldfish_pipe.c
> index cef0133aa47a..7ed2a21a0bac 100644
> --- a/drivers/platform/goldfish/goldfish_pipe.c
> +++ b/drivers/platform/goldfish/goldfish_pipe.c
> @@ -257,12 +257,12 @@ static int goldfish_pipe_error_convert(int status)
>   }
>  }
>  
> -static int pin_user_pages(unsigned long first_page,
> -   unsigned long last_page,
> -   unsigned int last_page_size,
> -   int is_write,
> -   struct page *pages[MAX_BUFFERS_PER_COMMAND],
> -   unsigned int *iter_last_page_size)
> +static int pin_goldfish_pages(unsigned long first_page,
> +   unsigned long last_page,
> +   unsigned int last_page_size,
> +   int is_write,
> +   struct page *pages[MAX_BUFFERS_PER_COMMAND],
> +   unsigned int *iter_last_page_size)
>  {
>   int ret;
>   int requested_pages = ((last_page - first_page) >> PAGE_SHIFT) + 1;
> @@ -354,9 +354,9 @@ static int transfer_max_buffers(struct goldfish_pipe 
> *pipe,
>   if (mutex_lock_interruptible(>lock))
>   return -ERESTARTSYS;
>  
> - pages_count = pin_user_pages(first_page, last_page,
> -  last_page_size, is_write,
> -  pipe->pages, _last_page_size);
> + pages_count = pin_goldfish_pages(first_page, last_page,
> +  last_page_size, is_write,
> +          pipe->pages, _last_page_size);
>   if (pages_count < 0) {
>   mutex_unlock(>lock);
>   return pages_count;
> -- 
> 2.24.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 02/24] mm/gup: factor out duplicate code from four routines

2019-11-18 Thread Jan Kara

On Thu 14-11-19 21:53:18, John Hubbard wrote:
> There are four locations in gup.c that have a fair amount of code
> duplication. This means that changing one requires making the same
> changes in four places, not to mention reading the same code four
> times, and wondering if there are subtle differences.
> 
> Factor out the common code into static functions, thus reducing the
> overall line count and the code's complexity.
> 
> Also, take the opportunity to slightly improve the efficiency of the
> error cases, by doing a mass subtraction of the refcount, surrounded
> by get_page()/put_page().
> 
> Also, further simplify (slightly), by waiting until the the successful
> end of each routine, to increment *nr.
> 
> Reviewed-by: Jérôme Glisse 
> Cc: Jan Kara 
> Cc: Ira Weiny 
> Cc: Christoph Hellwig 
> Cc: Aneesh Kumar K.V 
> Signed-off-by: John Hubbard 
> ---
>  mm/gup.c | 95 
>  1 file changed, 40 insertions(+), 55 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 85caf76b3012..858541ea30ce 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1969,6 +1969,29 @@ static int __gup_device_huge_pud(pud_t pud, pud_t 
> *pudp, unsigned long addr,
>  }
>  #endif
>  
> +static int __record_subpages(struct page *page, unsigned long addr,
> +  unsigned long end, struct page **pages)
> +{
> + int nr = 0;
> + int nr_recorded_pages = 0;
> +
> + do {
> + pages[nr] = page;
> + nr++;
> + page++;
> + nr_recorded_pages++;
> + } while (addr += PAGE_SIZE, addr != end);
> + return nr_recorded_pages;

nr == nr_recorded_pages so no need for both... BTW, structuring this as a
for loop would be probably more logical and shorter now:

for (nr = 0; addr != end; addr += PAGE_SIZE)
pages[nr++] = page++;
    return nr;

The rest of the patch looks good to me.

Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 11/24] goldish_pipe: convert to pin_user_pages() and put_user_page()

2019-11-18 Thread Jan Kara

On Thu 14-11-19 21:53:27, John Hubbard wrote:
> 1. Call the new global pin_user_pages_fast(), from pin_goldfish_pages().
> 
> 2. As required by pin_user_pages(), release these pages via
> put_user_page(). In this case, do so via put_user_pages_dirty_lock().
> 
> That has the side effect of calling set_page_dirty_lock(), instead
> of set_page_dirty(). This is probably more accurate.
> 
> As Christoph Hellwig put it, "set_page_dirty() is only safe if we are
> dealing with a file backed page where we have reference on the inode it
> hangs off." [1]
> 
> Another side effect is that the release code is simplified because
> the page[] loop is now in gup.c instead of here, so just delete the
> local release_user_pages() entirely, and call
> put_user_pages_dirty_lock() directly, instead.
> 
> [1] https://lore.kernel.org/r/20190723153640.gb...@lst.de
> 
> Reviewed-by: Ira Weiny 
> Signed-off-by: John Hubbard 

Looks good to me. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  drivers/platform/goldfish/goldfish_pipe.c | 17 +++--
>  1 file changed, 3 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/platform/goldfish/goldfish_pipe.c 
> b/drivers/platform/goldfish/goldfish_pipe.c
> index 7ed2a21a0bac..635a8bc1b480 100644
> --- a/drivers/platform/goldfish/goldfish_pipe.c
> +++ b/drivers/platform/goldfish/goldfish_pipe.c
> @@ -274,7 +274,7 @@ static int pin_goldfish_pages(unsigned long first_page,
>   *iter_last_page_size = last_page_size;
>   }
>  
> - ret = get_user_pages_fast(first_page, requested_pages,
> + ret = pin_user_pages_fast(first_page, requested_pages,
> !is_write ? FOLL_WRITE : 0,
> pages);
>   if (ret <= 0)
> @@ -285,18 +285,6 @@ static int pin_goldfish_pages(unsigned long first_page,
>   return ret;
>  }
>  
> -static void release_user_pages(struct page **pages, int pages_count,
> -int is_write, s32 consumed_size)
> -{
> - int i;
> -
> - for (i = 0; i < pages_count; i++) {
> - if (!is_write && consumed_size > 0)
> - set_page_dirty(pages[i]);
> - put_page(pages[i]);
> - }
> -}
> -
>  /* Populate the call parameters, merging adjacent pages together */
>  static void populate_rw_params(struct page **pages,
>  int pages_count,
> @@ -372,7 +360,8 @@ static int transfer_max_buffers(struct goldfish_pipe 
> *pipe,
>  
>   *consumed_size = pipe->command_buffer->rw_params.consumed_size;
>  
> - release_user_pages(pipe->pages, pages_count, is_write, *consumed_size);
> + put_user_pages_dirty_lock(pipe->pages, pages_count,
> +   !is_write && *consumed_size > 0);
>  
>   mutex_unlock(>lock);
>   return 0;
> -- 
> 2.24.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 13/24] mm/process_vm_access: set FOLL_PIN via pin_user_pages_remote()

2019-11-18 Thread Jan Kara

On Thu 14-11-19 21:53:29, John Hubbard wrote:
> Convert process_vm_access to use the new pin_user_pages_remote()
> call, which sets FOLL_PIN. Setting FOLL_PIN is now required for
> code that requires tracking of pinned pages.
> 
> Also, release the pages via put_user_page*().
> 
> Also, rename "pages" to "pinned_pages", as this makes for
> easier reading of process_vm_rw_single_vec().
> 
> Reviewed-by: Jérôme Glisse 
> Reviewed-by: Ira Weiny 
> Signed-off-by: John Hubbard 
> ---
>  mm/process_vm_access.c | 28 +++-
>  1 file changed, 15 insertions(+), 13 deletions(-)

The patch looks good to me. You can add:

Reviewed-by: Jan Kara 

        Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 15/24] fs/io_uring: set FOLL_PIN via pin_user_pages()

2019-11-18 Thread Jan Kara

On Thu 14-11-19 21:53:31, John Hubbard wrote:
> Convert fs/io_uring to use the new pin_user_pages() call, which sets
> FOLL_PIN. Setting FOLL_PIN is now required for code that requires
> tracking of pinned pages, and therefore for any code that calls
> put_user_page().
> 
> In partial anticipation of this work, the io_uring code was already
> calling put_user_page() instead of put_page(). Therefore, in order to
> convert from the get_user_pages()/put_page() model, to the
> pin_user_pages()/put_user_page() model, the only change required
> here is to change get_user_pages() to pin_user_pages().
> 
> Signed-off-by: John Hubbard 

Looks good to me. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  fs/io_uring.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index f9a38998f2fc..cff64bd00db9 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -3433,7 +3433,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx 
> *ctx, void __user *arg,
>  
>   ret = 0;
>   down_read(>mm->mmap_sem);
> - pret = get_user_pages(ubuf, nr_pages,
> + pret = pin_user_pages(ubuf, nr_pages,
> FOLL_WRITE | FOLL_LONGTERM,
>     pages, vmas);
>   if (pret == nr_pages) {
> -- 
> 2.24.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 10/24] mm/gup: introduce pin_user_pages*() and FOLL_PIN

2019-11-18 Thread Jan Kara

On Thu 14-11-19 21:53:26, John Hubbard wrote:
>  /*
> - * NOTE on FOLL_LONGTERM:
> + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
> + * other. Here is what they mean, and how to use them:
>   *
>   * FOLL_LONGTERM indicates that the page will be held for an indefinite time
> - * period _often_ under userspace control.  This is contrasted with
> - * iov_iter_get_pages() where usages which are transient.
> + * period _often_ under userspace control.  This is in contrast to
> + * iov_iter_get_pages(), where usages which are transient.
  ^^^ when you touch this, please fix also the
second sentense. It doesn't quite make sense to me... I'd probably write
there "whose usages are transient" but maybe you can come up with something
even better.

Otherwise the patch looks good to me so feel free to add:

Reviewed-by: Jan Kara 

        Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v5 17/24] mm/gup: track FOLL_PIN pages

2019-11-18 Thread Jan Kara

On Thu 14-11-19 21:53:33, John Hubbard wrote:
> Add tracking of pages that were pinned via FOLL_PIN.
> 
> As mentioned in the FOLL_PIN documentation, callers who effectively set
> FOLL_PIN are required to ultimately free such pages via put_user_page().
> The effect is similar to FOLL_GET, and may be thought of as "FOLL_GET
> for DIO and/or RDMA use".
> 
> Pages that have been pinned via FOLL_PIN are identifiable via a
> new function call:
> 
>bool page_dma_pinned(struct page *page);
> 
> What to do in response to encountering such a page, is left to later
> patchsets. There is discussion about this in [1].
^^ missing this reference
in the changelog...

> This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask().
> 
> Suggested-by: Jan Kara 
> Suggested-by: Jérôme Glisse 
> Signed-off-by: John Hubbard 
> ---
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 6588d2e02628..db872766480f 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1054,6 +1054,8 @@ static inline __must_check bool try_get_page(struct 
> page *page)
>   return true;
>  }
>  
> +__must_check bool user_page_ref_inc(struct page *page);
> +
>  static inline void put_page(struct page *page)
>  {
>   page = compound_head(page);
> @@ -1071,29 +1073,70 @@ static inline void put_page(struct page *page)
>   __put_page(page);
>  }
>  
> -/**
> - * put_user_page() - release a gup-pinned page
> - * @page:pointer to page to be released
> +/*
> + * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload
> + * the page's refcount so that two separate items are tracked: the original 
> page
> + * reference count, and also a new count of how many get_user_pages() calls 
> were
^^ pin_user_pages()

> + * made against the page. ("gup-pinned" is another term for the latter).
> + *
> + * With this scheme, get_user_pages() becomes special: such pages are marked
^^^ pin_user_pages()

> + * as distinct from normal pages. As such, the put_user_page() call (and its
> + * variants) must be used in order to release gup-pinned pages.
> + *
> + * Choice of value:
>   *
> - * Pages that were pinned via pin_user_pages*() must be released via either
> - * put_user_page(), or one of the put_user_pages*() routines. This is so that
> - * eventually such pages can be separately tracked and uniquely handled. In
> - * particular, interactions with RDMA and filesystems need special handling.
> + * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page 
> reference
> + * counts with respect to get_user_pages() and put_user_page() becomes 
> simpler,
^^^ pin_user_pages()

> + * due to the fact that adding an even power of two to the page refcount has
> + * the effect of using only the upper N bits, for the code that counts up 
> using
> + * the bias value. This means that the lower bits are left for the exclusive
> + * use of the original code that increments and decrements by one (or at 
> least,
> + * by much smaller values than the bias value).
>   *
> - * put_user_page() and put_page() are not interchangeable, despite this early
> - * implementation that makes them look the same. put_user_page() calls must
> - * be perfectly matched up with pin*() calls.
> + * Of course, once the lower bits overflow into the upper bits (and this is
> + * OK, because subtraction recovers the original values), then visual 
> inspection
> + * no longer suffices to directly view the separate counts. However, for 
> normal
> + * applications that don't have huge page reference counts, this won't be an
> + * issue.
> + *
> + * Locking: the lockless algorithm described in page_cache_get_speculative()
> + * and page_cache_gup_pin_speculative() provides safe operation for
> + * get_user_pages and page_mkclean and other calls that race to set up page
> + * table entries.
>   */
...
> @@ -2070,9 +2191,16 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, 
> unsigned long addr,
>   page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
>   refs = __record_subpages(page, addr, end, pages + *nr);
>  
> - head = try_get_compound_head(head, refs);
> - if (!head)
> - return 0;
> + if (flags & FOLL_PIN) {
> + head = page;
> + if (unlikely(!user_page_ref_inc(head)))
> + return 0;
> + head = page;

Why do you assign 'head' twice? Also the refcounting logic is repeated
several times so perhaps you can factor it out in to a helper f

Re: [PATCH v4 02/23] mm/gup: factor out duplicate code from four routines

2019-11-13 Thread Jan Kara

On Tue 12-11-19 20:26:49, John Hubbard wrote:
> There are four locations in gup.c that have a fair amount of code
> duplication. This means that changing one requires making the same
> changes in four places, not to mention reading the same code four
> times, and wondering if there are subtle differences.
> 
> Factor out the common code into static functions, thus reducing the
> overall line count and the code's complexity.
> 
> Also, take the opportunity to slightly improve the efficiency of the
> error cases, by doing a mass subtraction of the refcount, surrounded
> by get_page()/put_page().
> 
> Also, further simplify (slightly), by waiting until the the successful
> end of each routine, to increment *nr.
> 
> Reviewed-by: Jérôme Glisse 
> Cc: Ira Weiny 
> Cc: Christoph Hellwig 
> Cc: Aneesh Kumar K.V 
> Signed-off-by: John Hubbard 

> diff --git a/mm/gup.c b/mm/gup.c
> index 85caf76b3012..199da99e8ffc 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1969,6 +1969,34 @@ static int __gup_device_huge_pud(pud_t pud, pud_t 
> *pudp, unsigned long addr,
>  }
>  #endif
>  
> +static int __record_subpages(struct page *page, unsigned long addr,
> +  unsigned long end, struct page **pages, int nr)
> +{
> + int nr_recorded_pages = 0;
> +
> + do {
> + pages[nr] = page;
> + nr++;
> + page++;
> + nr_recorded_pages++;
> + } while (addr += PAGE_SIZE, addr != end);
> + return nr_recorded_pages;
> +}

Why don't you pass in already pages + nr?

> +
> +static void put_compound_head(struct page *page, int refs)
> +{
> + /* Do a get_page() first, in case refs == page->_refcount */
> + get_page(page);
> + page_ref_sub(page, refs);
> + put_page(page);
> +}
> +
> +static void __huge_pt_done(struct page *head, int nr_recorded_pages, int *nr)
> +{
> + *nr += nr_recorded_pages;
> + SetPageReferenced(head);
> +}

I don't find this last helper very useful. It seems to muddy water more
than necessary...

Other than that the cleanup looks nice to me.

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 03/23] mm/gup: move try_get_compound_head() to top, fix minor issues

2019-11-13 Thread Jan Kara

On Tue 12-11-19 20:26:50, John Hubbard wrote:
> An upcoming patch uses try_get_compound_head() more widely,
> so move it to the top of gup.c.
> 
> Also fix a tiny spelling error and a checkpatch.pl warning.
> 
> Signed-off-by: John Hubbard 

Looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  mm/gup.c | 29 +++--
>  1 file changed, 15 insertions(+), 14 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 199da99e8ffc..933524de6249 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -29,6 +29,21 @@ struct follow_page_context {
>   unsigned int page_mask;
>  };
>  
> +/*
> + * Return the compound head page with ref appropriately incremented,
> + * or NULL if that failed.
> + */
> +static inline struct page *try_get_compound_head(struct page *page, int refs)
> +{
> + struct page *head = compound_head(page);
> +
> + if (WARN_ON_ONCE(page_ref_count(head) < 0))
> + return NULL;
> + if (unlikely(!page_cache_add_speculative(head, refs)))
> + return NULL;
> + return head;
> +}
> +
>  /**
>   * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned 
> pages
>   * @pages:  array of pages to be maybe marked dirty, and definitely released.
> @@ -1793,20 +1808,6 @@ static void __maybe_unused undo_dev_pagemap(int *nr, 
> int nr_start,
>   }
>  }
>  
> -/*
> - * Return the compund head page with ref appropriately incremented,
> - * or NULL if that failed.
> - */
> -static inline struct page *try_get_compound_head(struct page *page, int refs)
> -{
> - struct page *head = compound_head(page);
> - if (WARN_ON_ONCE(page_ref_count(head) < 0))
> - return NULL;
> - if (unlikely(!page_cache_add_speculative(head, refs)))
> - return NULL;
> - return head;
> -}
> -
>  #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
>  static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
>unsigned int flags, struct page **pages, int *nr)
> -- 
> 2.24.0
> 
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 01/23] mm/gup: pass flags arg to __gup_device_* functions

2019-11-13 Thread Jan Kara

On Tue 12-11-19 20:26:48, John Hubbard wrote:
> A subsequent patch requires access to gup flags, so
> pass the flags argument through to the __gup_device_*
> functions.
> 
> Also placate checkpatch.pl by shortening a nearby line.
> 
> Reviewed-by: Jérôme Glisse 
> Reviewed-by: Ira Weiny 
> Cc: Kirill A. Shutemov 
> Signed-off-by: John Hubbard 

Looks good! You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  mm/gup.c | 28 ++--
>  1 file changed, 18 insertions(+), 10 deletions(-)
> 
> diff --git a/mm/gup.c b/mm/gup.c
> index 8f236a335ae9..85caf76b3012 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -1890,7 +1890,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, 
> unsigned long end,
>  
>  #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && 
> defined(CONFIG_TRANSPARENT_HUGEPAGE)
>  static int __gup_device_huge(unsigned long pfn, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   int nr_start = *nr;
>   struct dev_pagemap *pgmap = NULL;
> @@ -1916,13 +1917,14 @@ static int __gup_device_huge(unsigned long pfn, 
> unsigned long addr,
>  }
>  
>  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   unsigned long fault_pfn;
>   int nr_start = *nr;
>  
>   fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
> - if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
> + if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
>   return 0;
>  
>   if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
> @@ -1933,13 +1935,14 @@ static int __gup_device_huge_pmd(pmd_t orig, pmd_t 
> *pmdp, unsigned long addr,
>  }
>  
>  static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   unsigned long fault_pfn;
>   int nr_start = *nr;
>  
>   fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
> - if (!__gup_device_huge(fault_pfn, addr, end, pages, nr))
> + if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
>   return 0;
>  
>   if (unlikely(pud_val(orig) != pud_val(*pudp))) {
> @@ -1950,14 +1953,16 @@ static int __gup_device_huge_pud(pud_t orig, pud_t 
> *pudp, unsigned long addr,
>  }
>  #else
>  static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   BUILD_BUG();
>   return 0;
>  }
>  
>  static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
> - unsigned long end, struct page **pages, int *nr)
> +  unsigned long end, unsigned int flags,
> +  struct page **pages, int *nr)
>  {
>   BUILD_BUG();
>   return 0;
> @@ -2062,7 +2067,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
> unsigned long addr,
>   if (pmd_devmap(orig)) {
>   if (unlikely(flags & FOLL_LONGTERM))
>   return 0;
> - return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
> + return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
> +  pages, nr);
>   }
>  
>   refs = 0;
> @@ -2092,7 +2098,8 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, 
> unsigned long addr,
>  }
>  
>  static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
> - unsigned long end, unsigned int flags, struct page **pages, int 
> *nr)
> + unsigned long end, unsigned int flags,
> + struct page **pages, int *nr)
>  {
>   struct page *head, *page;
>   int refs;
> @@ -2103,7 +2110,8 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, 
> unsigned long addr,
>   if (pud_devmap(orig)) {
>   if (unlikely(flags & FOLL_LONGTERM))
>   re

Re: [PATCH v4 04/23] mm: devmap: refactor 1-based refcounting for ZONE_DEVICE pages

2019-11-13 Thread Jan Kara

On Tue 12-11-19 20:26:51, John Hubbard wrote:
> An upcoming patch changes and complicates the refcounting and
> especially the "put page" aspects of it. In order to keep
> everything clean, refactor the devmap page release routines:
> 
> * Rename put_devmap_managed_page() to page_is_devmap_managed(),
>   and limit the functionality to "read only": return a bool,
>   with no side effects.
> 
> * Add a new routine, put_devmap_managed_page(), to handle checking
>   what kind of page it is, and what kind of refcount handling it
>   requires.
> 
> * Rename __put_devmap_managed_page() to free_devmap_managed_page(),
>   and limit the functionality to unconditionally freeing a devmap
>   page.
> 
> This is originally based on a separate patch by Ira Weiny, which
> applied to an early version of the put_user_page() experiments.
> Since then, Jérôme Glisse suggested the refactoring described above.
> 
> Suggested-by: Jérôme Glisse 
> Signed-off-by: Ira Weiny 
> Signed-off-by: John Hubbard 

Looks good to me. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  include/linux/mm.h | 27 ---
>  mm/memremap.c  | 67 --
>  2 files changed, 53 insertions(+), 41 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index a2adf95b3f9c..96228376139c 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -967,9 +967,10 @@ static inline bool is_zone_device_page(const struct page 
> *page)
>  #endif
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page);
> +void free_devmap_managed_page(struct page *page);
>  DECLARE_STATIC_KEY_FALSE(devmap_managed_key);
> -static inline bool put_devmap_managed_page(struct page *page)
> +
> +static inline bool page_is_devmap_managed(struct page *page)
>  {
>   if (!static_branch_unlikely(_managed_key))
>   return false;
> @@ -978,7 +979,6 @@ static inline bool put_devmap_managed_page(struct page 
> *page)
>   switch (page->pgmap->type) {
>   case MEMORY_DEVICE_PRIVATE:
>   case MEMORY_DEVICE_FS_DAX:
> - __put_devmap_managed_page(page);
>   return true;
>   default:
>   break;
> @@ -986,6 +986,27 @@ static inline bool put_devmap_managed_page(struct page 
> *page)
>   return false;
>  }
>  
> +static inline bool put_devmap_managed_page(struct page *page)
> +{
> + bool is_devmap = page_is_devmap_managed(page);
> +
> + if (is_devmap) {
> + int count = page_ref_dec_return(page);
> +
> + /*
> +  * devmap page refcounts are 1-based, rather than 0-based: if
> +  * refcount is 1, then the page is free and the refcount is
> +  * stable because nobody holds a reference on the page.
> +  */
> + if (count == 1)
> + free_devmap_managed_page(page);
> + else if (!count)
> + __put_page(page);
> + }
> +
> + return is_devmap;
> +}
> +
>  #else /* CONFIG_DEV_PAGEMAP_OPS */
>  static inline bool put_devmap_managed_page(struct page *page)
>  {
> diff --git a/mm/memremap.c b/mm/memremap.c
> index 03ccbdfeb697..bc7e2a27d025 100644
> --- a/mm/memremap.c
> +++ b/mm/memremap.c
> @@ -410,48 +410,39 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
>  EXPORT_SYMBOL_GPL(get_dev_pagemap);
>  
>  #ifdef CONFIG_DEV_PAGEMAP_OPS
> -void __put_devmap_managed_page(struct page *page)
> +void free_devmap_managed_page(struct page *page)
>  {
> - int count = page_ref_dec_return(page);
> + /* Clear Active bit in case of parallel mark_page_accessed */
> + __ClearPageActive(page);
> + __ClearPageWaiters(page);
> +
> + mem_cgroup_uncharge(page);
>  
>   /*
> -  * If refcount is 1 then page is freed and refcount is stable as nobody
> -  * holds a reference on the page.
> +  * When a device_private page is freed, the page->mapping field
> +  * may still contain a (stale) mapping value. For example, the
> +  * lower bits of page->mapping may still identify the page as
> +  * an anonymous page. Ultimately, this entire field is just
> +  * stale and wrong, and it will cause errors if not cleared.
> +  * One example is:
> +  *
> +  *  migrate_vma_pages()
> +  *migrate_vma_insert_page()
> +  *  page_add_new_anon_rmap()
> +  *__page_set_anon_rmap()
> +  *  ...checks page->mapping, via PageAnon(page) call,
> +  *and inco

Re: [PATCH v4 09/23] mm/gup: introduce pin_user_pages*() and FOLL_PIN

2019-11-13 Thread Jan Kara

On Tue 12-11-19 20:26:56, John Hubbard wrote:
> Introduce pin_user_pages*() variations of get_user_pages*() calls,
> and also pin_longterm_pages*() variations.
> 
> These variants all set FOLL_PIN, which is also introduced, and
> thoroughly documented.
> 
> The pin_longterm*() variants also set FOLL_LONGTERM, in addition
> to FOLL_PIN:
> 
> pin_user_pages()
> pin_user_pages_remote()
> pin_user_pages_fast()
> 
> pin_longterm_pages()
> pin_longterm_pages_remote()
> pin_longterm_pages_fast()
> 
> All pages that are pinned via the above calls, must be unpinned via
> put_user_page().
> 
> The underlying rules are:
> 
> * These are gup-internal flags, so the call sites should not directly
> set FOLL_PIN nor FOLL_LONGTERM. That behavior is enforced with
> assertions, for the new FOLL_PIN flag. However, for the pre-existing
> FOLL_LONGTERM flag, which has some call sites that still directly
> set FOLL_LONGTERM, there is no assertion yet.
> 
> * Call sites that want to indicate that they are going to do DirectIO
>   ("DIO") or something with similar characteristics, should call a
>   get_user_pages()-like wrapper call that sets FOLL_PIN. These wrappers
>   will:
> * Start with "pin_user_pages" instead of "get_user_pages". That
>   makes it easy to find and audit the call sites.
>     * Set FOLL_PIN
> 
> * For pages that are received via FOLL_PIN, those pages must be returned
>   via put_user_page().
> 
> Thanks to Jan Kara and Vlastimil Babka for explaining the 4 cases
> in this documentation. (I've reworded it and expanded upon it.)
> 
> Reviewed-by: Mike Rapoport   # Documentation
> Reviewed-by: Jérôme Glisse 
> Cc: Jonathan Corbet 
> Cc: Ira Weiny 
> Signed-off-by: John Hubbard 

Thanks for the documentation. It looks great!

> diff --git a/mm/gup.c b/mm/gup.c
> index 83702b2e86c8..4409e84dff51 100644
> --- a/mm/gup.c
> +++ b/mm/gup.c
> @@ -201,6 +201,10 @@ static struct page *follow_page_pte(struct 
> vm_area_struct *vma,
>   spinlock_t *ptl;
>   pte_t *ptep, pte;
>  
> + /* FOLL_GET and FOLL_PIN are mutually exclusive. */
> + if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
> +  (FOLL_PIN | FOLL_GET)))
> + return ERR_PTR(-EINVAL);
>  retry:
>   if (unlikely(pmd_bad(*pmd)))
>   return no_page_table(vma, flags);

How does FOLL_PIN result in grabbing (at least normal, for now) page reference?
I didn't find that anywhere in this patch but it is a prerequisite to
converting any user to pin_user_pages() interface, right?

> +/**
> + * pin_user_pages_fast() - pin user pages in memory without taking locks
> + *
> + * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
> + * get_user_pages_fast() for documentation on the function arguments, because
> + * the arguments here are identical.
> + *
> + * FOLL_PIN means that the pages must be released via put_user_page(). Please
> + * see Documentation/vm/pin_user_pages.rst for further details.
> + *
> + * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. 
> It
> + * is NOT intended for Case 2 (RDMA: long-term pins).
> + */
> +int pin_user_pages_fast(unsigned long start, int nr_pages,
> + unsigned int gup_flags, struct page **pages)
> +{
> + /* FOLL_GET and FOLL_PIN are mutually exclusive. */
> + if (WARN_ON_ONCE(gup_flags & FOLL_GET))
> + return -EINVAL;
> +
> + gup_flags |= FOLL_PIN;
> + return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
> +}
> +EXPORT_SYMBOL_GPL(pin_user_pages_fast);

I was somewhat wondering about the number of functions you add here. So we
have:

pin_user_pages()
pin_user_pages_fast()
pin_user_pages_remote()

and then longterm variants:

pin_longterm_pages()
pin_longterm_pages_fast()
pin_longterm_pages_remote()

and obviously we have gup like:
get_user_pages()
get_user_pages_fast()
get_user_pages_remote()
... and some other gup variants ...

I think we really should have pin_* vs get_* variants as they are very
different in terms of guarantees and after conversion, any use of get_*
variant in non-mm code should be closely scrutinized. OTOH pin_longterm_*
don't look *that* useful to me and just using pin_* instead with
FOLL_LONGTERM flag would look OK to me and somewhat reduce the number of
functions which is already large enough? What do people think? I don't feel
too strongly about this but wanted to bring this up.

Honza



-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v3 00/23] mm/gup: track dma-pinned pages: FOLL_PIN, FOLL_LONGTERM

2019-11-13 Thread Jan Kara

On Wed 13-11-19 01:02:02, John Hubbard wrote:
> On 11/13/19 12:22 AM, Daniel Vetter wrote:
> ...
> > > > Why are we doing this? I think things got confused here someplace, as
> > > 
> > > 
> > > Because:
> > > 
> > > a) These need put_page() calls,  and
> > > 
> > > b) there is no put_pages() call, but there is a release_pages() call that
> > > is, arguably, what put_pages() would be.
> > > 
> > > 
> > > > the comment still says:
> > > > 
> > > > /**
> > > >   * put_user_page() - release a gup-pinned page
> > > >   * @page:pointer to page to be released
> > > >   *
> > > >   * Pages that were pinned via get_user_pages*() must be released via
> > > >   * either put_user_page(), or one of the put_user_pages*() routines
> > > >   * below.
> > > 
> > > 
> > > Ohhh, I missed those comments. They need to all be changed over to
> > > say "pages that were pinned via pin_user_pages*() or
> > > pin_longterm_pages*() must be released via put_user_page*()."
> > > 
> > > The get_user_pages*() pages must still be released via put_page.
> > > 
> > > The churn is due to a fairly significant change in strategy, whis
> > > is: instead of changing all get_user_pages*() sites to call
> > > put_user_page(), change selected sites to call pin_user_pages*() or
> > > pin_longterm_pages*(), plus put_user_page().
> > 
> > Can't we call this unpin_user_page then, for some symmetry? Or is that
> > even more churn?
> > 
> > Looking from afar the naming here seems really confusing.
> 
> 
> That look from afar is valuable, because I'm too close to the problem to see
> how the naming looks. :)
> 
> unpin_user_page() sounds symmetrical. It's true that it would cause more
> churn (which is why I started off with a proposal that avoids changing the
> names of put_user_page*() APIs). But OTOH, the amount of churn is proportional
> to the change in direction here, and it's really only 10 or 20 lines changed,
> in the end.
> 
> So I'm open to changing to that naming. It would be nice to hear what others
> prefer, too...

FWIW I'd find unpin_user_page() also better than put_user_page() as a
counterpart to pin_user_pages().

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 00/34] put_user_pages(): miscellaneous call sites

2019-08-09 Thread Jan Kara

On Thu 08-08-19 16:25:04, Weiny, Ira wrote:
> > I thought I'd caught things early enough to get away with the
> > rename and deletion of that. You could either:
> > 
> > a) open code an implementation of vaddr_put_pages_dirty_lock() that
> > doesn't call any of the *put_user_pages_dirty*() variants, or
> > 
> > b) include my first patch ("") are part of your series, or
> > 
> > c) base this on Andrews's tree, which already has merged in my first patch.
> > 
> 
> Yep I can do this.  I did not realize that Andrew had accepted any of
> this work.  I'll check out his tree.  But I don't think he is going to
> accept this series through his tree.  So what is the ETA on that landing
> in Linus' tree?
> 
> To that point I'm still not sure who would take all this as I am now
> touching mm, procfs, rdma, ext4, and xfs.

MM tree would be one candidate for routing but there are other options that
would make sense as well - Dan's tree, VFS tree, or even I can pickup the
patches to my tree if needed. But let's worry about the routing after we
have working and reviewed patches...

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 00/34] put_user_pages(): miscellaneous call sites

2019-08-09 Thread Jan Kara

On Wed 07-08-19 19:36:37, Ira Weiny wrote:
> On Wed, Aug 07, 2019 at 10:46:49AM +0200, Michal Hocko wrote:
> > > So I think your debug option and my suggested renaming serve a bit
> > > different purposes (and thus both make sense). If you do the renaming, you
> > > can just grep to see unconverted sites. Also when someone merges new GUP
> > > user (unaware of the new rules) while you switch GUP to use pins instead 
> > > of
> > > ordinary references, you'll get compilation error in case of renaming
> > > instead of hard to debug refcount leak without the renaming. And such
> > > conflict is almost bound to happen given the size of GUP patch set... Also
> > > the renaming serves against the "coding inertia" - i.e., GUP is around for
> > > ages so people just use it without checking any documentation or comments.
> > > After switching how GUP works, what used to be correct isn't anymore so
> > > renaming the function serves as a warning that something has really
> > > changed.
> > 
> > Fully agreed!
> 
> Ok Prior to this I've been basing all my work for the RDMA/FS DAX stuff in
> Johns put_user_pages()...  (Including when I proposed failing truncate with a
> lease in June [1])
> 
> However, based on the suggestions in that thread it became clear that a new
> interface was going to need to be added to pass in the "RDMA file" information
> to GUP to associate file pins with the correct processes...
> 
> I have many drawings on my white board with "a whole lot of lines" on them to
> make sure that if a process opens a file, mmaps it, pins it with RDMA, 
> _closes_
> it, and ummaps it; that the resulting file pin can still be traced back to the
> RDMA context and all the processes which may have access to it  No matter
> where the original context may have come from.  I believe I have accomplished
> that.
> 
> Before I go on, I would like to say that the "imbalance" of get_user_pages()
> and put_page() bothers me from a purist standpoint...  However, since this
> discussion cropped up I went ahead and ported my work to Linus' current master
> (5.3-rc3+) and in doing so I only had to steal a bit of Johns code...  Sorry
> John...  :-(
> 
> I don't have the commit messages all cleaned up and I know there may be some
> discussion on these new interfaces but I wanted to throw this series out there
> because I think it may be what Jan and Michal are driving at (or at least in
> that direction.
> 
> Right now only RDMA and DAX FS's are supported.  Other users of GUP will still
> fail on a DAX file and regular files will still be at risk.[2]
> 
> I've pushed this work (based 5.3-rc3+ (33920f1ec5bf)) here[3]:
> 
> https://github.com/weiny2/linux-kernel/tree/linus-rdmafsdax-b0-v3
> 
> I think the most relevant patch to this conversation is:
> 
> https://github.com/weiny2/linux-kernel/commit/5d377653ba5cf11c3b716f904b057bee6641aaf6
> 
> I stole Jans suggestion for a name as the name I used while prototyping was
> pretty bad...  So Thanks Jan...  ;-)

For your function, I'd choose a name like vaddr_pin_leased_pages() so that
association with a lease is clear from the name :) Also I'd choose the
counterpart to be vaddr_unpin_leased_page[s](). Especially having put_page in
the name looks confusing to me...

Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 00/34] put_user_pages(): miscellaneous call sites

2019-08-07 Thread Jan Kara

On Fri 02-08-19 12:14:09, John Hubbard wrote:
> On 8/2/19 7:52 AM, Jan Kara wrote:
> > On Fri 02-08-19 07:24:43, Matthew Wilcox wrote:
> > > On Fri, Aug 02, 2019 at 02:41:46PM +0200, Jan Kara wrote:
> > > > On Fri 02-08-19 11:12:44, Michal Hocko wrote:
> > > > > On Thu 01-08-19 19:19:31, john.hubb...@gmail.com wrote:
> > > > > [...]
> > > > > > 2) Convert all of the call sites for get_user_pages*(), to
> > > > > > invoke put_user_page*(), instead of put_page(). This involves 
> > > > > > dozens of
> > > > > > call sites, and will take some time.
> > > > > 
> > > > > How do we make sure this is the case and it will remain the case in 
> > > > > the
> > > > > future? There must be some automagic to enforce/check that. It is 
> > > > > simply
> > > > > not manageable to do it every now and then because then 3) will simply
> > > > > be never safe.
> > > > > 
> > > > > Have you considered coccinele or some other scripted way to do the
> > > > > transition? I have no idea how to deal with future changes that would
> > > > > break the balance though.
> 
> Hi Michal,
> 
> Yes, I've thought about it, and coccinelle falls a bit short (it's not smart
> enough to know which put_page()'s to convert). However, there is a debug
> option planned: a yet-to-be-posted commit [1] uses struct page extensions
> (obviously protected by CONFIG_DEBUG_GET_USER_PAGES_REFERENCES) to add
> a redundant counter. That allows:
> 
> void __put_page(struct page *page)
> {
>   ...
>   /* Someone called put_page() instead of put_user_page() */
>   WARN_ON_ONCE(atomic_read(_ext->pin_count) > 0);
> 
> > > > 
> > > > Yeah, that's why I've been suggesting at LSF/MM that we may need to 
> > > > create
> > > > a gup wrapper - say vaddr_pin_pages() - and track which sites dropping
> > > > references got converted by using this wrapper instead of gup. The
> > > > counterpart would then be more logically named as unpin_page() or 
> > > > whatever
> > > > instead of put_user_page().  Sure this is not completely foolproof (you 
> > > > can
> > > > create new callsite using vaddr_pin_pages() and then just drop refs 
> > > > using
> > > > put_page()) but I suppose it would be a high enough barrier for missed
> > > > conversions... Thoughts?
> 
> The debug option above is still a bit simplistic in its implementation
> (and maybe not taking full advantage of the data it has), but I think
> it's preferable, because it monitors the "core" and WARNs.
> 
> Instead of the wrapper, I'm thinking: documentation and the passage of
> time, plus the debug option (perhaps enhanced--probably once I post it
> someone will notice opportunities), yes?

So I think your debug option and my suggested renaming serve a bit
different purposes (and thus both make sense). If you do the renaming, you
can just grep to see unconverted sites. Also when someone merges new GUP
user (unaware of the new rules) while you switch GUP to use pins instead of
ordinary references, you'll get compilation error in case of renaming
instead of hard to debug refcount leak without the renaming. And such
conflict is almost bound to happen given the size of GUP patch set... Also
the renaming serves against the "coding inertia" - i.e., GUP is around for
ages so people just use it without checking any documentation or comments.
After switching how GUP works, what used to be correct isn't anymore so
renaming the function serves as a warning that something has really
changed.

Your refcount debug patches are good to catch bugs in the conversions done
but that requires you to be able to excercise the code path in the first
place which may require particular HW or so, and you also have to enable
the debug option which means you already aim at verifying the GUP
references are treated properly.

Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 00/34] put_user_pages(): miscellaneous call sites

2019-08-02 Thread Jan Kara

On Fri 02-08-19 07:24:43, Matthew Wilcox wrote:
> On Fri, Aug 02, 2019 at 02:41:46PM +0200, Jan Kara wrote:
> > On Fri 02-08-19 11:12:44, Michal Hocko wrote:
> > > On Thu 01-08-19 19:19:31, john.hubb...@gmail.com wrote:
> > > [...]
> > > > 2) Convert all of the call sites for get_user_pages*(), to
> > > > invoke put_user_page*(), instead of put_page(). This involves dozens of
> > > > call sites, and will take some time.
> > > 
> > > How do we make sure this is the case and it will remain the case in the
> > > future? There must be some automagic to enforce/check that. It is simply
> > > not manageable to do it every now and then because then 3) will simply
> > > be never safe.
> > > 
> > > Have you considered coccinele or some other scripted way to do the
> > > transition? I have no idea how to deal with future changes that would
> > > break the balance though.
> > 
> > Yeah, that's why I've been suggesting at LSF/MM that we may need to create
> > a gup wrapper - say vaddr_pin_pages() - and track which sites dropping
> > references got converted by using this wrapper instead of gup. The
> > counterpart would then be more logically named as unpin_page() or whatever
> > instead of put_user_page().  Sure this is not completely foolproof (you can
> > create new callsite using vaddr_pin_pages() and then just drop refs using
> > put_page()) but I suppose it would be a high enough barrier for missed
> > conversions... Thoughts?
> 
> I think the API we really need is get_user_bvec() / put_user_bvec(),
> and I know Christoph has been putting some work into that.  That avoids
> doing refcount operations on hundreds of pages if the page in question is
> a huge page.  Once people are switched over to that, they won't be tempted
> to manually call put_page() on the individual constituent pages of a bvec.

Well, get_user_bvec() is certainly a good API for one class of users but
just looking at the above series, you'll see there are *many* places that
just don't work with bvecs at all and you need something for those.

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 00/34] put_user_pages(): miscellaneous call sites

2019-08-02 Thread Jan Kara

On Fri 02-08-19 11:12:44, Michal Hocko wrote:
> On Thu 01-08-19 19:19:31, john.hubb...@gmail.com wrote:
> [...]
> > 2) Convert all of the call sites for get_user_pages*(), to
> > invoke put_user_page*(), instead of put_page(). This involves dozens of
> > call sites, and will take some time.
> 
> How do we make sure this is the case and it will remain the case in the
> future? There must be some automagic to enforce/check that. It is simply
> not manageable to do it every now and then because then 3) will simply
> be never safe.
> 
> Have you considered coccinele or some other scripted way to do the
> transition? I have no idea how to deal with future changes that would
> break the balance though.

Yeah, that's why I've been suggesting at LSF/MM that we may need to create
a gup wrapper - say vaddr_pin_pages() - and track which sites dropping
references got converted by using this wrapper instead of gup. The
counterpart would then be more logically named as unpin_page() or whatever
instead of put_user_page().  Sure this is not completely foolproof (you can
create new callsite using vaddr_pin_pages() and then just drop refs using
put_page()) but I suppose it would be a high enough barrier for missed
conversions... Thoughts?

    Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 0/9] mmu notifier provide context informations

2019-02-01 Thread Jan Kara

On Thu 31-01-19 11:10:06, Jerome Glisse wrote:
> 
> Andrew what is your plan for this ? I had a discussion with Peter Xu
> and Andrea about change_pte() and kvm. Today the change_pte() kvm
> optimization is effectively disabled because of invalidate_range
> calls. With a minimal couple lines patch on top of this patchset
> we can bring back the kvm change_pte optimization and we can also
> optimize some other cases like for instance when write protecting
> after fork (but i am not sure this is something qemu does often so
> it might not help for real kvm workload).
> 
> I will be posting a the extra patch as an RFC, but in the meantime
> i wanted to know what was the status for this.
> 
> Jan, Christian does your previous ACK still holds for this ?

Yes, I still think the approach makes sense. Dan's concern about in tree
users is valid but it seems you have those just not merged yet, right?

        Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 1/3] mm/mmu_notifier: use structure for invalidate_range_start/end callback

2018-12-05 Thread Jan Kara

On Wed 05-12-18 11:40:52, Jerome Glisse wrote:
> On Wed, Dec 05, 2018 at 05:35:20PM +0100, Jan Kara wrote:
> > On Wed 05-12-18 00:36:26, jgli...@redhat.com wrote:
> > > diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> > > index 5119ff846769..5f6665ae3ee2 100644
> > > --- a/mm/mmu_notifier.c
> > > +++ b/mm/mmu_notifier.c
> > > @@ -178,14 +178,20 @@ int __mmu_notifier_invalidate_range_start(struct 
> > > mm_struct *mm,
> > > unsigned long start, unsigned long end,
> > > bool blockable)
> > >  {
> > > + struct mmu_notifier_range _range, *range = &_range;
> > 
> > Why these games with two variables?
> 
> This is a temporary step i dediced to do the convertion in 2 steps,
> first i convert the callback to use the structure so that people
> having mmu notifier callback only have to review this patch and do
> not get distracted by the second step which update all the mm call
> site that trigger invalidation.
> 
> In the final result this code disappear. I did it that way to make
> the thing more reviewable. Sorry if that is a bit confusing.

Aha, right. Thanks for clarification. You can add:

Acked-by: Jan Kara 

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v2 2/3] mm/mmu_notifier: use structure for invalidate_range_start/end calls v2

2018-12-05 Thread Jan Kara

On Wed 05-12-18 00:36:27, jgli...@redhat.com wrote:
> From: Jérôme Glisse 
> 
> To avoid having to change many call sites everytime we want to add a
> parameter use a structure to group all parameters for the mmu_notifier
> invalidate_range_start/end cakks. No functional changes with this
> patch.
> 
> Changes since v1:
> - introduce mmu_notifier_range_init() as an helper to initialize
>   the range structure allowing to optimize out the case when mmu
>   notifier is not enabled
> - fix mm/migrate.c migrate_vma_collect()
> 
> Signed-off-by: Jérôme Glisse 
> Acked-by: Christian König 

The patch looks good to me. You can add:

Acked-by: Jan Kara 

Honza


> Cc: Andrew Morton 
> Cc: Matthew Wilcox 
> Cc: Ross Zwisler 
> Cc: Jan Kara 
> Cc: Dan Williams 
> Cc: Paolo Bonzini 
> Cc: Radim Krčmář 
> Cc: Michal Hocko 
> Cc: Felix Kuehling 
> Cc: Ralph Campbell 
> Cc: John Hubbard 
> Cc: k...@vger.kernel.org
> Cc: dri-devel@lists.freedesktop.org
> Cc: linux-r...@vger.kernel.org
> Cc: linux-fsde...@vger.kernel.org
> ---
>  fs/dax.c |  8 +--
>  fs/proc/task_mmu.c   |  7 ++-
>  include/linux/mm.h   |  4 +-
>  include/linux/mmu_notifier.h | 87 +---
>  kernel/events/uprobes.c  | 10 ++--
>  mm/huge_memory.c | 54 ++--
>  mm/hugetlb.c | 52 ++-
>  mm/khugepaged.c  | 10 ++--
>  mm/ksm.c | 21 
>  mm/madvise.c | 21 
>  mm/memory.c  | 97 ++--
>  mm/migrate.c | 25 +-
>  mm/mmu_notifier.c| 35 +++--
>  mm/mprotect.c| 15 +++---
>  mm/mremap.c  | 10 ++--
>  mm/oom_kill.c| 17 ---
>  mm/rmap.c| 30 ++-
>  17 files changed, 258 insertions(+), 245 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 9bcce89ea18e..874085bacaf5 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -758,7 +758,8 @@ static void dax_entry_mkclean(struct address_space 
> *mapping, pgoff_t index,
>  
>   i_mmap_lock_read(mapping);
>   vma_interval_tree_foreach(vma, >i_mmap, index, index) {
> - unsigned long address, start, end;
> + struct mmu_notifier_range range;
> + unsigned long address;
>  
>   cond_resched();
>  
> @@ -772,7 +773,8 @@ static void dax_entry_mkclean(struct address_space 
> *mapping, pgoff_t index,
>* call mmu_notifier_invalidate_range_start() on our behalf
>* before taking any lock.
>*/
> - if (follow_pte_pmd(vma->vm_mm, address, , , , 
> , ))
> + if (follow_pte_pmd(vma->vm_mm, address, ,
> +, , ))
>   continue;
>  
>   /*
> @@ -814,7 +816,7 @@ static void dax_entry_mkclean(struct address_space 
> *mapping, pgoff_t index,
>   pte_unmap_unlock(ptep, ptl);
>   }
>  
> - mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
> + mmu_notifier_invalidate_range_end();
>   }
>   i_mmap_unlock_read(mapping);
>  }
> diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
> index 47c3764c469b..b3ddceb003bc 100644
> --- a/fs/proc/task_mmu.c
> +++ b/fs/proc/task_mmu.c
> @@ -1096,6 +1096,7 @@ static ssize_t clear_refs_write(struct file *file, 
> const char __user *buf,
>   return -ESRCH;
>   mm = get_task_mm(task);
>   if (mm) {
> + struct mmu_notifier_range range;
>   struct clear_refs_private cp = {
>   .type = type,
>   };
> @@ -1139,11 +1140,13 @@ static ssize_t clear_refs_write(struct file *file, 
> const char __user *buf,
>   downgrade_write(>mmap_sem);
>   break;
>   }
> - mmu_notifier_invalidate_range_start(mm, 0, -1);
> +
> + mmu_notifier_range_init(, mm, 0, -1UL);
> + mmu_notifier_invalidate_range_start();
>   }
>   walk_page_range(0, mm->highest_vm_end, _refs_walk);
>   if (type == CLEAR_REFS_SOFT_DIRTY)
> - mmu_notifier_invalidate_range_end(mm, 0, -1);
> + mmu_notifier_invalidate_range_end();
>   tlb_finish_mmu(, 0, -1);
>   up_read(>mmap_sem);
>  out_mm:
> diff --git

Re: [PATCH v2 1/3] mm/mmu_notifier: use structure for invalidate_range_start/end callback

2018-12-05 Thread Jan Kara

On Wed 05-12-18 00:36:26, jgli...@redhat.com wrote:
> diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
> index 5119ff846769..5f6665ae3ee2 100644
> --- a/mm/mmu_notifier.c
> +++ b/mm/mmu_notifier.c
> @@ -178,14 +178,20 @@ int __mmu_notifier_invalidate_range_start(struct 
> mm_struct *mm,
> unsigned long start, unsigned long end,
> bool blockable)
>  {
> + struct mmu_notifier_range _range, *range = &_range;

Why these games with two variables?

>   struct mmu_notifier *mn;
>   int ret = 0;
>   int id;
>  
> + range->blockable = blockable;
> + range->start = start;
> + range->end = end;
> + range->mm = mm;
> +

Use your init function for this?

>   id = srcu_read_lock();
>   hlist_for_each_entry_rcu(mn, >mmu_notifier_mm->list, hlist) {
>   if (mn->ops->invalidate_range_start) {
> - int _ret = mn->ops->invalidate_range_start(mn, mm, 
> start, end, blockable);
> + int _ret = mn->ops->invalidate_range_start(mn, range);
>   if (_ret) {
>   pr_info("%pS callback failed with %d in 
> %sblockable context.\n",
>   
> mn->ops->invalidate_range_start, _ret,
> @@ -205,9 +211,20 @@ void __mmu_notifier_invalidate_range_end(struct 
> mm_struct *mm,
>unsigned long end,
>bool only_end)
>  {
> + struct mmu_notifier_range _range, *range = &_range;
>   struct mmu_notifier *mn;
>   int id;
>  
> + /*
> +  * The end call back will never be call if the start refused to go
> +  * through because of blockable was false so here assume that we
> +  * can block.
> +  */
> + range->blockable = true;
> + range->start = start;
> + range->end = end;
> + range->mm = mm;
> +

The same as above.

Otherwise the patch looks good to me.

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 2/3] mm/mmu_notifier: use structure for invalidate_range_start/end calls

2018-12-05 Thread Jan Kara

On Wed 05-12-18 10:53:57, Jerome Glisse wrote:
> On Wed, Dec 05, 2018 at 12:04:16PM +0100, Jan Kara wrote:
> > Hi Jerome!
> > 
> > On Mon 03-12-18 15:18:16, jgli...@redhat.com wrote:
> > > From: Jérôme Glisse 
> > > 
> > > To avoid having to change many call sites everytime we want to add a
> > > parameter use a structure to group all parameters for the mmu_notifier
> > > invalidate_range_start/end cakks. No functional changes with this
> > > patch.
> > 
> > Two suggestions for the patch below:
> > 
> > > @@ -772,7 +775,8 @@ static void dax_entry_mkclean(struct address_space 
> > > *mapping, pgoff_t index,
> > >* call mmu_notifier_invalidate_range_start() on our behalf
> > >* before taking any lock.
> > >*/
> > > - if (follow_pte_pmd(vma->vm_mm, address, , , , 
> > > , ))
> > > + if (follow_pte_pmd(vma->vm_mm, address, ,
> > > +, , ))
> > >   continue;
> > 
> > The change of follow_pte_pmd() arguments looks unexpected. Why should that
> > care about mmu notifier range? I see it may be convenient but it doesn't 
> > look
> > like a good API to me.
> 
> Saddly i do not see a way around that one this is because of fs/dax.c
> which does the mmu_notifier_invalidate_range_end while follow_pte_pmd
> do the mmu_notifier_invalidate_range_start

I see so this is really a preexisting problem with follow_pte_pmd() having
ugly interface. After some thoughts I think your patch actually slightly
improves the situation so OK.

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH 2/3] mm/mmu_notifier: use structure for invalidate_range_start/end calls

2018-12-05 Thread Jan Kara

Hi Jerome!

On Mon 03-12-18 15:18:16, jgli...@redhat.com wrote:
> From: Jérôme Glisse 
> 
> To avoid having to change many call sites everytime we want to add a
> parameter use a structure to group all parameters for the mmu_notifier
> invalidate_range_start/end cakks. No functional changes with this
> patch.

Two suggestions for the patch below:

> @@ -772,7 +775,8 @@ static void dax_entry_mkclean(struct address_space 
> *mapping, pgoff_t index,
>* call mmu_notifier_invalidate_range_start() on our behalf
>* before taking any lock.
>*/
> - if (follow_pte_pmd(vma->vm_mm, address, , , , 
> , ))
> + if (follow_pte_pmd(vma->vm_mm, address, ,
> +, , ))
>   continue;

The change of follow_pte_pmd() arguments looks unexpected. Why should that
care about mmu notifier range? I see it may be convenient but it doesn't look
like a good API to me.

> @@ -1139,11 +1140,15 @@ static ssize_t clear_refs_write(struct file *file, 
> const char __user *buf,
>   downgrade_write(>mmap_sem);
>   break;
>   }
> - mmu_notifier_invalidate_range_start(mm, 0, -1);
> +
> + range.start = 0;
> + range.end = -1UL;
> + range.mm = mm;
> + mmu_notifier_invalidate_range_start();

Also how about providing initializer for struct mmu_notifier_range? Or
something like DECLARE_MMU_NOTIFIER_RANGE? That will make sure that
unused arguments for particular notification places have defined values and
also if you add another mandatory argument (like you do in your third
patch), you just add another argument to the initializer and that way
the compiler makes sure you haven't missed any place. Finally the code will
remain more compact that way (less lines needed to initialize the struct).

Honza
-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: SLAB_TYPESAFE_BY_RCU without constructors (was Re: [PATCH v4 13/17] khwasan: add hooks implementation)

2018-08-06 Thread Jan Kara

On Wed 01-08-18 10:46:35, Dmitry Vyukov wrote:
> I guess it would be useful to have such extensive comment for each
> SLAB_TYPESAFE_BY_RCU use explaining why it is needed and how all the
> tricky aspects are handled.
> 
> For example, the one in jbd2 is interesting because it memsets the
> whole object before freeing it into SLAB_TYPESAFE_BY_RCU slab:
> 
> memset(jh, JBD2_POISON_FREE, sizeof(*jh));
> kmem_cache_free(jbd2_journal_head_cache, jh);
> 
> I guess there are also tricky ways how it can all work in the end
> (type-stable state is only a byte, or we check for all possible
> combinations of being overwritten with JBD2_POISON_FREE). But at first
> sight it does look fishy.

The RCU access is used from a single place:

fs/jbd2/transaction.c: jbd2_write_access_granted()

There are also quite some comments explaining why what it does is safe. The
overwrite by JBD2_POISON_FREE is much older than this RCU stuff (honestly I
didn't know about it until this moment) and has nothing to do with the
safety of RCU access.

        Honza

-- 
Jan Kara 
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 1/5] mm: add mkwrite param to vm_insert_mixed()

2017-07-24 Thread Jan Kara

On Mon 24-07-17 09:23:57, Ross Zwisler wrote:
> On Mon, Jul 24, 2017 at 01:25:30PM +0200, Jan Kara wrote:
> > > @@ -1658,14 +1658,28 @@ static int insert_pfn(struct vm_area_struct *vma, 
> > > unsigned long addr,
> > >   if (!pte)
> > >   goto out;
> > >   retval = -EBUSY;
> > > - if (!pte_none(*pte))
> > > - goto out_unlock;
> > > + if (!pte_none(*pte)) {
> > > + if (mkwrite) {
> > > + if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
> > 
> > Is the WARN_ON_ONCE() really appropriate here? Your testcase with private
> > mappings has triggered this situation if I'm right...
> 
> Yep, I think this WARN_ON_ONCE() is correct.  The test with private mappings
> had collisions between read-only DAX mappings which were being faulted in via
> insert_pfn(), and read/write COW page cache mappings which were being faulted
> in by wp_page_copy().
> 
> I was hitting a false-positive warning when I had the WARN_ON_ONCE() in
> insert_pfn() outside of the mkwrite case, i.e.:
> 
>   if (!pte_none(*pte)) {
>   if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
>   goto out_unlock;
>   if (mkwrite) {
>   entry = *pte;
>   goto out_mkwrite;
>   } else
>   goto out_unlock;
>   }
> 
> This was triggering when one thread was faulting in a read-only DAX mapping
> when another thread had already faulted in a read-write COW page cache page.
> 
> The patches I sent out have the warning in the mkwrite case, which would mean
> that we were getting a fault for a read/write PTE in insert_pfn() and the PFN
> didn't match what was already in the PTE.
> 
> This can't ever happen in the private mapping case because we will never
> install a read/write PTE for normal storage, only for COW page cache pages.
> Essentially I don't think we should ever be able to hit this warning, and if
> we do I'd like to get the bug report so that I can track down how it was
> happening and make sure that it's safe.  It is in the mkwrite path of
> insert_pfn() which is currently only used by the DAX code.
> 
> Does that make sense to you, or would you recommend leaving it out?  (If so,
> why?)

Ah, OK, makes sense. So feel free to add:

Reviewed-by: Jan Kara <j...@suse.cz>

Honza
-- 
Jan Kara <j...@suse.com>
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 5/5] dax: move all DAX radix tree defs to fs/dax.c

2017-07-24 Thread Jan Kara

On Fri 21-07-17 16:39:55, Ross Zwisler wrote:
> Now that we no longer insert struct page pointers in DAX radix trees the
> page cache code no longer needs to know anything about DAX exceptional
> entries.  Move all the DAX exceptional entry definitions from dax.h to
> fs/dax.c.
> 
> Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> Suggested-by: Jan Kara <j...@suse.cz>

Looks good. You can add:

Reviewed-by: Jan Kara <j...@suse.cz>

Honza


> ---
>  fs/dax.c| 34 ++
>  include/linux/dax.h | 41 -
>  2 files changed, 34 insertions(+), 41 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 0e27d90..e7acc45 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -54,6 +54,40 @@ static int __init init_dax_wait_table(void)
>  }
>  fs_initcall(init_dax_wait_table);
>  
> +/*
> + * We use lowest available bit in exceptional entry for locking, one bit for
> + * the entry size (PMD) and two more to tell us if the entry is a zero page 
> or
> + * an empty entry that is just used for locking.  In total four special bits.
> + *
> + * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the 
> ZERO_PAGE
> + * and EMPTY bits aren't set the entry is a normal DAX entry with a 
> filesystem
> + * block allocation.
> + */
> +#define RADIX_DAX_SHIFT  (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
> +#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
> +#define RADIX_DAX_PMD(1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 
> 1))
> +#define RADIX_DAX_ZERO_PAGE  (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
> +#define RADIX_DAX_EMPTY  (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 
> 3))
> +
> +static unsigned long dax_radix_sector(void *entry)
> +{
> + return (unsigned long)entry >> RADIX_DAX_SHIFT;
> +}
> +
> +static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
> +{
> + return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
> + ((unsigned long)sector << RADIX_DAX_SHIFT) |
> + RADIX_DAX_ENTRY_LOCK);
> +}
> +
> +static unsigned int dax_radix_order(void *entry)
> +{
> + if ((unsigned long)entry & RADIX_DAX_PMD)
> + return PMD_SHIFT - PAGE_SHIFT;
> + return 0;
> +}
> +
>  static int dax_is_pmd_entry(void *entry)
>  {
>   return (unsigned long)entry & RADIX_DAX_PMD;
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index afa99bb..d0e3272 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -88,33 +88,6 @@ void dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, 
> void *addr,
>   size_t size);
>  void dax_write_cache(struct dax_device *dax_dev, bool wc);
>  
> -/*
> - * We use lowest available bit in exceptional entry for locking, one bit for
> - * the entry size (PMD) and two more to tell us if the entry is a zero page 
> or
> - * an empty entry that is just used for locking.  In total four special bits.
> - *
> - * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the 
> ZERO_PAGE
> - * and EMPTY bits aren't set the entry is a normal DAX entry with a 
> filesystem
> - * block allocation.
> - */
> -#define RADIX_DAX_SHIFT  (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
> -#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
> -#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
> -#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
> -#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
> -
> -static inline unsigned long dax_radix_sector(void *entry)
> -{
> - return (unsigned long)entry >> RADIX_DAX_SHIFT;
> -}
> -
> -static inline void *dax_radix_locked_entry(sector_t sector, unsigned long 
> flags)
> -{
> - return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
> - ((unsigned long)sector << RADIX_DAX_SHIFT) |
> - RADIX_DAX_ENTRY_LOCK);
> -}
> -
>  ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
>   const struct iomap_ops *ops);
>  int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
> @@ -136,20 +109,6 @@ static inline int __dax_zero_page_range(struct 
> block_device *bdev,
>  }
>  #endif
>  
> -#ifdef CONFIG_FS_DAX_PMD
> -static inline unsigned int dax_radix_order(void *entry)
> -{
> - if ((unsigned long)entry & RADIX_DAX_PMD)
> - return PMD_SHIFT - PAGE_SHIFT;
> - return 0;
> -}
> -#else
> -static inline unsigned int dax_radix_order(void *entry)
> -{
> - return 0;
> -}
> -#endif
> -
>  static inline bool dax_mapping(struct address_space *mapping)
>  {
>   return mapping->host && IS_DAX(mapping->host);
> -- 
> 2.9.4
> 
-- 
Jan Kara <j...@suse.com>
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 4/5] dax: remove DAX code from page_cache_tree_insert()

2017-07-24 Thread Jan Kara

On Fri 21-07-17 16:39:54, Ross Zwisler wrote:
> Now that we no longer insert struct page pointers in DAX radix trees we can
> remove the special casing for DAX in page_cache_tree_insert().  This also
> allows us to make dax_wake_mapping_entry_waiter() local to fs/dax.c,
> removing it from dax.h.
> 
> Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> Suggested-by: Jan Kara <j...@suse.cz>

Looks good. You can add:

Reviewed-by: Jan Kara <j...@suse.cz>

Honza


> ---
>  fs/dax.c|  2 +-
>  include/linux/dax.h |  2 --
>  mm/filemap.c| 13 ++---
>  3 files changed, 3 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index fb0e4c1..0e27d90 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -127,7 +127,7 @@ static int wake_exceptional_entry_func(wait_queue_entry_t 
> *wait, unsigned int mo
>   * correct waitqueue where tasks might be waiting for that old 'entry' and
>   * wake them.
>   */
> -void dax_wake_mapping_entry_waiter(struct address_space *mapping,
> +static void dax_wake_mapping_entry_waiter(struct address_space *mapping,
>   pgoff_t index, void *entry, bool wake_all)
>  {
>   struct exceptional_entry_key key;
> diff --git a/include/linux/dax.h b/include/linux/dax.h
> index 29cced8..afa99bb 100644
> --- a/include/linux/dax.h
> +++ b/include/linux/dax.h
> @@ -122,8 +122,6 @@ int dax_iomap_fault(struct vm_fault *vmf, enum 
> page_entry_size pe_size,
>  int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
>  int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
> pgoff_t index);
> -void dax_wake_mapping_entry_waiter(struct address_space *mapping,
> - pgoff_t index, void *entry, bool wake_all);
>  
>  #ifdef CONFIG_FS_DAX
>  int __dax_zero_page_range(struct block_device *bdev,
> diff --git a/mm/filemap.c b/mm/filemap.c
> index a497024..1bf1265 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -130,17 +130,8 @@ static int page_cache_tree_insert(struct address_space 
> *mapping,
>   return -EEXIST;
>  
>   mapping->nrexceptional--;
> - if (!dax_mapping(mapping)) {
> - if (shadowp)
> - *shadowp = p;
> - } else {
> - /* DAX can replace empty locked entry with a hole */
> - WARN_ON_ONCE(p !=
> - dax_radix_locked_entry(0, RADIX_DAX_EMPTY));
> - /* Wakeup waiters for exceptional entry lock */
> - dax_wake_mapping_entry_waiter(mapping, page->index, p,
> -   true);
> - }
> +         if (shadowp)
> + *shadowp = p;
>   }
>   __radix_tree_replace(>page_tree, node, slot, page,
>workingset_update_node, mapping);
> -- 
> 2.9.4
> 
-- 
Jan Kara <j...@suse.com>
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 3/5] dax: use common 4k zero page for dax mmap reads

2017-07-24 Thread Jan Kara

rage => writeable DAX storage
> 
>   This distinction is made by via vm_normal_page().  vm_normal_page()
>   returns false for the common 4k zero page, though, just as it does for
>   DAX ptes.  Instead of special casing the DAX + 4k zero page case, we will
>   simplify our DAX PTE page fault sequence so that it matches our DAX PMD
>   sequence, and get rid of the dax_pfn_mkwrite() helper.  We will instead
>   use dax_iomap_fault() to handle write-protection faults.
> 
>   This means that insert_pfn() needs to follow the lead of insert_pfn_pmd()
>   and allow us to pass in a 'mkwrite' flag.  If 'mkwrite' is set
>   insert_pfn() will do the work that was previously done by wp_page_reuse()
>   as part of the dax_pfn_mkwrite() call path.
> ***
> 
> Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>

The patch looks good to me. You can add:

Reviewed-by: Jan Kara <j...@suse.cz>

And I really like that we've got rid of these pagecache hole pages!

Honza
-- 
Jan Kara <j...@suse.com>
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 2/5] dax: relocate some dax functions

2017-07-24 Thread Jan Kara

On Fri 21-07-17 16:39:52, Ross Zwisler wrote:
> dax_load_hole() will soon need to call dax_insert_mapping_entry(), so it
> needs to be moved lower in dax.c so the definition exists.
> 
> dax_wake_mapping_entry_waiter() will soon be removed from dax.h and be made
> static to dax.c, so we need to move its definition above all its callers.
> 
> Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>

Looks good. You can add:

Reviewed-by: Jan Kara <j...@suse.cz>

Honza


> ---
>  fs/dax.c | 138 
> +++
>  1 file changed, 69 insertions(+), 69 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index c844a51..779dc5e 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -121,6 +121,31 @@ static int 
> wake_exceptional_entry_func(wait_queue_entry_t *wait, unsigned int mo
>  }
>  
>  /*
> + * We do not necessarily hold the mapping->tree_lock when we call this
> + * function so it is possible that 'entry' is no longer a valid item in the
> + * radix tree.  This is okay because all we really need to do is to find the
> + * correct waitqueue where tasks might be waiting for that old 'entry' and
> + * wake them.
> + */
> +void dax_wake_mapping_entry_waiter(struct address_space *mapping,
> + pgoff_t index, void *entry, bool wake_all)
> +{
> + struct exceptional_entry_key key;
> + wait_queue_head_t *wq;
> +
> + wq = dax_entry_waitqueue(mapping, index, entry, );
> +
> + /*
> +  * Checking for locked entry and prepare_to_wait_exclusive() happens
> +  * under mapping->tree_lock, ditto for entry handling in our callers.
> +  * So at this point all tasks that could have seen our entry locked
> +  * must be in the waitqueue and the following check will see them.
> +  */
> + if (waitqueue_active(wq))
> + __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, );
> +}
> +
> +/*
>   * Check whether the given slot is locked. The function must be called with
>   * mapping->tree_lock held
>   */
> @@ -392,31 +417,6 @@ static void *grab_mapping_entry(struct address_space 
> *mapping, pgoff_t index,
>   return entry;
>  }
>  
> -/*
> - * We do not necessarily hold the mapping->tree_lock when we call this
> - * function so it is possible that 'entry' is no longer a valid item in the
> - * radix tree.  This is okay because all we really need to do is to find the
> - * correct waitqueue where tasks might be waiting for that old 'entry' and
> - * wake them.
> - */
> -void dax_wake_mapping_entry_waiter(struct address_space *mapping,
> - pgoff_t index, void *entry, bool wake_all)
> -{
> - struct exceptional_entry_key key;
> - wait_queue_head_t *wq;
> -
> - wq = dax_entry_waitqueue(mapping, index, entry, );
> -
> - /*
> -  * Checking for locked entry and prepare_to_wait_exclusive() happens
> -  * under mapping->tree_lock, ditto for entry handling in our callers.
> -  * So at this point all tasks that could have seen our entry locked
> -  * must be in the waitqueue and the following check will see them.
> -  */
> - if (waitqueue_active(wq))
> - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, );
> -}
> -
>  static int __dax_invalidate_mapping_entry(struct address_space *mapping,
> pgoff_t index, bool trunc)
>  {
> @@ -468,50 +468,6 @@ int dax_invalidate_mapping_entry_sync(struct 
> address_space *mapping,
>   return __dax_invalidate_mapping_entry(mapping, index, false);
>  }
>  
> -/*
> - * The user has performed a load from a hole in the file.  Allocating
> - * a new page in the file would cause excessive storage usage for
> - * workloads with sparse files.  We allocate a page cache page instead.
> - * We'll kick it out of the page cache if it's ever written to,
> - * otherwise it will simply fall out of the page cache under memory
> - * pressure without ever having been dirtied.
> - */
> -static int dax_load_hole(struct address_space *mapping, void **entry,
> -  struct vm_fault *vmf)
> -{
> - struct inode *inode = mapping->host;
> - struct page *page;
> - int ret;
> -
> - /* Hole page already exists? Return it...  */
> - if (!radix_tree_exceptional_entry(*entry)) {
> - page = *entry;
> - goto finish_fault;
> - }
> -
> - /* This will replace locked radix tree entry with a hole page */
> - page = find_or_create_page(mapping, vmf->pgoff,
> -vmf->gfp_mask | __GFP_ZERO);
> - if (!page

Re: [PATCH v4 1/5] mm: add mkwrite param to vm_insert_mixed()

2017-07-24 Thread Jan Kara

> @@ -1658,14 +1658,28 @@ static int insert_pfn(struct vm_area_struct *vma, 
> unsigned long addr,
>   if (!pte)
>   goto out;
>   retval = -EBUSY;
> - if (!pte_none(*pte))
> - goto out_unlock;
> + if (!pte_none(*pte)) {
> + if (mkwrite) {
> + if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))

Is the WARN_ON_ONCE() really appropriate here? Your testcase with private
mappings has triggered this situation if I'm right...

Otherwise the patch looks good to me.

        Honza

-- 
Jan Kara <j...@suse.com>
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

Re: [PATCH v4 1/5] mm: add mkwrite param to vm_insert_mixed()

2017-07-24 Thread Jan Kara

On Sat 22-07-17 09:21:31, Dan Williams wrote:
> On Fri, Jul 21, 2017 at 3:39 PM, Ross Zwisler
> <ross.zwis...@linux.intel.com> wrote:
> > To be able to use the common 4k zero page in DAX we need to have our PTE
> > fault path look more like our PMD fault path where a PTE entry can be
> > marked as dirty and writeable as it is first inserted, rather than waiting
> > for a follow-up dax_pfn_mkwrite() => finish_mkwrite_fault() call.
> >
> > Right now we can rely on having a dax_pfn_mkwrite() call because we can
> > distinguish between these two cases in do_wp_page():
> >
> > case 1: 4k zero page => writable DAX storage
> > case 2: read-only DAX storage => writeable DAX storage
> >
> > This distinction is made by via vm_normal_page().  vm_normal_page() returns
> > false for the common 4k zero page, though, just as it does for DAX ptes.
> > Instead of special casing the DAX + 4k zero page case, we will simplify our
> > DAX PTE page fault sequence so that it matches our DAX PMD sequence, and
> > get rid of the dax_pfn_mkwrite() helper.  We will instead use
> > dax_iomap_fault() to handle write-protection faults.
> >
> > This means that insert_pfn() needs to follow the lead of insert_pfn_pmd()
> > and allow us to pass in a 'mkwrite' flag.  If 'mkwrite' is set insert_pfn()
> > will do the work that was previously done by wp_page_reuse() as part of the
> > dax_pfn_mkwrite() call path.
> >
> > Signed-off-by: Ross Zwisler <ross.zwis...@linux.intel.com>
> > ---
> >  drivers/dax/device.c|  2 +-
> >  drivers/gpu/drm/exynos/exynos_drm_gem.c |  3 ++-
> >  drivers/gpu/drm/gma500/framebuffer.c|  2 +-
> >  drivers/gpu/drm/msm/msm_gem.c   |  3 ++-
> >  drivers/gpu/drm/omapdrm/omap_gem.c  |  6 --
> >  drivers/gpu/drm/ttm/ttm_bo_vm.c |  2 +-
> >  fs/dax.c|  2 +-
> >  include/linux/mm.h  |  2 +-
> >  mm/memory.c | 27 +--
> >  9 files changed, 34 insertions(+), 15 deletions(-)
> >
> > diff --git a/drivers/dax/device.c b/drivers/dax/device.c
> > index e9f3b3e..3973521 100644
> > --- a/drivers/dax/device.c
> > +++ b/drivers/dax/device.c
> > @@ -273,7 +273,7 @@ static int __dev_dax_pte_fault(struct dev_dax *dev_dax, 
> > struct vm_fault *vmf)
> >
> > pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
> >
> > -   rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
> > +   rc = vm_insert_mixed(vmf->vma, vmf->address, pfn, false);
> 
> Ugh, I generally find bool flags unreadable. They place a tax on
> jumping to function definition to recall what true and false mean. If
> we want to go this 'add an argument' route can we at least add an enum
> like:
> 
> enum {
> PTE_MKDIRTY,
> PTE_MKCLEAN,
> };
> 
> ...to differentiate the two cases?

So how I usually deal with this is that I create e.g.:

__vm_insert_mixed() that takes the bool argument, make vm_insert_mixed()
pass false, and vm_insert_mixed_mkwrite() pass true. That way there's no
code duplication, old call sites can stay unchanged, the naming clearly
says what's going on...

Honza
-- 
Jan Kara <j...@suse.com>
SUSE Labs, CR
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel

[PATCH 08/10] mm: replace __access_remote_vm() write parameter with gup_flags

2016-10-19 Thread Jan Kara

On Thu 13-10-16 01:20:18, Lorenzo Stoakes wrote:
> This patch removes the write parameter from __access_remote_vm() and replaces 
> it
> with a gup_flags parameter as use of this function previously _implied_
> FOLL_FORCE, whereas after this patch callers explicitly pass this flag.
> 
> We make this explicit as use of FOLL_FORCE can result in surprising behaviour
> (and hence bugs) within the mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 

So I'm not convinced this (and the following two patches) is actually
helping much. By grepping for FOLL_FORCE we will easily see that any caller
of access_remote_vm() gets that semantics and can thus continue search
accordingly (it is much simpler than searching for all get_user_pages()
users and extracting from parameter lists what they actually pass as
'force' argument). Sure it makes somewhat more visible to callers of
access_remote_vm() that they get FOLL_FORCE semantics but OTOH it also
opens a space for issues where a caller of access_remote_vm() actually
wants FOLL_FORCE (and currently all of them want it) and just mistakenly
does not set it. All in all I'd prefer to keep access_remote_vm() and
friends as is...

Honza

> ---
>  mm/memory.c | 23 +++
>  mm/nommu.c  |  9 ++---
>  2 files changed, 21 insertions(+), 11 deletions(-)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 20a9adb..79ebed3 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -3869,14 +3869,11 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
>   * given task for page fault accounting.
>   */
>  static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
> - unsigned long addr, void *buf, int len, int write)
> + unsigned long addr, void *buf, int len, unsigned int gup_flags)
>  {
>   struct vm_area_struct *vma;
>   void *old_buf = buf;
> - unsigned int flags = FOLL_FORCE;
> -
> - if (write)
> - flags |= FOLL_WRITE;
> + int write = gup_flags & FOLL_WRITE;
>  
>   down_read(>mmap_sem);
>   /* ignore errors, just check how much was successfully transferred */
> @@ -3886,7 +3883,7 @@ static int __access_remote_vm(struct task_struct *tsk, 
> struct mm_struct *mm,
>   struct page *page = NULL;
>  
>   ret = get_user_pages_remote(tsk, mm, addr, 1,
> - flags, , );
> + gup_flags, , );
>   if (ret <= 0) {
>  #ifndef CONFIG_HAVE_IOREMAP_PROT
>   break;
> @@ -3945,7 +3942,12 @@ static int __access_remote_vm(struct task_struct *tsk, 
> struct mm_struct *mm,
>  int access_remote_vm(struct mm_struct *mm, unsigned long addr,
>   void *buf, int len, int write)
>  {
> - return __access_remote_vm(NULL, mm, addr, buf, len, write);
> + unsigned int flags = FOLL_FORCE;
> +
> + if (write)
> + flags |= FOLL_WRITE;
> +
> + return __access_remote_vm(NULL, mm, addr, buf, len, flags);
>  }
>  
>  /*
> @@ -3958,12 +3960,17 @@ int access_process_vm(struct task_struct *tsk, 
> unsigned long addr,
>  {
>   struct mm_struct *mm;
>   int ret;
> + unsigned int flags = FOLL_FORCE;
>  
>   mm = get_task_mm(tsk);
>   if (!mm)
>   return 0;
>  
> - ret = __access_remote_vm(tsk, mm, addr, buf, len, write);
> + if (write)
> + flags |= FOLL_WRITE;
> +
> + ret = __access_remote_vm(tsk, mm, addr, buf, len, flags);
> +
>   mmput(mm);
>  
>   return ret;
> diff --git a/mm/nommu.c b/mm/nommu.c
> index 70cb844..bde7df3 100644
> --- a/mm/nommu.c
> +++ b/mm/nommu.c
> @@ -1809,9 +1809,10 @@ void filemap_map_pages(struct fault_env *fe,
>  EXPORT_SYMBOL(filemap_map_pages);
>  
>  static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
> - unsigned long addr, void *buf, int len, int write)
> + unsigned long addr, void *buf, int len, unsigned int gup_flags)
>  {
>   struct vm_area_struct *vma;
> + int write = gup_flags & FOLL_WRITE;
>  
>   down_read(>mmap_sem);
>  
> @@ -1853,7 +1854,8 @@ static int __access_remote_vm(struct task_struct *tsk, 
> struct mm_struct *mm,
>  int access_remote_vm(struct mm_struct *mm, unsigned long addr,
>   void *buf, int len, int write)
>  {
> - return __access_remote_vm(NULL, mm, addr, buf, len, write);
> + return __access_remote_vm(NULL, mm, addr, buf, len,
> + write ? FOLL_WRITE : 0);
>  }
>  
>  /*
> @@ -1871,7 +1873,8 @@ int access_process_vm(struct task_struct *tsk, unsigned 
> long addr, void *buf, in
>   if (!mm)
>   return 0;
>  
> - len = __access_remote_vm(tsk, mm, addr, buf, len, write);
> + len = __access_remote_vm(tsk, mm, addr, buf, len,
> + write ? FOLL_WRITE : 0);
>  
>   mmput(mm);
>   return len;
> -- 
> 2.10.0
> 
-- 
Jan Kara 
SUSE Labs, CR

[PATCH 07/10] mm: replace get_user_pages_remote() write/force parameters with gup_flags

2016-10-19 Thread Jan Kara

On Thu 13-10-16 01:20:17, Lorenzo Stoakes wrote:
> This patch removes the write and force parameters from get_user_pages_remote()
> and replaces them with a gup_flags parameter to make the use of FOLL_FORCE
> explicit in callers as use of this flag can result in surprising behaviour 
> (and
> hence bugs) within the mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 

Looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  drivers/gpu/drm/etnaviv/etnaviv_gem.c   |  7 +--
>  drivers/gpu/drm/i915/i915_gem_userptr.c |  6 +-
>  drivers/infiniband/core/umem_odp.c  |  7 +--
>  fs/exec.c   |  9 +++--
>  include/linux/mm.h  |  2 +-
>  kernel/events/uprobes.c |  6 --
>  mm/gup.c| 22 +++---
>  mm/memory.c |  6 +-
>  security/tomoyo/domain.c|  2 +-
>  9 files changed, 40 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c 
> b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
> index 5ce3603..0370b84 100644
> --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c
> +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c
> @@ -748,19 +748,22 @@ static struct page **etnaviv_gem_userptr_do_get_pages(
>   int ret = 0, pinned, npages = etnaviv_obj->base.size >> PAGE_SHIFT;
>   struct page **pvec;
>   uintptr_t ptr;
> + unsigned int flags = 0;
>  
>   pvec = drm_malloc_ab(npages, sizeof(struct page *));
>   if (!pvec)
>   return ERR_PTR(-ENOMEM);
>  
> + if (!etnaviv_obj->userptr.ro)
> + flags |= FOLL_WRITE;
> +
>   pinned = 0;
>   ptr = etnaviv_obj->userptr.ptr;
>  
>   down_read(>mmap_sem);
>   while (pinned < npages) {
>   ret = get_user_pages_remote(task, mm, ptr, npages - pinned,
> - !etnaviv_obj->userptr.ro, 0,
> - pvec + pinned, NULL);
> + flags, pvec + pinned, NULL);
>   if (ret < 0)
>   break;
>  
> diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
> b/drivers/gpu/drm/i915/i915_gem_userptr.c
> index e537930..c6f780f 100644
> --- a/drivers/gpu/drm/i915/i915_gem_userptr.c
> +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
> @@ -508,6 +508,10 @@ __i915_gem_userptr_get_pages_worker(struct work_struct 
> *_work)
>   pvec = drm_malloc_gfp(npages, sizeof(struct page *), GFP_TEMPORARY);
>   if (pvec != NULL) {
>   struct mm_struct *mm = obj->userptr.mm->mm;
> + unsigned int flags = 0;
> +
> + if (!obj->userptr.read_only)
> + flags |= FOLL_WRITE;
>  
>   ret = -EFAULT;
>   if (atomic_inc_not_zero(>mm_users)) {
> @@ -517,7 +521,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct 
> *_work)
>   (work->task, mm,
>obj->userptr.ptr + pinned * PAGE_SIZE,
>npages - pinned,
> -  !obj->userptr.read_only, 0,
> +  flags,
>pvec + pinned, NULL);
>   if (ret < 0)
>   break;
> diff --git a/drivers/infiniband/core/umem_odp.c 
> b/drivers/infiniband/core/umem_odp.c
> index 75077a0..1f0fe32 100644
> --- a/drivers/infiniband/core/umem_odp.c
> +++ b/drivers/infiniband/core/umem_odp.c
> @@ -527,6 +527,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 
> user_virt, u64 bcnt,
>   u64 off;
>   int j, k, ret = 0, start_idx, npages = 0;
>   u64 base_virt_addr;
> + unsigned int flags = 0;
>  
>   if (access_mask == 0)
>   return -EINVAL;
> @@ -556,6 +557,9 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 
> user_virt, u64 bcnt,
>   goto out_put_task;
>   }
>  
> + if (access_mask & ODP_WRITE_ALLOWED_BIT)
> + flags |= FOLL_WRITE;
> +
>   start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
>   k = start_idx;
>  
> @@ -574,8 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 
> user_virt, u64 bcnt,
>*/
>   npages = get_user_pages_remote(owning_process, owning_mm,
>   user_virt, gup_num_pages,
> - access_mask & O

[PATCH 06/10] mm: replace get_user_pages() write/force parameters with gup_flags

2016-10-19 Thread Jan Kara

On Thu 13-10-16 01:20:16, Lorenzo Stoakes wrote:
> This patch removes the write and force parameters from get_user_pages() and
> replaces them with a gup_flags parameter to make the use of FOLL_FORCE 
> explicit
> in callers as use of this flag can result in surprising behaviour (and hence
> bugs) within the mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 

The patch looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  arch/cris/arch-v32/drivers/cryptocop.c |  4 +---
>  arch/ia64/kernel/err_inject.c  |  2 +-
>  arch/x86/mm/mpx.c  |  5 ++---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c|  7 +--
>  drivers/gpu/drm/radeon/radeon_ttm.c|  3 ++-
>  drivers/gpu/drm/via/via_dmablit.c  |  4 ++--
>  drivers/infiniband/core/umem.c |  6 +-
>  drivers/infiniband/hw/mthca/mthca_memfree.c|  2 +-
>  drivers/infiniband/hw/qib/qib_user_pages.c |  3 ++-
>  drivers/infiniband/hw/usnic/usnic_uiom.c   |  5 -
>  drivers/media/v4l2-core/videobuf-dma-sg.c  |  7 +--
>  drivers/misc/mic/scif/scif_rma.c   |  3 +--
>  drivers/misc/sgi-gru/grufault.c|  2 +-
>  drivers/platform/goldfish/goldfish_pipe.c  |  3 ++-
>  drivers/rapidio/devices/rio_mport_cdev.c   |  3 ++-
>  .../vc04_services/interface/vchiq_arm/vchiq_2835_arm.c |  3 +--
>  .../vc04_services/interface/vchiq_arm/vchiq_arm.c  |  3 +--
>  drivers/virt/fsl_hypervisor.c  |  4 ++--
>  include/linux/mm.h |  2 +-
>  mm/gup.c   | 12 +++-
>  mm/mempolicy.c |  2 +-
>  mm/nommu.c | 18 
> --
>  22 files changed, 49 insertions(+), 54 deletions(-)
> 
> diff --git a/arch/cris/arch-v32/drivers/cryptocop.c 
> b/arch/cris/arch-v32/drivers/cryptocop.c
> index b5698c8..099e170 100644
> --- a/arch/cris/arch-v32/drivers/cryptocop.c
> +++ b/arch/cris/arch-v32/drivers/cryptocop.c
> @@ -2722,7 +2722,6 @@ static int cryptocop_ioctl_process(struct inode *inode, 
> struct file *filp, unsig
>   err = get_user_pages((unsigned long int)(oper.indata + prev_ix),
>noinpages,
>0,  /* read access only for in data */
> -  0, /* no force */
>inpages,
>NULL);
>  
> @@ -2736,8 +2735,7 @@ static int cryptocop_ioctl_process(struct inode *inode, 
> struct file *filp, unsig
>   if (oper.do_cipher){
>   err = get_user_pages((unsigned long int)oper.cipher_outdata,
>nooutpages,
> -  1, /* write access for out data */
> -  0, /* no force */
> +  FOLL_WRITE, /* write access for out data */
>outpages,
>NULL);
>   up_read(>mm->mmap_sem);
> diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c
> index 09f8457..5ed0ea9 100644
> --- a/arch/ia64/kernel/err_inject.c
> +++ b/arch/ia64/kernel/err_inject.c
> @@ -142,7 +142,7 @@ store_virtual_to_phys(struct device *dev, struct 
> device_attribute *attr,
>   u64 virt_addr=simple_strtoull(buf, NULL, 16);
>   int ret;
>  
> - ret = get_user_pages(virt_addr, 1, VM_READ, 0, NULL, NULL);
> + ret = get_user_pages(virt_addr, 1, FOLL_WRITE, NULL, NULL);
>   if (ret<=0) {
>  #ifdef ERR_INJ_DEBUG
>   printk("Virtual address %lx is not existing.\n",virt_addr);
> diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
> index 8047687..e4f8009 100644
> --- a/arch/x86/mm/mpx.c
> +++ b/arch/x86/mm/mpx.c
> @@ -544,10 +544,9 @@ static int mpx_resolve_fault(long __user *addr, int 
> write)
>  {
>   long gup_ret;
>   int nr_pages = 1;
> - int force = 0;
>  
> - gup_ret = get_user_pages((unsigned long)addr, nr_pages, write,
> - force, NULL, NULL);
> + gup_ret = get_user_pages((unsigned long)addr, nr_pages,
> + write ? FOLL_WRITE : 0, NULL, NULL);
>   /*
>* get_user_pages() returns number of pages gotten.
>* 0 means we failed to fault in and get anything,
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
>

[PATCH 05/10] mm: replace get_vaddr_frames() write/force parameters with gup_flags

2016-10-19 Thread Jan Kara

On Thu 13-10-16 01:20:15, Lorenzo Stoakes wrote:
> This patch removes the write and force parameters from get_vaddr_frames() and
> replaces them with a gup_flags parameter to make the use of FOLL_FORCE 
> explicit
> in callers as use of this flag can result in surprising behaviour (and hence
> bugs) within the mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 

Looks good. You can add:

Reviewed-by: Jan Kara 

Honza

> ---
>  drivers/gpu/drm/exynos/exynos_drm_g2d.c|  3 ++-
>  drivers/media/platform/omap/omap_vout.c|  2 +-
>  drivers/media/v4l2-core/videobuf2-memops.c |  6 +-
>  include/linux/mm.h |  2 +-
>  mm/frame_vector.c  | 13 ++---
>  5 files changed, 11 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c 
> b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
> index aa92dec..fbd13fa 100644
> --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c
> +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
> @@ -488,7 +488,8 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct 
> drm_device *drm_dev,
>   goto err_free;
>   }
>  
> - ret = get_vaddr_frames(start, npages, true, true, g2d_userptr->vec);
> + ret = get_vaddr_frames(start, npages, FOLL_FORCE | FOLL_WRITE,
> + g2d_userptr->vec);
>   if (ret != npages) {
>   DRM_ERROR("failed to get user pages from userptr.\n");
>   if (ret < 0)
> diff --git a/drivers/media/platform/omap/omap_vout.c 
> b/drivers/media/platform/omap/omap_vout.c
> index e668dde..a31b95c 100644
> --- a/drivers/media/platform/omap/omap_vout.c
> +++ b/drivers/media/platform/omap/omap_vout.c
> @@ -214,7 +214,7 @@ static int omap_vout_get_userptr(struct videobuf_buffer 
> *vb, u32 virtp,
>   if (!vec)
>   return -ENOMEM;
>  
> - ret = get_vaddr_frames(virtp, 1, true, false, vec);
> + ret = get_vaddr_frames(virtp, 1, FOLL_WRITE, vec);
>   if (ret != 1) {
>   frame_vector_destroy(vec);
>   return -EINVAL;
> diff --git a/drivers/media/v4l2-core/videobuf2-memops.c 
> b/drivers/media/v4l2-core/videobuf2-memops.c
> index 3c3b517..1cd322e 100644
> --- a/drivers/media/v4l2-core/videobuf2-memops.c
> +++ b/drivers/media/v4l2-core/videobuf2-memops.c
> @@ -42,6 +42,10 @@ struct frame_vector *vb2_create_framevec(unsigned long 
> start,
>   unsigned long first, last;
>   unsigned long nr;
>   struct frame_vector *vec;
> + unsigned int flags = FOLL_FORCE;
> +
> + if (write)
> + flags |= FOLL_WRITE;
>  
>   first = start >> PAGE_SHIFT;
>   last = (start + length - 1) >> PAGE_SHIFT;
> @@ -49,7 +53,7 @@ struct frame_vector *vb2_create_framevec(unsigned long 
> start,
>   vec = frame_vector_create(nr);
>   if (!vec)
>   return ERR_PTR(-ENOMEM);
> - ret = get_vaddr_frames(start & PAGE_MASK, nr, write, true, vec);
> + ret = get_vaddr_frames(start & PAGE_MASK, nr, flags, vec);
>   if (ret < 0)
>   goto out_destroy;
>   /* We accept only complete set of PFNs */
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 27ab538..5ff084f6 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1305,7 +1305,7 @@ struct frame_vector {
>  struct frame_vector *frame_vector_create(unsigned int nr_frames);
>  void frame_vector_destroy(struct frame_vector *vec);
>  int get_vaddr_frames(unsigned long start, unsigned int nr_pfns,
> -  bool write, bool force, struct frame_vector *vec);
> +  unsigned int gup_flags, struct frame_vector *vec);
>  void put_vaddr_frames(struct frame_vector *vec);
>  int frame_vector_to_pages(struct frame_vector *vec);
>  void frame_vector_to_pfns(struct frame_vector *vec);
> diff --git a/mm/frame_vector.c b/mm/frame_vector.c
> index 81b6749..db77dcb 100644
> --- a/mm/frame_vector.c
> +++ b/mm/frame_vector.c
> @@ -11,10 +11,7 @@
>   * get_vaddr_frames() - map virtual addresses to pfns
>   * @start:   starting user address
>   * @nr_frames:   number of pages / pfns from start to map
> - * @write:   whether pages will be written to by the caller
> - * @force:   whether to force write access even if user mapping is
> - *   readonly. See description of the same argument of
> - get_user_pages().
> + * @gup_flags:   flags modifying lookup behaviour
>   * @vec: structure which receives pages / pfns of the addresses mapped.
>   *   It should have space for at least nr_frames entries.
>   *
> @@ -34,23 +31,17 @@
>   * This fu

[PATCH 04/10] mm: replace get_user_pages_locked() write/force parameters with gup_flags

2016-10-19 Thread Jan Kara

On Thu 13-10-16 01:20:14, Lorenzo Stoakes wrote:
> This patch removes the write and force parameters from get_user_pages_locked()
> and replaces them with a gup_flags parameter to make the use of FOLL_FORCE
> explicit in callers as use of this flag can result in surprising behaviour 
> (and
> hence bugs) within the mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 

After our discussion the patch looks good to me. You can add:

Reviewed-by: Jan Kara 

        Honza
-- 
Jan Kara 
SUSE Labs, CR

[PATCH 04/10] mm: replace get_user_pages_locked() write/force parameters with gup_flags

2016-10-19 Thread Jan Kara

On Tue 18-10-16 14:56:09, Lorenzo Stoakes wrote:
> On Tue, Oct 18, 2016 at 02:54:25PM +0200, Jan Kara wrote:
> > > @@ -1282,7 +1282,7 @@ long get_user_pages(unsigned long start, unsigned 
> > > long nr_pages,
> > >   int write, int force, struct page **pages,
> > >   struct vm_area_struct **vmas);
> > >  long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
> > > - int write, int force, struct page **pages, int *locked);
> > > + unsigned int gup_flags, struct page **pages, int *locked);
> >
> > Hum, the prototype is inconsistent with e.g. __get_user_pages_unlocked()
> > where gup_flags come after **pages argument. Actually it makes more sense
> > to have it before **pages so that input arguments come first and output
> > arguments second but I don't care that much. But it definitely should be
> > consistent...
> 
> It was difficult to decide quite how to arrange parameters as there was
> inconsitency with regards to parameter ordering already - for example
> __get_user_pages() places its flags argument before pages whereas, as you 
> note,
> __get_user_pages_unlocked() puts them afterwards.
> 
> I ended up compromising by trying to match the existing ordering of the 
> function
> as much as I could by replacing write, force pairs with gup_flags in the same
> location (with the exception of get_user_pages_unlocked() which I felt should
> match __get_user_pages_unlocked() in signature) or if there was already a
> gup_flags parameter as in the case of __get_user_pages_unlocked() I simply
> removed the write, force pair and left the flags as the last parameter.
> 
> I am happy to rearrange parameters as needed, however I am not sure if it'd be
> worthwhile for me to do so (I am keen to try to avoid adding too much noise 
> here
> :)
> 
> If we were to rearrange parameters for consistency I'd suggest adjusting
> __get_user_pages_unlocked() to put gup_flags before pages and do the same with
> get_user_pages_unlocked(), let me know what you think.

Yeah, ok. If the inconsistency is already there, just leave it for now.

Honza
-- 
Jan Kara 
SUSE Labs, CR

[PATCH 04/10] mm: replace get_user_pages_locked() write/force parameters with gup_flags

2016-10-18 Thread Jan Kara

On Thu 13-10-16 01:20:14, Lorenzo Stoakes wrote:
> This patch removes the write and force parameters from get_user_pages_locked()
> and replaces them with a gup_flags parameter to make the use of FOLL_FORCE
> explicit in callers as use of this flag can result in surprising behaviour 
> (and
> hence bugs) within the mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 
> ---
>  include/linux/mm.h |  2 +-
>  mm/frame_vector.c  |  8 +++-
>  mm/gup.c   | 12 +++-
>  mm/nommu.c |  5 -
>  4 files changed, 15 insertions(+), 12 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 6adc4bc..27ab538 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1282,7 +1282,7 @@ long get_user_pages(unsigned long start, unsigned long 
> nr_pages,
>   int write, int force, struct page **pages,
>   struct vm_area_struct **vmas);
>  long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
> - int write, int force, struct page **pages, int *locked);
> + unsigned int gup_flags, struct page **pages, int *locked);

Hum, the prototype is inconsistent with e.g. __get_user_pages_unlocked()
where gup_flags come after **pages argument. Actually it makes more sense
to have it before **pages so that input arguments come first and output
arguments second but I don't care that much. But it definitely should be
consistent...

    Honza
-- 
Jan Kara 
SUSE Labs, CR

[PATCH 03/10] mm: replace get_user_pages_unlocked() write/force parameters with gup_flags

2016-10-18 Thread Jan Kara

On Thu 13-10-16 01:20:13, Lorenzo Stoakes wrote:
> This patch removes the write and force parameters from 
> get_user_pages_unlocked()
> and replaces them with a gup_flags parameter to make the use of FOLL_FORCE
> explicit in callers as use of this flag can result in surprising behaviour 
> (and
> hence bugs) within the mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 

Looks good. You can add:

Reviewed-by: Jan Kara 

        Honza
-- 
Jan Kara 
SUSE Labs, CR

[PATCH 02/10] mm: remove write/force parameters from __get_user_pages_unlocked()

2016-10-18 Thread Jan Kara

On Thu 13-10-16 01:20:12, Lorenzo Stoakes wrote:
> This patch removes the write and force parameters from
> __get_user_pages_unlocked() to make the use of FOLL_FORCE explicit in callers 
> as
> use of this flag can result in surprising behaviour (and hence bugs) within 
> the
> mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 

The patch looks good. You can add:

Reviewed-by: Jan Kara 

        Honza
-- 
Jan Kara 
SUSE Labs, CR

[PATCH 01/10] mm: remove write/force parameters from __get_user_pages_locked()

2016-10-18 Thread Jan Kara

On Thu 13-10-16 01:20:11, Lorenzo Stoakes wrote:
> This patch removes the write and force parameters from 
> __get_user_pages_locked()
> to make the use of FOLL_FORCE explicit in callers as use of this flag can 
> result
> in surprising behaviour (and hence bugs) within the mm subsystem.
> 
> Signed-off-by: Lorenzo Stoakes 

Looks good. You can add:

Reviewed-by: Jan Kara 

        Honza
-- 
Jan Kara 
SUSE Labs, CR

[PATCH] tree wide: Use kvfree() than conditional kfree()/vfree()

2015-11-09 Thread Jan Kara

On Mon 09-11-15 20:56:10, Tetsuo Handa wrote:
> There are many locations that do
> 
>   if (memory_was_allocated_by_vmalloc)
> vfree(ptr);
>   else
> kfree(ptr);
> 
> but kvfree() can handle both kmalloc()ed memory and vmalloc()ed memory
> using is_vmalloc_addr(). Unless callers have special reasons, we can
> replace this branch with kvfree(). Please check and reply if you found
> problems.
> 
> Signed-off-by: Tetsuo Handa 
> Acked-by: Michal Hocko 

You can add

Acked-by: Jan Kara 

for the UDF and fs/xattr.c parts.

Honza

> Cc: Russell King  # arm
> Cc:  # apei
> Cc:  # drbd
> Cc:  # mspec
> Cc:  # drm
> Cc: Oleg Drokin  # lustre
> Cc: Andreas Dilger  # lustre
> Cc:  # coda
> Cc:  # jffs2
> Cc: Jan Kara  # udf
> Cc:  # xattr
> Cc:  # ipc + mm
> Cc:  # ipv4
> ---
>  arch/arm/mm/dma-mapping.c  | 11 ++--
>  drivers/acpi/apei/erst.c   |  6 ++--
>  drivers/block/drbd/drbd_bitmap.c   | 26 +
>  drivers/block/drbd/drbd_int.h  |  3 --
>  drivers/char/mspec.c   | 15 ++
>  drivers/gpu/drm/drm_hashtab.c  |  5 +---
>  .../lustre/include/linux/libcfs/libcfs_private.h   |  8 ++
>  fs/coda/coda_linux.h   |  3 +-
>  fs/jffs2/build.c   |  8 ++
>  fs/jffs2/fs.c  |  5 +---
>  fs/jffs2/super.c   |  5 +---
>  fs/udf/super.c |  7 +
>  fs/xattr.c | 33 
> ++
>  ipc/sem.c  |  2 +-
>  ipc/util.c | 11 ++--
>  ipc/util.h |  2 +-
>  mm/percpu.c| 18 +---
>  mm/vmalloc.c   |  5 +---
>  net/ipv4/fib_trie.c|  4 +--
>  19 files changed, 46 insertions(+), 131 deletions(-)
> 
> diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
> index e62400e..492bf3e 100644
> --- a/arch/arm/mm/dma-mapping.c
> +++ b/arch/arm/mm/dma-mapping.c
> @@ -1200,10 +1200,7 @@ error:
>   while (i--)
>   if (pages[i])
>   __free_pages(pages[i], 0);
> - if (array_size <= PAGE_SIZE)
> - kfree(pages);
> - else
> - vfree(pages);
> + kvfree(pages);
>   return NULL;
>  }
>  
> @@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, 
> struct page **pages,
>  size_t size, struct dma_attrs *attrs)
>  {
>   int count = size >> PAGE_SHIFT;
> - int array_size = count * sizeof(struct page *);
>   int i;
>  
>   if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) {
> @@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, 
> struct page **pages,
>   __free_pages(pages[i], 0);
>   }
>  
> - if (array_size <= PAGE_SIZE)
> - kfree(pages);
> - else
> - vfree(pages);
> + kvfree(pages);
>   return 0;
>  }
>  
> diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c
> index 6682c5d..6e6bc10 100644
> --- a/drivers/acpi/apei/erst.c
> +++ b/drivers/acpi/apei/erst.c
> @@ -32,6 +32,7 @@
>  #include 
>  #include 
>  #include 
> +#include  /* kvfree() */
>  #include 
>  
>  #include "apei-internal.h"
> @@ -532,10 +533,7 @@ retry:
>   return -ENOMEM;
>   memcpy(new_entries, entries,
>  erst_record_id_cache.len * sizeof(entries[0]));
> - if (erst_record_id_cache.size < PAGE_SIZE)
> - kfree(entries);
> - else
> - vfree(entries);
> + kvfree(entries);
>   erst_record_id_cache.entries = entries = new_entries;
>   erst_record_id_cache.size = new_size;
>   }
> diff --git a/drivers/block/drbd/drbd_bitmap.c 
> b/drivers/block/drbd/drbd_bitmap.c
> index 9462d27..2daaafb 100644
> --- a/drivers/block/drbd/drbd_bitmap.c
> +++ b/drivers/block/drbd/drbd_bitmap.c
> @@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned 
> long number)
>   }
>  }
>  
> -static void bm_vk_free(void *ptr, int v)
> +static inline void bm_vk_free(void *ptr)
>  {
> - if (v)
> - vfree(pt

[PATCH 6/7] drm/i915: Convert to use get_user_page_unlocked()

2015-10-06 Thread Jan Kara

From: Jan Kara <j...@suse.cz>

Convert __i915_gem_userptr_get_pages_worker() to use
get_user_page_unlocked() so that we don't unnecessarily leak knowledge of
mm locking into driver code.

CC: Daniel Vetter 
CC: David Airlie 
CC: dri-devel at lists.freedesktop.org
Signed-off-by: Jan Kara 
---
 drivers/gpu/drm/i915/i915_gem_userptr.c | 13 ++---
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c 
b/drivers/gpu/drm/i915/i915_gem_userptr.c
index 8fd431bcdfd3..5138fe61d2fa 100644
--- a/drivers/gpu/drm/i915/i915_gem_userptr.c
+++ b/drivers/gpu/drm/i915/i915_gem_userptr.c
@@ -585,19 +585,18 @@ __i915_gem_userptr_get_pages_worker(struct work_struct 
*_work)
if (pvec != NULL) {
struct mm_struct *mm = obj->userptr.mm->mm;

-   down_read(>mmap_sem);
while (pinned < num_pages) {
-   ret = get_user_pages(work->task, mm,
-obj->userptr.ptr + pinned * 
PAGE_SIZE,
-num_pages - pinned,
-!obj->userptr.read_only, 0,
-pvec + pinned, NULL);
+   ret = get_user_pages_unlocked(
+   work->task, mm,
+   obj->userptr.ptr + pinned * PAGE_SIZE,
+   num_pages - pinned,
+   !obj->userptr.read_only, 0,
+   pvec + pinned);
if (ret < 0)
break;

pinned += ret;
}
-   up_read(>mmap_sem);
}

mutex_lock(>struct_mutex);
-- 
2.1.4

[PATCH 3/7] drm: Convert via driver to use get_user_pages_fast()

2015-10-06 Thread Jan Kara

From: Jan Kara <j...@suse.cz>

CC: David Airlie 
CC: dri-devel at lists.freedesktop.org
Signed-off-by: Jan Kara 
---
 drivers/gpu/drm/via/via_dmablit.c | 12 
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/via/via_dmablit.c 
b/drivers/gpu/drm/via/via_dmablit.c
index d0cbd5ecd7f0..d71add236e62 100644
--- a/drivers/gpu/drm/via/via_dmablit.c
+++ b/drivers/gpu/drm/via/via_dmablit.c
@@ -238,14 +238,10 @@ via_lock_all_dma_pages(drm_via_sg_info_t *vsg,  
drm_via_dmablit_t *xfer)
vsg->pages = vzalloc(sizeof(struct page *) * vsg->num_pages);
if (NULL == vsg->pages)
return -ENOMEM;
-   down_read(>mm->mmap_sem);
-   ret = get_user_pages(current, current->mm,
-(unsigned long)xfer->mem_addr,
-vsg->num_pages,
-(vsg->direction == DMA_FROM_DEVICE),
-0, vsg->pages, NULL);
-
-   up_read(>mm->mmap_sem);
+   ret = get_user_pages_fast((unsigned long)xfer->mem_addr,
+ vsg->num_pages,
+ (vsg->direction == DMA_FROM_DEVICE),
+ vsg->pages);
if (ret != vsg->num_pages) {
if (ret < 0)
return ret;
-- 
2.1.4

[PATCH 0/7] get_user_pages() cleanup

2015-10-06 Thread Jan Kara

From: Jan Kara <j...@suse.cz>

  Hello,

Now when the usage of get_user_pages() in media drivers got cleaned up, here
comes a series which removes knowledge about mmap_sem from a couple of other
drivers. Patches are trivial and standalone but please check, they are only
compile tested. If you are OK with them, either take them through your
respective trees or ack them and I can take care of pushing them to Linus
(probably through mm tree). Thanks.

After these patches there are some 12 call sites of get_user_pages() outside of
core code (mostly infiniband and RDMA). So we are slowly getting to the goal of
removing knowledge about page fault locking from drivers which will
consequently allow us to change the locking rules with reasonable effort.

Honza

CC: Jesper Nilsson 
CC: linux-cris-kernel at axis.com
CC: Mikael Starvik 
CC: linux-ia64 at vger.kernel.org
CC: Tony Luck 
CC: David Airlie 
CC: dri-devel at lists.freedesktop.org
CC: Timur Tabi 
CC: linux-rdma at vger.kernel.org
CC: Roland Dreier 
CC: Daniel Vetter 
CC: David Airlie 
CC: dri-devel at lists.freedesktop.org
CC: Andy Walls 
CC: linux-media at vger.kernel.org
CC: Mauro Carvalho Chehab

[PATCH 9/9] [media] mm: Move get_vaddr_frames() behind a config option

2015-06-18 Thread Jan Kara

On Wed 10-06-15 09:37:20, Josh Triplett wrote:
> On Wed, Jun 10, 2015 at 06:20:52AM -0300, Mauro Carvalho Chehab wrote:
> > From: Jan Kara 
> > 
> > get_vaddr_frames() is used by relatively rare drivers so hide it and the
> > related functions behind a config option that is selected only by
> > drivers that need the infrastructure.
> > 
> > Suggested-by: Andrew Morton 
> > 
> > Signed-off-by: Jan Kara 
> > Signed-off-by: Hans Verkuil 
> > Signed-off-by: Mauro Carvalho Chehab 
> 
> Seems sensible to me.
> 
> Since this patch makes the kernel smaller, can you include the delta
> from bloat-o-meter between allnoconfig with and without this patch?

The results are:

add/remove: 0/6 grow/shrink: 0/0 up/down: 0/-868 (-868)
function old new   delta
frame_vector_destroy  55   - -55
frame_vector_to_pfns  56   - -56
frame_vector_create   81   - -81
put_vaddr_frames  93   - -93
frame_vector_to_pages 98   - -98
get_vaddr_frames 485   --485

I've added it to the changelog of the patch.

> Also, I assume you've compile-tested the kernel with allyesconfig minus
> the three options that now have "select FRAME_VECTOR", to make sure it
> builds?
  I did not because the config option VIDEOBUF2_MEMOPS is a virtual one
selected transitively by quite a few video drivers and I didn't bother with
tracking down all of them... But since that config option guards
compilation of the code I modified I'm pretty confident I got it right.

Honza

> >  create mode 100644 mm/frame_vector.c
> > 
> > diff --git a/drivers/gpu/drm/exynos/Kconfig b/drivers/gpu/drm/exynos/Kconfig
> > index 0a6780367d28..fc678289cf79 100644
> > --- a/drivers/gpu/drm/exynos/Kconfig
> > +++ b/drivers/gpu/drm/exynos/Kconfig
> > @@ -71,6 +71,7 @@ config DRM_EXYNOS_VIDI
> >  config DRM_EXYNOS_G2D
> > bool "Exynos DRM G2D"
> > depends on DRM_EXYNOS && !VIDEO_SAMSUNG_S5P_G2D
> > +   select FRAME_VECTOR
> > help
> >   Choose this option if you want to use Exynos G2D for DRM.
> >  
> > diff --git a/drivers/media/platform/omap/Kconfig 
> > b/drivers/media/platform/omap/Kconfig
> > index dc2aaab54aef..217d613b0fe7 100644
> > --- a/drivers/media/platform/omap/Kconfig
> > +++ b/drivers/media/platform/omap/Kconfig
> > @@ -10,6 +10,7 @@ config VIDEO_OMAP2_VOUT
> > select OMAP2_DSS if HAS_IOMEM && ARCH_OMAP2PLUS
> > select OMAP2_VRFB if ARCH_OMAP2 || ARCH_OMAP3
> > select VIDEO_OMAP2_VOUT_VRFB if VIDEO_OMAP2_VOUT && OMAP2_VRFB
> > +   select FRAME_VECTOR
> > default n
> > ---help---
> >   V4L2 Display driver support for OMAP2/3 based boards.
> > diff --git a/drivers/media/v4l2-core/Kconfig 
> > b/drivers/media/v4l2-core/Kconfig
> > index f7a01a72eb9e..f38f6e387f04 100644
> > --- a/drivers/media/v4l2-core/Kconfig
> > +++ b/drivers/media/v4l2-core/Kconfig
> > @@ -73,6 +73,7 @@ config VIDEOBUF2_CORE
> >  
> >  config VIDEOBUF2_MEMOPS
> > tristate
> > +   select FRAME_VECTOR
> >  
> >  config VIDEOBUF2_DMA_CONTIG
> > tristate
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 390214da4546..2ca52e9986f0 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -635,3 +635,6 @@ config MAX_STACK_SIZE_MB
> >   changed to a smaller value in which case that is used.
> >  
> >   A sane initial value is 80 MB.
> > +
> > +config FRAME_VECTOR
> > +   bool
> > diff --git a/mm/Makefile b/mm/Makefile
> > index 98c4eaeabdcb..be5d5c866305 100644
> > --- a/mm/Makefile
> > +++ b/mm/Makefile
> > @@ -78,3 +78,4 @@ obj-$(CONFIG_CMA) += cma.o
> >  obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
> >  obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
> >  obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
> > +obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
> > diff --git a/mm/frame_vector.c b/mm/frame_vector.c
> > new file mode 100644
> > index ..31a2bd5f41d5
> > --- /dev/null
> > +++ b/mm/frame_vector.c
> > @@ -0,0 +1,232 @@
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +/*
> > + * get_vaddr_frames() - map virtual addresses to pfns
> > + * @start: starting user address
> > + * @nr_fra

[PATCH 2/9] mm: Provide new get_vaddr_frames() helper

2015-06-03 Thread Jan Kara

On Tue 02-06-15 15:29:12, Andrew Morton wrote:
> On Tue, 2 Jun 2015 17:23:00 +0200 Jan Kara  wrote:
> 
> > > That's a lump of new code which many kernels won't be needing.  Can we
> > > put all this in a new .c file and select it within drivers/media
> > > Kconfig?
> >   So the attached patch should do what you had in mind. OK?
> 
> lgtm.
> 
> >  drivers/gpu/drm/exynos/Kconfig  |   1 +
> >  drivers/media/platform/omap/Kconfig |   1 +
> >  drivers/media/v4l2-core/Kconfig |   1 +
> >  mm/Kconfig  |   3 +
> >  mm/Makefile |   1 +
> >  mm/frame-vec.c  | 233 
> > 
> 
> But frame_vector.c would be a more pleasing name.  For `struct frame_vector'.
  OK, makes sense. Updated patch attached.

Honza
-- 
Jan Kara 
SUSE Labs, CR
-- next part --
A non-text attachment was scrubbed...
Name: 0001-mm-Move-get_vaddr_frames-behind-a-config-option.patch
Type: text/x-patch
Size: 16477 bytes
Desc: not available
URL: 
<http://lists.freedesktop.org/archives/dri-devel/attachments/20150603/1bb89e2a/attachment-0001.bin>

[PATCH 2/9] mm: Provide new get_vaddr_frames() helper

2015-06-02 Thread Jan Kara

On Thu 28-05-15 16:24:02, Andrew Morton wrote:
> On Wed, 13 May 2015 15:08:08 +0200 Jan Kara  wrote:
> 
> > Provide new function get_vaddr_frames().  This function maps virtual
> > addresses from given start and fills given array with page frame numbers of
> > the corresponding pages. If given start belongs to a normal vma, the 
> > function
> > grabs reference to each of the pages to pin them in memory. If start
> > belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller
> > must make sure pfns aren't reused for anything else while he is using
> > them.
> > 
> > This function is created for various drivers to simplify handling of
> > their buffers.
> > 
> > Acked-by: Mel Gorman 
> > Acked-by: Vlastimil Babka 
> > Signed-off-by: Jan Kara 
> > ---
> >  include/linux/mm.h |  44 +++
> >  mm/gup.c   | 226 
> > +
> 
> That's a lump of new code which many kernels won't be needing.  Can we
> put all this in a new .c file and select it within drivers/media
> Kconfig?
  So the attached patch should do what you had in mind. OK?

Honza
-- 
Jan Kara 
SUSE Labs, CR
-- next part --
A non-text attachment was scrubbed...
Name: 0001-mm-Move-get_vaddr_frames-behind-a-config-option.patch
Type: text/x-patch
Size: 16447 bytes
Desc: not available
URL: 
<http://lists.freedesktop.org/archives/dri-devel/attachments/20150602/dec9e2f0/attachment.bin>

[PATCH 2/9] mm: Provide new get_vaddr_frames() helper

2015-06-01 Thread Jan Kara

On Thu 28-05-15 16:24:02, Andrew Morton wrote:
> On Wed, 13 May 2015 15:08:08 +0200 Jan Kara  wrote:
> 
> > Provide new function get_vaddr_frames().  This function maps virtual
> > addresses from given start and fills given array with page frame numbers of
> > the corresponding pages. If given start belongs to a normal vma, the 
> > function
> > grabs reference to each of the pages to pin them in memory. If start
> > belongs to VM_IO | VM_PFNMAP vma, we don't touch page structures. Caller
> > must make sure pfns aren't reused for anything else while he is using
> > them.
> > 
> > This function is created for various drivers to simplify handling of
> > their buffers.
> > 
> > Acked-by: Mel Gorman 
> > Acked-by: Vlastimil Babka 
> > Signed-off-by: Jan Kara 
> > ---
> >  include/linux/mm.h |  44 +++
> >  mm/gup.c   | 226 
> > +
> 
> That's a lump of new code which many kernels won't be needing.  Can we
> put all this in a new .c file and select it within drivers/media
> Kconfig?
  Yeah, makes sense. I'll write a patch. Hans, is it OK with you if I
just create a patch on top of the series you have in your tree?

Honza
-- 
Jan Kara 
SUSE Labs, CR

[PATCH 0/9 v5] Helper to abstract vma handling in media layer

2015-05-25 Thread Jan Kara

On Mon 25-05-15 13:37:56, Hans Verkuil wrote:
> Hi Jan,
> 
> On 05/13/2015 03:08 PM, Jan Kara wrote:
> >   Hello,
> > 
> > I'm sending the fifth version of my patch series to abstract vma handling
> > from the various media drivers. The patches got some review from mm people 
> > and
> > testing from device driver guys so unless someone objects, patches will be
> > queued in media tree for the next merge window.
> 
> What is the current status? I saw one comment for patch 9, so I assume it is 
> not
> quite ready yet.
> 
> Let me know when you think it is time to merge.
  There was a minor comment to the exynos patch - I've sent updated version
of that patch as a reply to the comment. Do you want me to resend the whole
series? Otherwise I think you can just pick up the patches and merge them.
Thanks!

Honza

> > After this patch set drivers have to know much less details about vmas, 
> > their
> > types, and locking. Also quite some code is removed from them. As a bonus
> > drivers get automatically VM_FAULT_RETRY handling. The primary motivation 
> > for
> > this series is to remove knowledge about mmap_sem locking from as many 
> > places a
> > possible so that we can change it with reasonable effort.
> > 
> > The core of the series is the new helper get_vaddr_frames() which is given a
> > virtual address and it fills in PFNs / struct page pointers (depending on 
> > VMA
> > type) into the provided array. If PFNs correspond to normal pages it also 
> > grabs
> > references to these pages. The difference from get_user_pages() is that this
> > function can also deal with pfnmap, and io mappings which is what the media
> > drivers need.
> > 
> > I have tested the patches with vivid driver so at least vb2 code got some
> > exposure. Conversion of other drivers was just compile-tested (for x86 so 
> > e.g.
> > exynos driver which is only for Samsung platform is completely untested).
> > 
> > Honza
> > Changes since v4:
> > * Minor cleanups and fixes pointed out by Mel and Vlasta
> > * Added Acked-by tags
> > 
> > Changes since v3:
> > * Added include  into mm/gup.c as it's needed for some 
> > archs
> > * Fixed error path for exynos driver
> > 
> > Changes since v2:
> > * Renamed functions and structures as Mel suggested
> > * Other minor changes suggested by Mel
> > * Rebased on top of 4.1-rc2
> > * Changed functions to get pointer to array of pages / pfns to perform
> >   conversion if necessary. This fixes possible issue in the omap I may have
> >   introduced in v2 and generally makes the API less errorprone.
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-media" in
> > the body of a message to majordomo at vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > 
> 
-- 
Jan Kara 
SUSE Labs, CR

[PATCH 9/9] drm/exynos: Convert g2d_userptr_get_dma_addr() to use get_vaddr_frames()

2015-05-14 Thread Jan Kara

On Thu 14-05-15 19:51:23, Inki Dae wrote:
> Hi,
> 
> On 2015ë 05ì 13ì¼ 22:08, Jan Kara wrote:
> > Convert g2d_userptr_get_dma_addr() to pin pages using get_vaddr_frames().
> > This removes the knowledge about vmas and mmap_sem locking from exynos
> > driver. Also it fixes a problem that the function has been mapping user
> > provided address without holding mmap_sem.
> > 
> > Signed-off-by: Jan Kara 
> > ---
> >  drivers/gpu/drm/exynos/exynos_drm_g2d.c | 89 ++
> >  drivers/gpu/drm/exynos/exynos_drm_gem.c | 97 
> > -
> >  2 files changed, 29 insertions(+), 157 deletions(-)
> > 
> > diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c 
> > b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
> > index 81a250830808..265519c0fe2d 100644
> > --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c
> > +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
> > @@ -190,10 +190,8 @@ struct g2d_cmdlist_userptr {
> > dma_addr_t  dma_addr;
> > unsigned long   userptr;
> > unsigned long   size;
> > -   struct page **pages;
> > -   unsigned intnpages;
> > +   struct frame_vector *vec;
> > struct sg_table *sgt;
> > -   struct vm_area_struct   *vma;
> > atomic_trefcount;
> > boolin_pool;
> > boolout_of_list;
> > @@ -363,6 +361,7 @@ static void g2d_userptr_put_dma_addr(struct drm_device 
> > *drm_dev,
> >  {
> > struct g2d_cmdlist_userptr *g2d_userptr =
> > (struct g2d_cmdlist_userptr *)obj;
> > +   struct page **pages;
> >  
> > if (!obj)
> > return;
> > @@ -382,19 +381,21 @@ out:
> > exynos_gem_unmap_sgt_from_dma(drm_dev, g2d_userptr->sgt,
> > DMA_BIDIRECTIONAL);
> >  
> > -   exynos_gem_put_pages_to_userptr(g2d_userptr->pages,
> > -   g2d_userptr->npages,
> > -   g2d_userptr->vma);
> > +   pages = frame_vector_pages(g2d_userptr->vec);
> > +   if (!IS_ERR(pages)) {
> > +   int i;
> >  
> > -   exynos_gem_put_vma(g2d_userptr->vma);
> > +   for (i = 0; i < frame_vector_count(g2d_userptr->vec); i++)
> > +   set_page_dirty_lock(pages[i]);
> > +   }
> > +   put_vaddr_frames(g2d_userptr->vec);
> > +   frame_vector_destroy(g2d_userptr->vec);
> >  
> > if (!g2d_userptr->out_of_list)
> > list_del_init(_userptr->list);
> >  
> > sg_free_table(g2d_userptr->sgt);
> > kfree(g2d_userptr->sgt);
> > -
> > -   drm_free_large(g2d_userptr->pages);
> > kfree(g2d_userptr);
> >  }
> >  
> > @@ -413,6 +414,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct 
> > drm_device *drm_dev,
> > struct vm_area_struct *vma;
> > unsigned long start, end;
> > unsigned int npages, offset;
> > +   struct frame_vector *vec;
> > int ret;
> >  
> > if (!size) {
> > @@ -456,65 +458,37 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct 
> > drm_device *drm_dev,
> > return ERR_PTR(-ENOMEM);
> >  
> > atomic_set(_userptr->refcount, 1);
> > +   g2d_userptr->size = size;
> >  
> > start = userptr & PAGE_MASK;
> > offset = userptr & ~PAGE_MASK;
> > end = PAGE_ALIGN(userptr + size);
> > npages = (end - start) >> PAGE_SHIFT;
> > -   g2d_userptr->npages = npages;
> > -
> > -   pages = drm_calloc_large(npages, sizeof(struct page *));
> 
> The declaration to pages isn't needed anymore because you removed it.
> 
> > -   if (!pages) {
> > -   DRM_ERROR("failed to allocate pages.\n");
> > -   ret = -ENOMEM;
> > +   vec = g2d_userptr->vec = frame_vector_create(npages);
> 
> I think you can use g2d_userptr->vec so it seems that vec isn't needed.
> 
> > +   if (!vec)
> > goto err_free;
> > -   }
> >  
> > -   down_read(>mm->mmap_sem);
> > -   vma = find_vma(current->mm, userptr);
> 
> For vma, ditto.
  Thanks for review! Attached is a new version of the patch.

Honza

-- 
Jan Kara 
SUSE Labs, CR
-- next part --
A non-text attachment was scrubbed...
Name: 0001-drm-exynos-Convert-g2d_userptr_get_dma_addr-to-use-g.patch
Type: text/x-patch
Size: 7676 bytes
Desc: not available
URL: 
<http://lists.freedesktop.org/archives/dri-devel/attachments/20150514/cf217dc2/attachment.bin>

[PATCH 9/9] drm/exynos: Convert g2d_userptr_get_dma_addr() to use get_vaddr_frames()

2015-05-13 Thread Jan Kara

Convert g2d_userptr_get_dma_addr() to pin pages using get_vaddr_frames().
This removes the knowledge about vmas and mmap_sem locking from exynos
driver. Also it fixes a problem that the function has been mapping user
provided address without holding mmap_sem.

Signed-off-by: Jan Kara 
---
 drivers/gpu/drm/exynos/exynos_drm_g2d.c | 89 ++
 drivers/gpu/drm/exynos/exynos_drm_gem.c | 97 -
 2 files changed, 29 insertions(+), 157 deletions(-)

diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c 
b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
index 81a250830808..265519c0fe2d 100644
--- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c
+++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c
@@ -190,10 +190,8 @@ struct g2d_cmdlist_userptr {
dma_addr_t  dma_addr;
unsigned long   userptr;
unsigned long   size;
-   struct page **pages;
-   unsigned intnpages;
+   struct frame_vector *vec;
struct sg_table *sgt;
-   struct vm_area_struct   *vma;
atomic_trefcount;
boolin_pool;
boolout_of_list;
@@ -363,6 +361,7 @@ static void g2d_userptr_put_dma_addr(struct drm_device 
*drm_dev,
 {
struct g2d_cmdlist_userptr *g2d_userptr =
(struct g2d_cmdlist_userptr *)obj;
+   struct page **pages;

if (!obj)
return;
@@ -382,19 +381,21 @@ out:
exynos_gem_unmap_sgt_from_dma(drm_dev, g2d_userptr->sgt,
DMA_BIDIRECTIONAL);

-   exynos_gem_put_pages_to_userptr(g2d_userptr->pages,
-   g2d_userptr->npages,
-   g2d_userptr->vma);
+   pages = frame_vector_pages(g2d_userptr->vec);
+   if (!IS_ERR(pages)) {
+   int i;

-   exynos_gem_put_vma(g2d_userptr->vma);
+   for (i = 0; i < frame_vector_count(g2d_userptr->vec); i++)
+   set_page_dirty_lock(pages[i]);
+   }
+   put_vaddr_frames(g2d_userptr->vec);
+   frame_vector_destroy(g2d_userptr->vec);

if (!g2d_userptr->out_of_list)
list_del_init(_userptr->list);

sg_free_table(g2d_userptr->sgt);
kfree(g2d_userptr->sgt);
-
-   drm_free_large(g2d_userptr->pages);
kfree(g2d_userptr);
 }

@@ -413,6 +414,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct 
drm_device *drm_dev,
struct vm_area_struct *vma;
unsigned long start, end;
unsigned int npages, offset;
+   struct frame_vector *vec;
int ret;

if (!size) {
@@ -456,65 +458,37 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct 
drm_device *drm_dev,
return ERR_PTR(-ENOMEM);

atomic_set(_userptr->refcount, 1);
+   g2d_userptr->size = size;

start = userptr & PAGE_MASK;
offset = userptr & ~PAGE_MASK;
end = PAGE_ALIGN(userptr + size);
npages = (end - start) >> PAGE_SHIFT;
-   g2d_userptr->npages = npages;
-
-   pages = drm_calloc_large(npages, sizeof(struct page *));
-   if (!pages) {
-   DRM_ERROR("failed to allocate pages.\n");
-   ret = -ENOMEM;
+   vec = g2d_userptr->vec = frame_vector_create(npages);
+   if (!vec)
goto err_free;
-   }

-   down_read(>mm->mmap_sem);
-   vma = find_vma(current->mm, userptr);
-   if (!vma) {
-   up_read(>mm->mmap_sem);
-   DRM_ERROR("failed to get vm region.\n");
+   ret = get_vaddr_frames(start, npages, 1, 1, vec);
+   if (ret != npages) {
+   DRM_ERROR("failed to get user pages from userptr.\n");
+   if (ret < 0)
+   goto err_destroy_framevec;
ret = -EFAULT;
-   goto err_free_pages;
+   goto err_put_framevec;
}
-
-   if (vma->vm_end < userptr + size) {
-   up_read(>mm->mmap_sem);
-   DRM_ERROR("vma is too small.\n");
+   if (frame_vector_to_pages(vec) < 0) {
ret = -EFAULT;
-   goto err_free_pages;
+   goto err_put_framevec;
}

-   g2d_userptr->vma = exynos_gem_get_vma(vma);
-   if (!g2d_userptr->vma) {
-   up_read(>mm->mmap_sem);
-   DRM_ERROR("failed to copy vma.\n");
-   ret = -ENOMEM;
-   goto err_free_pages;
-   }
-
-   g2d_userptr->size = size;
-
-   ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK,
-   npages, pages, vma);
-   if (ret < 0) {
-   up_read(>mm->mmap_sem);
-

[PATCH 8/9] media: vb2: Remove unused functions

2015-05-13 Thread Jan Kara

Conversion to the use of pinned pfns made some functions unused. Remove
them. Also there's no need to lock mmap_sem in __buf_prepare() anymore.

Acked-by: Marek Szyprowski 
Tested-by: Marek Szyprowski 
Signed-off-by: Jan Kara 
---
 drivers/media/v4l2-core/videobuf2-memops.c | 114 -
 include/media/videobuf2-memops.h   |   6 --
 2 files changed, 120 deletions(-)

diff --git a/drivers/media/v4l2-core/videobuf2-memops.c 
b/drivers/media/v4l2-core/videobuf2-memops.c
index 0ec186d41b9b..48c6a49c4928 100644
--- a/drivers/media/v4l2-core/videobuf2-memops.c
+++ b/drivers/media/v4l2-core/videobuf2-memops.c
@@ -23,120 +23,6 @@
 #include 

 /**
- * vb2_get_vma() - acquire and lock the virtual memory area
- * @vma:   given virtual memory area
- *
- * This function attempts to acquire an area mapped in the userspace for
- * the duration of a hardware operation. The area is "locked" by performing
- * the same set of operation that are done when process calls fork() and
- * memory areas are duplicated.
- *
- * Returns a copy of a virtual memory region on success or NULL.
- */
-struct vm_area_struct *vb2_get_vma(struct vm_area_struct *vma)
-{
-   struct vm_area_struct *vma_copy;
-
-   vma_copy = kmalloc(sizeof(*vma_copy), GFP_KERNEL);
-   if (vma_copy == NULL)
-   return NULL;
-
-   if (vma->vm_ops && vma->vm_ops->open)
-   vma->vm_ops->open(vma);
-
-   if (vma->vm_file)
-   get_file(vma->vm_file);
-
-   memcpy(vma_copy, vma, sizeof(*vma));
-
-   vma_copy->vm_mm = NULL;
-   vma_copy->vm_next = NULL;
-   vma_copy->vm_prev = NULL;
-
-   return vma_copy;
-}
-EXPORT_SYMBOL_GPL(vb2_get_vma);
-
-/**
- * vb2_put_userptr() - release a userspace virtual memory area
- * @vma:   virtual memory region associated with the area to be released
- *
- * This function releases the previously acquired memory area after a hardware
- * operation.
- */
-void vb2_put_vma(struct vm_area_struct *vma)
-{
-   if (!vma)
-   return;
-
-   if (vma->vm_ops && vma->vm_ops->close)
-   vma->vm_ops->close(vma);
-
-   if (vma->vm_file)
-   fput(vma->vm_file);
-
-   kfree(vma);
-}
-EXPORT_SYMBOL_GPL(vb2_put_vma);
-
-/**
- * vb2_get_contig_userptr() - lock physically contiguous userspace mapped 
memory
- * @vaddr: starting virtual address of the area to be verified
- * @size:  size of the area
- * @res_paddr: will return physical address for the given vaddr
- * @res_vma:   will return locked copy of struct vm_area for the given area
- *
- * This function will go through memory area of size @size mapped at @vaddr and
- * verify that the underlying physical pages are contiguous. If they are
- * contiguous the virtual memory area is locked and a @res_vma is filled with
- * the copy and @res_pa set to the physical address of the buffer.
- *
- * Returns 0 on success.
- */
-int vb2_get_contig_userptr(unsigned long vaddr, unsigned long size,
-  struct vm_area_struct **res_vma, dma_addr_t *res_pa)
-{
-   struct mm_struct *mm = current->mm;
-   struct vm_area_struct *vma;
-   unsigned long offset, start, end;
-   unsigned long this_pfn, prev_pfn;
-   dma_addr_t pa = 0;
-
-   start = vaddr;
-   offset = start & ~PAGE_MASK;
-   end = start + size;
-
-   vma = find_vma(mm, start);
-
-   if (vma == NULL || vma->vm_end < end)
-   return -EFAULT;
-
-   for (prev_pfn = 0; start < end; start += PAGE_SIZE) {
-   int ret = follow_pfn(vma, start, _pfn);
-   if (ret)
-   return ret;
-
-   if (prev_pfn == 0)
-   pa = this_pfn << PAGE_SHIFT;
-   else if (this_pfn != prev_pfn + 1)
-   return -EFAULT;
-
-   prev_pfn = this_pfn;
-   }
-
-   /*
-* Memory is contigous, lock vma and return to the caller
-*/
-   *res_vma = vb2_get_vma(vma);
-   if (*res_vma == NULL)
-   return -ENOMEM;
-
-   *res_pa = pa + offset;
-   return 0;
-}
-EXPORT_SYMBOL_GPL(vb2_get_contig_userptr);
-
-/**
  * vb2_create_framevec() - map virtual addresses to pfns
  * @start: Virtual user address where we start mapping
  * @length:Length of a range to map
diff --git a/include/media/videobuf2-memops.h b/include/media/videobuf2-memops.h
index 2f0564ff5f31..830b5239fd8b 100644
--- a/include/media/videobuf2-memops.h
+++ b/include/media/videobuf2-memops.h
@@ -31,12 +31,6 @@ struct vb2_vmarea_handler {

 extern const struct vm_operations_struct vb2_common_vm_ops;

-int vb2_get_contig_userptr(unsigned long vaddr, unsigned long size,
-  struct vm_area_struct **res_vma, dma_addr_t *res_pa);
-
-struct vm_area_struct *vb2_get_vma(struct vm_area_struct *vma);
-void vb2_p

[PATCH 7/9] media: vb2: Convert vb2_dc_get_userptr() to use frame vector

2015-05-13 Thread Jan Kara

Convert vb2_dc_get_userptr() to use frame vector infrastructure. When we
are doing that there's no need to allocate page array and some code can
be simplified.

Acked-by: Marek Szyprowski 
Tested-by: Marek Szyprowski 
Signed-off-by: Jan Kara 
---
 drivers/media/v4l2-core/videobuf2-dma-contig.c | 214 -
 1 file changed, 34 insertions(+), 180 deletions(-)

diff --git a/drivers/media/v4l2-core/videobuf2-dma-contig.c 
b/drivers/media/v4l2-core/videobuf2-dma-contig.c
index 620c4aa78881..e6cea452302b 100644
--- a/drivers/media/v4l2-core/videobuf2-dma-contig.c
+++ b/drivers/media/v4l2-core/videobuf2-dma-contig.c
@@ -32,15 +32,13 @@ struct vb2_dc_buf {
dma_addr_t  dma_addr;
enum dma_data_direction dma_dir;
struct sg_table *dma_sgt;
+   struct frame_vector *vec;

/* MMAP related */
struct vb2_vmarea_handler   handler;
atomic_trefcount;
struct sg_table *sgt_base;

-   /* USERPTR related */
-   struct vm_area_struct   *vma;
-
/* DMABUF related */
struct dma_buf_attachment   *db_attach;
 };
@@ -49,24 +47,6 @@ struct vb2_dc_buf {
 /*scatterlist table functions*/
 /*/

-
-static void vb2_dc_sgt_foreach_page(struct sg_table *sgt,
-   void (*cb)(struct page *pg))
-{
-   struct scatterlist *s;
-   unsigned int i;
-
-   for_each_sg(sgt->sgl, s, sgt->orig_nents, i) {
-   struct page *page = sg_page(s);
-   unsigned int n_pages = PAGE_ALIGN(s->offset + s->length)
-   >> PAGE_SHIFT;
-   unsigned int j;
-
-   for (j = 0; j < n_pages; ++j, ++page)
-   cb(page);
-   }
-}
-
 static unsigned long vb2_dc_get_contiguous_size(struct sg_table *sgt)
 {
struct scatterlist *s;
@@ -429,92 +409,12 @@ static struct dma_buf *vb2_dc_get_dmabuf(void *buf_priv, 
unsigned long flags)
 /*   callbacks for USERPTR buffers   */
 /*/

-static inline int vma_is_io(struct vm_area_struct *vma)
-{
-   return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
-}
-
-static int vb2_dc_get_user_pfn(unsigned long start, int n_pages,
-   struct vm_area_struct *vma, unsigned long *res)
-{
-   unsigned long pfn, start_pfn, prev_pfn;
-   unsigned int i;
-   int ret;
-
-   if (!vma_is_io(vma))
-   return -EFAULT;
-
-   ret = follow_pfn(vma, start, );
-   if (ret)
-   return ret;
-
-   start_pfn = pfn;
-   start += PAGE_SIZE;
-
-   for (i = 1; i < n_pages; ++i, start += PAGE_SIZE) {
-   prev_pfn = pfn;
-   ret = follow_pfn(vma, start, );
-
-   if (ret) {
-   pr_err("no page for address %lu\n", start);
-   return ret;
-   }
-   if (pfn != prev_pfn + 1)
-   return -EINVAL;
-   }
-
-   *res = start_pfn;
-   return 0;
-}
-
-static int vb2_dc_get_user_pages(unsigned long start, struct page **pages,
-   int n_pages, struct vm_area_struct *vma,
-   enum dma_data_direction dma_dir)
-{
-   if (vma_is_io(vma)) {
-   unsigned int i;
-
-   for (i = 0; i < n_pages; ++i, start += PAGE_SIZE) {
-   unsigned long pfn;
-   int ret = follow_pfn(vma, start, );
-
-   if (!pfn_valid(pfn))
-   return -EINVAL;
-
-   if (ret) {
-   pr_err("no page for address %lu\n", start);
-   return ret;
-   }
-   pages[i] = pfn_to_page(pfn);
-   }
-   } else {
-   int n;
-
-   n = get_user_pages(current, current->mm, start & PAGE_MASK,
-   n_pages, dma_dir == DMA_FROM_DEVICE, 1, pages, NULL);
-   /* negative error means that no page was pinned */
-   n = max(n, 0);
-   if (n != n_pages) {
-   pr_err("got only %d of %d user pages\n", n, n_pages);
-   while (n)
-   put_page(pages[--n]);
-   return -EFAULT;
-   }
-   }
-
-   return 0;
-}
-
-static void vb2_dc_put_dirty_page(struct page *page)
-{
-   set_page_dirty_lock(page);
-   put_page(page);
-}
-
 static void vb2_dc_put_userptr(void *buf_priv)
 {
struct vb2_dc_buf *buf = buf_priv;
struct sg_table *sgt = buf->dma_sgt;
+   int i;
+   struct page **pages;

if (sgt) {
DEFINE_DMA_ATTRS(attrs);
@@ -526,15 +426,15 @@ static void vb2_dc_put_userptr(void *buf_priv)

[PATCH 6/9] media: vb2: Convert vb2_vmalloc_get_userptr() to use frame vector

2015-05-13 Thread Jan Kara

Convert vb2_vmalloc_get_userptr() to use frame vector infrastructure.
When we are doing that there's no need to allocate page array and some
code can be simplified.

Acked-by: Marek Szyprowski 
Tested-by: Marek Szyprowski 
Signed-off-by: Jan Kara 
---
 drivers/media/v4l2-core/videobuf2-vmalloc.c | 94 +++--
 1 file changed, 36 insertions(+), 58 deletions(-)

diff --git a/drivers/media/v4l2-core/videobuf2-vmalloc.c 
b/drivers/media/v4l2-core/videobuf2-vmalloc.c
index 0ba40be21ebd..d2ce81fa2cdf 100644
--- a/drivers/media/v4l2-core/videobuf2-vmalloc.c
+++ b/drivers/media/v4l2-core/videobuf2-vmalloc.c
@@ -23,11 +23,9 @@

 struct vb2_vmalloc_buf {
void*vaddr;
-   struct page **pages;
-   struct vm_area_struct   *vma;
+   struct frame_vector *vec;
enum dma_data_direction dma_dir;
unsigned long   size;
-   unsigned intn_pages;
atomic_trefcount;
struct vb2_vmarea_handler   handler;
struct dma_buf  *dbuf;
@@ -76,10 +74,8 @@ static void *vb2_vmalloc_get_userptr(void *alloc_ctx, 
unsigned long vaddr,
 enum dma_data_direction dma_dir)
 {
struct vb2_vmalloc_buf *buf;
-   unsigned long first, last;
-   int n_pages, offset;
-   struct vm_area_struct *vma;
-   dma_addr_t physp;
+   struct frame_vector *vec;
+   int n_pages, offset, i;

buf = kzalloc(sizeof(*buf), GFP_KERNEL);
if (!buf)
@@ -88,53 +84,36 @@ static void *vb2_vmalloc_get_userptr(void *alloc_ctx, 
unsigned long vaddr,
buf->dma_dir = dma_dir;
offset = vaddr & ~PAGE_MASK;
buf->size = size;
-
-   down_read(>mm->mmap_sem);
-   vma = find_vma(current->mm, vaddr);
-   if (vma && (vma->vm_flags & VM_PFNMAP) && (vma->vm_pgoff)) {
-   if (vb2_get_contig_userptr(vaddr, size, , ))
-   goto fail_pages_array_alloc;
-   buf->vma = vma;
-   buf->vaddr = (__force void *)ioremap_nocache(physp, size);
-   if (!buf->vaddr)
-   goto fail_pages_array_alloc;
+   vec = vb2_create_framevec(vaddr, size, dma_dir == DMA_FROM_DEVICE);
+   if (IS_ERR(vec))
+   goto fail_pfnvec_create;
+   buf->vec = vec;
+   n_pages = frame_vector_count(vec);
+   if (frame_vector_to_pages(vec) < 0) {
+   unsigned long *nums = frame_vector_pfns(vec);
+
+   /*
+* We cannot get page pointers for these pfns. Check memory is
+* physically contiguous and use direct mapping.
+*/
+   for (i = 1; i < n_pages; i++)
+   if (nums[i-1] + 1 != nums[i])
+   goto fail_map;
+   buf->vaddr = (__force void *)
+   ioremap_nocache(nums[0] << PAGE_SHIFT, size);
} else {
-   first = vaddr >> PAGE_SHIFT;
-   last  = (vaddr + size - 1) >> PAGE_SHIFT;
-   buf->n_pages = last - first + 1;
-   buf->pages = kzalloc(buf->n_pages * sizeof(struct page *),
-GFP_KERNEL);
-   if (!buf->pages)
-   goto fail_pages_array_alloc;
-
-   /* current->mm->mmap_sem is taken by videobuf2 core */
-   n_pages = get_user_pages(current, current->mm,
-vaddr & PAGE_MASK, buf->n_pages,
-dma_dir == DMA_FROM_DEVICE,
-1, /* force */
-buf->pages, NULL);
-   if (n_pages != buf->n_pages)
-   goto fail_get_user_pages;
-
-   buf->vaddr = vm_map_ram(buf->pages, buf->n_pages, -1,
+   buf->vaddr = vm_map_ram(frame_vector_pages(vec), n_pages, -1,
PAGE_KERNEL);
-   if (!buf->vaddr)
-   goto fail_get_user_pages;
}
-   up_read(>mm->mmap_sem);

+   if (!buf->vaddr)
+   goto fail_map;
buf->vaddr += offset;
return buf;

-fail_get_user_pages:
-   pr_debug("get_user_pages requested/got: %d/%d]\n", n_pages,
-buf->n_pages);
-   while (--n_pages >= 0)
-   put_page(buf->pages[n_pages]);
-   kfree(buf->pages);
-
-fail_pages_array_alloc:
-   up_read(>mm->mmap_sem);
+fail_map:
+   vb2_destroy_framevec(vec);
+fail_pfnvec_create:
kfree(buf);

return NULL;
@@ -145,22 +124,21 @@ static void vb2_vmalloc_put_userptr(void *buf_p

[PATCH 5/9] media: vb2: Convert vb2_dma_sg_get_userptr() to use frame vector

2015-05-13 Thread Jan Kara

Acked-by: Marek Szyprowski 
Tested-by: Marek Szyprowski 
Signed-off-by: Jan Kara 
---
 drivers/media/v4l2-core/videobuf2-dma-sg.c | 97 +-
 1 file changed, 15 insertions(+), 82 deletions(-)

diff --git a/drivers/media/v4l2-core/videobuf2-dma-sg.c 
b/drivers/media/v4l2-core/videobuf2-dma-sg.c
index afd4b514affc..4ee1b3fbfe2a 100644
--- a/drivers/media/v4l2-core/videobuf2-dma-sg.c
+++ b/drivers/media/v4l2-core/videobuf2-dma-sg.c
@@ -38,6 +38,7 @@ struct vb2_dma_sg_buf {
struct device   *dev;
void*vaddr;
struct page **pages;
+   struct frame_vector *vec;
int offset;
enum dma_data_direction dma_dir;
struct sg_table sg_table;
@@ -51,7 +52,6 @@ struct vb2_dma_sg_buf {
unsigned intnum_pages;
atomic_trefcount;
struct vb2_vmarea_handler   handler;
-   struct vm_area_struct   *vma;

struct dma_buf_attachment   *db_attach;
 };
@@ -224,25 +224,17 @@ static void vb2_dma_sg_finish(void *buf_priv)
dma_sync_sg_for_cpu(buf->dev, sgt->sgl, sgt->nents, buf->dma_dir);
 }

-static inline int vma_is_io(struct vm_area_struct *vma)
-{
-   return !!(vma->vm_flags & (VM_IO | VM_PFNMAP));
-}
-
 static void *vb2_dma_sg_get_userptr(void *alloc_ctx, unsigned long vaddr,
unsigned long size,
enum dma_data_direction dma_dir)
 {
struct vb2_dma_sg_conf *conf = alloc_ctx;
struct vb2_dma_sg_buf *buf;
-   unsigned long first, last;
-   int num_pages_from_user;
-   struct vm_area_struct *vma;
struct sg_table *sgt;
DEFINE_DMA_ATTRS(attrs);
+   struct frame_vector *vec;

dma_set_attr(DMA_ATTR_SKIP_CPU_SYNC, );
-
buf = kzalloc(sizeof *buf, GFP_KERNEL);
if (!buf)
return NULL;
@@ -253,63 +245,19 @@ static void *vb2_dma_sg_get_userptr(void *alloc_ctx, 
unsigned long vaddr,
buf->offset = vaddr & ~PAGE_MASK;
buf->size = size;
buf->dma_sgt = >sg_table;
+   vec = vb2_create_framevec(vaddr, size, buf->dma_dir == DMA_FROM_DEVICE);
+   if (IS_ERR(vec))
+   goto userptr_fail_pfnvec;
+   buf->vec = vec;

-   first = (vaddr   & PAGE_MASK) >> PAGE_SHIFT;
-   last  = ((vaddr + size - 1) & PAGE_MASK) >> PAGE_SHIFT;
-   buf->num_pages = last - first + 1;
-
-   buf->pages = kzalloc(buf->num_pages * sizeof(struct page *),
-GFP_KERNEL);
-   if (!buf->pages)
-   goto userptr_fail_alloc_pages;
-
-   down_read(>mm->mmap_sem);
-   vma = find_vma(current->mm, vaddr);
-   if (!vma) {
-   dprintk(1, "no vma for address %lu\n", vaddr);
-   goto userptr_fail_find_vma;
-   }
-
-   if (vma->vm_end < vaddr + size) {
-   dprintk(1, "vma at %lu is too small for %lu bytes\n",
-   vaddr, size);
-   goto userptr_fail_find_vma;
-   }
-
-   buf->vma = vb2_get_vma(vma);
-   if (!buf->vma) {
-   dprintk(1, "failed to copy vma\n");
-   goto userptr_fail_find_vma;
-   }
-
-   if (vma_is_io(buf->vma)) {
-   for (num_pages_from_user = 0;
-num_pages_from_user < buf->num_pages;
-++num_pages_from_user, vaddr += PAGE_SIZE) {
-   unsigned long pfn;
-
-   if (follow_pfn(vma, vaddr, )) {
-   dprintk(1, "no page for address %lu\n", vaddr);
-   break;
-   }
-   buf->pages[num_pages_from_user] = pfn_to_page(pfn);
-   }
-   } else
-   num_pages_from_user = get_user_pages(current, current->mm,
-vaddr & PAGE_MASK,
-buf->num_pages,
-buf->dma_dir == DMA_FROM_DEVICE,
-1, /* force */
-buf->pages,
-NULL);
-   up_read(>mm->mmap_sem);
-
-   if (num_pages_from_user != buf->num_pages)
-   goto userptr_fail_get_user_pages;
+   buf->pages = frame_vector_pages(vec);
+   if (IS_ERR(buf->pages))
+   goto userptr_fail_sgtable;
+   buf->num_pages = frame_vector_count(vec);

if (sg_alloc_table_from_pages(buf->dma_sgt, buf->pages,
buf->num_pages, buf->offset, size, 0))
-

[PATCH 3/9] media: omap_vout: Convert omap_vout_uservirt_to_phys() to use get_vaddr_pfns()

2015-05-13 Thread Jan Kara

Convert omap_vout_uservirt_to_phys() to use get_vaddr_pfns() instead of
hand made mapping of virtual address to physical address. Also the
function leaked page reference from get_user_pages() so fix that by
properly release the reference when omap_vout_buffer_release() is
called.

Signed-off-by: Jan Kara 
---
 drivers/media/platform/omap/omap_vout.c | 67 +++--
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/drivers/media/platform/omap/omap_vout.c 
b/drivers/media/platform/omap/omap_vout.c
index 17b189a81ec5..0e4b3cfacc5d 100644
--- a/drivers/media/platform/omap/omap_vout.c
+++ b/drivers/media/platform/omap/omap_vout.c
@@ -195,46 +195,34 @@ static int omap_vout_try_format(struct v4l2_pix_format 
*pix)
 }

 /*
- * omap_vout_uservirt_to_phys: This inline function is used to convert user
- * space virtual address to physical address.
+ * omap_vout_get_userptr: Convert user space virtual address to physical
+ * address.
  */
-static unsigned long omap_vout_uservirt_to_phys(unsigned long virtp)
+static int omap_vout_get_userptr(struct videobuf_buffer *vb, u32 virtp,
+u32 *physp)
 {
-   unsigned long physp = 0;
-   struct vm_area_struct *vma;
-   struct mm_struct *mm = current->mm;
+   struct frame_vector *vec;
+   int ret;

/* For kernel direct-mapped memory, take the easy way */
-   if (virtp >= PAGE_OFFSET)
-   return virt_to_phys((void *) virtp);
-
-   down_read(>mm->mmap_sem);
-   vma = find_vma(mm, virtp);
-   if (vma && (vma->vm_flags & VM_IO) && vma->vm_pgoff) {
-   /* this will catch, kernel-allocated, mmaped-to-usermode
-  addresses */
-   physp = (vma->vm_pgoff << PAGE_SHIFT) + (virtp - vma->vm_start);
-   up_read(>mm->mmap_sem);
-   } else {
-   /* otherwise, use get_user_pages() for general userland pages */
-   int res, nr_pages = 1;
-   struct page *pages;
+   if (virtp >= PAGE_OFFSET) {
+   *physp = virt_to_phys((void *)virtp);
+   return 0;
+   }

-   res = get_user_pages(current, current->mm, virtp, nr_pages, 1,
-   0, , NULL);
-   up_read(>mm->mmap_sem);
+   vec = frame_vector_create(1);
+   if (!vec)
+   return -ENOMEM;

-   if (res == nr_pages) {
-   physp =  __pa(page_address([0]) +
-   (virtp & ~PAGE_MASK));
-   } else {
-   printk(KERN_WARNING VOUT_NAME
-   "get_user_pages failed\n");
-   return 0;
-   }
+   ret = get_vaddr_frames(virtp, 1, true, false, vec);
+   if (ret != 1) {
+   frame_vector_destroy(vec);
+   return -EINVAL;
}
+   *physp = __pfn_to_phys(frame_vector_pfns(vec)[0]);
+   vb->priv = vec;

-   return physp;
+   return 0;
 }

 /*
@@ -788,11 +776,15 @@ static int omap_vout_buffer_prepare(struct videobuf_queue 
*q,
 * address of the buffer
 */
if (V4L2_MEMORY_USERPTR == vb->memory) {
+   int ret;
+
if (0 == vb->baddr)
return -EINVAL;
/* Physical address */
-   vout->queued_buf_addr[vb->i] = (u8 *)
-   omap_vout_uservirt_to_phys(vb->baddr);
+   ret = omap_vout_get_userptr(vb, vb->baddr,
+   (u32 *)>queued_buf_addr[vb->i]);
+   if (ret < 0)
+   return ret;
} else {
unsigned long addr, dma_addr;
unsigned long size;
@@ -841,9 +833,12 @@ static void omap_vout_buffer_release(struct videobuf_queue 
*q,
struct omap_vout_device *vout = q->priv_data;

vb->state = VIDEOBUF_NEEDS_INIT;
+   if (vb->memory == V4L2_MEMORY_USERPTR && vb->priv) {
+   struct frame_vector *vec = vb->priv;

-   if (V4L2_MEMORY_MMAP != vout->memory)
-   return;
+   put_vaddr_frames(vec);
+   frame_vector_destroy(vec);
+   }
 }

 /*
-- 
2.1.4

1 2 >

1 - 100 of 149 matches

Mail list logo