Re: [PATCH v4 2/3] dma-buf: add DMA_BUF_SET_NAME ioctls

2019-06-13 Thread Suren Baghdasaryan
On Wed, Jun 12, 2019 at 2:48 PM 'Chenbo Feng' via kernel-team
 wrote:
>
> From: Greg Hackmann 
>
> This patch adds complimentary DMA_BUF_SET_NAME  ioctls, which lets
> userspace processes attach a free-form name to each buffer.
>
> This information can be extremely helpful for tracking and accounting
> shared buffers.  For example, on Android, we know what each buffer will
> be used for at allocation time: GL, multimedia, camera, etc.  The
> userspace allocator can use DMA_BUF_SET_NAME to associate that
> information with the buffer, so we can later give developers a
> breakdown of how much memory they're allocating for graphics, camera,
> etc.
>
> Signed-off-by: Greg Hackmann 
> Signed-off-by: Chenbo Feng 
> ---
>  drivers/dma-buf/dma-buf.c| 64 ++--
>  include/linux/dma-buf.h  |  5 ++-
>  include/uapi/linux/dma-buf.h |  3 ++
>  3 files changed, 68 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> index ffd5a2ad7d6f..87a928c93c1a 100644
> --- a/drivers/dma-buf/dma-buf.c
> +++ b/drivers/dma-buf/dma-buf.c
> @@ -48,8 +48,24 @@ struct dma_buf_list {
>
>  static struct dma_buf_list db_list;
>
> +static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int buflen)
> +{
> +   struct dma_buf *dmabuf;
> +   char name[DMA_BUF_NAME_LEN];
> +   size_t ret = 0;
> +
> +   dmabuf = dentry->d_fsdata;
> +   mutex_lock(>lock);
> +   if (dmabuf->name)
> +   ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN);
> +   mutex_unlock(>lock);
> +
> +   return dynamic_dname(dentry, buffer, buflen, "/%s:%s",
> +dentry->d_name.name, ret > 0 ? name : "");
> +}
> +
>  static const struct dentry_operations dma_buf_dentry_ops = {
> -   .d_dname = simple_dname,
> +   .d_dname = dmabuffs_dname,
>  };
>
>  static struct vfsmount *dma_buf_mnt;
> @@ -297,6 +313,42 @@ static __poll_t dma_buf_poll(struct file *file, 
> poll_table *poll)
> return events;
>  }
>
> +/**
> + * dma_buf_set_name - Set a name to a specific dma_buf to track the usage.
> + * The name of the dma-buf buffer can only be set when the dma-buf is not
> + * attached to any devices. It could theoritically support changing the
> + * name of the dma-buf if the same piece of memory is used for multiple
> + * purpose between different devices.
> + *
> + * @dmabuf [in] dmabuf buffer that will be renamed.
> + * @buf:   [in] A piece of userspace memory that contains the name of
> + *  the dma-buf.
> + *
> + * Returns 0 on success. If the dma-buf buffer is already attached to
> + * devices, return -EBUSY.
> + *
> + */
> +static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user *buf)
> +{
> +   char *name = strndup_user(buf, DMA_BUF_NAME_LEN);
> +   long ret = 0;
> +
> +   if (IS_ERR(name))
> +   return PTR_ERR(name);
> +
> +   mutex_lock(>lock);
> +   if (!list_empty(>attachments)) {
> +   ret = -EBUSY;

Are you missing kfree(name) here?

> +   goto out_unlock;
> +   }
> +   kfree(dmabuf->name);
> +   dmabuf->name = name;
> +
> +out_unlock:
> +   mutex_unlock(>lock);
> +   return ret;
> +}
> +
>  static long dma_buf_ioctl(struct file *file,
>   unsigned int cmd, unsigned long arg)
>  {
> @@ -335,6 +387,10 @@ static long dma_buf_ioctl(struct file *file,
> ret = dma_buf_begin_cpu_access(dmabuf, direction);
>
> return ret;
> +
> +   case DMA_BUF_SET_NAME:
> +   return dma_buf_set_name(dmabuf, (const char __user *)arg);
> +
> default:
> return -ENOTTY;
> }
> @@ -376,6 +432,7 @@ static struct file *dma_buf_getfile(struct dma_buf 
> *dmabuf, int flags)
> goto err_alloc_file;
> file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
> file->private_data = dmabuf;
> +   file->f_path.dentry->d_fsdata = dmabuf;
>
> return file;
>
> @@ -1082,12 +1139,13 @@ static int dma_buf_debug_show(struct seq_file *s, 
> void *unused)
> continue;
> }
>
> -   seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n",
> +   seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n",
> buf_obj->size,
> buf_obj->file->f_flags, buf_obj->file->f_mode,
> file_count(buf_obj->file),
> buf_obj->exp_name,
> -   file_inode(buf_obj->file)->i_ino);
> +   file_inode(buf_obj->file)->i_ino,
> +   buf_obj->name ?: "");
>
> robj = buf_obj->resv;
> while (true) {
> diff --git a/include/linux/dma-buf.h b/include/linux/dma-buf.h
> index 58725f890b5b..582998e19df6 100644
> --- 

Re: [RESEND PATCH v3 2/3] dma-buf: add DMA_BUF_{GET,SET}_NAME ioctls

2019-06-13 Thread Suren Baghdasaryan
On Wed, Jun 12, 2019 at 2:49 PM 'Chenbo Feng' via kernel-team
 wrote:
>
> On Wed, Jun 12, 2019 at 7:43 AM Sumit Semwal  wrote:
> >
> > Hello Chenbo,
> >
> > Thanks very much for your patches. Other than a couple tiny nits
> > below, I think these look good, and I will merge them before the end
> > of this week.
> > On Tue, 11 Jun 2019 at 05:32, Chenbo Feng  wrote:
> > >
> > > From: Greg Hackmann 
> > >
> > > This patch adds complimentary DMA_BUF_SET_NAME and DMA_BUF_GET_NAME
> > > ioctls, which lets userspace processes attach a free-form name to each
> > > buffer.
> > This should remove the _GET_NAME bit since it's not there anymore.
> > >
> > > This information can be extremely helpful for tracking and accounting
> > > shared buffers.  For example, on Android, we know what each buffer will
> > > be used for at allocation time: GL, multimedia, camera, etc.  The
> > > userspace allocator can use DMA_BUF_SET_NAME to associate that
> > > information with the buffer, so we can later give developers a
> > > breakdown of how much memory they're allocating for graphics, camera,
> > > etc.
> > >
> > > Signed-off-by: Greg Hackmann 
> > > Signed-off-by: Chenbo Feng 
> > > ---
> > >  drivers/dma-buf/dma-buf.c| 49 +---
> > >  include/linux/dma-buf.h  |  5 +++-
> > >  include/uapi/linux/dma-buf.h |  3 +++
> > >  3 files changed, 53 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
> > > index ffd5a2ad7d6f..c1da5f9ce44d 100644
> > > --- a/drivers/dma-buf/dma-buf.c
> > > +++ b/drivers/dma-buf/dma-buf.c
> > > @@ -48,8 +48,24 @@ struct dma_buf_list {
> > >
> > >  static struct dma_buf_list db_list;
> > >
> > > +static char *dmabuffs_dname(struct dentry *dentry, char *buffer, int 
> > > buflen)
> > > +{
> > > +   struct dma_buf *dmabuf;
> > > +   char name[DMA_BUF_NAME_LEN];
> > > +   size_t ret = 0;
> > > +
> > > +   dmabuf = dentry->d_fsdata;
> > > +   mutex_lock(>lock);
> > > +   if (dmabuf->name)
> > > +   ret = strlcpy(name, dmabuf->name, DMA_BUF_NAME_LEN);
> > > +   mutex_unlock(>lock);
> > > +
> > > +   return dynamic_dname(dentry, buffer, buflen, "/%s:%s",
> > > +dentry->d_name.name, ret > 0 ? name : "");
> > > +}
> > > +
> > >  static const struct dentry_operations dma_buf_dentry_ops = {
> > > -   .d_dname = simple_dname,
> > > +   .d_dname = dmabuffs_dname,
> > >  };
> > >
> > >  static struct vfsmount *dma_buf_mnt;
> > > @@ -297,6 +313,27 @@ static __poll_t dma_buf_poll(struct file *file, 
> > > poll_table *poll)
> > > return events;
> > >  }
> > >
> > > +static long dma_buf_set_name(struct dma_buf *dmabuf, const char __user 
> > > *buf)
> > > +{
> > > +   char *name = strndup_user(buf, DMA_BUF_NAME_LEN);
> > > +   long ret = 0;
> > > +
> > > +   if (IS_ERR(name))
> > > +   return PTR_ERR(name);
> > > +
> > > +   mutex_lock(>lock);
> > > +   if (!list_empty(>attachments)) {
> > > +   ret = -EBUSY;

Sorry, I commented on an older version... I think you need kfree(name); here.

> > > +   goto out_unlock;
> > > +   }
> > We might also want to document this better - that name change for a
> > buffer is still allowed if it doesn't have any attached devices after
> > its usage is done but before it is destroyed? (theoritically it could
> > be reused with a different name?)
> >
> > > +   kfree(dmabuf->name);
> > > +   dmabuf->name = name;
> > > +
> > > +out_unlock:
> > > +   mutex_unlock(>lock);
> > > +   return ret;
> > > +}
> > > +
> > >  static long dma_buf_ioctl(struct file *file,
> > >   unsigned int cmd, unsigned long arg)
> > >  {
> > > @@ -335,6 +372,10 @@ static long dma_buf_ioctl(struct file *file,
> > > ret = dma_buf_begin_cpu_access(dmabuf, direction);
> > >
> > > return ret;
> > > +
> > > +   case DMA_BUF_SET_NAME:
> > > +   return dma_buf_set_name(dmabuf, (const char __user *)arg);
> > > +
> > > default:
> > > return -ENOTTY;
> > > }
> > > @@ -376,6 +417,7 @@ static struct file *dma_buf_getfile(struct dma_buf 
> > > *dmabuf, int flags)
> > > goto err_alloc_file;
> > > file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
> > > file->private_data = dmabuf;
> > > +   file->f_path.dentry->d_fsdata = dmabuf;
> > >
> > > return file;
> > >
> > > @@ -1082,12 +1124,13 @@ static int dma_buf_debug_show(struct seq_file *s, 
> > > void *unused)
> > > continue;
> > > }
> > >
> > > -   seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\n",
> > > +   seq_printf(s, "%08zu\t%08x\t%08x\t%08ld\t%s\t%08lu\t%s\n",
> > > buf_obj->size,
> > > buf_obj->file->f_flags, 
> > > 

Re: [RFC][PATCH 1/3] dma-buf: heaps: Add deferred-free-helper library code

2020-12-23 Thread Suren Baghdasaryan
Hi John,
Just a couple nits, otherwise looks sane to me.

On Thu, Dec 17, 2020 at 3:06 PM John Stultz  wrote:
>
> This patch provides infrastructure for deferring buffer frees.
>
> This is a feature ION provided which when used with some form
> of a page pool, provides a nice performance boost in an
> allocation microbenchmark. The reason it helps is it allows the
> page-zeroing to be done out of the normal allocation/free path,
> and pushed off to a kthread.

I suggest adding some more description for this API and how it can be
used. IIUC there are 2 uses: lazy deferred freeing using kthread, and
object pooling. no_pool parameter I think deserves some explanation
(disallows pooling when system is under memory pressure).

>
> As not all heaps will find this useful, its implemented as
> a optional helper library that heaps can utilize.
>
> Cc: Sumit Semwal 
> Cc: Liam Mark 
> Cc: Chris Goldsworthy 
> Cc: Laura Abbott 
> Cc: Brian Starkey 
> Cc: Hridya Valsaraju 
> Cc: Suren Baghdasaryan 
> Cc: Sandeep Patil 
> Cc: Daniel Mentz 
> Cc: Ørjan Eide 
> Cc: Robin Murphy 
> Cc: Ezequiel Garcia 
> Cc: Simon Ser 
> Cc: James Jones 
> Cc: linux-me...@vger.kernel.org
> Cc: dri-devel@lists.freedesktop.org
> Signed-off-by: John Stultz 
> ---
>  drivers/dma-buf/heaps/Kconfig|   3 +
>  drivers/dma-buf/heaps/Makefile   |   1 +
>  drivers/dma-buf/heaps/deferred-free-helper.c | 136 +++
>  drivers/dma-buf/heaps/deferred-free-helper.h |  15 ++
>  4 files changed, 155 insertions(+)
>  create mode 100644 drivers/dma-buf/heaps/deferred-free-helper.c
>  create mode 100644 drivers/dma-buf/heaps/deferred-free-helper.h
>
> diff --git a/drivers/dma-buf/heaps/Kconfig b/drivers/dma-buf/heaps/Kconfig
> index a5eef06c4226..ecf65204f714 100644
> --- a/drivers/dma-buf/heaps/Kconfig
> +++ b/drivers/dma-buf/heaps/Kconfig
> @@ -1,3 +1,6 @@
> +config DMABUF_HEAPS_DEFERRED_FREE
> +   bool
> +
>  config DMABUF_HEAPS_SYSTEM
> bool "DMA-BUF System Heap"
> depends on DMABUF_HEAPS
> diff --git a/drivers/dma-buf/heaps/Makefile b/drivers/dma-buf/heaps/Makefile
> index 974467791032..4e7839875615 100644
> --- a/drivers/dma-buf/heaps/Makefile
> +++ b/drivers/dma-buf/heaps/Makefile
> @@ -1,3 +1,4 @@
>  # SPDX-License-Identifier: GPL-2.0
> +obj-$(CONFIG_DMABUF_HEAPS_DEFERRED_FREE) += deferred-free-helper.o
>  obj-$(CONFIG_DMABUF_HEAPS_SYSTEM)  += system_heap.o
>  obj-$(CONFIG_DMABUF_HEAPS_CMA) += cma_heap.o
> diff --git a/drivers/dma-buf/heaps/deferred-free-helper.c 
> b/drivers/dma-buf/heaps/deferred-free-helper.c
> new file mode 100644
> index ..b8f54860454f
> --- /dev/null
> +++ b/drivers/dma-buf/heaps/deferred-free-helper.c
> @@ -0,0 +1,136 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Deferred dmabuf freeing helper
> + *
> + * Copyright (C) 2020 Linaro, Ltd.
> + *
> + * Based on the ION page pool code
> + * Copyright (C) 2011 Google, Inc.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "deferred-free-helper.h"
> +
> +static LIST_HEAD(free_list);
> +static size_t list_size;
> +wait_queue_head_t freelist_waitqueue;
> +struct task_struct *freelist_task;
> +static DEFINE_MUTEX(free_list_lock);
> +
> +enum {
> +   USE_POOL = 0,
> +   SKIP_POOL = 1,
> +};

This enum is used for a bool parameter. Either make it part of the
public API or eliminate and use bool instead.

> +
> +void deferred_free(struct deferred_freelist_item *item,
> +  void (*free)(struct deferred_freelist_item*, bool),
> +  size_t size)
> +{
> +   INIT_LIST_HEAD(>list);
> +   item->size = size;
> +   item->free = free;
> +
> +   mutex_lock(_list_lock);
> +   list_add(>list, _list);
> +   list_size += size;
> +   mutex_unlock(_list_lock);
> +   wake_up(_waitqueue);
> +}
> +
> +static size_t free_one_item(bool nopool)
> +{
> +   size_t size = 0;
> +   struct deferred_freelist_item *item;
> +
> +   mutex_lock(_list_lock);
> +   if (list_empty(_list)) {
> +   mutex_unlock(_list_lock);
> +   return 0;
> +   }
> +   item = list_first_entry(_list, struct deferred_freelist_item, 
> list);
> +   list_del(>list);
> +   size = item->size;
> +   list_size -= size;
> +   mutex_unlock(_list_lock);
> +
> +   item->free(item, nopool);
> +   return size;
> +}
> +
> +static unsigned long get_freelist_size(void)
> +{
> +   unsigned long size;
> +
> +   mutex_lock(_list_lock);
> +  

Re: [RESEND][PATCH 2/3] dma-buf: heaps: Add a WARN_ON should the vmap_cnt go negative

2021-01-23 Thread Suren Baghdasaryan
On Thu, Jan 21, 2021 at 11:56 PM Sumit Semwal  wrote:
>
> Hi John, Suren,
>
>
> On Wed, 20 Jan 2021 at 02:15, John Stultz  wrote:
> >
> > We shouldn't vunmap more then we vmap, but if we do, make
> > sure we complain loudly.
>
> I was checking the general usage of vunmap in the kernel, and I
> couldn't find many instances where we need to WARN_ON for the vunmap
> count more than vmap count. Is there a specific need for this in the heaps?

Hi Sumit,
My worry was that buffer->vmap_cnt could silently go negative. But if
this warning is not consistent with other places we do refcounted
vmap/vunmap then feel free to ignore my suggestion.
Thanks!

>
> Best,
> Sumit.
> >
> > Cc: Sumit Semwal 
> > Cc: Liam Mark 
> > Cc: Laura Abbott 
> > Cc: Brian Starkey 
> > Cc: Hridya Valsaraju 
> > Cc: Suren Baghdasaryan 
> > Cc: Sandeep Patil 
> > Cc: Daniel Mentz 
> > Cc: Chris Goldsworthy 
> > Cc: Ørjan Eide 
> > Cc: Robin Murphy 
> > Cc: Ezequiel Garcia 
> > Cc: Simon Ser 
> > Cc: James Jones 
> > Cc: linux-me...@vger.kernel.org
> > Cc: dri-devel@lists.freedesktop.org
> > Suggested-by: Suren Baghdasaryan 
> > Signed-off-by: John Stultz 
> > ---
> >  drivers/dma-buf/heaps/cma_heap.c| 1 +
> >  drivers/dma-buf/heaps/system_heap.c | 1 +
> >  2 files changed, 2 insertions(+)
> >
> > diff --git a/drivers/dma-buf/heaps/cma_heap.c 
> > b/drivers/dma-buf/heaps/cma_heap.c
> > index 364fc2f3e499..0c76cbc3fb11 100644
> > --- a/drivers/dma-buf/heaps/cma_heap.c
> > +++ b/drivers/dma-buf/heaps/cma_heap.c
> > @@ -232,6 +232,7 @@ static void cma_heap_vunmap(struct dma_buf *dmabuf, 
> > struct dma_buf_map *map)
> > struct cma_heap_buffer *buffer = dmabuf->priv;
> >
> > mutex_lock(>lock);
> > +   WARN_ON(buffer->vmap_cnt == 0);
> > if (!--buffer->vmap_cnt) {
> > vunmap(buffer->vaddr);
> > buffer->vaddr = NULL;
> > diff --git a/drivers/dma-buf/heaps/system_heap.c 
> > b/drivers/dma-buf/heaps/system_heap.c
> > index 405351aad2a8..2321c91891f6 100644
> > --- a/drivers/dma-buf/heaps/system_heap.c
> > +++ b/drivers/dma-buf/heaps/system_heap.c
> > @@ -273,6 +273,7 @@ static void system_heap_vunmap(struct dma_buf *dmabuf, 
> > struct dma_buf_map *map)
> > struct system_heap_buffer *buffer = dmabuf->priv;
> >
> > mutex_lock(>lock);
> > +   WARN_ON(buffer->vmap_cnt == 0);
> > if (!--buffer->vmap_cnt) {
> > vunmap(buffer->vaddr);
> > buffer->vaddr = NULL;
> > --
> > 2.17.1
> >
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH v2 2/2] dma-buf: heaps: Map system heap pages as managed by linux vm

2021-02-04 Thread Suren Baghdasaryan
On Wed, Feb 3, 2021 at 12:06 AM Christian König
 wrote:
>
> Am 03.02.21 um 03:02 schrieb Suren Baghdasaryan:
> > On Tue, Feb 2, 2021 at 5:39 PM Minchan Kim  wrote:
> >> On Tue, Feb 02, 2021 at 04:31:34PM -0800, Suren Baghdasaryan wrote:
> >>> Currently system heap maps its buffers with VM_PFNMAP flag using
> >>> remap_pfn_range. This results in such buffers not being accounted
> >>> for in PSS calculations because vm treats this memory as having no
> >>> page structs. Without page structs there are no counters representing
> >>> how many processes are mapping a page and therefore PSS calculation
> >>> is impossible.
> >>> Historically, ION driver used to map its buffers as VM_PFNMAP areas
> >>> due to memory carveouts that did not have page structs [1]. That
> >>> is not the case anymore and it seems there was desire to move away
> >>> from remap_pfn_range [2].
> >>> Dmabuf system heap design inherits this ION behavior and maps its
> >>> pages using remap_pfn_range even though allocated pages are backed
> >>> by page structs.
> >>> Replace remap_pfn_range with vm_insert_page, following Laura's suggestion
> >>> in [1]. This would allow correct PSS calculation for dmabufs.
> >>>
> >>> [1] 
> >>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fdriverdev-devel.linuxdriverproject.narkive.com%2Fv0fJGpaD%2Fusing-ion-memory-for-direct-iodata=04%7C01%7Cchristian.koenig%40amd.com%7Cb4c145b86dd0472c943c08d8c7e7ba4b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637479145389160353%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=W1N%2B%2BlcFDaRSvXdSPe5hPNMRByHfGkU7Uc3cmM3FCTU%3Dreserved=0
> >>> [2] 
> >>> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fdriverdev.linuxdriverproject.org%2Fpipermail%2Fdriverdev-devel%2F2018-October%2F127519.htmldata=04%7C01%7Cchristian.koenig%40amd.com%7Cb4c145b86dd0472c943c08d8c7e7ba4b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637479145389160353%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=jQxSzKEr52lUcAIx%2FuBHMJ7yOgof%2FVMlW9%2BB2f%2FoS%2FE%3Dreserved=0
> >>> (sorry, could not find lore links for these discussions)
> >>>
> >>> Suggested-by: Laura Abbott 
> >>> Signed-off-by: Suren Baghdasaryan 
> >> Reviewed-by: Minchan Kim 
> >>
> >> A note: This patch makes dmabuf system heap accounted as PSS so
> >> if someone has relies on the size, they will see the bloat.
> >> IIRC, there was some debate whether PSS accounting for their
> >> buffer is correct or not. If it'd be a problem, we need to
> >> discuss how to solve it(maybe, vma->vm_flags and reintroduce
> >> remap_pfn_range for them to be respected).
> > I did not see debates about not including *mapped* dmabufs into PSS
> > calculation. I remember people were discussing how to account dmabufs
> > referred only by the FD but that is a different discussion. If the
> > buffer is mapped into the address space of a process then IMHO
> > including it into PSS of that process is not controversial.
>
> Well, I think it is. And to be honest this doesn't looks like a good
> idea to me since it will eventually lead to double accounting of system
> heap DMA-bufs.

Thanks for the comment! Could you please expand on this double
accounting issue? Do you mean userspace could double account dmabufs
because it expects dmabufs not to be part of PSS or is there some
in-kernel accounting mechanism that would be broken by this?

>
> As discussed multiple times it is illegal to use the struct page of a
> DMA-buf. This case here is a bit special since it is the owner of the
> pages which does that, but I'm not sure if this won't cause problems
> elsewhere as well.

I would be happy to keep things as they are but calculating dmabuf
contribution to PSS without struct pages is extremely inefficient and
becomes a real pain when we consider the possibilities of partial
mappings, when not the entire dmabuf is being mapped.
Calculating this would require parsing /proc/pid/maps for the process,
finding dmabuf mappings and the size for each one, then parsing
/proc/pid/maps for ALL processes in the system to see if the same
dmabufs are used by other processes and only then calculating the PSS.
I hope that explains the desire to use already existing struct pages
to obtain PSS in a much more efficient way.

>
> A more appropriate solution would be to held processes accountable for
> resources they have allocated through device drivers.

Are you suggesting some new kernel mechanism to account resources
allocated by a process via a driver? If so, any details?

>
> Regards,
> Christian.
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/2] mm: replace BUG_ON in vm_insert_page with a return of an error

2021-02-03 Thread Suren Baghdasaryan
On Tue, Feb 2, 2021 at 5:31 PM Minchan Kim  wrote:
>
> On Tue, Feb 02, 2021 at 04:31:33PM -0800, Suren Baghdasaryan wrote:
> > Replace BUG_ON(vma->vm_flags & VM_PFNMAP) in vm_insert_page with
> > WARN_ON_ONCE and returning an error. This is to ensure users of the
> > vm_insert_page that set VM_PFNMAP are notified of the wrong flag usage
> > and get an indication of an error without panicing the kernel.
> > This will help identifying drivers that need to clear VM_PFNMAP before
> > using dmabuf system heap which is moving to use vm_insert_page.
> >
> > Suggested-by: Christoph Hellwig 
> > Signed-off-by: Suren Baghdasaryan 
> > ---
> >  mm/memory.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/mm/memory.c b/mm/memory.c
> > index feff48e1465a..e503c9801cd9 100644
> > --- a/mm/memory.c
> > +++ b/mm/memory.c
> > @@ -1827,7 +1827,8 @@ int vm_insert_page(struct vm_area_struct *vma, 
> > unsigned long addr,
> >   return -EINVAL;
> >   if (!(vma->vm_flags & VM_MIXEDMAP)) {
> >   BUG_ON(mmap_read_trylock(vma->vm_mm));
>
> Better to replace above BUG_ON with WARN_ON_ONCE, too?

If nobody objects I'll do that in the next respin. Thanks!

>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH v2 2/2] dma-buf: heaps: Map system heap pages as managed by linux vm

2021-02-03 Thread Suren Baghdasaryan
On Tue, Feb 2, 2021 at 5:39 PM Minchan Kim  wrote:
>
> On Tue, Feb 02, 2021 at 04:31:34PM -0800, Suren Baghdasaryan wrote:
> > Currently system heap maps its buffers with VM_PFNMAP flag using
> > remap_pfn_range. This results in such buffers not being accounted
> > for in PSS calculations because vm treats this memory as having no
> > page structs. Without page structs there are no counters representing
> > how many processes are mapping a page and therefore PSS calculation
> > is impossible.
> > Historically, ION driver used to map its buffers as VM_PFNMAP areas
> > due to memory carveouts that did not have page structs [1]. That
> > is not the case anymore and it seems there was desire to move away
> > from remap_pfn_range [2].
> > Dmabuf system heap design inherits this ION behavior and maps its
> > pages using remap_pfn_range even though allocated pages are backed
> > by page structs.
> > Replace remap_pfn_range with vm_insert_page, following Laura's suggestion
> > in [1]. This would allow correct PSS calculation for dmabufs.
> >
> > [1] 
> > https://driverdev-devel.linuxdriverproject.narkive.com/v0fJGpaD/using-ion-memory-for-direct-io
> > [2] 
> > http://driverdev.linuxdriverproject.org/pipermail/driverdev-devel/2018-October/127519.html
> > (sorry, could not find lore links for these discussions)
> >
> > Suggested-by: Laura Abbott 
> > Signed-off-by: Suren Baghdasaryan 
> Reviewed-by: Minchan Kim 
>
> A note: This patch makes dmabuf system heap accounted as PSS so
> if someone has relies on the size, they will see the bloat.
> IIRC, there was some debate whether PSS accounting for their
> buffer is correct or not. If it'd be a problem, we need to
> discuss how to solve it(maybe, vma->vm_flags and reintroduce
> remap_pfn_range for them to be respected).

I did not see debates about not including *mapped* dmabufs into PSS
calculation. I remember people were discussing how to account dmabufs
referred only by the FD but that is a different discussion. If the
buffer is mapped into the address space of a process then IMHO
including it into PSS of that process is not controversial.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCH 1/2] mm: replace BUG_ON in vm_insert_page with a return of an error

2021-02-03 Thread Suren Baghdasaryan
Replace BUG_ON(vma->vm_flags & VM_PFNMAP) in vm_insert_page with
WARN_ON_ONCE and returning an error. This is to ensure users of the
vm_insert_page that set VM_PFNMAP are notified of the wrong flag usage
and get an indication of an error without panicing the kernel.
This will help identifying drivers that need to clear VM_PFNMAP before
using dmabuf system heap which is moving to use vm_insert_page.

Suggested-by: Christoph Hellwig 
Signed-off-by: Suren Baghdasaryan 
---
 mm/memory.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mm/memory.c b/mm/memory.c
index feff48e1465a..e503c9801cd9 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1827,7 +1827,8 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned 
long addr,
return -EINVAL;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
BUG_ON(mmap_read_trylock(vma->vm_mm));
-   BUG_ON(vma->vm_flags & VM_PFNMAP);
+   if (WARN_ON_ONCE(vma->vm_flags & VM_PFNMAP))
+   return -EINVAL;
vma->vm_flags |= VM_MIXEDMAP;
}
return insert_page(vma, addr, page, vma->vm_page_prot);
-- 
2.30.0.365.g02bc693789-goog

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 1/2] mm: replace BUG_ON in vm_insert_page with a return of an error

2021-02-03 Thread Suren Baghdasaryan
On Tue, Feb 2, 2021 at 5:55 PM Matthew Wilcox  wrote:
>
> On Tue, Feb 02, 2021 at 04:31:33PM -0800, Suren Baghdasaryan wrote:
> > Replace BUG_ON(vma->vm_flags & VM_PFNMAP) in vm_insert_page with
> > WARN_ON_ONCE and returning an error. This is to ensure users of the
> > vm_insert_page that set VM_PFNMAP are notified of the wrong flag usage
> > and get an indication of an error without panicing the kernel.
> > This will help identifying drivers that need to clear VM_PFNMAP before
> > using dmabuf system heap which is moving to use vm_insert_page.
>
> NACK.
>
> The system may not _panic_, but it is clearly now _broken_.  The device
> doesn't work, and so the system is useless.  You haven't really improved
> anything here.  Just bloated the kernel with yet another _ONCE variable
> that in a normal system will never ever ever be triggered.

We had a discussion in https://lore.kernel.org/patchwork/patch/1372409
about how some DRM drivers set up their VMAs with VM_PFNMAP before
mapping them. We want to use vm_insert_page instead of remap_pfn_range
in the dmabuf heaps so that this memory is visible in PSS. However if
a driver that sets VM_PFNMAP tries to use a dmabuf heap, it will step
into this BUG_ON. We wanted to catch and gradually fix such drivers
but without causing a panic in the process. I hope this clarifies the
reasons why I'm making this change and I'm open to other ideas if they
would address this issue in a better way.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Linaro-mm-sig] [PATCH 1/2] mm: replace BUG_ON in vm_insert_page with a return of an error

2021-02-05 Thread Suren Baghdasaryan
On Thu, Feb 4, 2021 at 7:55 AM Alex Deucher  wrote:
>
> On Thu, Feb 4, 2021 at 3:16 AM Christian König  
> wrote:
> >
> > Am 03.02.21 um 22:41 schrieb Suren Baghdasaryan:
> > > [SNIP]
> > >>> How many semi-unrelated buffer accounting schemes does google come up 
> > >>> with?
> > >>>
> > >>> We're at three with this one.
> > >>>
> > >>> And also we _cannot_ required that all dma-bufs are backed by struct
> > >>> page, so requiring struct page to make this work is a no-go.
> > >>>
> > >>> Second, we do not want to all get_user_pages and friends to work on
> > >>> dma-buf, it causes all kinds of pain. Yes on SoC where dma-buf are
> > >>> exclusively in system memory you can maybe get away with this, but
> > >>> dma-buf is supposed to work in more places than just Android SoCs.
> > >> I just realized that vm_inser_page doesn't even work for CMA, it would
> > >> upset get_user_pages pretty badly - you're trying to pin a page in
> > >> ZONE_MOVEABLE but you can't move it because it's rather special.
> > >> VM_SPECIAL is exactly meant to catch this stuff.
> > > Thanks for the input, Daniel! Let me think about the cases you pointed 
> > > out.
> > >
> > > IMHO, the issue with PSS is the difficulty of calculating this metric
> > > without struct page usage. I don't think that problem becomes easier
> > > if we use cgroups or any other API. I wanted to enable existing PSS
> > > calculation mechanisms for the dmabufs known to be backed by struct
> > > pages (since we know how the heap allocated that memory), but sounds
> > > like this would lead to problems that I did not consider.
> >
> > Yeah, using struct page indeed won't work. We discussed that multiple
> > times now and Daniel even has a patch to mangle the struct page pointers
> > inside the sg_table object to prevent abuse in that direction.
> >
> > On the other hand I totally agree that we need to do something on this
> > side which goes beyong what cgroups provide.
> >
> > A few years ago I came up with patches to improve the OOM killer to
> > include resources bound to the processes through file descriptors. I
> > unfortunately can't find them of hand any more and I'm currently to busy
> > to dig them up.
>
> https://lists.freedesktop.org/archives/dri-devel/2015-September/089778.html
> I think there was a more recent discussion, but I can't seem to find it.

Thanks for the pointer!
Appreciate the time everyone took to explain the issues.
Thanks,
Suren.

>
> Alex
>
> >
> > In general I think we need to make it possible that both the in kernel
> > OOM killer as well as userspace processes and handlers have access to
> > that kind of data.
> >
> > The fdinfo approach as suggested in the other thread sounds like the
> > easiest solution to me.
> >
> > Regards,
> > Christian.
> >
> > > Thanks,
> > > Suren.
> > >
> > >
> >
> > ___
> > dri-devel mailing list
> > dri-devel@lists.freedesktop.org
> > https://lists.freedesktop.org/mailman/listinfo/dri-devel
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [RFC][PATCH v6 1/7] drm: Add a sharable drm page-pool implementation

2021-02-09 Thread Suren Baghdasaryan
On Tue, Feb 9, 2021 at 4:57 AM Christian König  wrote:
>
> Am 09.02.21 um 13:11 schrieb Christian König:
> > [SNIP]
>  +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
>  +{
>  + spin_lock(>lock);
>  + list_add_tail(>lru, >items);
>  + pool->count++;
>  + atomic_long_add(1 << pool->order, _pages);
>  + spin_unlock(>lock);
>  +
>  + mod_node_page_state(page_pgdat(page),
>  NR_KERNEL_MISC_RECLAIMABLE,
>  + 1 << pool->order);
> >>> Hui what? What should that be good for?
> >> This is a carryover from the ION page pool implementation:
> >> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28data=04%7C01%7Cchristian.koenig%40amd.com%7Cc4eadb0a9cf6491d99ba08d8ca173457%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637481548325174885%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=FUjZK5NSDMUYfU7vGeE4fDU2HCF%2FYyNBwc30aoLLPQ4%3Dreserved=0
> >>
> >>
> >> My sense is it helps with the vmstat/meminfo accounting so folks can
> >> see the cached pages are shrinkable/freeable. This maybe falls under
> >> other dmabuf accounting/stats discussions, so I'm happy to remove it
> >> for now, or let the drivers using the shared page pool logic handle
> >> the accounting themselves?
>
> Intentionally separated the discussion for that here.
>
> As far as I can see this is just bluntly incorrect.
>
> Either the page is reclaimable or it is part of our pool and freeable
> through the shrinker, but never ever both.

IIRC the original motivation for counting ION pooled pages as
reclaimable was to include them into /proc/meminfo's MemAvailable
calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
non-slab kernel pages" seems like a good place to account for them but
I might be wrong.

>
> In the best case this just messes up the accounting, in the worst case
> it can cause memory corruption.
>
> Christian.
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [RFC][PATCH v6 1/7] drm: Add a sharable drm page-pool implementation

2021-02-09 Thread Suren Baghdasaryan
On Tue, Feb 9, 2021 at 9:46 AM Christian König  wrote:
>
>
>
> Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > On Tue, Feb 9, 2021 at 4:57 AM Christian König  
> > wrote:
> >> Am 09.02.21 um 13:11 schrieb Christian König:
> >>> [SNIP]
> >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> >>>>>> +{
> >>>>>> + spin_lock(>lock);
> >>>>>> + list_add_tail(>lru, >items);
> >>>>>> + pool->count++;
> >>>>>> + atomic_long_add(1 << pool->order, _pages);
> >>>>>> + spin_unlock(>lock);
> >>>>>> +
> >>>>>> + mod_node_page_state(page_pgdat(page),
> >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> >>>>>> + 1 << pool->order);
> >>>>> Hui what? What should that be good for?
> >>>> This is a carryover from the ION page pool implementation:
> >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28data=04%7C01%7Cchristian.koenig%40amd.com%7Cdff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3Dreserved=0
> >>>>
> >>>>
> >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> >>>> for now, or let the drivers using the shared page pool logic handle
> >>>> the accounting themselves?
> >> Intentionally separated the discussion for that here.
> >>
> >> As far as I can see this is just bluntly incorrect.
> >>
> >> Either the page is reclaimable or it is part of our pool and freeable
> >> through the shrinker, but never ever both.
> > IIRC the original motivation for counting ION pooled pages as
> > reclaimable was to include them into /proc/meminfo's MemAvailable
> > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > non-slab kernel pages" seems like a good place to account for them but
> > I might be wrong.
>
> Yeah, that's what I see here as well. But exactly that is utterly nonsense.
>
> Those pages are not "free" in the sense that get_free_page could return
> them directly.

Any ideas where these pages would fit better? We do want to know that
under memory pressure these pages can be made available (which is I
think what MemAvailable means).

>
> Regards,
> Christian.
>
> >
> >> In the best case this just messes up the accounting, in the worst case
> >> it can cause memory corruption.
> >>
> >> Christian.
>
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [RFC][PATCH v6 1/7] drm: Add a sharable drm page-pool implementation

2021-02-09 Thread Suren Baghdasaryan
On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter  wrote:
>
> On Tue, Feb 9, 2021 at 6:46 PM Christian König  
> wrote:
> >
> >
> >
> > Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > > On Tue, Feb 9, 2021 at 4:57 AM Christian König  
> > > wrote:
> > >> Am 09.02.21 um 13:11 schrieb Christian König:
> > >>> [SNIP]
> > >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page 
> > >>>>>> *page)
> > >>>>>> +{
> > >>>>>> + spin_lock(>lock);
> > >>>>>> + list_add_tail(>lru, >items);
> > >>>>>> + pool->count++;
> > >>>>>> + atomic_long_add(1 << pool->order, _pages);
> > >>>>>> + spin_unlock(>lock);
> > >>>>>> +
> > >>>>>> + mod_node_page_state(page_pgdat(page),
> > >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> > >>>>>> + 1 << pool->order);
> > >>>>> Hui what? What should that be good for?
> > >>>> This is a carryover from the ION page pool implementation:
> > >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28data=04%7C01%7Cchristian.koenig%40amd.com%7Cdff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3Dreserved=0
> > >>>>
> > >>>>
> > >>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> > >>>> see the cached pages are shrinkable/freeable. This maybe falls under
> > >>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> > >>>> for now, or let the drivers using the shared page pool logic handle
> > >>>> the accounting themselves?
> > >> Intentionally separated the discussion for that here.
> > >>
> > >> As far as I can see this is just bluntly incorrect.
> > >>
> > >> Either the page is reclaimable or it is part of our pool and freeable
> > >> through the shrinker, but never ever both.
> > > IIRC the original motivation for counting ION pooled pages as
> > > reclaimable was to include them into /proc/meminfo's MemAvailable
> > > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > > non-slab kernel pages" seems like a good place to account for them but
> > > I might be wrong.
> >
> > Yeah, that's what I see here as well. But exactly that is utterly nonsense.
> >
> > Those pages are not "free" in the sense that get_free_page could return
> > them directly.
>
> Well on Android that is kinda true, because Android has it's
> oom-killer (way back it was just a shrinker callback, not sure how it
> works now), which just shot down all the background apps. So at least
> some of that (everything used by background apps) is indeed
> reclaimable on Android.
>
> But that doesn't hold on Linux in general, so we can't really do this
> for common code.
>
> Also I had a long meeting with Suren, John and other googles
> yesterday, and the aim is now to try and support all the Android gpu
> memory accounting needs with cgroups. That should work, and it will
> allow Android to handle all the Android-ism in a clean way in upstream
> code. Or that's at least the plan.
>
> I think the only thing we identified that Android still needs on top
> is the dma-buf sysfs stuff, so that shared buffers (which on Android
> are always dma-buf, and always stay around as dma-buf fd throughout
> their lifetime) can be listed/analyzed with full detail.
>
> But aside from this the plan for all the per-process or per-heap
> account, oom-killer integration and everything else is planned to be
> done with cgroups.

Until cgroups are ready we probably will need to add a sysfs node to
report the total dmabuf pool size and I think that would cover our
current accounting need here.
As I mentioned, not including dmabuf pools into MemAvailable would
affect that stat and I'm wondering if pools should be considered as
part of MemAvailable or not. Since MemAvailable includes SReclaimable
I think it makes sense to include them but maybe there are other
considerations that I'm missing?

> Android (for now) only needs to account overall gpu
> memory since none of it is swappable on android drivers anyway, plus
> no vram, so not much needed.
>
> Cheers, Daniel
>
> >
> > Regards,
> > Christian.
> >
> > >
> > >> In the best case this just messes up the accounting, in the worst case
> > >> it can cause memory corruption.
> > >>
> > >> Christian.
> >
>
>
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> http://blog.ffwll.ch
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [RFC][PATCH v6 1/7] drm: Add a sharable drm page-pool implementation

2021-02-10 Thread Suren Baghdasaryan
On Wed, Feb 10, 2021 at 9:21 AM Daniel Vetter  wrote:
>
> On Wed, Feb 10, 2021 at 5:39 PM Suren Baghdasaryan  wrote:
> >
> > On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter  wrote:
> > >
> > > On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
> > > > On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter  wrote:
> > > > >
> > > > > On Tue, Feb 9, 2021 at 6:46 PM Christian König 
> > > > >  wrote:
> > > > > >
> > > > > >
> > > > > >
> > > > > > Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > > > > > > On Tue, Feb 9, 2021 at 4:57 AM Christian König 
> > > > > > >  wrote:
> > > > > > >> Am 09.02.21 um 13:11 schrieb Christian König:
> > > > > > >>> [SNIP]
> > > > > > >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct 
> > > > > > >>>>>> page *page)
> > > > > > >>>>>> +{
> > > > > > >>>>>> + spin_lock(>lock);
> > > > > > >>>>>> + list_add_tail(>lru, >items);
> > > > > > >>>>>> + pool->count++;
> > > > > > >>>>>> + atomic_long_add(1 << pool->order, _pages);
> > > > > > >>>>>> + spin_unlock(>lock);
> > > > > > >>>>>> +
> > > > > > >>>>>> + mod_node_page_state(page_pgdat(page),
> > > > > > >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> > > > > > >>>>>> + 1 << pool->order);
> > > > > > >>>>> Hui what? What should that be good for?
> > > > > > >>>> This is a carryover from the ION page pool implementation:
> > > > > > >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28data=04%7C01%7Cchristian.koenig%40amd.com%7Cdff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3Dreserved=0
> > > > > > >>>>
> > > > > > >>>>
> > > > > > >>>> My sense is it helps with the vmstat/meminfo accounting so 
> > > > > > >>>> folks can
> > > > > > >>>> see the cached pages are shrinkable/freeable. This maybe falls 
> > > > > > >>>> under
> > > > > > >>>> other dmabuf accounting/stats discussions, so I'm happy to 
> > > > > > >>>> remove it
> > > > > > >>>> for now, or let the drivers using the shared page pool logic 
> > > > > > >>>> handle
> > > > > > >>>> the accounting themselves?
> > > > > > >> Intentionally separated the discussion for that here.
> > > > > > >>
> > > > > > >> As far as I can see this is just bluntly incorrect.
> > > > > > >>
> > > > > > >> Either the page is reclaimable or it is part of our pool and 
> > > > > > >> freeable
> > > > > > >> through the shrinker, but never ever both.
> > > > > > > IIRC the original motivation for counting ION pooled pages as
> > > > > > > reclaimable was to include them into /proc/meminfo's MemAvailable
> > > > > > > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > > > > > > non-slab kernel pages" seems like a good place to account for 
> > > > > > > them but
> > > > > > > I might be wrong.
> > > > > >
> > > > > > Yeah, that's what I see here as well. But exactly that is utterly 
> > > > > > nonsense.
> > > > > >
> > > > > > Those pages are not "free" in the sense that get_free_page could 
> > > > > > return
> > > > > > them directly.
> > > > >
> > > > > W

Re: [RFC][PATCH v6 1/7] drm: Add a sharable drm page-pool implementation

2021-02-10 Thread Suren Baghdasaryan
On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter  wrote:
>
> On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
> > On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter  wrote:
> > >
> > > On Tue, Feb 9, 2021 at 6:46 PM Christian König  
> > > wrote:
> > > >
> > > >
> > > >
> > > > Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> > > > > On Tue, Feb 9, 2021 at 4:57 AM Christian König 
> > > > >  wrote:
> > > > >> Am 09.02.21 um 13:11 schrieb Christian König:
> > > > >>> [SNIP]
> > > > >>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page 
> > > > >>>>>> *page)
> > > > >>>>>> +{
> > > > >>>>>> + spin_lock(>lock);
> > > > >>>>>> + list_add_tail(>lru, >items);
> > > > >>>>>> + pool->count++;
> > > > >>>>>> + atomic_long_add(1 << pool->order, _pages);
> > > > >>>>>> + spin_unlock(>lock);
> > > > >>>>>> +
> > > > >>>>>> + mod_node_page_state(page_pgdat(page),
> > > > >>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> > > > >>>>>> + 1 << pool->order);
> > > > >>>>> Hui what? What should that be good for?
> > > > >>>> This is a carryover from the ION page pool implementation:
> > > > >>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28data=04%7C01%7Cchristian.koenig%40amd.com%7Cdff8edcd4d147a5b08d8cd20cff2%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637484888114923580%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=9%2BIBC0tezSV6Ci4S3kWfW%2BQvJm4mdunn3dF6C0kyfCw%3Dreserved=0
> > > > >>>>
> > > > >>>>
> > > > >>>> My sense is it helps with the vmstat/meminfo accounting so folks 
> > > > >>>> can
> > > > >>>> see the cached pages are shrinkable/freeable. This maybe falls 
> > > > >>>> under
> > > > >>>> other dmabuf accounting/stats discussions, so I'm happy to remove 
> > > > >>>> it
> > > > >>>> for now, or let the drivers using the shared page pool logic handle
> > > > >>>> the accounting themselves?
> > > > >> Intentionally separated the discussion for that here.
> > > > >>
> > > > >> As far as I can see this is just bluntly incorrect.
> > > > >>
> > > > >> Either the page is reclaimable or it is part of our pool and freeable
> > > > >> through the shrinker, but never ever both.
> > > > > IIRC the original motivation for counting ION pooled pages as
> > > > > reclaimable was to include them into /proc/meminfo's MemAvailable
> > > > > calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> > > > > non-slab kernel pages" seems like a good place to account for them but
> > > > > I might be wrong.
> > > >
> > > > Yeah, that's what I see here as well. But exactly that is utterly 
> > > > nonsense.
> > > >
> > > > Those pages are not "free" in the sense that get_free_page could return
> > > > them directly.
> > >
> > > Well on Android that is kinda true, because Android has it's
> > > oom-killer (way back it was just a shrinker callback, not sure how it
> > > works now), which just shot down all the background apps. So at least
> > > some of that (everything used by background apps) is indeed
> > > reclaimable on Android.
> > >
> > > But that doesn't hold on Linux in general, so we can't really do this
> > > for common code.
> > >
> > > Also I had a long meeting with Suren, John and other googles
> > > yesterday, and the aim is now to try and support all the Android gpu
> > > memory accounting needs with cgroups. That should work, and it will
> > > allow Android to handle all the Android-ism in a clean way in upstream
> > > code. Or that's at least the plan.
> > 

Re: [RFC][PATCH v6 1/7] drm: Add a sharable drm page-pool implementation

2021-02-10 Thread Suren Baghdasaryan
On Wed, Feb 10, 2021 at 10:32 AM Christian König
 wrote:
>
>
>
> Am 10.02.21 um 17:39 schrieb Suren Baghdasaryan:
> > On Wed, Feb 10, 2021 at 5:06 AM Daniel Vetter  wrote:
> >> On Tue, Feb 09, 2021 at 12:16:51PM -0800, Suren Baghdasaryan wrote:
> >>> On Tue, Feb 9, 2021 at 12:03 PM Daniel Vetter  wrote:
> >>>> On Tue, Feb 9, 2021 at 6:46 PM Christian König 
> >>>>  wrote:
> >>>>>
> >>>>>
> >>>>> Am 09.02.21 um 18:33 schrieb Suren Baghdasaryan:
> >>>>>> On Tue, Feb 9, 2021 at 4:57 AM Christian König 
> >>>>>>  wrote:
> >>>>>>> Am 09.02.21 um 13:11 schrieb Christian König:
> >>>>>>>> [SNIP]
> >>>>>>>>>>> +void drm_page_pool_add(struct drm_page_pool *pool, struct page 
> >>>>>>>>>>> *page)
> >>>>>>>>>>> +{
> >>>>>>>>>>> + spin_lock(>lock);
> >>>>>>>>>>> + list_add_tail(>lru, >items);
> >>>>>>>>>>> + pool->count++;
> >>>>>>>>>>> + atomic_long_add(1 << pool->order, _pages);
> >>>>>>>>>>> + spin_unlock(>lock);
> >>>>>>>>>>> +
> >>>>>>>>>>> + mod_node_page_state(page_pgdat(page),
> >>>>>>>>>>> NR_KERNEL_MISC_RECLAIMABLE,
> >>>>>>>>>>> + 1 << pool->order);
> >>>>>>>>>> Hui what? What should that be good for?
> >>>>>>>>> This is a carryover from the ION page pool implementation:
> >>>>>>>>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgit.kernel.org%2Fpub%2Fscm%2Flinux%2Fkernel%2Fgit%2Ftorvalds%2Flinux.git%2Ftree%2Fdrivers%2Fstaging%2Fandroid%2Fion%2Fion_page_pool.c%3Fh%3Dv5.10%23n28data=04%7C01%7Cchristian.koenig%40amd.com%7Cbb7155447ee149a49f3a08d8cde2685d%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637485719618339413%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=IYsJoAd7SUo12V7tS3CCRqNVm569iy%2FtoXQqm2MdC1g%3Dreserved=0
> >>>>>>>>>
> >>>>>>>>>
> >>>>>>>>> My sense is it helps with the vmstat/meminfo accounting so folks can
> >>>>>>>>> see the cached pages are shrinkable/freeable. This maybe falls under
> >>>>>>>>> other dmabuf accounting/stats discussions, so I'm happy to remove it
> >>>>>>>>> for now, or let the drivers using the shared page pool logic handle
> >>>>>>>>> the accounting themselves?
> >>>>>>> Intentionally separated the discussion for that here.
> >>>>>>>
> >>>>>>> As far as I can see this is just bluntly incorrect.
> >>>>>>>
> >>>>>>> Either the page is reclaimable or it is part of our pool and freeable
> >>>>>>> through the shrinker, but never ever both.
> >>>>>> IIRC the original motivation for counting ION pooled pages as
> >>>>>> reclaimable was to include them into /proc/meminfo's MemAvailable
> >>>>>> calculations. NR_KERNEL_MISC_RECLAIMABLE defined as "reclaimable
> >>>>>> non-slab kernel pages" seems like a good place to account for them but
> >>>>>> I might be wrong.
> >>>>> Yeah, that's what I see here as well. But exactly that is utterly 
> >>>>> nonsense.
> >>>>>
> >>>>> Those pages are not "free" in the sense that get_free_page could return
> >>>>> them directly.
> >>>> Well on Android that is kinda true, because Android has it's
> >>>> oom-killer (way back it was just a shrinker callback, not sure how it
> >>>> works now), which just shot down all the background apps. So at least
> >>>> some of that (everything used by background apps) is indeed
> >>>> reclaimable on Android.
> >>>>
> >>>> But that doesn't hold on Linux in general, so we can't really do this
> >>>> for common code.
> >>>>
> >>>> Also I had a long meeting with Suren, John and other googles
> >>>> yesterday, 

Re: [RFC][PATCH v6 1/7] drm: Add a sharable drm page-pool implementation

2021-02-05 Thread Suren Baghdasaryan
On Fri, Feb 5, 2021 at 12:47 PM John Stultz  wrote:
>
> On Fri, Feb 5, 2021 at 12:47 AM Christian König
>  wrote:
> > Am 05.02.21 um 09:06 schrieb John Stultz:
> > > diff --git a/drivers/gpu/drm/page_pool.c b/drivers/gpu/drm/page_pool.c
> > > new file mode 100644
> > > index ..2139f86e6ca7
> > > --- /dev/null
> > > +++ b/drivers/gpu/drm/page_pool.c
> > > @@ -0,0 +1,220 @@
> > > +// SPDX-License-Identifier: GPL-2.0
> >
> > Please use a BSD/MIT compatible license if you want to copy this from
> > the TTM code.
>
> Hrm. This may be problematic, as it's not just TTM code, but some of
> the TTM logic integrated into a page-pool implementation I wrote based
> on logic from the ION code (which was GPL-2.0 before it was dropped).
> So I don't think I can just make it MIT.  Any extra context on the
> need for MIT, or suggestions on how to best resolve this?
>
> > > +int drm_page_pool_get_size(struct drm_page_pool *pool)
> > > +{
> > > + int ret;
> > > +
> > > + spin_lock(>lock);
> > > + ret = pool->count;
> > > + spin_unlock(>lock);
> >
> > Maybe use an atomic for the count instead?
> >
>
> I can do that, but am curious as to the benefit? We are mostly using
> count where we already have to take the pool->lock anyway, and this
> drm_page_pool_get_size() is only used for debugfs output so far, so I
> don't expect it to be a hot path.
>
>
> > > +void drm_page_pool_add(struct drm_page_pool *pool, struct page *page)
> > > +{
> > > + spin_lock(>lock);
> > > + list_add_tail(>lru, >items);
> > > + pool->count++;
> > > + atomic_long_add(1 << pool->order, _pages);
> > > + spin_unlock(>lock);
> > > +
> > > + mod_node_page_state(page_pgdat(page), NR_KERNEL_MISC_RECLAIMABLE,
> > > + 1 << pool->order);
> >
> > Hui what? What should that be good for?
>
> This is a carryover from the ION page pool implementation:
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/drivers/staging/android/ion/ion_page_pool.c?h=v5.10#n28
>
> My sense is it helps with the vmstat/meminfo accounting so folks can
> see the cached pages are shrinkable/freeable. This maybe falls under
> other dmabuf accounting/stats discussions, so I'm happy to remove it
> for now, or let the drivers using the shared page pool logic handle
> the accounting themselves?

Yep, ION pools were accounted for as reclaimable kernel memory because
they could be dropped when the system is under memory pressure.

>
>
> > > +static struct page *drm_page_pool_remove(struct drm_page_pool *pool)
> > > +{
> > > + struct page *page;
> > > +
> > > + if (!pool->count)
> > > + return NULL;
> >
> > Better use list_first_entry_or_null instead of checking the count.
> >
> > This way you can also pull the lock into the function.
>
> Yea, that cleans a number of things up nicely. Thank you!
>
>
> > > +struct drm_page_pool *drm_page_pool_create(unsigned int order,
> > > +int (*free_page)(struct page *p, 
> > > unsigned int order))
> > > +{
> > > + struct drm_page_pool *pool = kmalloc(sizeof(*pool), GFP_KERNEL);
> >
> > Why not making this an embedded object? We should not see much dynamic
> > pool creation.
>
> Yea, it felt cleaner at the time this way, but I think I will need to
> switch to an embedded object in order to resolve the memory usage
> issue you pointed out with growing the ttm_pool_dma, so thank you for
> the suggestion!
>
>
> > > +void drm_page_pool_destroy(struct drm_page_pool *pool)
> > > +{
> > > + struct page *page;
> > > +
> > > + /* Remove us from the pool list */
> > > + mutex_lock(_list_lock);
> > > + list_del(>list);
> > > + mutex_unlock(_list_lock);
> > > +
> > > + /* Free any remaining pages in the pool */
> > > + spin_lock(>lock);
> >
> > Locking should be unnecessary when the pool is destroyed anyway.
>
> I guess if we've already pruned ourself from the pool list, then your
> right, we can't race with the shrinker and it's maybe not necessary.
> But it also seems easier to consistently follow the locking rules in a
> very unlikely path rather than leaning on subtlety.  Either way, I
> think this becomes moot if I make the improvements you suggest to
> drm_page_pool_remove().
>
> > > +static int drm_page_pool_shrink_one(void)
> > > +{
> > > + struct drm_page_pool *pool;
> > > + struct page *page;
> > > + int nr_freed = 0;
> > > +
> > > + mutex_lock(_list_lock);
> > > + pool = list_first_entry(_list, typeof(*pool), list);
> > > +
> > > + spin_lock(>lock);
> > > + page = drm_page_pool_remove(pool);
> > > + spin_unlock(>lock);
> > > +
> > > + if (page)
> > > + nr_freed = drm_page_pool_free_pages(pool, page);
> > > +
> > > + list_move_tail(>list, _list);
> >
> > Better to move this up, directly after the list_first_entry().
>
> Sounds good!
>
> Thanks so much for your review and feedback! I'll try to get some of
> the easy suggestions 

[PATCH v2 4/6] mm: replace vma->vm_flags indirect modification in ksm_madvise

2023-01-25 Thread Suren Baghdasaryan
Replace indirect modifications to vma->vm_flags with calls to modifier
functions to be able to track flag changes and to keep vma locking
correctness. Add a BUG_ON check in ksm_madvise() to catch indirect
vm_flags modification attempts.

Signed-off-by: Suren Baghdasaryan 
---
 arch/powerpc/kvm/book3s_hv_uvmem.c | 5 -
 arch/s390/mm/gmap.c| 5 -
 mm/khugepaged.c| 2 ++
 mm/ksm.c   | 2 ++
 4 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c 
b/arch/powerpc/kvm/book3s_hv_uvmem.c
index 1d67baa5557a..325a7a47d348 100644
--- a/arch/powerpc/kvm/book3s_hv_uvmem.c
+++ b/arch/powerpc/kvm/book3s_hv_uvmem.c
@@ -393,6 +393,7 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
 {
unsigned long gfn = memslot->base_gfn;
unsigned long end, start = gfn_to_hva(kvm, gfn);
+   unsigned long vm_flags;
int ret = 0;
struct vm_area_struct *vma;
int merge_flag = (merge) ? MADV_MERGEABLE : MADV_UNMERGEABLE;
@@ -409,12 +410,14 @@ static int kvmppc_memslot_page_merge(struct kvm *kvm,
ret = H_STATE;
break;
}
+   vm_flags = vma->vm_flags;
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
- merge_flag, >vm_flags);
+ merge_flag, _flags);
if (ret) {
ret = H_STATE;
break;
}
+   reset_vm_flags(vma, vm_flags);
start = vma->vm_end;
} while (end > vma->vm_end);
 
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 3a695b8a1e3c..d5eb47dcdacb 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2587,14 +2587,17 @@ int gmap_mark_unmergeable(void)
 {
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+   unsigned long vm_flags;
int ret;
VMA_ITERATOR(vmi, mm, 0);
 
for_each_vma(vmi, vma) {
+   vm_flags = vma->vm_flags;
ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, >vm_flags);
+ MADV_UNMERGEABLE, _flags);
if (ret)
return ret;
+   reset_vm_flags(vma, vm_flags);
}
mm->def_flags &= ~VM_MERGEABLE;
return 0;
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 8abc59345bf2..76b24cd0c179 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -354,6 +354,8 @@ struct attribute_group khugepaged_attr_group = {
 int hugepage_madvise(struct vm_area_struct *vma,
 unsigned long *vm_flags, int advice)
 {
+   /* vma->vm_flags can be changed only using modifier functions */
+   BUG_ON(vm_flags == >vm_flags);
switch (advice) {
case MADV_HUGEPAGE:
 #ifdef CONFIG_S390
diff --git a/mm/ksm.c b/mm/ksm.c
index 04f1c8c2df11..992b2be9f5e6 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2573,6 +2573,8 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long 
start,
struct mm_struct *mm = vma->vm_mm;
int err;
 
+   /* vma->vm_flags can be changed only using modifier functions */
+   BUG_ON(vm_flags == >vm_flags);
switch (advice) {
case MADV_MERGEABLE:
/*
-- 
2.39.1



[PATCH v2 0/6] introduce vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
This patchset was originally published as a part of per-VMA locking [1] and
was split after suggestion that it's viable on its own and to facilitate
the review process. It is now a preprequisite for the next version of per-VMA
lock patchset, which reuses vm_flags modifier functions to lock the VMA when
vm_flags are being updated.

VMA vm_flags modifications are usually done under exclusive mmap_lock
protection because this attrubute affects other decisions like VMA merging
or splitting and races should be prevented. Introduce vm_flags modifier
functions to enforce correct locking.

[1] https://lore.kernel.org/all/20230109205336.3665937-1-sur...@google.com/

The patchset applies cleanly over mm-unstable branch of mm tree.

My apologies for an extremely large distribution list. The patch touches
lots of files and many are in arch/ and drivers/.

Suren Baghdasaryan (6):
  mm: introduce vma->vm_flags modifier functions
  mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK
  mm: replace vma->vm_flags direct modifications with modifier calls
  mm: replace vma->vm_flags indirect modification in ksm_madvise
  mm: introduce mod_vm_flags_nolock and use it in untrack_pfn
  mm: export dump_mm()

 arch/arm/kernel/process.c |  2 +-
 arch/ia64/mm/init.c   |  8 +--
 arch/loongarch/include/asm/tlb.h  |  2 +-
 arch/powerpc/kvm/book3s_hv_uvmem.c|  5 +-
 arch/powerpc/kvm/book3s_xive_native.c |  2 +-
 arch/powerpc/mm/book3s64/subpage_prot.c   |  2 +-
 arch/powerpc/platforms/book3s/vas-api.c   |  2 +-
 arch/powerpc/platforms/cell/spufs/file.c  | 14 ++---
 arch/s390/mm/gmap.c   |  8 +--
 arch/x86/entry/vsyscall/vsyscall_64.c |  2 +-
 arch/x86/kernel/cpu/sgx/driver.c  |  2 +-
 arch/x86/kernel/cpu/sgx/virt.c|  2 +-
 arch/x86/mm/pat/memtype.c | 14 +++--
 arch/x86/um/mem_32.c  |  2 +-
 drivers/acpi/pfr_telemetry.c  |  2 +-
 drivers/android/binder.c  |  3 +-
 drivers/char/mspec.c  |  2 +-
 drivers/crypto/hisilicon/qm.c |  2 +-
 drivers/dax/device.c  |  2 +-
 drivers/dma/idxd/cdev.c   |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c   |  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c  |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_events.c   |  4 +-
 drivers/gpu/drm/amd/amdkfd/kfd_process.c  |  4 +-
 drivers/gpu/drm/drm_gem.c |  2 +-
 drivers/gpu/drm/drm_gem_dma_helper.c  |  3 +-
 drivers/gpu/drm/drm_gem_shmem_helper.c|  2 +-
 drivers/gpu/drm/drm_vm.c  |  8 +--
 drivers/gpu/drm/etnaviv/etnaviv_gem.c |  2 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c   |  4 +-
 drivers/gpu/drm/gma500/framebuffer.c  |  2 +-
 drivers/gpu/drm/i810/i810_dma.c   |  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c  |  4 +-
 drivers/gpu/drm/mediatek/mtk_drm_gem.c|  2 +-
 drivers/gpu/drm/msm/msm_gem.c |  2 +-
 drivers/gpu/drm/omapdrm/omap_gem.c|  3 +-
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c   |  3 +-
 drivers/gpu/drm/tegra/gem.c   |  5 +-
 drivers/gpu/drm/ttm/ttm_bo_vm.c   |  3 +-
 drivers/gpu/drm/virtio/virtgpu_vram.c |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c  |  2 +-
 drivers/gpu/drm/xen/xen_drm_front_gem.c   |  3 +-
 drivers/hsi/clients/cmt_speech.c  |  2 +-
 drivers/hwtracing/intel_th/msu.c  |  2 +-
 drivers/hwtracing/stm/core.c  |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c |  4 +-
 drivers/infiniband/hw/mlx5/main.c |  4 +-
 drivers/infiniband/hw/qib/qib_file_ops.c  | 13 +++--
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c  |  2 +-
 .../infiniband/hw/vmw_pvrdma/pvrdma_verbs.c   |  2 +-
 .../common/videobuf2/videobuf2-dma-contig.c   |  2 +-
 .../common/videobuf2/videobuf2-vmalloc.c  |  2 +-
 drivers/media/v4l2-core/videobuf-dma-contig.c |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c |  4 +-
 drivers/media/v4l2-core/videobuf-vmalloc.c|  2 +-
 drivers/misc/cxl/context.c|  2 +-
 drivers/misc/habanalabs/common/memory.c   |  2 +-
 drivers/misc/habanalabs/gaudi/gaudi.c |  4 +-
 drivers/misc/habanalabs/gaudi2/gaudi2.c   |  8 +--
 drivers/misc/habanalabs/goya/goya.c   |  4 +-
 drivers/misc/ocxl/context.c   |  4 +-
 drivers/misc/ocxl/sysfs.c |  2 +-
 drivers/misc/open-dice.c  |  4 +-
 drivers/misc/sgi-gru/grufile.c|  4 +-
 drivers/misc/uacce/uacce.c|  2 +-
 drivers/sbus/char/oradax.c|  2 +-
 drivers/scsi/cxlflash/ocxl_hw.c   |  2 +-
 drivers/

[PATCH v2 3/6] mm: replace vma->vm_flags direct modifications with modifier calls

2023-01-25 Thread Suren Baghdasaryan
Replace direct modifications to vma->vm_flags with calls to modifier
functions to be able to track flag changes and to keep vma locking
correctness.

Signed-off-by: Suren Baghdasaryan 
---
 arch/arm/kernel/process.c  |  2 +-
 arch/ia64/mm/init.c|  8 
 arch/loongarch/include/asm/tlb.h   |  2 +-
 arch/powerpc/kvm/book3s_xive_native.c  |  2 +-
 arch/powerpc/mm/book3s64/subpage_prot.c|  2 +-
 arch/powerpc/platforms/book3s/vas-api.c|  2 +-
 arch/powerpc/platforms/cell/spufs/file.c   | 14 +++---
 arch/s390/mm/gmap.c|  3 +--
 arch/x86/entry/vsyscall/vsyscall_64.c  |  2 +-
 arch/x86/kernel/cpu/sgx/driver.c   |  2 +-
 arch/x86/kernel/cpu/sgx/virt.c |  2 +-
 arch/x86/mm/pat/memtype.c  |  6 +++---
 arch/x86/um/mem_32.c   |  2 +-
 drivers/acpi/pfr_telemetry.c   |  2 +-
 drivers/android/binder.c   |  3 +--
 drivers/char/mspec.c   |  2 +-
 drivers/crypto/hisilicon/qm.c  |  2 +-
 drivers/dax/device.c   |  2 +-
 drivers/dma/idxd/cdev.c|  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c|  2 +-
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c   |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c  |  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_events.c|  4 ++--
 drivers/gpu/drm/amd/amdkfd/kfd_process.c   |  4 ++--
 drivers/gpu/drm/drm_gem.c  |  2 +-
 drivers/gpu/drm/drm_gem_dma_helper.c   |  3 +--
 drivers/gpu/drm/drm_gem_shmem_helper.c |  2 +-
 drivers/gpu/drm/drm_vm.c   |  8 
 drivers/gpu/drm/etnaviv/etnaviv_gem.c  |  2 +-
 drivers/gpu/drm/exynos/exynos_drm_gem.c|  4 ++--
 drivers/gpu/drm/gma500/framebuffer.c   |  2 +-
 drivers/gpu/drm/i810/i810_dma.c|  2 +-
 drivers/gpu/drm/i915/gem/i915_gem_mman.c   |  4 ++--
 drivers/gpu/drm/mediatek/mtk_drm_gem.c |  2 +-
 drivers/gpu/drm/msm/msm_gem.c  |  2 +-
 drivers/gpu/drm/omapdrm/omap_gem.c |  3 +--
 drivers/gpu/drm/rockchip/rockchip_drm_gem.c|  3 +--
 drivers/gpu/drm/tegra/gem.c|  5 ++---
 drivers/gpu/drm/ttm/ttm_bo_vm.c|  3 +--
 drivers/gpu/drm/virtio/virtgpu_vram.c  |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_ttm_glue.c   |  2 +-
 drivers/gpu/drm/xen/xen_drm_front_gem.c|  3 +--
 drivers/hsi/clients/cmt_speech.c   |  2 +-
 drivers/hwtracing/intel_th/msu.c   |  2 +-
 drivers/hwtracing/stm/core.c   |  2 +-
 drivers/infiniband/hw/hfi1/file_ops.c  |  4 ++--
 drivers/infiniband/hw/mlx5/main.c  |  4 ++--
 drivers/infiniband/hw/qib/qib_file_ops.c   | 13 ++---
 drivers/infiniband/hw/usnic/usnic_ib_verbs.c   |  2 +-
 drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c|  2 +-
 .../media/common/videobuf2/videobuf2-dma-contig.c  |  2 +-
 drivers/media/common/videobuf2/videobuf2-vmalloc.c |  2 +-
 drivers/media/v4l2-core/videobuf-dma-contig.c  |  2 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c  |  4 ++--
 drivers/media/v4l2-core/videobuf-vmalloc.c |  2 +-
 drivers/misc/cxl/context.c |  2 +-
 drivers/misc/habanalabs/common/memory.c|  2 +-
 drivers/misc/habanalabs/gaudi/gaudi.c  |  4 ++--
 drivers/misc/habanalabs/gaudi2/gaudi2.c|  8 
 drivers/misc/habanalabs/goya/goya.c|  4 ++--
 drivers/misc/ocxl/context.c|  4 ++--
 drivers/misc/ocxl/sysfs.c  |  2 +-
 drivers/misc/open-dice.c   |  4 ++--
 drivers/misc/sgi-gru/grufile.c |  4 ++--
 drivers/misc/uacce/uacce.c |  2 +-
 drivers/sbus/char/oradax.c |  2 +-
 drivers/scsi/cxlflash/ocxl_hw.c|  2 +-
 drivers/scsi/sg.c  |  2 +-
 drivers/staging/media/atomisp/pci/hmm/hmm_bo.c |  2 +-
 drivers/staging/media/deprecated/meye/meye.c   |  4 ++--
 .../media/deprecated/stkwebcam/stk-webcam.c|  2 +-
 drivers/target/target_core_user.c  |  2 +-
 drivers/uio/uio.c  |  2 +-
 drivers/usb/core/devio.c   |  3 +--
 drivers/usb/mon/mon_bin.c  |  3 +--
 drivers/vdpa/vdpa_user/iova_domain.c   |  2 +-
 drivers/vfio/pci/vfio_pci_core.c   |  2 +-
 drivers/vhost/vdpa.c   |  2 +-
 drivers/video/fbdev/68328f

Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 10:37 AM Matthew Wilcox  wrote:
>
> On Wed, Jan 25, 2023 at 08:49:50AM -0800, Suren Baghdasaryan wrote:
> > On Wed, Jan 25, 2023 at 1:10 AM Peter Zijlstra  wrote:
> > > > + /*
> > > > +  * Flags, see mm.h.
> > > > +  * WARNING! Do not modify directly.
> > > > +  * Use {init|reset|set|clear|mod}_vm_flags() functions instead.
> > > > +  */
> > > > + unsigned long vm_flags;
> > >
> > > We have __private and ACCESS_PRIVATE() to help with enforcing this.
> >
> > Thanks for pointing this out, Peter! I guess for that I'll need to
> > convert all read accesses and provide get_vm_flags() too? That will
> > cause some additional churt (a quick search shows 801 hits over 248
> > files) but maybe it's worth it? I think Michal suggested that too in
> > another patch. Should I do that while we are at it?
>
> Here's a trick I saw somewhere in the VFS:
>
> union {
> const vm_flags_t vm_flags;
> vm_flags_t __private __vm_flags;
> };
>
> Now it can be read by anybody but written only by those using
> ACCESS_PRIVATE.

Huh, this is quite nice! I think it does not save us from the cases
when vma->vm_flags is passed by a reference and modified indirectly,
like in ksm_madvise()? Though maybe such usecases are so rare (I found
only 2 cases) that we can ignore this?


Re: [PATCH v2 5/6] mm: introduce mod_vm_flags_nolock and use it in untrack_pfn

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 1:42 AM Michal Hocko  wrote:
>
> On Wed 25-01-23 00:38:50, Suren Baghdasaryan wrote:
> > In cases when VMA flags are modified after VMA was isolated and mmap_lock
> > was downgraded, flags modifications would result in an assertion because
> > mmap write lock is not held.
> > Introduce mod_vm_flags_nolock to be used in such situation.
> > Pass a hint to untrack_pfn to conditionally use mod_vm_flags_nolock for
> > flags modification and to avoid assertion.
>
> The changelog nor the documentation of mod_vm_flags_nolock
> really explain when it is safe to use it. This is really important for
> future potential users.

True. I'll add clarification in the comments and in the changelog. Thanks!

>
> > Signed-off-by: Suren Baghdasaryan 
> > ---
> >  arch/x86/mm/pat/memtype.c | 10 +++---
> >  include/linux/mm.h| 12 +---
> >  include/linux/pgtable.h   |  5 +++--
> >  mm/memory.c   | 13 +++--
> >  mm/memremap.c |  4 ++--
> >  mm/mmap.c | 16 ++--
> >  6 files changed, 38 insertions(+), 22 deletions(-)
> >
> > diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
> > index ae9645c900fa..d8adc0b42cf2 100644
> > --- a/arch/x86/mm/pat/memtype.c
> > +++ b/arch/x86/mm/pat/memtype.c
> > @@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, 
> > pgprot_t *prot, pfn_t pfn)
> >   * can be for the entire vma (in which case pfn, size are zero).
> >   */
> >  void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
> > -  unsigned long size)
> > +  unsigned long size, bool mm_wr_locked)
> >  {
> >   resource_size_t paddr;
> >   unsigned long prot;
> > @@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, 
> > unsigned long pfn,
> >   size = vma->vm_end - vma->vm_start;
> >   }
> >   free_pfn_range(paddr, size);
> > - if (vma)
> > - clear_vm_flags(vma, VM_PAT);
> > + if (vma) {
> > + if (mm_wr_locked)
> > + clear_vm_flags(vma, VM_PAT);
> > + else
> > + mod_vm_flags_nolock(vma, 0, VM_PAT);
> > + }
> >  }
> >
> >  /*
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 55335edd1373..48d49930c411 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -656,12 +656,18 @@ static inline void clear_vm_flags(struct 
> > vm_area_struct *vma,
> >   vma->vm_flags &= ~flags;
> >  }
> >
> > +static inline void mod_vm_flags_nolock(struct vm_area_struct *vma,
> > +unsigned long set, unsigned long clear)
> > +{
> > + vma->vm_flags |= set;
> > + vma->vm_flags &= ~clear;
> > +}
> > +
> >  static inline void mod_vm_flags(struct vm_area_struct *vma,
> >   unsigned long set, unsigned long clear)
> >  {
> >   mmap_assert_write_locked(vma->vm_mm);
> > - vma->vm_flags |= set;
> > - vma->vm_flags &= ~clear;
> > + mod_vm_flags_nolock(vma, set, clear);
> >  }
> >
> >  static inline void vma_set_anonymous(struct vm_area_struct *vma)
> > @@ -2087,7 +2093,7 @@ static inline void zap_vma_pages(struct 
> > vm_area_struct *vma)
> >  }
> >  void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
> >   struct vm_area_struct *start_vma, unsigned long start,
> > - unsigned long end);
> > + unsigned long end, bool mm_wr_locked);
> >
> >  struct mmu_notifier_range;
> >
> > diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
> > index 5fd45454c073..c63cd44777ec 100644
> > --- a/include/linux/pgtable.h
> > +++ b/include/linux/pgtable.h
> > @@ -1185,7 +1185,8 @@ static inline int track_pfn_copy(struct 
> > vm_area_struct *vma)
> >   * can be for the entire vma (in which case pfn, size are zero).
> >   */
> >  static inline void untrack_pfn(struct vm_area_struct *vma,
> > -unsigned long pfn, unsigned long size)
> > +unsigned long pfn, unsigned long size,
> > +bool mm_wr_locked)
> >  {
> >  }
> >
> > @@ -1203,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct 
> > *vma, pgprot_t *prot,
> >pfn_t pfn);
> >  extern int track_pfn_copy

Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 1:10 AM Peter Zijlstra  wrote:
>
> On Wed, Jan 25, 2023 at 12:38:46AM -0800, Suren Baghdasaryan wrote:
>
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 2d6d790d9bed..6c7c70bf50dd 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -491,7 +491,13 @@ struct vm_area_struct {
> >* See vmf_insert_mixed_prot() for discussion.
> >*/
> >   pgprot_t vm_page_prot;
> > - unsigned long vm_flags; /* Flags, see mm.h. */
> > +
> > + /*
> > +  * Flags, see mm.h.
> > +  * WARNING! Do not modify directly.
> > +  * Use {init|reset|set|clear|mod}_vm_flags() functions instead.
> > +  */
> > + unsigned long vm_flags;
>
> We have __private and ACCESS_PRIVATE() to help with enforcing this.

Thanks for pointing this out, Peter! I guess for that I'll need to
convert all read accesses and provide get_vm_flags() too? That will
cause some additional churt (a quick search shows 801 hits over 248
files) but maybe it's worth it? I think Michal suggested that too in
another patch. Should I do that while we are at it?

>


[PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
vm_flags are among VMA attributes which affect decisions like VMA merging
and splitting. Therefore all vm_flags modifications are performed after
taking exclusive mmap_lock to prevent vm_flags updates racing with such
operations. Introduce modifier functions for vm_flags to be used whenever
flags are updated. This way we can better check and control correct
locking behavior during these updates.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/mm.h   | 37 +
 include/linux/mm_types.h |  8 +++-
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c2f62bdce134..b71f2809caac 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -627,6 +627,43 @@ static inline void vma_init(struct vm_area_struct *vma, 
struct mm_struct *mm)
INIT_LIST_HEAD(>anon_vma_chain);
 }
 
+/* Use when VMA is not part of the VMA tree and needs no locking */
+static inline void init_vm_flags(struct vm_area_struct *vma,
+unsigned long flags)
+{
+   vma->vm_flags = flags;
+}
+
+/* Use when VMA is part of the VMA tree and modifications need coordination */
+static inline void reset_vm_flags(struct vm_area_struct *vma,
+ unsigned long flags)
+{
+   mmap_assert_write_locked(vma->vm_mm);
+   init_vm_flags(vma, flags);
+}
+
+static inline void set_vm_flags(struct vm_area_struct *vma,
+   unsigned long flags)
+{
+   mmap_assert_write_locked(vma->vm_mm);
+   vma->vm_flags |= flags;
+}
+
+static inline void clear_vm_flags(struct vm_area_struct *vma,
+ unsigned long flags)
+{
+   mmap_assert_write_locked(vma->vm_mm);
+   vma->vm_flags &= ~flags;
+}
+
+static inline void mod_vm_flags(struct vm_area_struct *vma,
+   unsigned long set, unsigned long clear)
+{
+   mmap_assert_write_locked(vma->vm_mm);
+   vma->vm_flags |= set;
+   vma->vm_flags &= ~clear;
+}
+
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
 {
vma->vm_ops = NULL;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 2d6d790d9bed..6c7c70bf50dd 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -491,7 +491,13 @@ struct vm_area_struct {
 * See vmf_insert_mixed_prot() for discussion.
 */
pgprot_t vm_page_prot;
-   unsigned long vm_flags; /* Flags, see mm.h. */
+
+   /*
+* Flags, see mm.h.
+* WARNING! Do not modify directly.
+* Use {init|reset|set|clear|mod}_vm_flags() functions instead.
+*/
+   unsigned long vm_flags;
 
/*
 * For areas with an address space and backing store,
-- 
2.39.1



Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 10:33 AM Matthew Wilcox  wrote:
>
> On Wed, Jan 25, 2023 at 12:38:46AM -0800, Suren Baghdasaryan wrote:
> > +/* Use when VMA is not part of the VMA tree and needs no locking */
> > +static inline void init_vm_flags(struct vm_area_struct *vma,
> > +  unsigned long flags)
> > +{
> > + vma->vm_flags = flags;
>
> vm_flags are supposed to have type vm_flags_t.  That's not been
> fully realised yet, but perhaps we could avoid making it worse?
>
> >   pgprot_t vm_page_prot;
> > - unsigned long vm_flags; /* Flags, see mm.h. */
> > +
> > + /*
> > +  * Flags, see mm.h.
> > +  * WARNING! Do not modify directly.
> > +  * Use {init|reset|set|clear|mod}_vm_flags() functions instead.
> > +  */
> > + unsigned long vm_flags;
>
> Including changing this line to vm_flags_t

Good point. Will make the change. Thanks!


Re: [PATCH v2 3/6] mm: replace vma->vm_flags direct modifications with modifier calls

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 1:30 AM 'Michal Hocko' via kernel-team
 wrote:
>
> On Wed 25-01-23 00:38:48, Suren Baghdasaryan wrote:
> > Replace direct modifications to vma->vm_flags with calls to modifier
> > functions to be able to track flag changes and to keep vma locking
> > correctness.
>
> Is this a manual (git grep) based work or have you used Coccinele for
> the patch generation?

It was a manual "search and replace" and in the process I temporarily
renamed vm_flags to ensure I did not miss any usage.

>
> My potentially incomplete check
> $ git grep ">[[:space:]]*vm_flags[[:space:]]*[&|^]="
>
> shows that nothing should be left after this. There is still quite a lot
> of direct checks of the flags (more than 600). Maybe it would be good to
> make flags accessible only via accessors which would also prevent any
> future direct setting of those flags in uncontrolled way as well.

Yes, I think Peter's suggestion in the first patch would also require
that. Much more churn but probably worth it for the future
maintenance. I'll add a patch which converts all readers as well.

>
> Anyway
> Acked-by: Michal Hocko 

Thanks for all the reviews!

> --
> Michal Hocko
> SUSE Labs
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>


[PATCH v2 2/6] mm: replace VM_LOCKED_CLEAR_MASK with VM_LOCKED_MASK

2023-01-25 Thread Suren Baghdasaryan
To simplify the usage of VM_LOCKED_CLEAR_MASK in clear_vm_flags(),
replace it with VM_LOCKED_MASK bitmask and convert all users.

Signed-off-by: Suren Baghdasaryan 
---
 include/linux/mm.h | 4 ++--
 kernel/fork.c  | 2 +-
 mm/hugetlb.c   | 4 ++--
 mm/mlock.c | 6 +++---
 mm/mmap.c  | 6 +++---
 mm/mremap.c| 2 +-
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index b71f2809caac..da62bdd627bf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -421,8 +421,8 @@ extern unsigned int kobjsize(const void *objp);
 /* This mask defines which mm->def_flags a process can inherit its parent */
 #define VM_INIT_DEF_MASK   VM_NOHUGEPAGE
 
-/* This mask is used to clear all the VMA flags used by mlock */
-#define VM_LOCKED_CLEAR_MASK   (~(VM_LOCKED | VM_LOCKONFAULT))
+/* This mask represents all the VMA flag bits used by mlock */
+#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT)
 
 /* Arch-specific flags to clear when updating VM flags on protection change */
 #ifndef VM_ARCH_CLEAR
diff --git a/kernel/fork.c b/kernel/fork.c
index 6683c1b0f460..03d472051236 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -669,7 +669,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
tmp->anon_vma = NULL;
} else if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
-   tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT);
+   clear_vm_flags(tmp, VM_LOCKED_MASK);
file = tmp->vm_file;
if (file) {
struct address_space *mapping = file->f_mapping;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d20c8b09890e..4ecdbad9a451 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6973,8 +6973,8 @@ static unsigned long page_table_shareable(struct 
vm_area_struct *svma,
unsigned long s_end = sbase + PUD_SIZE;
 
/* Allow segments to share if only one is marked locked */
-   unsigned long vm_flags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
-   unsigned long svm_flags = svma->vm_flags & VM_LOCKED_CLEAR_MASK;
+   unsigned long vm_flags = vma->vm_flags & ~VM_LOCKED_MASK;
+   unsigned long svm_flags = svma->vm_flags & ~VM_LOCKED_MASK;
 
/*
 * match the virtual addresses, permission and the alignment of the
diff --git a/mm/mlock.c b/mm/mlock.c
index 0336f52e03d7..5c4fff93cd6b 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -497,7 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t 
len,
if (vma->vm_start != tmp)
return -ENOMEM;
 
-   newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+   newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= flags;
/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
tmp = vma->vm_end;
@@ -661,7 +661,7 @@ static int apply_mlockall_flags(int flags)
struct vm_area_struct *vma, *prev = NULL;
vm_flags_t to_add = 0;
 
-   current->mm->def_flags &= VM_LOCKED_CLEAR_MASK;
+   current->mm->def_flags &= ~VM_LOCKED_MASK;
if (flags & MCL_FUTURE) {
current->mm->def_flags |= VM_LOCKED;
 
@@ -681,7 +681,7 @@ static int apply_mlockall_flags(int flags)
for_each_vma(vmi, vma) {
vm_flags_t newflags;
 
-   newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK;
+   newflags = vma->vm_flags & ~VM_LOCKED_MASK;
newflags |= to_add;
 
/* Ignore errors */
diff --git a/mm/mmap.c b/mm/mmap.c
index d4abc6feced1..323bd253b25a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2671,7 +2671,7 @@ unsigned long mmap_region(struct file *file, unsigned 
long addr,
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
-   vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+   clear_vm_flags(vma, VM_LOCKED_MASK);
else
mm->locked_vm += (len >> PAGE_SHIFT);
}
@@ -3340,8 +3340,8 @@ static struct vm_area_struct *__install_special_mapping(
vma->vm_start = addr;
vma->vm_end = addr + len;
 
-   vma->vm_flags = vm_flags | mm->def_flags | VM_DONTEXPAND | VM_SOFTDIRTY;
-   vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
+   init_vm_flags(vma, (vm_flags | mm->def_flags |
+ VM_DONTEXPAND | VM_SOFTDIRTY) & ~VM_LOCKED_MASK);
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
 
vma->vm_ops = ops;
diff --git a/mm/mremap.c b/mm/mremap.c
index 1b3ee02bead7..35db9752c

Re: [PATCH v2 4/6] mm: replace vma->vm_flags indirect modification in ksm_madvise

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 9:08 AM Michal Hocko  wrote:
>
> On Wed 25-01-23 08:57:48, Suren Baghdasaryan wrote:
> > On Wed, Jan 25, 2023 at 1:38 AM 'Michal Hocko' via kernel-team
> >  wrote:
> > >
> > > On Wed 25-01-23 00:38:49, Suren Baghdasaryan wrote:
> > > > Replace indirect modifications to vma->vm_flags with calls to modifier
> > > > functions to be able to track flag changes and to keep vma locking
> > > > correctness. Add a BUG_ON check in ksm_madvise() to catch indirect
> > > > vm_flags modification attempts.
> > >
> > > Those BUG_ONs scream to much IMHO. KSM is an MM internal code so I
> > > gueess we should be willing to trust it.
> >
> > Yes, but I really want to prevent an indirect misuse since it was not
> > easy to find these. If you feel strongly about it I will remove them
> > or if you have a better suggestion I'm all for it.
>
> You can avoid that by making flags inaccesible directly, right?

Ah, you mean Peter's suggestion of using __private? I guess that would
cover it. I'll drop these BUG_ONs in the next version. Thanks!

>
> --
> Michal Hocko
> SUSE Labs


[PATCH v2 5/6] mm: introduce mod_vm_flags_nolock and use it in untrack_pfn

2023-01-25 Thread Suren Baghdasaryan
In cases when VMA flags are modified after VMA was isolated and mmap_lock
was downgraded, flags modifications would result in an assertion because
mmap write lock is not held.
Introduce mod_vm_flags_nolock to be used in such situation.
Pass a hint to untrack_pfn to conditionally use mod_vm_flags_nolock for
flags modification and to avoid assertion.

Signed-off-by: Suren Baghdasaryan 
---
 arch/x86/mm/pat/memtype.c | 10 +++---
 include/linux/mm.h| 12 +---
 include/linux/pgtable.h   |  5 +++--
 mm/memory.c   | 13 +++--
 mm/memremap.c |  4 ++--
 mm/mmap.c | 16 ++--
 6 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/arch/x86/mm/pat/memtype.c b/arch/x86/mm/pat/memtype.c
index ae9645c900fa..d8adc0b42cf2 100644
--- a/arch/x86/mm/pat/memtype.c
+++ b/arch/x86/mm/pat/memtype.c
@@ -1046,7 +1046,7 @@ void track_pfn_insert(struct vm_area_struct *vma, 
pgprot_t *prot, pfn_t pfn)
  * can be for the entire vma (in which case pfn, size are zero).
  */
 void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
-unsigned long size)
+unsigned long size, bool mm_wr_locked)
 {
resource_size_t paddr;
unsigned long prot;
@@ -1065,8 +1065,12 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned 
long pfn,
size = vma->vm_end - vma->vm_start;
}
free_pfn_range(paddr, size);
-   if (vma)
-   clear_vm_flags(vma, VM_PAT);
+   if (vma) {
+   if (mm_wr_locked)
+   clear_vm_flags(vma, VM_PAT);
+   else
+   mod_vm_flags_nolock(vma, 0, VM_PAT);
+   }
 }
 
 /*
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 55335edd1373..48d49930c411 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -656,12 +656,18 @@ static inline void clear_vm_flags(struct vm_area_struct 
*vma,
vma->vm_flags &= ~flags;
 }
 
+static inline void mod_vm_flags_nolock(struct vm_area_struct *vma,
+  unsigned long set, unsigned long clear)
+{
+   vma->vm_flags |= set;
+   vma->vm_flags &= ~clear;
+}
+
 static inline void mod_vm_flags(struct vm_area_struct *vma,
unsigned long set, unsigned long clear)
 {
mmap_assert_write_locked(vma->vm_mm);
-   vma->vm_flags |= set;
-   vma->vm_flags &= ~clear;
+   mod_vm_flags_nolock(vma, set, clear);
 }
 
 static inline void vma_set_anonymous(struct vm_area_struct *vma)
@@ -2087,7 +2093,7 @@ static inline void zap_vma_pages(struct vm_area_struct 
*vma)
 }
 void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt,
struct vm_area_struct *start_vma, unsigned long start,
-   unsigned long end);
+   unsigned long end, bool mm_wr_locked);
 
 struct mmu_notifier_range;
 
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5fd45454c073..c63cd44777ec 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1185,7 +1185,8 @@ static inline int track_pfn_copy(struct vm_area_struct 
*vma)
  * can be for the entire vma (in which case pfn, size are zero).
  */
 static inline void untrack_pfn(struct vm_area_struct *vma,
-  unsigned long pfn, unsigned long size)
+  unsigned long pfn, unsigned long size,
+  bool mm_wr_locked)
 {
 }
 
@@ -1203,7 +1204,7 @@ extern void track_pfn_insert(struct vm_area_struct *vma, 
pgprot_t *prot,
 pfn_t pfn);
 extern int track_pfn_copy(struct vm_area_struct *vma);
 extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
-   unsigned long size);
+   unsigned long size, bool mm_wr_locked);
 extern void untrack_pfn_moved(struct vm_area_struct *vma);
 #endif
 
diff --git a/mm/memory.c b/mm/memory.c
index d6902065e558..5b11b50e2c4a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1613,7 +1613,7 @@ void unmap_page_range(struct mmu_gather *tlb,
 static void unmap_single_vma(struct mmu_gather *tlb,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr,
-   struct zap_details *details)
+   struct zap_details *details, bool mm_wr_locked)
 {
unsigned long start = max(vma->vm_start, start_addr);
unsigned long end;
@@ -1628,7 +1628,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
uprobe_munmap(vma, start, end);
 
if (unlikely(vma->vm_flags & VM_PFNMAP))
-   untrack_pfn(vma, 0, 0);
+   untrack_pfn(vma, 0, 0, mm_wr_locked);
 
if (start != end) {
if (unlikely(is_vm_hugetlb_page(vma))) {
@@ -1675,7 +1675,7 @@ static void unmap_single_vma(struct mmu_gather *tlb,
  */
 void unmap_vmas

Re: [PATCH v2 4/6] mm: replace vma->vm_flags indirect modification in ksm_madvise

2023-01-25 Thread Suren Baghdasaryan
On Wed, Jan 25, 2023 at 1:38 AM 'Michal Hocko' via kernel-team
 wrote:
>
> On Wed 25-01-23 00:38:49, Suren Baghdasaryan wrote:
> > Replace indirect modifications to vma->vm_flags with calls to modifier
> > functions to be able to track flag changes and to keep vma locking
> > correctness. Add a BUG_ON check in ksm_madvise() to catch indirect
> > vm_flags modification attempts.
>
> Those BUG_ONs scream to much IMHO. KSM is an MM internal code so I
> gueess we should be willing to trust it.

Yes, but I really want to prevent an indirect misuse since it was not
easy to find these. If you feel strongly about it I will remove them
or if you have a better suggestion I'm all for it.

>
> > Signed-off-by: Suren Baghdasaryan 
>
> Acked-by: Michal Hocko 
> --
> Michal Hocko
> SUSE Labs
>
> --
> To unsubscribe from this group and stop receiving emails from it, send an 
> email to kernel-team+unsubscr...@android.com.
>


[PATCH v2 6/6] mm: export dump_mm()

2023-01-25 Thread Suren Baghdasaryan
mmap_assert_write_locked() is used in vm_flags modifiers. Because
mmap_assert_write_locked() uses dump_mm() and vm_flags are sometimes
modified from from inside a module, it's necessary to export
dump_mm() function.

Signed-off-by: Suren Baghdasaryan 
---
 mm/debug.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mm/debug.c b/mm/debug.c
index 9d3d893dc7f4..96d594e16292 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -215,6 +215,7 @@ void dump_mm(const struct mm_struct *mm)
mm->def_flags, >def_flags
);
 }
+EXPORT_SYMBOL(dump_mm);
 
 static bool page_init_poisoning __read_mostly = true;
 
-- 
2.39.1



Re: [PATCH v2 1/6] mm: introduce vma->vm_flags modifier functions

2023-01-26 Thread Suren Baghdasaryan
On Thu, Jan 26, 2023 at 7:09 AM Matthew Wilcox  wrote:
>
> On Thu, Jan 26, 2023 at 04:50:59PM +0200, Mike Rapoport wrote:
> > On Thu, Jan 26, 2023 at 11:17:09AM +0200, Mike Rapoport wrote:
> > > On Wed, Jan 25, 2023 at 12:38:46AM -0800, Suren Baghdasaryan wrote:
> > > > +/* Use when VMA is not part of the VMA tree and needs no locking */
> > > > +static inline void init_vm_flags(struct vm_area_struct *vma,
> > > > +  unsigned long flags)
> > >
> > > I'd suggest to make it vm_flags_init() etc.
> >
> > Thinking more about it, it will be even clearer to name these 
> > vma_flags_xyz()
>
> Perhaps vma_VERB_flags()?
>
> vma_init_flags()
> vma_reset_flags()
> vma_set_flags()
> vma_clear_flags()
> vma_mod_flags()

Due to excessive email bouncing I posted the v3 of this patchset using
the original per-VMA patchset's distribution list. That might have
dropped Mike from the list. Sorry about that Mike, I'll add you to my
usual list of suspects :)
The v3 is here:
https://lore.kernel.org/all/20230125233554.153109-1-sur...@google.com/
and Andrew did suggest the same renames, so I'll be posting v4 with
those changes later today.
Thanks for the feedback!

>