RE: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Namjae Jeon
> 
> > -Original Message-
> > From: Namjae Jeon 
> > Sent: Sunday, March 21, 2021 10:14 PM
> > To: linux-fsde...@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
> > c...@vger.kernel.org
> > Cc: linux-cifsd-de...@lists.sourceforge.net; smfre...@gmail.com;
> > senozhat...@chromium.org; hyc@gmail.com; v...@zeniv.linux.org.uk;
> > h...@lst.de; h...@infradead.org; ronniesahlb...@gmail.com;
> > aurelien.ap...@gmail.com; aap...@suse.com; sand...@sandeen.net;
> > dan.carpen...@oracle.com; colin.k...@canonical.com;
> > rdun...@infradead.org; Namjae Jeon ;
> > Sergey Senozhatsky ; Steve French
> > 
> > Subject: [PATCH 3/5] cifsd: add file operations
> >
> > This adds file operations and buffer pool for cifsd.
> >
> > Signed-off-by: Namjae Jeon 
> > Signed-off-by: Sergey Senozhatsky 
> > Signed-off-by: Hyunchul Lee 
> > Acked-by: Ronnie Sahlberg 
> > Signed-off-by: Steve French 
> > ---
> >  fs/cifsd/buffer_pool.c |  292 ++
> >  fs/cifsd/buffer_pool.h |   28 +
> >  fs/cifsd/vfs.c | 1989 
> >  fs/cifsd/vfs.h |  314 +++
> >  fs/cifsd/vfs_cache.c   |  851 +
> >  fs/cifsd/vfs_cache.h   |  213 +
> >  6 files changed, 3687 insertions(+)
> >  create mode 100644 fs/cifsd/buffer_pool.c
> >  create mode 100644 fs/cifsd/buffer_pool.h
> >  create mode 100644 fs/cifsd/vfs.c
> >  create mode 100644 fs/cifsd/vfs.h
> >  create mode 100644 fs/cifsd/vfs_cache.c
> >  create mode 100644 fs/cifsd/vfs_cache.h
> >
> > diff --git a/fs/cifsd/buffer_pool.c b/fs/cifsd/buffer_pool.c
> > new file mode 100644
> > index ..864fea547c68
> > --- /dev/null
> > +++ b/fs/cifsd/buffer_pool.c
> > @@ -0,0 +1,292 @@
> > +// SPDX-License-Identifier: GPL-2.0-or-later
> > +/*
> > + *   Copyright (C) 2018 Samsung Electronics Co., Ltd.
> > + */
> > +
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +#include 
> > +
> > +#include "glob.h"
> > +#include "buffer_pool.h"
> > +#include "connection.h"
> > +#include "mgmt/ksmbd_ida.h"
> > +
> > +static struct kmem_cache *filp_cache;
> > +
> > +struct wm {
> > +   struct list_headlist;
> > +   unsigned intsz;
> > +   charbuffer[0];
> > +};
> > +
> > +struct wm_list {
> > +   struct list_headlist;
> > +   unsigned intsz;
> > +
> > +   spinlock_t  wm_lock;
> > +   int avail_wm;
> > +   struct list_headidle_wm;
> > +   wait_queue_head_t   wm_wait;
> > +};
> > +
> > +static LIST_HEAD(wm_lists);
> > +static DEFINE_RWLOCK(wm_lists_lock);
> > +
> > +void *ksmbd_alloc(size_t size)
> > +{
> > +   return kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
> > +}
> > +
> > +void ksmbd_free(void *ptr)
> > +{
> > +   kvfree(ptr);
> > +}
> > +
> > +static struct wm *wm_alloc(size_t sz, gfp_t flags)
> > +{
> > +   struct wm *wm;
> > +   size_t alloc_sz = sz + sizeof(struct wm);
> > +
> > +   wm = kvmalloc(alloc_sz, flags);
> > +   if (!wm)
> > +   return NULL;
> > +   wm->sz = sz;
> > +   return wm;
> > +}
> > +
> > +static int register_wm_size_class(size_t sz)
> > +{
> > +   struct wm_list *l, *nl;
> > +
> > +   nl = kvmalloc(sizeof(struct wm_list), GFP_KERNEL);
> > +   if (!nl)
> > +   return -ENOMEM;
> > +
> > +   nl->sz = sz;
> > +   spin_lock_init(>wm_lock);
> > +   INIT_LIST_HEAD(>idle_wm);
> > +   INIT_LIST_HEAD(>list);
> > +   init_waitqueue_head(>wm_wait);
> > +   nl->avail_wm = 0;
> > +
> > +   write_lock(_lists_lock);
> > +   list_for_each_entry(l, _lists, list) {
> > +   if (l->sz == sz) {
> > +   write_unlock(_lists_lock);
> > +   kvfree(nl);
> > +   return 0;
> > +   }
> > +   }
> > +
> > +   list_add(>list, _lists);
> > +   write_unlock(_lists_lock);
> > +   return 0;
> > +}
> > +
> > +static struct wm_list *match_wm_list(size_t size)
> > +{
> > +   struct wm_list *l, *rl = NULL;
> > +
> > +   read_lock(_lists_lock);
> > +   list_for_each_entry(l, _lists, list) {
> > +   if (l->sz == size) {
> > +   rl = l;
> > +   break;
> > +   }
> > +   }
> > +   read_unlock(_lists_lock);
> > +   return rl;
> > +}
> > +
> > +static struct wm *find_wm(size_t size)
> > +{
> > +   struct wm_list *wm_list;
> > +   struct wm *wm;
> > +
> > +   wm_list = match_wm_list(size);
> > +   if (!wm_list) {
> > +   if (register_wm_size_class(size))
> > +   return NULL;
> > +   wm_list = match_wm_list(size);
> > +   }
> > +
> > +   if (!wm_list)
> > +   return NULL;
> > +
> > +   while (1) {
> > +   spin_lock(_list->wm_lock);
> > +   if (!list_empty(_list->idle_wm)) {
> > +   wm = list_entry(wm_list->idle_wm.next,
> > +   struct wm,
> > +   list);
> > +   list_del(>list);
> > +   spin_unlock(_list->wm_lock);
> 

RE: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Namjae Jeon
> On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote:
> > This adds file operations and buffer pool for cifsd.
> 
> Some random notes:
> 
> > +static void rollback_path_modification(char *filename) {
> > +   if (filename) {
> > +   filename--;
> > +   *filename = '/';
> What an odd way to spell filename[-1] = '/';...
Okay. Will fix.
> 
> > +int ksmbd_vfs_inode_permission(struct dentry *dentry, int acc_mode,
> > +bool delete) {
> 
> > +   if (delete) {
> > +   struct dentry *parent;
> > +
> > +   parent = dget_parent(dentry);
> > +   if (!parent)
> > +   return -EINVAL;
> > +
> > +   if (inode_permission(_user_ns, d_inode(parent), MAY_EXEC | 
> > MAY_WRITE)) {
> > +   dput(parent);
> > +   return -EACCES;
> > +   }
> > +   dput(parent);
> 
> Who's to guarantee that parent is stable?  IOW, by the time of that
> inode_permission() call dentry might very well not be a child of that thing...
Okay, Will fix.
> 
> > +   parent = dget_parent(dentry);
> > +   if (!parent)
> > +   return 0;
> > +
> > +   if (!inode_permission(_user_ns, d_inode(parent), MAY_EXEC | 
> > MAY_WRITE))
> > +   *daccess |= FILE_DELETE_LE;
> 
> Ditto.
Okay.
> 
> > +int ksmbd_vfs_mkdir(struct ksmbd_work *work,
> > +   const char *name,
> > +   umode_t mode)
> 
> 
> > +   err = vfs_mkdir(_user_ns, d_inode(path.dentry), dentry, mode);
> > +   if (!err) {
> > +   ksmbd_vfs_inherit_owner(work, d_inode(path.dentry),
> > +   d_inode(dentry));
> 
> ->mkdir() might very well return success, with dentry left unhashed negative.
> Look at the callers of vfs_mkdir() to see how it should be handled.
Okay, Will fix.
> 
> > +static int check_lock_range(struct file *filp,
> > +   loff_t start,
> > +   loff_t end,
> > +   unsigned char type)
> > +{
> > +   struct file_lock *flock;
> > +   struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> > +   int error = 0;
> > +
> > +   if (!ctx || list_empty_careful(>flc_posix))
> > +   return 0;
> > +
> > +   spin_lock(>flc_lock);
> > +   list_for_each_entry(flock, >flc_posix, fl_list) {
> > +   /* check conflict locks */
> > +   if (flock->fl_end >= start && end >= flock->fl_start) {
> > +   if (flock->fl_type == F_RDLCK) {
> > +   if (type == WRITE) {
> > +   ksmbd_err("not allow write by shared 
> > lock\n");
> > +   error = 1;
> > +   goto out;
> > +   }
> > +   } else if (flock->fl_type == F_WRLCK) {
> > +   /* check owner in lock */
> > +   if (flock->fl_file != filp) {
> > +   error = 1;
> > +   ksmbd_err("not allow rw access by 
> > exclusive lock from other
> opens\n");
> > +   goto out;
> > +   }
> > +   }
> > +   }
> > +   }
> > +out:
> > +   spin_unlock(>flc_lock);
> > +   return error;
> > +}
> 
> WTF is that doing in smbd?
This code was added to pass the smb2 lock test of samba's smbtorture.
Will fix it.
> 
> > +   filp = fp->filp;
> > +   inode = d_inode(filp->f_path.dentry);
> 
> That should be file_inode().  Try it on overlayfs, watch it do interesting 
> things...
Okay.
> 
> > +   nbytes = kernel_read(filp, rbuf, count, pos);
> > +   if (nbytes < 0) {
> > +   name = d_path(>f_path, namebuf, sizeof(namebuf));
> > +   if (IS_ERR(name))
> > +   name = "(error)";
> > +   ksmbd_err("smb read failed for (%s), err = %zd\n",
> > +   name, nbytes);
> 
> Do you really want the full pathname here?  For (presumably) spew into syslog?
No, Will fix.
> 
> > +int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) {
> > +   struct path parent;
> > +   struct dentry *dir, *dentry;
> > +   char *last;
> > +   int err = -ENOENT;
> > +
> > +   last = extract_last_component(name);
> > +   if (!last)
> > +   return -ENOENT;
> 
> Yeccchhh...
I guess I change it err instead of -ENOENT.

> 
> > +   if (ksmbd_override_fsids(work))
> > +   return -ENOMEM;
> > +
> > +   err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, );
> > +   if (err) {
> > +   ksmbd_debug(VFS, "can't get %s, err %d\n", name, err);
> > +   ksmbd_revert_fsids(work);
> > +   rollback_path_modification(last);
> > +   return err;
> > +   }
> > +
> > +   dir = parent.dentry;
> > +   if (!d_inode(dir))
> > +   goto out;
> 
> Really?  When does that happen?
Will fix.
> 
> > +static int __ksmbd_vfs_rename(struct ksmbd_work *work,
> > + struct dentry *src_dent_parent,
> > +  

Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Sergey Senozhatsky
On (21/03/22 17:09), Matthew Wilcox wrote:
> On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote:
> > On (21/03/22 08:15), Matthew Wilcox wrote:
> > > 
> > > What's the scenario for which your allocator performs better than slub
> > > 
> > 
> > IIRC request and reply buffers can be up to 4M in size. So this stuff
> > just allocates a number of fat buffers and keeps them around so that
> > it doesn't have to vmalloc(4M) for every request and every response.
> 
> Hang on a minute, this speaks to a deeper design problem.  If we're doing
> a 'request' or 'reply' that is that large, the I/O should be coming from
> or going to the page cache.  If it goes via a 4MB virtually contiguous
> buffer first, that's a memcpy that could/should be avoided.

But huge vmalloc buffers are still needed. For instance, `ls -la` in
a directory with a huge number of entries.

> But now i'm looking for how ksmbd_find_buffer() is used, and it isn't.
> So it looks like someone came to the same conclusion I did, but forgot
> to delete the wm code.

Yes, I think it's disabled by default and requires some smb.conf
configuration. So I guess that wm code can be removed. Especially given
that

> That said, there are plenty of opportunities to optimise the vmalloc code,
> and that's worth pursuing.

That would be really interesting to see!

> And here's the receive path which contains
> the memcpy that should be avoided (ok, it's actually the call to ->read;
> we shouldn't be reading in the entire 4MB but rather the header):

> + conn->request_buf = ksmbd_alloc_request(size);
> + if (!conn->request_buf)
> + continue;
> +
> + memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf));
> + if (!ksmbd_smb_request(conn))
> + break;
> +
> + /*
> +  * We already read 4 bytes to find out PDU size, now
> +  * read in PDU
> +  */
> + size = t->ops->read(t, conn->request_buf + 4, pdu_size);


// A side note, it seems that the maximum read/write/trans buffer size that
// windows supports is 8MB, not 4MB.


Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Matthew Wilcox
On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote:
> On (21/03/22 08:15), Matthew Wilcox wrote:
> > 
> > What's the scenario for which your allocator performs better than slub
> > 
> 
> IIRC request and reply buffers can be up to 4M in size. So this stuff
> just allocates a number of fat buffers and keeps them around so that
> it doesn't have to vmalloc(4M) for every request and every response.

Hang on a minute, this speaks to a deeper design problem.  If we're doing
a 'request' or 'reply' that is that large, the I/O should be coming from
or going to the page cache.  If it goes via a 4MB virtually contiguous
buffer first, that's a memcpy that could/should be avoided.

But now i'm looking for how ksmbd_find_buffer() is used, and it isn't.
So it looks like someone came to the same conclusion I did, but forgot
to delete the wm code.

That said, there are plenty of opportunities to optimise the vmalloc code,
and that's worth pursuing.  And here's the receive path which contains
the memcpy that should be avoided (ok, it's actually the call to ->read;
we shouldn't be reading in the entire 4MB but rather the header):

+   conn->request_buf = ksmbd_alloc_request(size);
+   if (!conn->request_buf)
+   continue;
+
+   memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf));
+   if (!ksmbd_smb_request(conn))
+   break;
+
+   /*
+* We already read 4 bytes to find out PDU size, now
+* read in PDU
+*/
+   size = t->ops->read(t, conn->request_buf + 4, pdu_size);



RE: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Schaufler, Casey
> -Original Message-
> From: Namjae Jeon 
> Sent: Sunday, March 21, 2021 10:14 PM
> To: linux-fsde...@vger.kernel.org; linux-kernel@vger.kernel.org; linux-
> c...@vger.kernel.org
> Cc: linux-cifsd-de...@lists.sourceforge.net; smfre...@gmail.com;
> senozhat...@chromium.org; hyc@gmail.com; v...@zeniv.linux.org.uk;
> h...@lst.de; h...@infradead.org; ronniesahlb...@gmail.com;
> aurelien.ap...@gmail.com; aap...@suse.com; sand...@sandeen.net;
> dan.carpen...@oracle.com; colin.k...@canonical.com;
> rdun...@infradead.org; Namjae Jeon ;
> Sergey Senozhatsky ; Steve French
> 
> Subject: [PATCH 3/5] cifsd: add file operations
> 
> This adds file operations and buffer pool for cifsd.
> 
> Signed-off-by: Namjae Jeon 
> Signed-off-by: Sergey Senozhatsky 
> Signed-off-by: Hyunchul Lee 
> Acked-by: Ronnie Sahlberg 
> Signed-off-by: Steve French 
> ---
>  fs/cifsd/buffer_pool.c |  292 ++
>  fs/cifsd/buffer_pool.h |   28 +
>  fs/cifsd/vfs.c | 1989 
>  fs/cifsd/vfs.h |  314 +++
>  fs/cifsd/vfs_cache.c   |  851 +
>  fs/cifsd/vfs_cache.h   |  213 +
>  6 files changed, 3687 insertions(+)
>  create mode 100644 fs/cifsd/buffer_pool.c
>  create mode 100644 fs/cifsd/buffer_pool.h
>  create mode 100644 fs/cifsd/vfs.c
>  create mode 100644 fs/cifsd/vfs.h
>  create mode 100644 fs/cifsd/vfs_cache.c
>  create mode 100644 fs/cifsd/vfs_cache.h
> 
> diff --git a/fs/cifsd/buffer_pool.c b/fs/cifsd/buffer_pool.c
> new file mode 100644
> index ..864fea547c68
> --- /dev/null
> +++ b/fs/cifsd/buffer_pool.c
> @@ -0,0 +1,292 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + *   Copyright (C) 2018 Samsung Electronics Co., Ltd.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "glob.h"
> +#include "buffer_pool.h"
> +#include "connection.h"
> +#include "mgmt/ksmbd_ida.h"
> +
> +static struct kmem_cache *filp_cache;
> +
> +struct wm {
> + struct list_headlist;
> + unsigned intsz;
> + charbuffer[0];
> +};
> +
> +struct wm_list {
> + struct list_headlist;
> + unsigned intsz;
> +
> + spinlock_t  wm_lock;
> + int avail_wm;
> + struct list_headidle_wm;
> + wait_queue_head_t   wm_wait;
> +};
> +
> +static LIST_HEAD(wm_lists);
> +static DEFINE_RWLOCK(wm_lists_lock);
> +
> +void *ksmbd_alloc(size_t size)
> +{
> + return kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
> +}
> +
> +void ksmbd_free(void *ptr)
> +{
> + kvfree(ptr);
> +}
> +
> +static struct wm *wm_alloc(size_t sz, gfp_t flags)
> +{
> + struct wm *wm;
> + size_t alloc_sz = sz + sizeof(struct wm);
> +
> + wm = kvmalloc(alloc_sz, flags);
> + if (!wm)
> + return NULL;
> + wm->sz = sz;
> + return wm;
> +}
> +
> +static int register_wm_size_class(size_t sz)
> +{
> + struct wm_list *l, *nl;
> +
> + nl = kvmalloc(sizeof(struct wm_list), GFP_KERNEL);
> + if (!nl)
> + return -ENOMEM;
> +
> + nl->sz = sz;
> + spin_lock_init(>wm_lock);
> + INIT_LIST_HEAD(>idle_wm);
> + INIT_LIST_HEAD(>list);
> + init_waitqueue_head(>wm_wait);
> + nl->avail_wm = 0;
> +
> + write_lock(_lists_lock);
> + list_for_each_entry(l, _lists, list) {
> + if (l->sz == sz) {
> + write_unlock(_lists_lock);
> + kvfree(nl);
> + return 0;
> + }
> + }
> +
> + list_add(>list, _lists);
> + write_unlock(_lists_lock);
> + return 0;
> +}
> +
> +static struct wm_list *match_wm_list(size_t size)
> +{
> + struct wm_list *l, *rl = NULL;
> +
> + read_lock(_lists_lock);
> + list_for_each_entry(l, _lists, list) {
> + if (l->sz == size) {
> + rl = l;
> + break;
> + }
> + }
> + read_unlock(_lists_lock);
> + return rl;
> +}
> +
> +static struct wm *find_wm(size_t size)
> +{
> + struct wm_list *wm_list;
> + struct wm *wm;
> +
> + wm_list = match_wm_list(size);
> + if (!wm_list) {
> + if (register_wm_size_class(size))
> + return NULL;
> + wm_list = match_wm_list(size);
> + }
> +
> + if (!wm_list)
> + return NULL;
> +
> + while (1) {
> + spin_lock(_list->wm_lock);
> + if (!list_empty(_list->idle_wm)) {
> + wm = list_entry(wm_list->idle_wm.next,
> + struct wm,
> + list);
> + list_del(>list);
> + spin_unlock(_list->wm_lock);
> + return wm;
> + }
> +
> + if (wm_list->avail_wm > num_online_cpus()) {
> + spin_unlock(_list->wm_lock);
> + 

Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Matthew Wilcox
On Mon, Mar 22, 2021 at 02:57:18PM +0100, Christoph Hellwig wrote:
> On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote:
> > On (21/03/22 08:15), Matthew Wilcox wrote:
> > > 
> > > What's the scenario for which your allocator performs better than slub
> > > 
> > 
> > IIRC request and reply buffers can be up to 4M in size. So this stuff
> > just allocates a number of fat buffers and keeps them around so that
> > it doesn't have to vmalloc(4M) for every request and every response.
> 
> Do we have any data suggesting it is faster than vmalloc?

Oh, I have no trouble believing it's faster than vmalloc.  Here's
the fast(!) path that always has memory available, never does retries.
I'm calling out the things I perceive as expensive on the right hand side.
Also, I'm taking the 4MB size as the example.

vmalloc()
  __vmalloc_node()
__vmalloc_node_range()
  __get_vm_area_node()
[allocates vm_struct]
alloc_vmap_area()
[allocates vmap_area]
[takes free_vmap_area_lock]
  __alloc_vmap_area()
find_vmap_lowest_match
[walks free_vmap_area_root]
[takes vmap_area_lock]
  __vmalloc_area_node()
... array_size is 8KiB, we call __vmalloc_node
__vmalloc_node
[everything we did above, all over again,
 two more allocations, two more lock acquire]
alloc_pages_node(), 1024 times
vmap_pages_range_noflush()
  vmap_range_noflush()
[allocate at least two pages for PTEs]

There's definitely some low handling fruit here.  __vmalloc_area_node()
should probably call kvmalloc_node() instead of __vmalloc_node() for
table sizes > 4KiB.  But a lot of this is inherent to how vmalloc works,
and we need to put a cache in front of it.  Just not this one.


Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Christoph Hellwig
On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote:
> On (21/03/22 08:15), Matthew Wilcox wrote:
> > 
> > What's the scenario for which your allocator performs better than slub
> > 
> 
> IIRC request and reply buffers can be up to 4M in size. So this stuff
> just allocates a number of fat buffers and keeps them around so that
> it doesn't have to vmalloc(4M) for every request and every response.

Do we have any data suggesting it is faster than vmalloc?


Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Matthew Wilcox
On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote:
> On (21/03/22 08:15), Matthew Wilcox wrote:
> > 
> > What's the scenario for which your allocator performs better than slub
> > 
> 
> IIRC request and reply buffers can be up to 4M in size. So this stuff
> just allocates a number of fat buffers and keeps them around so that
> it doesn't have to vmalloc(4M) for every request and every response.

That makes a lot more sense; I was thrown off by the kvmalloc, which
is usually used for allocations that might be smaller than PAGE_SIZE.

So what this patch is really saying is that vmalloc() should include
some caching, so it can defer freeing until there's memory pressure
or it's built up a large (percpu) backlog of freed areas.

Vlad, have you thought about this?


Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Sergey Senozhatsky
On (21/03/22 10:04), Dan Carpenter wrote:
> 
> On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote:
> > +void *ksmbd_alloc(size_t size)
> > +{
> > +   return kvmalloc(size, GFP_KERNEL | __GFP_ZERO);
> 
> 
> This patch adds a bunch of wrappers around kvmalloc().  Don't do that.

Agreed. This was not cleaned up properly. The initial implementation
of that function (IIRC... it was sometime in 2018) basically contained
kvmalloc() implementation, because back in the days Samsung used kernel
version that simply didn't have kvmalloc() ( < KERNEL_VERSION(5, 0, 0))


Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Sergey Senozhatsky
On (21/03/22 07:02), Al Viro wrote:
> On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote:
> > +static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft,
> > +   unsigned int id)
> > +{
> > +   bool unclaimed = true;
> > +   struct ksmbd_file *fp;
> > +
> > +   read_lock(>lock);
> > +   fp = idr_find(ft->idr, id);
> > +   if (fp)
> > +   fp = ksmbd_fp_get(fp);
> > +
> > +   if (fp && fp->f_ci) {
> > +   read_lock(>f_ci->m_lock);
> > +   unclaimed = list_empty(>node);
> > +   read_unlock(>f_ci->m_lock);
> > +   }
> > +   read_unlock(>lock);
> > +
> > +   if (fp && unclaimed) {
> > +   atomic_dec(>refcount);
> > +   return NULL;
> > +   }
>
> Can that atomic_dec() end up dropping the last remaining reference?

Yes, I think it should increment refcount only for "claimed" fp.


Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Sergey Senozhatsky
On (21/03/22 08:15), Matthew Wilcox wrote:
> 
> What's the scenario for which your allocator performs better than slub
> 

IIRC request and reply buffers can be up to 4M in size. So this stuff
just allocates a number of fat buffers and keeps them around so that
it doesn't have to vmalloc(4M) for every request and every response.


Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Matthew Wilcox
On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote:
> This adds file operations and buffer pool for cifsd.

Why do you want this buffer pool?  Do you not trust the
slab allocator to be able to do its job?  Because what you have
here looks slower than the slab allocator.

Let's follow this through for the best-case scenario (a buffer of the right
size already exists):

> +void *ksmbd_find_buffer(size_t size)
> +{
> + struct wm *wm;
> +
> + wm = find_wm(size);
> +
> + WARN_ON(!wm);
> + if (wm)
> + return wm->buffer;
> + return NULL;
> +}

OK, simple, we just call find_wm().

> +static struct wm *find_wm(size_t size)
> +{
> + struct wm_list *wm_list;
> + struct wm *wm;
> +
> + wm_list = match_wm_list(size);

First we find the list for this buffer ...

> +static struct wm_list *match_wm_list(size_t size)
> +{
> + struct wm_list *l, *rl = NULL;
> +
> + read_lock(_lists_lock);
> + list_for_each_entry(l, _lists, list) {
> + if (l->sz == size) {
> + rl = l;
> + break;
> + }
> + }
> + read_unlock(_lists_lock);
> + return rl;
> +}

... by taking an rwlock, and walking a linked list?!  Uh ...

> + while (1) {
> + spin_lock(_list->wm_lock);
> + if (!list_empty(_list->idle_wm)) {
> + wm = list_entry(wm_list->idle_wm.next,
> + struct wm,
> + list);
> + list_del(>list);
> + spin_unlock(_list->wm_lock);
> + return wm;

Great!  We found one!  And all it cost us was acquiring a global rwlock,
walking a linked list to find a wmlist, then a per-wmlist spinlock.

Meanwhile, there's no guarantee the buffer we found is on the local
NUMA node.

Compare to slub, allocating from a kmem_cache (assuming you create
one for each buffer size ...):

void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size);

static __always_inline void *slab_alloc(struct kmem_cache *s,
gfp_t gfpflags, unsigned long addr, size_t orig_size)
return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size);

static __always_inline void *slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
do {
tid = this_cpu_read(s->cpu_slab->tid);
c = raw_cpu_ptr(s->cpu_slab);
} while (IS_ENABLED(CONFIG_PREEMPTION) &&
 unlikely(tid != READ_ONCE(c->tid)));
object = c->freelist;
page = c->page;
if (unlikely(!object || !page || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
} else {
void *next_object = get_freepointer_safe(s, object);
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
object, tid,
next_object, next_tid(tid {

note_cmpxchg_failure("slab_alloc", s, tid);
goto redo;
}
prefetch_freepointer(s, next_object);
stat(s, ALLOC_FASTPATH);

No lock, anywhere.  Lots of percpu goodness, so you get memory allocated
on your local node.

What's the scenario for which your allocator performs better than slub,
on a typical machine that serves enough SMB that it's worth having an
in-kernel SMBD?



Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Dan Carpenter
On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote:
> +void *ksmbd_alloc(size_t size)
> +{
> + return kvmalloc(size, GFP_KERNEL | __GFP_ZERO);


This patch adds a bunch of wrappers around kvmalloc().  Don't do that.
Just use kvmalloc() directly instead.  They just make the code hard to
read.  kvmalloc() is not appropriate for small allocations.  This
wrapper hides a GFP_KERNEL which may lead to scheduling in atomic bugs
and the secret ZEROing makes the code hard to read.

> +}
> +
> +void ksmbd_free(void *ptr)
> +{
> + kvfree(ptr);
> +}
> +
> +static struct wm *wm_alloc(size_t sz, gfp_t flags)
> +{
> + struct wm *wm;
> + size_t alloc_sz = sz + sizeof(struct wm);
  ^^

Check for integer overflow.

> +
> + wm = kvmalloc(alloc_sz, flags);
> + if (!wm)
> + return NULL;
> + wm->sz = sz;
> + return wm;
> +}
> +
> +static int register_wm_size_class(size_t sz)
> +{
> + struct wm_list *l, *nl;
> +
> + nl = kvmalloc(sizeof(struct wm_list), GFP_KERNEL);

Just use kmalloc() for small allocations.

> + if (!nl)
> + return -ENOMEM;
> +
> + nl->sz = sz;
> + spin_lock_init(>wm_lock);
> + INIT_LIST_HEAD(>idle_wm);
> + INIT_LIST_HEAD(>list);
> + init_waitqueue_head(>wm_wait);
> + nl->avail_wm = 0;
> +
> + write_lock(_lists_lock);
> + list_for_each_entry(l, _lists, list) {
> + if (l->sz == sz) {
> + write_unlock(_lists_lock);
> + kvfree(nl);
> + return 0;
> + }
> + }
> +
> + list_add(>list, _lists);
> + write_unlock(_lists_lock);
> + return 0;
> +}
> +
> +static struct wm_list *match_wm_list(size_t size)
> +{
> + struct wm_list *l, *rl = NULL;
> +
> + read_lock(_lists_lock);
> + list_for_each_entry(l, _lists, list) {
> + if (l->sz == size) {
> + rl = l;
> + break;
> + }
> + }
> + read_unlock(_lists_lock);
> + return rl;
> +}
> +
> +static struct wm *find_wm(size_t size)
> +{
> + struct wm_list *wm_list;
> + struct wm *wm;
> +
> + wm_list = match_wm_list(size);
> + if (!wm_list) {
> + if (register_wm_size_class(size))
> + return NULL;
> + wm_list = match_wm_list(size);
> + }
> +
> + if (!wm_list)
> + return NULL;
> +
> + while (1) {
> + spin_lock(_list->wm_lock);
> + if (!list_empty(_list->idle_wm)) {
> + wm = list_entry(wm_list->idle_wm.next,
> + struct wm,
> + list);
> + list_del(>list);
> + spin_unlock(_list->wm_lock);
> + return wm;
> + }
> +
> + if (wm_list->avail_wm > num_online_cpus()) {
> + spin_unlock(_list->wm_lock);
> + wait_event(wm_list->wm_wait,
> +!list_empty(_list->idle_wm));
> + continue;
> + }
> +
> + wm_list->avail_wm++;

I don't think we should increment this until after the allocation
succeeds?

> + spin_unlock(_list->wm_lock);
> +
> + wm = wm_alloc(size, GFP_KERNEL);
> + if (!wm) {
> + spin_lock(_list->wm_lock);
> + wm_list->avail_wm--;
> + spin_unlock(_list->wm_lock);
> + wait_event(wm_list->wm_wait,
> +!list_empty(_list->idle_wm));
> + continue;
> + }
> + break;
> + }
> +
> + return wm;
> +}

regards,
dan carpenter



Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Al Viro
On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote:

> +static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft,
> + unsigned int id)
> +{
> + bool unclaimed = true;
> + struct ksmbd_file *fp;
> +
> + read_lock(>lock);
> + fp = idr_find(ft->idr, id);
> + if (fp)
> + fp = ksmbd_fp_get(fp);
> +
> + if (fp && fp->f_ci) {
> + read_lock(>f_ci->m_lock);
> + unclaimed = list_empty(>node);
> + read_unlock(>f_ci->m_lock);
> + }
> + read_unlock(>lock);
> +
> + if (fp && unclaimed) {
> + atomic_dec(>refcount);
> + return NULL;
> + }

Can that atomic_dec() end up dropping the last remaining reference?
If not, what's to prevent that?


Re: [PATCH 3/5] cifsd: add file operations

2021-03-22 Thread Al Viro
On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote:
> This adds file operations and buffer pool for cifsd.

Some random notes:

> +static void rollback_path_modification(char *filename)
> +{
> + if (filename) {
> + filename--;
> + *filename = '/';
What an odd way to spell filename[-1] = '/';...

> +int ksmbd_vfs_inode_permission(struct dentry *dentry, int acc_mode, bool 
> delete)
> +{

> + if (delete) {
> + struct dentry *parent;
> +
> + parent = dget_parent(dentry);
> + if (!parent)
> + return -EINVAL;
> +
> + if (inode_permission(_user_ns, d_inode(parent), MAY_EXEC | 
> MAY_WRITE)) {
> + dput(parent);
> + return -EACCES;
> + }
> + dput(parent);

Who's to guarantee that parent is stable?  IOW, by the time of that
inode_permission() call dentry might very well not be a child of that thing...

> + parent = dget_parent(dentry);
> + if (!parent)
> + return 0;
> +
> + if (!inode_permission(_user_ns, d_inode(parent), MAY_EXEC | 
> MAY_WRITE))
> + *daccess |= FILE_DELETE_LE;

Ditto.

> +int ksmbd_vfs_mkdir(struct ksmbd_work *work,
> + const char *name,
> + umode_t mode)


> + err = vfs_mkdir(_user_ns, d_inode(path.dentry), dentry, mode);
> + if (!err) {
> + ksmbd_vfs_inherit_owner(work, d_inode(path.dentry),
> + d_inode(dentry));

->mkdir() might very well return success, with dentry left unhashed negative.
Look at the callers of vfs_mkdir() to see how it should be handled.

> +static int check_lock_range(struct file *filp,
> + loff_t start,
> + loff_t end,
> + unsigned char type)
> +{
> + struct file_lock *flock;
> + struct file_lock_context *ctx = file_inode(filp)->i_flctx;
> + int error = 0;
> +
> + if (!ctx || list_empty_careful(>flc_posix))
> + return 0;
> +
> + spin_lock(>flc_lock);
> + list_for_each_entry(flock, >flc_posix, fl_list) {
> + /* check conflict locks */
> + if (flock->fl_end >= start && end >= flock->fl_start) {
> + if (flock->fl_type == F_RDLCK) {
> + if (type == WRITE) {
> + ksmbd_err("not allow write by shared 
> lock\n");
> + error = 1;
> + goto out;
> + }
> + } else if (flock->fl_type == F_WRLCK) {
> + /* check owner in lock */
> + if (flock->fl_file != filp) {
> + error = 1;
> + ksmbd_err("not allow rw access by 
> exclusive lock from other opens\n");
> + goto out;
> + }
> + }
> + }
> + }
> +out:
> + spin_unlock(>flc_lock);
> + return error;
> +}

WTF is that doing in smbd?

> + filp = fp->filp;
> + inode = d_inode(filp->f_path.dentry);

That should be file_inode().  Try it on overlayfs, watch it do interesting 
things...

> + nbytes = kernel_read(filp, rbuf, count, pos);
> + if (nbytes < 0) {
> + name = d_path(>f_path, namebuf, sizeof(namebuf));
> + if (IS_ERR(name))
> + name = "(error)";
> + ksmbd_err("smb read failed for (%s), err = %zd\n",
> + name, nbytes);

Do you really want the full pathname here?  For (presumably) spew into syslog?

> +int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name)
> +{
> + struct path parent;
> + struct dentry *dir, *dentry;
> + char *last;
> + int err = -ENOENT;
> +
> + last = extract_last_component(name);
> + if (!last)
> + return -ENOENT;

Yeccchhh...

> + if (ksmbd_override_fsids(work))
> + return -ENOMEM;
> +
> + err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, );
> + if (err) {
> + ksmbd_debug(VFS, "can't get %s, err %d\n", name, err);
> + ksmbd_revert_fsids(work);
> + rollback_path_modification(last);
> + return err;
> + }
> +
> + dir = parent.dentry;
> + if (!d_inode(dir))
> + goto out;

Really?  When does that happen?

> +static int __ksmbd_vfs_rename(struct ksmbd_work *work,
> +   struct dentry *src_dent_parent,
> +   struct dentry *src_dent,
> +   struct dentry *dst_dent_parent,
> +   struct dentry *trap_dent,
> +   char *dst_name)
> +{
> + struct dentry *dst_dent;
> + int err;
> +
> + spin_lock(_dent->d_lock);
> +