RE: [PATCH 3/5] cifsd: add file operations
> > > -Original Message- > > From: Namjae Jeon > > Sent: Sunday, March 21, 2021 10:14 PM > > To: linux-fsde...@vger.kernel.org; linux-kernel@vger.kernel.org; linux- > > c...@vger.kernel.org > > Cc: linux-cifsd-de...@lists.sourceforge.net; smfre...@gmail.com; > > senozhat...@chromium.org; hyc@gmail.com; v...@zeniv.linux.org.uk; > > h...@lst.de; h...@infradead.org; ronniesahlb...@gmail.com; > > aurelien.ap...@gmail.com; aap...@suse.com; sand...@sandeen.net; > > dan.carpen...@oracle.com; colin.k...@canonical.com; > > rdun...@infradead.org; Namjae Jeon ; > > Sergey Senozhatsky ; Steve French > > > > Subject: [PATCH 3/5] cifsd: add file operations > > > > This adds file operations and buffer pool for cifsd. > > > > Signed-off-by: Namjae Jeon > > Signed-off-by: Sergey Senozhatsky > > Signed-off-by: Hyunchul Lee > > Acked-by: Ronnie Sahlberg > > Signed-off-by: Steve French > > --- > > fs/cifsd/buffer_pool.c | 292 ++ > > fs/cifsd/buffer_pool.h | 28 + > > fs/cifsd/vfs.c | 1989 > > fs/cifsd/vfs.h | 314 +++ > > fs/cifsd/vfs_cache.c | 851 + > > fs/cifsd/vfs_cache.h | 213 + > > 6 files changed, 3687 insertions(+) > > create mode 100644 fs/cifsd/buffer_pool.c > > create mode 100644 fs/cifsd/buffer_pool.h > > create mode 100644 fs/cifsd/vfs.c > > create mode 100644 fs/cifsd/vfs.h > > create mode 100644 fs/cifsd/vfs_cache.c > > create mode 100644 fs/cifsd/vfs_cache.h > > > > diff --git a/fs/cifsd/buffer_pool.c b/fs/cifsd/buffer_pool.c > > new file mode 100644 > > index ..864fea547c68 > > --- /dev/null > > +++ b/fs/cifsd/buffer_pool.c > > @@ -0,0 +1,292 @@ > > +// SPDX-License-Identifier: GPL-2.0-or-later > > +/* > > + * Copyright (C) 2018 Samsung Electronics Co., Ltd. > > + */ > > + > > +#include > > +#include > > +#include > > +#include > > +#include > > +#include > > +#include > > + > > +#include "glob.h" > > +#include "buffer_pool.h" > > +#include "connection.h" > > +#include "mgmt/ksmbd_ida.h" > > + > > +static struct kmem_cache *filp_cache; > > + > > +struct wm { > > + struct list_headlist; > > + unsigned intsz; > > + charbuffer[0]; > > +}; > > + > > +struct wm_list { > > + struct list_headlist; > > + unsigned intsz; > > + > > + spinlock_t wm_lock; > > + int avail_wm; > > + struct list_headidle_wm; > > + wait_queue_head_t wm_wait; > > +}; > > + > > +static LIST_HEAD(wm_lists); > > +static DEFINE_RWLOCK(wm_lists_lock); > > + > > +void *ksmbd_alloc(size_t size) > > +{ > > + return kvmalloc(size, GFP_KERNEL | __GFP_ZERO); > > +} > > + > > +void ksmbd_free(void *ptr) > > +{ > > + kvfree(ptr); > > +} > > + > > +static struct wm *wm_alloc(size_t sz, gfp_t flags) > > +{ > > + struct wm *wm; > > + size_t alloc_sz = sz + sizeof(struct wm); > > + > > + wm = kvmalloc(alloc_sz, flags); > > + if (!wm) > > + return NULL; > > + wm->sz = sz; > > + return wm; > > +} > > + > > +static int register_wm_size_class(size_t sz) > > +{ > > + struct wm_list *l, *nl; > > + > > + nl = kvmalloc(sizeof(struct wm_list), GFP_KERNEL); > > + if (!nl) > > + return -ENOMEM; > > + > > + nl->sz = sz; > > + spin_lock_init(>wm_lock); > > + INIT_LIST_HEAD(>idle_wm); > > + INIT_LIST_HEAD(>list); > > + init_waitqueue_head(>wm_wait); > > + nl->avail_wm = 0; > > + > > + write_lock(_lists_lock); > > + list_for_each_entry(l, _lists, list) { > > + if (l->sz == sz) { > > + write_unlock(_lists_lock); > > + kvfree(nl); > > + return 0; > > + } > > + } > > + > > + list_add(>list, _lists); > > + write_unlock(_lists_lock); > > + return 0; > > +} > > + > > +static struct wm_list *match_wm_list(size_t size) > > +{ > > + struct wm_list *l, *rl = NULL; > > + > > + read_lock(_lists_lock); > > + list_for_each_entry(l, _lists, list) { > > + if (l->sz == size) { > > + rl = l; > > + break; > > + } > > + } > > + read_unlock(_lists_lock); > > + return rl; > > +} > > + > > +static struct wm *find_wm(size_t size) > > +{ > > + struct wm_list *wm_list; > > + struct wm *wm; > > + > > + wm_list = match_wm_list(size); > > + if (!wm_list) { > > + if (register_wm_size_class(size)) > > + return NULL; > > + wm_list = match_wm_list(size); > > + } > > + > > + if (!wm_list) > > + return NULL; > > + > > + while (1) { > > + spin_lock(_list->wm_lock); > > + if (!list_empty(_list->idle_wm)) { > > + wm = list_entry(wm_list->idle_wm.next, > > + struct wm, > > + list); > > + list_del(>list); > > + spin_unlock(_list->wm_lock); >
RE: [PATCH 3/5] cifsd: add file operations
> On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote: > > This adds file operations and buffer pool for cifsd. > > Some random notes: > > > +static void rollback_path_modification(char *filename) { > > + if (filename) { > > + filename--; > > + *filename = '/'; > What an odd way to spell filename[-1] = '/';... Okay. Will fix. > > > +int ksmbd_vfs_inode_permission(struct dentry *dentry, int acc_mode, > > +bool delete) { > > > + if (delete) { > > + struct dentry *parent; > > + > > + parent = dget_parent(dentry); > > + if (!parent) > > + return -EINVAL; > > + > > + if (inode_permission(_user_ns, d_inode(parent), MAY_EXEC | > > MAY_WRITE)) { > > + dput(parent); > > + return -EACCES; > > + } > > + dput(parent); > > Who's to guarantee that parent is stable? IOW, by the time of that > inode_permission() call dentry might very well not be a child of that thing... Okay, Will fix. > > > + parent = dget_parent(dentry); > > + if (!parent) > > + return 0; > > + > > + if (!inode_permission(_user_ns, d_inode(parent), MAY_EXEC | > > MAY_WRITE)) > > + *daccess |= FILE_DELETE_LE; > > Ditto. Okay. > > > +int ksmbd_vfs_mkdir(struct ksmbd_work *work, > > + const char *name, > > + umode_t mode) > > > > + err = vfs_mkdir(_user_ns, d_inode(path.dentry), dentry, mode); > > + if (!err) { > > + ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), > > + d_inode(dentry)); > > ->mkdir() might very well return success, with dentry left unhashed negative. > Look at the callers of vfs_mkdir() to see how it should be handled. Okay, Will fix. > > > +static int check_lock_range(struct file *filp, > > + loff_t start, > > + loff_t end, > > + unsigned char type) > > +{ > > + struct file_lock *flock; > > + struct file_lock_context *ctx = file_inode(filp)->i_flctx; > > + int error = 0; > > + > > + if (!ctx || list_empty_careful(>flc_posix)) > > + return 0; > > + > > + spin_lock(>flc_lock); > > + list_for_each_entry(flock, >flc_posix, fl_list) { > > + /* check conflict locks */ > > + if (flock->fl_end >= start && end >= flock->fl_start) { > > + if (flock->fl_type == F_RDLCK) { > > + if (type == WRITE) { > > + ksmbd_err("not allow write by shared > > lock\n"); > > + error = 1; > > + goto out; > > + } > > + } else if (flock->fl_type == F_WRLCK) { > > + /* check owner in lock */ > > + if (flock->fl_file != filp) { > > + error = 1; > > + ksmbd_err("not allow rw access by > > exclusive lock from other > opens\n"); > > + goto out; > > + } > > + } > > + } > > + } > > +out: > > + spin_unlock(>flc_lock); > > + return error; > > +} > > WTF is that doing in smbd? This code was added to pass the smb2 lock test of samba's smbtorture. Will fix it. > > > + filp = fp->filp; > > + inode = d_inode(filp->f_path.dentry); > > That should be file_inode(). Try it on overlayfs, watch it do interesting > things... Okay. > > > + nbytes = kernel_read(filp, rbuf, count, pos); > > + if (nbytes < 0) { > > + name = d_path(>f_path, namebuf, sizeof(namebuf)); > > + if (IS_ERR(name)) > > + name = "(error)"; > > + ksmbd_err("smb read failed for (%s), err = %zd\n", > > + name, nbytes); > > Do you really want the full pathname here? For (presumably) spew into syslog? No, Will fix. > > > +int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) { > > + struct path parent; > > + struct dentry *dir, *dentry; > > + char *last; > > + int err = -ENOENT; > > + > > + last = extract_last_component(name); > > + if (!last) > > + return -ENOENT; > > Yeccchhh... I guess I change it err instead of -ENOENT. > > > + if (ksmbd_override_fsids(work)) > > + return -ENOMEM; > > + > > + err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, ); > > + if (err) { > > + ksmbd_debug(VFS, "can't get %s, err %d\n", name, err); > > + ksmbd_revert_fsids(work); > > + rollback_path_modification(last); > > + return err; > > + } > > + > > + dir = parent.dentry; > > + if (!d_inode(dir)) > > + goto out; > > Really? When does that happen? Will fix. > > > +static int __ksmbd_vfs_rename(struct ksmbd_work *work, > > + struct dentry *src_dent_parent, > > +
Re: [PATCH 3/5] cifsd: add file operations
On (21/03/22 17:09), Matthew Wilcox wrote: > On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote: > > On (21/03/22 08:15), Matthew Wilcox wrote: > > > > > > What's the scenario for which your allocator performs better than slub > > > > > > > IIRC request and reply buffers can be up to 4M in size. So this stuff > > just allocates a number of fat buffers and keeps them around so that > > it doesn't have to vmalloc(4M) for every request and every response. > > Hang on a minute, this speaks to a deeper design problem. If we're doing > a 'request' or 'reply' that is that large, the I/O should be coming from > or going to the page cache. If it goes via a 4MB virtually contiguous > buffer first, that's a memcpy that could/should be avoided. But huge vmalloc buffers are still needed. For instance, `ls -la` in a directory with a huge number of entries. > But now i'm looking for how ksmbd_find_buffer() is used, and it isn't. > So it looks like someone came to the same conclusion I did, but forgot > to delete the wm code. Yes, I think it's disabled by default and requires some smb.conf configuration. So I guess that wm code can be removed. Especially given that > That said, there are plenty of opportunities to optimise the vmalloc code, > and that's worth pursuing. That would be really interesting to see! > And here's the receive path which contains > the memcpy that should be avoided (ok, it's actually the call to ->read; > we shouldn't be reading in the entire 4MB but rather the header): > + conn->request_buf = ksmbd_alloc_request(size); > + if (!conn->request_buf) > + continue; > + > + memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf)); > + if (!ksmbd_smb_request(conn)) > + break; > + > + /* > + * We already read 4 bytes to find out PDU size, now > + * read in PDU > + */ > + size = t->ops->read(t, conn->request_buf + 4, pdu_size); // A side note, it seems that the maximum read/write/trans buffer size that // windows supports is 8MB, not 4MB.
Re: [PATCH 3/5] cifsd: add file operations
On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote: > On (21/03/22 08:15), Matthew Wilcox wrote: > > > > What's the scenario for which your allocator performs better than slub > > > > IIRC request and reply buffers can be up to 4M in size. So this stuff > just allocates a number of fat buffers and keeps them around so that > it doesn't have to vmalloc(4M) for every request and every response. Hang on a minute, this speaks to a deeper design problem. If we're doing a 'request' or 'reply' that is that large, the I/O should be coming from or going to the page cache. If it goes via a 4MB virtually contiguous buffer first, that's a memcpy that could/should be avoided. But now i'm looking for how ksmbd_find_buffer() is used, and it isn't. So it looks like someone came to the same conclusion I did, but forgot to delete the wm code. That said, there are plenty of opportunities to optimise the vmalloc code, and that's worth pursuing. And here's the receive path which contains the memcpy that should be avoided (ok, it's actually the call to ->read; we shouldn't be reading in the entire 4MB but rather the header): + conn->request_buf = ksmbd_alloc_request(size); + if (!conn->request_buf) + continue; + + memcpy(conn->request_buf, hdr_buf, sizeof(hdr_buf)); + if (!ksmbd_smb_request(conn)) + break; + + /* +* We already read 4 bytes to find out PDU size, now +* read in PDU +*/ + size = t->ops->read(t, conn->request_buf + 4, pdu_size);
RE: [PATCH 3/5] cifsd: add file operations
> -Original Message- > From: Namjae Jeon > Sent: Sunday, March 21, 2021 10:14 PM > To: linux-fsde...@vger.kernel.org; linux-kernel@vger.kernel.org; linux- > c...@vger.kernel.org > Cc: linux-cifsd-de...@lists.sourceforge.net; smfre...@gmail.com; > senozhat...@chromium.org; hyc@gmail.com; v...@zeniv.linux.org.uk; > h...@lst.de; h...@infradead.org; ronniesahlb...@gmail.com; > aurelien.ap...@gmail.com; aap...@suse.com; sand...@sandeen.net; > dan.carpen...@oracle.com; colin.k...@canonical.com; > rdun...@infradead.org; Namjae Jeon ; > Sergey Senozhatsky ; Steve French > > Subject: [PATCH 3/5] cifsd: add file operations > > This adds file operations and buffer pool for cifsd. > > Signed-off-by: Namjae Jeon > Signed-off-by: Sergey Senozhatsky > Signed-off-by: Hyunchul Lee > Acked-by: Ronnie Sahlberg > Signed-off-by: Steve French > --- > fs/cifsd/buffer_pool.c | 292 ++ > fs/cifsd/buffer_pool.h | 28 + > fs/cifsd/vfs.c | 1989 > fs/cifsd/vfs.h | 314 +++ > fs/cifsd/vfs_cache.c | 851 + > fs/cifsd/vfs_cache.h | 213 + > 6 files changed, 3687 insertions(+) > create mode 100644 fs/cifsd/buffer_pool.c > create mode 100644 fs/cifsd/buffer_pool.h > create mode 100644 fs/cifsd/vfs.c > create mode 100644 fs/cifsd/vfs.h > create mode 100644 fs/cifsd/vfs_cache.c > create mode 100644 fs/cifsd/vfs_cache.h > > diff --git a/fs/cifsd/buffer_pool.c b/fs/cifsd/buffer_pool.c > new file mode 100644 > index ..864fea547c68 > --- /dev/null > +++ b/fs/cifsd/buffer_pool.c > @@ -0,0 +1,292 @@ > +// SPDX-License-Identifier: GPL-2.0-or-later > +/* > + * Copyright (C) 2018 Samsung Electronics Co., Ltd. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include "glob.h" > +#include "buffer_pool.h" > +#include "connection.h" > +#include "mgmt/ksmbd_ida.h" > + > +static struct kmem_cache *filp_cache; > + > +struct wm { > + struct list_headlist; > + unsigned intsz; > + charbuffer[0]; > +}; > + > +struct wm_list { > + struct list_headlist; > + unsigned intsz; > + > + spinlock_t wm_lock; > + int avail_wm; > + struct list_headidle_wm; > + wait_queue_head_t wm_wait; > +}; > + > +static LIST_HEAD(wm_lists); > +static DEFINE_RWLOCK(wm_lists_lock); > + > +void *ksmbd_alloc(size_t size) > +{ > + return kvmalloc(size, GFP_KERNEL | __GFP_ZERO); > +} > + > +void ksmbd_free(void *ptr) > +{ > + kvfree(ptr); > +} > + > +static struct wm *wm_alloc(size_t sz, gfp_t flags) > +{ > + struct wm *wm; > + size_t alloc_sz = sz + sizeof(struct wm); > + > + wm = kvmalloc(alloc_sz, flags); > + if (!wm) > + return NULL; > + wm->sz = sz; > + return wm; > +} > + > +static int register_wm_size_class(size_t sz) > +{ > + struct wm_list *l, *nl; > + > + nl = kvmalloc(sizeof(struct wm_list), GFP_KERNEL); > + if (!nl) > + return -ENOMEM; > + > + nl->sz = sz; > + spin_lock_init(>wm_lock); > + INIT_LIST_HEAD(>idle_wm); > + INIT_LIST_HEAD(>list); > + init_waitqueue_head(>wm_wait); > + nl->avail_wm = 0; > + > + write_lock(_lists_lock); > + list_for_each_entry(l, _lists, list) { > + if (l->sz == sz) { > + write_unlock(_lists_lock); > + kvfree(nl); > + return 0; > + } > + } > + > + list_add(>list, _lists); > + write_unlock(_lists_lock); > + return 0; > +} > + > +static struct wm_list *match_wm_list(size_t size) > +{ > + struct wm_list *l, *rl = NULL; > + > + read_lock(_lists_lock); > + list_for_each_entry(l, _lists, list) { > + if (l->sz == size) { > + rl = l; > + break; > + } > + } > + read_unlock(_lists_lock); > + return rl; > +} > + > +static struct wm *find_wm(size_t size) > +{ > + struct wm_list *wm_list; > + struct wm *wm; > + > + wm_list = match_wm_list(size); > + if (!wm_list) { > + if (register_wm_size_class(size)) > + return NULL; > + wm_list = match_wm_list(size); > + } > + > + if (!wm_list) > + return NULL; > + > + while (1) { > + spin_lock(_list->wm_lock); > + if (!list_empty(_list->idle_wm)) { > + wm = list_entry(wm_list->idle_wm.next, > + struct wm, > + list); > + list_del(>list); > + spin_unlock(_list->wm_lock); > + return wm; > + } > + > + if (wm_list->avail_wm > num_online_cpus()) { > + spin_unlock(_list->wm_lock); > +
Re: [PATCH 3/5] cifsd: add file operations
On Mon, Mar 22, 2021 at 02:57:18PM +0100, Christoph Hellwig wrote: > On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote: > > On (21/03/22 08:15), Matthew Wilcox wrote: > > > > > > What's the scenario for which your allocator performs better than slub > > > > > > > IIRC request and reply buffers can be up to 4M in size. So this stuff > > just allocates a number of fat buffers and keeps them around so that > > it doesn't have to vmalloc(4M) for every request and every response. > > Do we have any data suggesting it is faster than vmalloc? Oh, I have no trouble believing it's faster than vmalloc. Here's the fast(!) path that always has memory available, never does retries. I'm calling out the things I perceive as expensive on the right hand side. Also, I'm taking the 4MB size as the example. vmalloc() __vmalloc_node() __vmalloc_node_range() __get_vm_area_node() [allocates vm_struct] alloc_vmap_area() [allocates vmap_area] [takes free_vmap_area_lock] __alloc_vmap_area() find_vmap_lowest_match [walks free_vmap_area_root] [takes vmap_area_lock] __vmalloc_area_node() ... array_size is 8KiB, we call __vmalloc_node __vmalloc_node [everything we did above, all over again, two more allocations, two more lock acquire] alloc_pages_node(), 1024 times vmap_pages_range_noflush() vmap_range_noflush() [allocate at least two pages for PTEs] There's definitely some low handling fruit here. __vmalloc_area_node() should probably call kvmalloc_node() instead of __vmalloc_node() for table sizes > 4KiB. But a lot of this is inherent to how vmalloc works, and we need to put a cache in front of it. Just not this one.
Re: [PATCH 3/5] cifsd: add file operations
On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote: > On (21/03/22 08:15), Matthew Wilcox wrote: > > > > What's the scenario for which your allocator performs better than slub > > > > IIRC request and reply buffers can be up to 4M in size. So this stuff > just allocates a number of fat buffers and keeps them around so that > it doesn't have to vmalloc(4M) for every request and every response. Do we have any data suggesting it is faster than vmalloc?
Re: [PATCH 3/5] cifsd: add file operations
On Mon, Mar 22, 2021 at 06:03:21PM +0900, Sergey Senozhatsky wrote: > On (21/03/22 08:15), Matthew Wilcox wrote: > > > > What's the scenario for which your allocator performs better than slub > > > > IIRC request and reply buffers can be up to 4M in size. So this stuff > just allocates a number of fat buffers and keeps them around so that > it doesn't have to vmalloc(4M) for every request and every response. That makes a lot more sense; I was thrown off by the kvmalloc, which is usually used for allocations that might be smaller than PAGE_SIZE. So what this patch is really saying is that vmalloc() should include some caching, so it can defer freeing until there's memory pressure or it's built up a large (percpu) backlog of freed areas. Vlad, have you thought about this?
Re: [PATCH 3/5] cifsd: add file operations
On (21/03/22 10:04), Dan Carpenter wrote: > > On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote: > > +void *ksmbd_alloc(size_t size) > > +{ > > + return kvmalloc(size, GFP_KERNEL | __GFP_ZERO); > > > This patch adds a bunch of wrappers around kvmalloc(). Don't do that. Agreed. This was not cleaned up properly. The initial implementation of that function (IIRC... it was sometime in 2018) basically contained kvmalloc() implementation, because back in the days Samsung used kernel version that simply didn't have kvmalloc() ( < KERNEL_VERSION(5, 0, 0))
Re: [PATCH 3/5] cifsd: add file operations
On (21/03/22 07:02), Al Viro wrote: > On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote: > > +static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft, > > + unsigned int id) > > +{ > > + bool unclaimed = true; > > + struct ksmbd_file *fp; > > + > > + read_lock(>lock); > > + fp = idr_find(ft->idr, id); > > + if (fp) > > + fp = ksmbd_fp_get(fp); > > + > > + if (fp && fp->f_ci) { > > + read_lock(>f_ci->m_lock); > > + unclaimed = list_empty(>node); > > + read_unlock(>f_ci->m_lock); > > + } > > + read_unlock(>lock); > > + > > + if (fp && unclaimed) { > > + atomic_dec(>refcount); > > + return NULL; > > + } > > Can that atomic_dec() end up dropping the last remaining reference? Yes, I think it should increment refcount only for "claimed" fp.
Re: [PATCH 3/5] cifsd: add file operations
On (21/03/22 08:15), Matthew Wilcox wrote: > > What's the scenario for which your allocator performs better than slub > IIRC request and reply buffers can be up to 4M in size. So this stuff just allocates a number of fat buffers and keeps them around so that it doesn't have to vmalloc(4M) for every request and every response.
Re: [PATCH 3/5] cifsd: add file operations
On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote: > This adds file operations and buffer pool for cifsd. Why do you want this buffer pool? Do you not trust the slab allocator to be able to do its job? Because what you have here looks slower than the slab allocator. Let's follow this through for the best-case scenario (a buffer of the right size already exists): > +void *ksmbd_find_buffer(size_t size) > +{ > + struct wm *wm; > + > + wm = find_wm(size); > + > + WARN_ON(!wm); > + if (wm) > + return wm->buffer; > + return NULL; > +} OK, simple, we just call find_wm(). > +static struct wm *find_wm(size_t size) > +{ > + struct wm_list *wm_list; > + struct wm *wm; > + > + wm_list = match_wm_list(size); First we find the list for this buffer ... > +static struct wm_list *match_wm_list(size_t size) > +{ > + struct wm_list *l, *rl = NULL; > + > + read_lock(_lists_lock); > + list_for_each_entry(l, _lists, list) { > + if (l->sz == size) { > + rl = l; > + break; > + } > + } > + read_unlock(_lists_lock); > + return rl; > +} ... by taking an rwlock, and walking a linked list?! Uh ... > + while (1) { > + spin_lock(_list->wm_lock); > + if (!list_empty(_list->idle_wm)) { > + wm = list_entry(wm_list->idle_wm.next, > + struct wm, > + list); > + list_del(>list); > + spin_unlock(_list->wm_lock); > + return wm; Great! We found one! And all it cost us was acquiring a global rwlock, walking a linked list to find a wmlist, then a per-wmlist spinlock. Meanwhile, there's no guarantee the buffer we found is on the local NUMA node. Compare to slub, allocating from a kmem_cache (assuming you create one for each buffer size ...): void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) void *ret = slab_alloc(s, gfpflags, _RET_IP_, s->object_size); static __always_inline void *slab_alloc(struct kmem_cache *s, gfp_t gfpflags, unsigned long addr, size_t orig_size) return slab_alloc_node(s, gfpflags, NUMA_NO_NODE, addr, orig_size); static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, size_t orig_size) do { tid = this_cpu_read(s->cpu_slab->tid); c = raw_cpu_ptr(s->cpu_slab); } while (IS_ENABLED(CONFIG_PREEMPTION) && unlikely(tid != READ_ONCE(c->tid))); object = c->freelist; page = c->page; if (unlikely(!object || !page || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); } else { void *next_object = get_freepointer_safe(s, object); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, object, tid, next_object, next_tid(tid { note_cmpxchg_failure("slab_alloc", s, tid); goto redo; } prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); No lock, anywhere. Lots of percpu goodness, so you get memory allocated on your local node. What's the scenario for which your allocator performs better than slub, on a typical machine that serves enough SMB that it's worth having an in-kernel SMBD?
Re: [PATCH 3/5] cifsd: add file operations
On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote: > +void *ksmbd_alloc(size_t size) > +{ > + return kvmalloc(size, GFP_KERNEL | __GFP_ZERO); This patch adds a bunch of wrappers around kvmalloc(). Don't do that. Just use kvmalloc() directly instead. They just make the code hard to read. kvmalloc() is not appropriate for small allocations. This wrapper hides a GFP_KERNEL which may lead to scheduling in atomic bugs and the secret ZEROing makes the code hard to read. > +} > + > +void ksmbd_free(void *ptr) > +{ > + kvfree(ptr); > +} > + > +static struct wm *wm_alloc(size_t sz, gfp_t flags) > +{ > + struct wm *wm; > + size_t alloc_sz = sz + sizeof(struct wm); ^^ Check for integer overflow. > + > + wm = kvmalloc(alloc_sz, flags); > + if (!wm) > + return NULL; > + wm->sz = sz; > + return wm; > +} > + > +static int register_wm_size_class(size_t sz) > +{ > + struct wm_list *l, *nl; > + > + nl = kvmalloc(sizeof(struct wm_list), GFP_KERNEL); Just use kmalloc() for small allocations. > + if (!nl) > + return -ENOMEM; > + > + nl->sz = sz; > + spin_lock_init(>wm_lock); > + INIT_LIST_HEAD(>idle_wm); > + INIT_LIST_HEAD(>list); > + init_waitqueue_head(>wm_wait); > + nl->avail_wm = 0; > + > + write_lock(_lists_lock); > + list_for_each_entry(l, _lists, list) { > + if (l->sz == sz) { > + write_unlock(_lists_lock); > + kvfree(nl); > + return 0; > + } > + } > + > + list_add(>list, _lists); > + write_unlock(_lists_lock); > + return 0; > +} > + > +static struct wm_list *match_wm_list(size_t size) > +{ > + struct wm_list *l, *rl = NULL; > + > + read_lock(_lists_lock); > + list_for_each_entry(l, _lists, list) { > + if (l->sz == size) { > + rl = l; > + break; > + } > + } > + read_unlock(_lists_lock); > + return rl; > +} > + > +static struct wm *find_wm(size_t size) > +{ > + struct wm_list *wm_list; > + struct wm *wm; > + > + wm_list = match_wm_list(size); > + if (!wm_list) { > + if (register_wm_size_class(size)) > + return NULL; > + wm_list = match_wm_list(size); > + } > + > + if (!wm_list) > + return NULL; > + > + while (1) { > + spin_lock(_list->wm_lock); > + if (!list_empty(_list->idle_wm)) { > + wm = list_entry(wm_list->idle_wm.next, > + struct wm, > + list); > + list_del(>list); > + spin_unlock(_list->wm_lock); > + return wm; > + } > + > + if (wm_list->avail_wm > num_online_cpus()) { > + spin_unlock(_list->wm_lock); > + wait_event(wm_list->wm_wait, > +!list_empty(_list->idle_wm)); > + continue; > + } > + > + wm_list->avail_wm++; I don't think we should increment this until after the allocation succeeds? > + spin_unlock(_list->wm_lock); > + > + wm = wm_alloc(size, GFP_KERNEL); > + if (!wm) { > + spin_lock(_list->wm_lock); > + wm_list->avail_wm--; > + spin_unlock(_list->wm_lock); > + wait_event(wm_list->wm_wait, > +!list_empty(_list->idle_wm)); > + continue; > + } > + break; > + } > + > + return wm; > +} regards, dan carpenter
Re: [PATCH 3/5] cifsd: add file operations
On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote: > +static struct ksmbd_file *__ksmbd_lookup_fd(struct ksmbd_file_table *ft, > + unsigned int id) > +{ > + bool unclaimed = true; > + struct ksmbd_file *fp; > + > + read_lock(>lock); > + fp = idr_find(ft->idr, id); > + if (fp) > + fp = ksmbd_fp_get(fp); > + > + if (fp && fp->f_ci) { > + read_lock(>f_ci->m_lock); > + unclaimed = list_empty(>node); > + read_unlock(>f_ci->m_lock); > + } > + read_unlock(>lock); > + > + if (fp && unclaimed) { > + atomic_dec(>refcount); > + return NULL; > + } Can that atomic_dec() end up dropping the last remaining reference? If not, what's to prevent that?
Re: [PATCH 3/5] cifsd: add file operations
On Mon, Mar 22, 2021 at 02:13:42PM +0900, Namjae Jeon wrote: > This adds file operations and buffer pool for cifsd. Some random notes: > +static void rollback_path_modification(char *filename) > +{ > + if (filename) { > + filename--; > + *filename = '/'; What an odd way to spell filename[-1] = '/';... > +int ksmbd_vfs_inode_permission(struct dentry *dentry, int acc_mode, bool > delete) > +{ > + if (delete) { > + struct dentry *parent; > + > + parent = dget_parent(dentry); > + if (!parent) > + return -EINVAL; > + > + if (inode_permission(_user_ns, d_inode(parent), MAY_EXEC | > MAY_WRITE)) { > + dput(parent); > + return -EACCES; > + } > + dput(parent); Who's to guarantee that parent is stable? IOW, by the time of that inode_permission() call dentry might very well not be a child of that thing... > + parent = dget_parent(dentry); > + if (!parent) > + return 0; > + > + if (!inode_permission(_user_ns, d_inode(parent), MAY_EXEC | > MAY_WRITE)) > + *daccess |= FILE_DELETE_LE; Ditto. > +int ksmbd_vfs_mkdir(struct ksmbd_work *work, > + const char *name, > + umode_t mode) > + err = vfs_mkdir(_user_ns, d_inode(path.dentry), dentry, mode); > + if (!err) { > + ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), > + d_inode(dentry)); ->mkdir() might very well return success, with dentry left unhashed negative. Look at the callers of vfs_mkdir() to see how it should be handled. > +static int check_lock_range(struct file *filp, > + loff_t start, > + loff_t end, > + unsigned char type) > +{ > + struct file_lock *flock; > + struct file_lock_context *ctx = file_inode(filp)->i_flctx; > + int error = 0; > + > + if (!ctx || list_empty_careful(>flc_posix)) > + return 0; > + > + spin_lock(>flc_lock); > + list_for_each_entry(flock, >flc_posix, fl_list) { > + /* check conflict locks */ > + if (flock->fl_end >= start && end >= flock->fl_start) { > + if (flock->fl_type == F_RDLCK) { > + if (type == WRITE) { > + ksmbd_err("not allow write by shared > lock\n"); > + error = 1; > + goto out; > + } > + } else if (flock->fl_type == F_WRLCK) { > + /* check owner in lock */ > + if (flock->fl_file != filp) { > + error = 1; > + ksmbd_err("not allow rw access by > exclusive lock from other opens\n"); > + goto out; > + } > + } > + } > + } > +out: > + spin_unlock(>flc_lock); > + return error; > +} WTF is that doing in smbd? > + filp = fp->filp; > + inode = d_inode(filp->f_path.dentry); That should be file_inode(). Try it on overlayfs, watch it do interesting things... > + nbytes = kernel_read(filp, rbuf, count, pos); > + if (nbytes < 0) { > + name = d_path(>f_path, namebuf, sizeof(namebuf)); > + if (IS_ERR(name)) > + name = "(error)"; > + ksmbd_err("smb read failed for (%s), err = %zd\n", > + name, nbytes); Do you really want the full pathname here? For (presumably) spew into syslog? > +int ksmbd_vfs_remove_file(struct ksmbd_work *work, char *name) > +{ > + struct path parent; > + struct dentry *dir, *dentry; > + char *last; > + int err = -ENOENT; > + > + last = extract_last_component(name); > + if (!last) > + return -ENOENT; Yeccchhh... > + if (ksmbd_override_fsids(work)) > + return -ENOMEM; > + > + err = kern_path(name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, ); > + if (err) { > + ksmbd_debug(VFS, "can't get %s, err %d\n", name, err); > + ksmbd_revert_fsids(work); > + rollback_path_modification(last); > + return err; > + } > + > + dir = parent.dentry; > + if (!d_inode(dir)) > + goto out; Really? When does that happen? > +static int __ksmbd_vfs_rename(struct ksmbd_work *work, > + struct dentry *src_dent_parent, > + struct dentry *src_dent, > + struct dentry *dst_dent_parent, > + struct dentry *trap_dent, > + char *dst_name) > +{ > + struct dentry *dst_dent; > + int err; > + > + spin_lock(_dent->d_lock); > +