Re: [Virtio-fs] [PATCH 00/18] virtiofs: Fix various races and cleanups round 1
On 2019/9/10 0:14, Stefan Hajnoczi wrote: > On Sun, Sep 08, 2019 at 07:53:55PM +0800, piaojun wrote: >> >> >> On 2019/9/6 19:52, Miklos Szeredi wrote: >>> On Fri, Sep 6, 2019 at 12:36 PM Stefan Hajnoczi wrote: >>>> >>>> On Fri, Sep 06, 2019 at 10:15:14AM +0200, Miklos Szeredi wrote: >>>>> On Thu, Sep 5, 2019 at 9:49 PM Vivek Goyal wrote: >>>>>> >>>>>> Hi, >>>>>> >>>>>> Michael Tsirkin pointed out issues w.r.t various locking related TODO >>>>>> items and races w.r.t device removal. >>>>>> >>>>>> In this first round of cleanups, I have taken care of most pressing >>>>>> issues. >>>>>> >>>>>> These patches apply on top of following. >>>>>> >>>>>> git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git#virtiofs-v4 >>>>>> >>>>>> I have tested these patches with mount/umount and device removal using >>>>>> qemu monitor. For example. >>>>> >>>>> Is device removal mandatory? Can't this be made a non-removable >>>>> device? Is there a good reason why removing the virtio-fs device >>>>> makes sense? >>>> >>>> Hot plugging and unplugging virtio PCI adapters is common. I'd very >>>> much like removal to work from the beginning. >>> >>> Can you give an example use case? >> >> I think VirtFS migration need hot plugging, or it may cause QEMU crash >> or some problems. > > Live migration is currently unsupported. Hot unplugging the virtio-fs > device would allow the guest to live migrate successfully, so it's a > useful feature to work around the missing live migration support. > > Is this what you mean? Agreed, migration support is necessary for user, and hot plugging/unplugging is also common for virtio device. Jun
Re: [Virtio-fs] [PATCH 00/18] virtiofs: Fix various races and cleanups round 1
On 2019/9/6 19:52, Miklos Szeredi wrote: > On Fri, Sep 6, 2019 at 12:36 PM Stefan Hajnoczi wrote: >> >> On Fri, Sep 06, 2019 at 10:15:14AM +0200, Miklos Szeredi wrote: >>> On Thu, Sep 5, 2019 at 9:49 PM Vivek Goyal wrote: Hi, Michael Tsirkin pointed out issues w.r.t various locking related TODO items and races w.r.t device removal. In this first round of cleanups, I have taken care of most pressing issues. These patches apply on top of following. git://git.kernel.org/pub/scm/linux/kernel/git/mszeredi/fuse.git#virtiofs-v4 I have tested these patches with mount/umount and device removal using qemu monitor. For example. >>> >>> Is device removal mandatory? Can't this be made a non-removable >>> device? Is there a good reason why removing the virtio-fs device >>> makes sense? >> >> Hot plugging and unplugging virtio PCI adapters is common. I'd very >> much like removal to work from the beginning. > > Can you give an example use case? I think VirtFS migration need hot plugging, or it may cause QEMU crash or some problems. Thanks, Jun
Re: [Virtio-fs] [PATCH 16/18] virtiofs: Use virtio_fs_mutex for races w.r.t ->remove and mount path
On 2019/9/6 3:48, Vivek Goyal wrote: > It is possible that a mount is in progress and device is being removed at > the same time. Use virtio_fs_mutex to avoid races. > > This also takes care of bunch of races and removes some TODO items. > > Signed-off-by: Vivek Goyal > --- > fs/fuse/virtio_fs.c | 32 ++-- > 1 file changed, 22 insertions(+), 10 deletions(-) > > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 29ec2f5bbbe2..c483482185b6 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -13,7 +13,9 @@ > #include > #include "fuse_i.h" > > -/* List of virtio-fs device instances and a lock for the list */ > +/* List of virtio-fs device instances and a lock for the list. Also provides > + * mutual exclusion in device removal and mounting path > + */ > static DEFINE_MUTEX(virtio_fs_mutex); > static LIST_HEAD(virtio_fs_instances); > > @@ -72,17 +74,19 @@ static void release_virtiofs_obj(struct kref *ref) > kfree(vfs); > } > > +/* Make sure virtiofs_mutex is held */ Typo? virtiofs_mutex->virtio_fs_mutex Jun > static void virtiofs_put(struct virtio_fs *fs) > { > - mutex_lock(&virtio_fs_mutex); > kref_put(&fs->refcount, release_virtiofs_obj); > - mutex_unlock(&virtio_fs_mutex); > } > > static void virtio_fs_put(struct fuse_iqueue *fiq) > { > struct virtio_fs *vfs = fiq->priv; > + > + mutex_lock(&virtio_fs_mutex); > virtiofs_put(vfs); > + mutex_unlock(&virtio_fs_mutex); > } > > static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) > @@ -596,9 +600,8 @@ static void virtio_fs_remove(struct virtio_device *vdev) > struct virtio_fs *fs = vdev->priv; > > mutex_lock(&virtio_fs_mutex); > + /* This device is going away. No one should get new reference */ > list_del_init(&fs->list); > - mutex_unlock(&virtio_fs_mutex); > - > virtio_fs_stop_all_queues(fs); > virtio_fs_drain_all_queues(fs); > vdev->config->reset(vdev); > @@ -607,6 +610,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) > vdev->priv = NULL; > /* Put device reference on virtio_fs object */ > virtiofs_put(fs); > + mutex_unlock(&virtio_fs_mutex); > } > > #ifdef CONFIG_PM_SLEEP > @@ -978,10 +982,15 @@ static int virtio_fs_fill_super(struct super_block *sb) > .no_force_umount = true, > }; > > - /* TODO lock */ > - if (fs->vqs[VQ_REQUEST].fud) { > - pr_err("virtio-fs: device already in use\n"); > - err = -EBUSY; > + mutex_lock(&virtio_fs_mutex); > + > + /* After holding mutex, make sure virtiofs device is still there. > + * Though we are holding a refernce to it, drive ->remove might > + * still have cleaned up virtual queues. In that case bail out. > + */ > + err = -EINVAL; > + if (list_empty(&fs->list)) { > + pr_info("virtio-fs: tag <%s> not found\n", fs->tag); > goto err; > } > > @@ -1007,7 +1016,6 @@ static int virtio_fs_fill_super(struct super_block *sb) > > fc = fs->vqs[VQ_REQUEST].fud->fc; > > - /* TODO take fuse_mutex around this loop? */ > for (i = 0; i < fs->nvqs; i++) { > struct virtio_fs_vq *fsvq = &fs->vqs[i]; > > @@ -1020,6 +1028,7 @@ static int virtio_fs_fill_super(struct super_block *sb) > /* Previous unmount will stop all queues. Start these again */ > virtio_fs_start_all_queues(fs); > fuse_send_init(fc, init_req); > + mutex_unlock(&virtio_fs_mutex); > return 0; > > err_free_init_req: > @@ -1027,6 +1036,7 @@ static int virtio_fs_fill_super(struct super_block *sb) > err_free_fuse_devs: > virtio_fs_free_devs(fs); > err: > + mutex_unlock(&virtio_fs_mutex); > return err; > } > > @@ -1100,7 +1110,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc) > > fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); > if (!fc) { > + mutex_lock(&virtio_fs_mutex); > virtiofs_put(fs); > + mutex_unlock(&virtio_fs_mutex); > return -ENOMEM; > } > >
Re: [Virtio-fs] [PATCH 15/18] virtiofs: Make virtio_fs object refcounted
On 2019/9/6 3:48, Vivek Goyal wrote: > This object is used both by fuse_connection as well virt device. So make > this object reference counted and that makes it easy to define life cycle > of the object. > > Now deivce can be removed while filesystem is still mounted. This will > cleanup all the virtqueues but virtio_fs object will still be around and > will be cleaned when filesystem is unmounted and sb/fc drops its reference. > > Removing a device also stops all virt queues and any new reuqest gets > error -ENOTCONN. All existing in flight requests are drained before > ->remove returns. > > Signed-off-by: Vivek Goyal > --- > fs/fuse/virtio_fs.c | 52 + > 1 file changed, 43 insertions(+), 9 deletions(-) > > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 01bbf2c0e144..29ec2f5bbbe2 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -37,6 +37,7 @@ struct virtio_fs_vq { > > /* A virtio-fs device instance */ > struct virtio_fs { > + struct kref refcount; > struct list_head list;/* on virtio_fs_instances */ > char *tag; > struct virtio_fs_vq *vqs; > @@ -63,6 +64,27 @@ static inline struct fuse_pqueue *vq_to_fpq(struct > virtqueue *vq) > return &vq_to_fsvq(vq)->fud->pq; > } > > +static void release_virtiofs_obj(struct kref *ref) > +{ > + struct virtio_fs *vfs = container_of(ref, struct virtio_fs, refcount); > + > + kfree(vfs->vqs); > + kfree(vfs); > +} > + > +static void virtio_fs_put(struct virtio_fs *fs) > +{ > + mutex_lock(&virtio_fs_mutex); > + kref_put(&fs->refcount, release_virtiofs_obj); > + mutex_unlock(&virtio_fs_mutex); > +} > + > +static void virtio_fs_put(struct fuse_iqueue *fiq) > +{ > + struct virtio_fs *vfs = fiq->priv; > + virtiofs_put(vfs); > +} It's a little confusing that virtiofs_put() looks like virtiofs_put(), and could we use __virtio_fs_put to replace virtio_fs_put? Thanks, Jun
Re: [Virtio-fs] [PATCH 08/18] virtiofs: Drain all pending requests during ->remove time
On 2019/9/6 3:48, Vivek Goyal wrote: > When device is going away, drain all pending requests. > > Signed-off-by: Vivek Goyal > --- > fs/fuse/virtio_fs.c | 83 - > 1 file changed, 51 insertions(+), 32 deletions(-) > > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 90e7b2f345e5..d5730a50b303 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -63,6 +63,55 @@ static inline struct fuse_pqueue *vq_to_fpq(struct > virtqueue *vq) > return &vq_to_fsvq(vq)->fud->pq; > } > > +static void virtio_fs_drain_queue(struct virtio_fs_vq *fsvq) > +{ > + WARN_ON(fsvq->in_flight < 0); > + > + /* Wait for in flight requests to finish.*/ blank space missed after *finish.*. > + while (1) { > + spin_lock(&fsvq->lock); > + if (!fsvq->in_flight) { > + spin_unlock(&fsvq->lock); > + break; > + } > + spin_unlock(&fsvq->lock); > + usleep_range(1000, 2000); > + } > + > + flush_work(&fsvq->done_work); > + flush_delayed_work(&fsvq->dispatch_work); > +} > + > +static inline void drain_hiprio_queued_reqs(struct virtio_fs_vq *fsvq) Should we add *virtio_fs* prefix for this function? And I wonder if there are only forget reqs to drain? Maybe we should call it *virtio_fs_drain_queued_forget_reqs* or someone containing *forget_reqs*. Thanks, Jun > +{ > + struct virtio_fs_forget *forget; > + > + spin_lock(&fsvq->lock); > + while (1) { > + forget = list_first_entry_or_null(&fsvq->queued_reqs, > + struct virtio_fs_forget, list); > + if (!forget) > + break; > + list_del(&forget->list); > + kfree(forget); > + } > + spin_unlock(&fsvq->lock); > +} > + > +static void virtio_fs_drain_all_queues(struct virtio_fs *fs) > +{ > + struct virtio_fs_vq *fsvq; > + int i; > + > + for (i = 0; i < fs->nvqs; i++) { > + fsvq = &fs->vqs[i]; > + if (i == VQ_HIPRIO) > + drain_hiprio_queued_reqs(fsvq); > + > + virtio_fs_drain_queue(fsvq); > + } > +} > + > /* Add a new instance to the list or return -EEXIST if tag name exists*/ > static int virtio_fs_add_instance(struct virtio_fs *fs) > { > @@ -511,6 +560,7 @@ static void virtio_fs_remove(struct virtio_device *vdev) > struct virtio_fs *fs = vdev->priv; > > virtio_fs_stop_all_queues(fs); > + virtio_fs_drain_all_queues(fs); > vdev->config->reset(vdev); > virtio_fs_cleanup_vqs(vdev, fs); > > @@ -865,37 +915,6 @@ __releases(fiq->waitq.lock) > } > } > > -static void virtio_fs_flush_hiprio_queue(struct virtio_fs_vq *fsvq) > -{ > - struct virtio_fs_forget *forget; > - > - WARN_ON(fsvq->in_flight < 0); > - > - /* Go through pending forget requests and free them */ > - spin_lock(&fsvq->lock); > - while (1) { > - forget = list_first_entry_or_null(&fsvq->queued_reqs, > - struct virtio_fs_forget, list); > - if (!forget) > - break; > - list_del(&forget->list); > - kfree(forget); > - } > - > - spin_unlock(&fsvq->lock); > - > - /* Wait for in flight requests to finish.*/ > - while (1) { > - spin_lock(&fsvq->lock); > - if (!fsvq->in_flight) { > - spin_unlock(&fsvq->lock); > - break; > - } > - spin_unlock(&fsvq->lock); > - usleep_range(1000, 2000); > - } > -} > - > const static struct fuse_iqueue_ops virtio_fs_fiq_ops = { > .wake_forget_and_unlock = virtio_fs_wake_forget_and_unlock, > .wake_interrupt_and_unlock = virtio_fs_wake_interrupt_and_unlock, > @@ -988,7 +1007,7 @@ static void virtio_kill_sb(struct super_block *sb) > spin_lock(&fsvq->lock); > fsvq->connected = false; > spin_unlock(&fsvq->lock); > - virtio_fs_flush_hiprio_queue(fsvq); > + virtio_fs_drain_all_queues(vfs); > > fuse_kill_sb_anon(sb); > virtio_fs_free_devs(vfs); >
Re: [Virtio-fs] [PATCH 04/19] virtio: Implement get_shm_region for PCI transport
On 2019/8/26 21:06, Vivek Goyal wrote: > On Mon, Aug 26, 2019 at 09:43:08AM +0800, piaojun wrote: > > [..] >>> +static bool vp_get_shm_region(struct virtio_device *vdev, >>> + struct virtio_shm_region *region, u8 id) >>> +{ >>> + struct virtio_pci_device *vp_dev = to_vp_device(vdev); >>> + struct pci_dev *pci_dev = vp_dev->pci_dev; >>> + u8 bar; >>> + u64 offset, len; >>> + phys_addr_t phys_addr; >>> + size_t bar_len; >>> + char *bar_name; >> >> 'char *bar_name' should be cleaned up to avoid compiling warning. And I >> wonder if you mix tab and blankspace for code indent? Or it's just my >> email display problem? > > Will get rid of now unused bar_name. > OK > Generally git flags if there are tab/space issues. I did not see any. So > if you see something, point it out and I will fix it. > cohuck found the same indent problem and pointed them in another email. Jun
Re: [Virtio-fs] [PATCH 04/19] virtio: Implement get_shm_region for PCI transport
On 2019/8/22 1:57, Vivek Goyal wrote: > From: Sebastien Boeuf > > On PCI the shm regions are found using capability entries; > find a region by searching for the capability. > > Cc: k...@vger.kernel.org > Signed-off-by: Sebastien Boeuf > Signed-off-by: Dr. David Alan Gilbert > Signed-off-by: kbuild test robot > --- > drivers/virtio/virtio_pci_modern.c | 108 + > include/uapi/linux/virtio_pci.h| 11 ++- > 2 files changed, 118 insertions(+), 1 deletion(-) > > diff --git a/drivers/virtio/virtio_pci_modern.c > b/drivers/virtio/virtio_pci_modern.c > index 7abcc50838b8..1cdedd93f42a 100644 > --- a/drivers/virtio/virtio_pci_modern.c > +++ b/drivers/virtio/virtio_pci_modern.c > @@ -443,6 +443,112 @@ static void del_vq(struct virtio_pci_vq_info *info) > vring_del_virtqueue(vq); > } > > +static int virtio_pci_find_shm_cap(struct pci_dev *dev, > + u8 required_id, > + u8 *bar, u64 *offset, u64 *len) > +{ > + int pos; > + > +for (pos = pci_find_capability(dev, PCI_CAP_ID_VNDR); > + pos > 0; > + pos = pci_find_next_capability(dev, pos, PCI_CAP_ID_VNDR)) { > + u8 type, cap_len, id; > +u32 tmp32; > +u64 res_offset, res_length; > + > + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, > + cfg_type), > + &type); > +if (type != VIRTIO_PCI_CAP_SHARED_MEMORY_CFG) > +continue; > + > + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, > + cap_len), > + &cap_len); > + if (cap_len != sizeof(struct virtio_pci_cap64)) { > + printk(KERN_ERR "%s: shm cap with bad size offset: %d > size: %d\n", > + __func__, pos, cap_len); > +continue; > +} > + > + pci_read_config_byte(dev, pos + offsetof(struct virtio_pci_cap, > + id), > + &id); > +if (id != required_id) > +continue; > + > +/* Type, and ID match, looks good */ > +pci_read_config_byte(dev, pos + offsetof(struct > virtio_pci_cap, > + bar), > + bar); > + > +/* Read the lower 32bit of length and offset */ > +pci_read_config_dword(dev, pos + offsetof(struct > virtio_pci_cap, offset), > + &tmp32); > +res_offset = tmp32; > +pci_read_config_dword(dev, pos + offsetof(struct > virtio_pci_cap, length), > + &tmp32); > +res_length = tmp32; > + > +/* and now the top half */ > +pci_read_config_dword(dev, > + pos + offsetof(struct virtio_pci_cap64, > + offset_hi), > + &tmp32); > +res_offset |= ((u64)tmp32) << 32; > +pci_read_config_dword(dev, > + pos + offsetof(struct virtio_pci_cap64, > + length_hi), > + &tmp32); > +res_length |= ((u64)tmp32) << 32; > + > +*offset = res_offset; > +*len = res_length; > + > +return pos; > +} > +return 0; > +} > + > +static bool vp_get_shm_region(struct virtio_device *vdev, > + struct virtio_shm_region *region, u8 id) > +{ > + struct virtio_pci_device *vp_dev = to_vp_device(vdev); > + struct pci_dev *pci_dev = vp_dev->pci_dev; > + u8 bar; > + u64 offset, len; > + phys_addr_t phys_addr; > + size_t bar_len; > + char *bar_name; 'char *bar_name' should be cleaned up to avoid compiling warning. And I wonder if you mix tab and blankspace for code indent? Or it's just my email display problem? Thanks, Jun
[PATCH] scsi/virio_scsi.c: do not call virtscsi_remove_vqs() in virtscsi_init() to avoid crash bug
If some error happened before find_vqs, error branch will goto virtscsi_remove_vqs to free vqs. Actually the vqs have not been allocated successfully, so this will cause wild-pointer-free problem. So virtscsi_remove_vqs could be deleted as no error will happen after find_vqs. Signed-off-by: Jun Piao --- drivers/scsi/virtio_scsi.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c index 1c72db9..da0fd74 100644 --- a/drivers/scsi/virtio_scsi.c +++ b/drivers/scsi/virtio_scsi.c @@ -833,8 +833,6 @@ static int virtscsi_init(struct virtio_device *vdev, kfree(names); kfree(callbacks); kfree(vqs); - if (err) - virtscsi_remove_vqs(vdev); return err; } --
Re: [PATCH v3 2/2] net/9p: add a per-client fcall kmem_cache
Thanks for clearing my doubt, and you can add: Acked-by: Jun Piao On 2018/8/10 9:41, Dominique Martinet wrote: > piaojun wrote on Fri, Aug 10, 2018: >> Could you help paste the test result of before-after-applied this patch in >> comment? And please see my comments below. > > Thanks the the review, do you mean the commit message? > > I'll add the summary I wrote in reply to your question a few mails > before. > Yes, I mean the commit message. > >>> diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h >>> index e23896116d9a..645266b74652 100644 >>> --- a/include/net/9p/9p.h >>> +++ b/include/net/9p/9p.h >>> @@ -336,6 +336,9 @@ enum p9_qid_t { >>> #define P9_NOFID (u32)(~0) >>> #define P9_MAXWELEM16 >>> >>> +/* Minimal header size: len + id + tag */ >> >> Here should be 'size + id + tag'. > > hm I didn't want to repeat size, but I guess people do refer to that > field as size. > I'll actually rewrite it as: > Minimal header size: size[4] type[1] tag[2] > It looks better. >>> + kmem_cache_destroy(clnt->fcall_cache); >> >> I'm afraid that we should check NULL for clnt->fcall_cache. > > kmem_cache_destroy() in mm/slab_common.c does the null check for us: > -- > void kmem_cache_destroy(struct kmem_cache *s) > { > int err; > > if (unlikely(!s)) > return; > -- > OK, it makes sense.
Re: [PATCH v3 2/2] net/9p: add a per-client fcall kmem_cache
Hi Dominique, Could you help paste the test result of before-after-applied this patch in comment? And please see my comments below. On 2018/8/9 22:33, Dominique Martinet wrote: > From: Dominique Martinet > > Having a specific cache for the fcall allocations helps speed up > allocations a bit, especially in case of non-"round" msizes. > > The caches will automatically be merged if there are multiple caches > of items with the same size so we do not need to try to share a cache > between different clients of the same size. > > Since the msize is negotiated with the server, only allocate the cache > after that negotiation has happened - previous allocations or > allocations of different sizes (e.g. zero-copy fcall) are made with > kmalloc directly. > > Signed-off-by: Dominique Martinet > Cc: Matthew Wilcox > Cc: Greg Kurz > Cc: Jun Piao > --- > v2: > - Add a pointer to the cache in p9_fcall to make sure a buffer > allocated with kmalloc gets freed with kfree and vice-versa > This could have been smaller with a bool but this spares having > to look at the client so looked a bit cleaner, I'm expecting this > patch will need a v3 one way or another so I went for the bolder > approach - please say if you think a smaller item is better ; I *think* > nothing relies on this being ordered the same way as the data on the > wire (struct isn't packed anyway) so we can move id after tag and add > another u8 to not have any overhead > > - added likely() to cache existence check in allocation, but nothing > for msize check or free because of zc request being of different size > > v3: > - defined the userdata region in the cache, as read or write calls can > access the buffer directly (lead to warnings with HARDENED_USERCOPY=y) > > > > I didn't get any comment on v2 for this patch, but I'm still not fully > happy with this in all honesty. > > Part of me believes we might be better off always allocating from the > cache even for zero-copy headers, it's a waste of space but the buffers > are there and being reused constantly for non-zc calls, and mixing > kmallocs in could lead to these buffers being really freed and > reallocated all the time instead of reusing the slab buffers > continuously. > > That would let me move the likely() for the fast path, with the only > exception being the TVERSION initial call on mount, for which I still > don't have any nice idea on how to handle, using a different size in > p9_client_rpc or prepare_req if the type is TVERSION and I'm really not > happy with that... > > > include/net/9p/9p.h | 4 > include/net/9p/client.h | 1 + > net/9p/client.c | 40 +++- > 3 files changed, 40 insertions(+), 5 deletions(-) > > diff --git a/include/net/9p/9p.h b/include/net/9p/9p.h > index e23896116d9a..645266b74652 100644 > --- a/include/net/9p/9p.h > +++ b/include/net/9p/9p.h > @@ -336,6 +336,9 @@ enum p9_qid_t { > #define P9_NOFID (u32)(~0) > #define P9_MAXWELEM 16 > > +/* Minimal header size: len + id + tag */ Here should be 'size + id + tag'. > +#define P9_HDRSZ 7 > + > /* ample room for Twrite/Rread header */ > #define P9_IOHDRSZ 24 > > @@ -558,6 +561,7 @@ struct p9_fcall { > size_t offset; > size_t capacity; > > + struct kmem_cache *cache; > u8 *sdata; > }; > > diff --git a/include/net/9p/client.h b/include/net/9p/client.h > index c2671d40bb6b..735f3979d559 100644 > --- a/include/net/9p/client.h > +++ b/include/net/9p/client.h > @@ -123,6 +123,7 @@ struct p9_client { > struct p9_trans_module *trans_mod; > enum p9_trans_status status; > void *trans; > + struct kmem_cache *fcall_cache; > > union { > struct { > diff --git a/net/9p/client.c b/net/9p/client.c > index ed78751aee7c..6c57ab1294d7 100644 > --- a/net/9p/client.c > +++ b/net/9p/client.c > @@ -192,6 +192,9 @@ static int parse_opts(char *opts, struct p9_client *clnt) > goto free_and_return; > } > > + if (clnt->trans_mod) { > + pr_warn("Had trans %s\n", > clnt->trans_mod->name); > + } > v9fs_put_trans(clnt->trans_mod); > clnt->trans_mod = v9fs_get_trans_by_name(s); > if (clnt->trans_mod == NULL) { > @@ -231,9 +234,16 @@ static int parse_opts(char *opts, struct p9_client *clnt) > return ret; > } > > -static int p9_fcall_init(struct p9_fcall *fc, int alloc_msize) > +static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, > + int alloc_msize) > { > - fc->sdata = kmalloc(alloc_msize, GFP_NOFS); > + if (likely(c->fcall_cache) && alloc_msize == c->msize) { > + fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); > + fc->cache = c->fcall_cache; > + } else { > + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); > +
Re: [PATCH v3 1/2] net/9p: embed fcall in req to round down buffer allocs
LGTM On 2018/8/9 22:33, Dominique Martinet wrote: > From: Dominique Martinet > > 'msize' is often a power of two, or at least page-aligned, so avoiding > an overhead of two dozen bytes for each allocation will help the > allocator do its work and reduce memory fragmentation. > > Suggested-by: Matthew Wilcox > Signed-off-by: Dominique Martinet > Reviewed-by: Greg Kurz Acked-by: Jun Piao > Cc: Matthew Wilcox > Cc: Jun Piao > --- > v2: > - Add extra label to not free uninitialized memory on alloc failure > - Rename p9_fcall_alloc to 9p_fcall_init > - Add a p9_fcall_fini function to echo to init > > v3: > - use p9_call_fini() in rdma_request() instead of kfree, the code > was in the following patch in previous version > > include/net/9p/client.h | 5 +- > net/9p/client.c | 167 +--- > net/9p/trans_fd.c | 12 +-- > net/9p/trans_rdma.c | 29 +++ > net/9p/trans_virtio.c | 18 ++--- > net/9p/trans_xen.c | 12 +-- > 6 files changed, 125 insertions(+), 118 deletions(-) > > diff --git a/include/net/9p/client.h b/include/net/9p/client.h > index a4dc42c53d18..c2671d40bb6b 100644 > --- a/include/net/9p/client.h > +++ b/include/net/9p/client.h > @@ -95,8 +95,8 @@ struct p9_req_t { > int status; > int t_err; > wait_queue_head_t wq; > - struct p9_fcall *tc; > - struct p9_fcall *rc; > + struct p9_fcall tc; > + struct p9_fcall rc; > void *aux; > struct list_head req_list; > }; > @@ -230,6 +230,7 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char > *name, int mode, > kgid_t gid, struct p9_qid *); > int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 > *status); > int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl); > +void p9_fcall_fini(struct p9_fcall *fc); > struct p9_req_t *p9_tag_lookup(struct p9_client *, u16); > void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status); > > diff --git a/net/9p/client.c b/net/9p/client.c > index 88db45966740..ed78751aee7c 100644 > --- a/net/9p/client.c > +++ b/net/9p/client.c > @@ -231,16 +231,20 @@ static int parse_opts(char *opts, struct p9_client > *clnt) > return ret; > } > > -static struct p9_fcall *p9_fcall_alloc(int alloc_msize) > +static int p9_fcall_init(struct p9_fcall *fc, int alloc_msize) > { > - struct p9_fcall *fc; > - fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); > - if (!fc) > - return NULL; > + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); > + if (!fc->sdata) > + return -ENOMEM; > fc->capacity = alloc_msize; > - fc->sdata = (char *) fc + sizeof(struct p9_fcall); > - return fc; > + return 0; > +} > + > +void p9_fcall_fini(struct p9_fcall *fc) > +{ > + kfree(fc->sdata); > } > +EXPORT_SYMBOL(p9_fcall_fini); > > static struct kmem_cache *p9_req_cache; > > @@ -263,13 +267,13 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > if (!req) > return NULL; > > - req->tc = p9_fcall_alloc(alloc_msize); > - req->rc = p9_fcall_alloc(alloc_msize); > - if (!req->tc || !req->rc) > + if (p9_fcall_init(&req->tc, alloc_msize)) > + goto free_req; > + if (p9_fcall_init(&req->rc, alloc_msize)) > goto free; > > - p9pdu_reset(req->tc); > - p9pdu_reset(req->rc); > + p9pdu_reset(&req->tc); > + p9pdu_reset(&req->rc); > req->status = REQ_STATUS_ALLOC; > init_waitqueue_head(&req->wq); > INIT_LIST_HEAD(&req->req_list); > @@ -281,7 +285,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > GFP_NOWAIT); > else > tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); > - req->tc->tag = tag; > + req->tc.tag = tag; > spin_unlock_irq(&c->lock); > idr_preload_end(); > if (tag < 0) > @@ -290,8 +294,9 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > return req; > > free: > - kfree(req->tc); > - kfree(req->rc); > + p9_fcall_fini(&req->tc); > + p9_fcall_fini(&req->rc); > +free_req: > kmem_cache_free(p9_req_cache, req); > return ERR_PTR(-ENOMEM); > } > @@ -329,14 +334,14 @@ EXPORT_SYMBOL(p9_tag_lookup); > static void p9_free_req(struct p9_client *c, struct p9_req_t *r) > { > unsigned long flags; > - u16 tag = r->tc->tag; > + u16 tag = r->tc.tag; > > p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); > spin_lock_irqsave(&c->lock, flags); > idr_remove(&c->reqs, tag); > spin_unlock_irqrestore(&c->lock, flags); > - kfree(r->tc); > - kfree(r->rc); > + p9_fcall_fini(&r->tc); > + p9_fcall_fini(&r->rc); > kmem_cache_free(p9_req_cache, r); > } > > @@ -368,7 +373,7 @@ static void p9_tag_cleanup(struct p9_client *c) > */ >
Re: [PATCH] net/9p/trans_virtio.c: decrease the refcount of 9p virtio device when removing it
Hi Dominique, On 2018/8/8 17:40, Dominique Martinet wrote: > piaojun wrote on Wed, Aug 08, 2018: >> I try to remove 9pnet_virtio.ko by 'rmmod 9pnet_virtio' as I want to >> replace it without rebooting system. > > I do that all the time when testing, it works for me. > What exact kernel commit are you running? > My kernel commit id 6edf1d4cb0acde, and I replace the 9p code with 9p-next. And I wonder if this will work well? >> Here I have not mount 9pfs yet, so the refcount is still 0. >> >> Before rmmod: >> # lsmod | grep 9p >> 9pnet_virtio 20480 0 >> 9pnet 106496 1 9pnet_virtio >> virtio_ring28672 5 >> virtio_scsi,9pnet_virtio,virtio_pci,virtio_blk,virtio_net >> virtio 16384 5 >> virtio_scsi,9pnet_virtio,virtio_pci,virtio_blk,virtio_net >> >> After rmmod: >> # lsmod | grep 9p >> 9pnet_virtio 20480 0 >> 9pnet 106496 1 9pnet_virtio >> virtio_ring28672 5 >> virtio_scsi,9pnet_virtio,virtio_pci,virtio_blk,virtio_net >> virtio 16384 5 >> virtio_scsi,9pnet_virtio,virtio_pci,virtio_blk,virtio_net >> >> Normally 9pnet_virtio should be invisible after rmmod like this: >> # lsmod | grep 9p >> 9pnet 106496 0 > > Right, that obviously didn't work... > > But on the other hand, if I apply your commit and load/unload > 9pnet_virtio 5-10 times (I ran it in a loop) I get KASAN errors because > we put too many of these refs ; that doesn't happen without your patch > so it's apparently wrong. > I'm curious how that could make modprobe work better for you as well, it > shouldn't depend on that... > > Maybe `modprobe -r` might give a better error, or something in dmesg? > In my testing, `modprobe -r` has the same behavior with rmmod.
Re: [PATCH] net/9p/trans_virtio.c: decrease the refcount of 9p virtio device when removing it
Hi Dominique, On 2018/8/8 16:36, Dominique Martinet wrote: > piaojun wrote on Wed, Aug 08, 2018: >> I found that 9pnet_virtio.ko could not be removed by rmmod command, and I >> could still found it by lsmod. The reason is that we forgot decrease the >> refcount of 9p virtio device by kobject_put. So we should put refcount in >> p9_virtio_remove. > > > Hmm, I cannot seem to reproduce this. Can you give more details on how > to get into stuck state? > I tried mounting something, accessing the sysfs files, etc to no avail. > > lsmod gives a counter to how many references there are to the module, > you can use that to debug a bit. > For example here I get this line just after loading the module: > 9pnet_virtio 32768 0 > > Then after mounting something there is one reference: > 9pnet_virtio 32768 1 > > Then unmounting puts that back to 0 and 'modprobe -r' (or rmmod) works. > I try to remove 9pnet_virtio.ko by 'rmmod 9pnet_virtio' as I want to replace it without rebooting system. Here I have not mount 9pfs yet, so the refcount is still 0. Before rmmod: # lsmod | grep 9p 9pnet_virtio 20480 0 9pnet 106496 1 9pnet_virtio virtio_ring28672 5 virtio_scsi,9pnet_virtio,virtio_pci,virtio_blk,virtio_net virtio 16384 5 virtio_scsi,9pnet_virtio,virtio_pci,virtio_blk,virtio_net After rmmod: # lsmod | grep 9p 9pnet_virtio 20480 0 9pnet 106496 1 9pnet_virtio virtio_ring28672 5 virtio_scsi,9pnet_virtio,virtio_pci,virtio_blk,virtio_net virtio 16384 5 virtio_scsi,9pnet_virtio,virtio_pci,virtio_blk,virtio_net Normally 9pnet_virtio should be invisible after rmmod like this: # lsmod | grep 9p 9pnet 106496 0 > > > > I dislike saying the next part but I think form also is important, > please bear with me: > > - shorter subject line, please. For example, you can lose 20 characters > by reodering words so there is no need for pronouns > "net/9p/virtio: decrease 9p virtio device refcount on removal" > > - I personally dislike commit messages that are "novelized" (that is, > put yourself as an actor and describe what you were doing) > That seems to be somewhat accepted as looking at the kernel's git log I > see some (few) commits using "I ..." that are not pull request messages > but if possible please avoid this style and try to describe facts, how > things are wrong, what got fixed and if required how. > To give an example again, this says the same thing: > "The 9pnet_virtio module could not be unloaded because we forgot to > decrease the refcount of the 9p virtio device with kobject_put. > > Put the reference in 9p_virtio_remove" > Your suggestion really makes sense, and I will make some improvment later. Thanks, Jun > > Thanks, >
[PATCH] net/9p/trans_virtio.c: decrease the refcount of 9p virtio device when removing it
I found that 9pnet_virtio.ko could not be removed by rmmod command, and I could still found it by lsmod. The reason is that we forgot decrease the refcount of 9p virtio device by kobject_put. So we should put refcount in p9_virtio_remove. Signed-off-by: Jun Piao --- net/9p/trans_virtio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 46a5ab2..a00e992 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -710,7 +710,8 @@ static void p9_virtio_remove(struct virtio_device *vdev) vdev->config->del_vqs(vdev); sysfs_remove_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); - kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); + kobject_uevent(&(vdev->dev.kobj), KOBJ_REMOVE); + kobject_put(&(vdev->dev.kobj)); kfree(chan->tag); kfree(chan->vc_wq); kfree(chan); --
[PATCH v3] net/9p/trans_virtio.c: add null terminal for mount tag
chan->tag is Non-null terminated which will result in printing messy code when debugging code. So we should add '\0' for tag to make the code more convenient and robust. In addition, I drop char->tag_len to simplify the code. Signed-off-by: Jun Piao --- net/9p/trans_virtio.c | 17 +++-- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index d422bfc..46a5ab2 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -89,10 +89,8 @@ struct virtio_chan { unsigned long p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; - - int tag_len; /* -* tag name to identify a mount Non-null terminated +* tag name to identify a mount null terminated */ char *tag; @@ -525,14 +523,15 @@ static ssize_t p9_mount_tag_show(struct device *dev, { struct virtio_chan *chan; struct virtio_device *vdev; + int tag_len; vdev = dev_to_virtio(dev); chan = vdev->priv; + tag_len = strlen(chan->tag); - memcpy(buf, chan->tag, chan->tag_len); - buf[chan->tag_len] = 0; + memcpy(buf, chan->tag, tag_len + 1); - return chan->tag_len + 1; + return tag_len + 1; } static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); @@ -585,7 +584,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) err = -EINVAL; goto out_free_vq; } - tag = kmalloc(tag_len, GFP_KERNEL); + tag = kzalloc(tag_len + 1, GFP_KERNEL); if (!tag) { err = -ENOMEM; goto out_free_vq; @@ -594,7 +593,6 @@ static int p9_virtio_probe(struct virtio_device *vdev) virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), tag, tag_len); chan->tag = tag; - chan->tag_len = tag_len; err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); if (err) { goto out_free_tag; @@ -654,8 +652,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) mutex_lock(&virtio_9p_lock); list_for_each_entry(chan, &virtio_chan_list, chan_list) { - if (!strncmp(devname, chan->tag, chan->tag_len) && - strlen(devname) == chan->tag_len) { + if (!strcmp(devname, chan->tag)) { if (!chan->inuse) { chan->inuse = true; found = 1; --
Re: [V9fs-developer] [PATCH v2] net/9p/trans_virtio.c: add null terminal for mount tag
Hi Greg and Dominique, We'd better reach an agreement about the patch fix. In my opinion, replacing strlen(chan->tag) with a local variable sounds reasonable, and changing strncmp to strcmp may be little beneficial, as strcmp is more dangerours such as buffer-flow. So I'd like to hear your suggestion for the next version of the patch, or this patch is good enough? Thanks, Jun On 2018/8/2 21:23, Greg Kurz wrote: > On Thu, 2 Aug 2018 09:59:38 +0800 > piaojun wrote: > >> Hi Dominique, >> >> On 2018/8/2 9:54, Dominique Martinet wrote: >>> piaojun wrote on Thu, Aug 02, 2018: >>>> chan->tag is Non-null terminated which will result in printing messy code >>>> when debugging code. So we should add '\0' for tag to make the code more >>>> convenient and robust. In addition, I drop char->tag_len to simplify the >>>> code. >>> >>> Some new lines in commit message would be appreciated. >>> >>> That aside, I have a couple of nitpicks, but it looks good to me - thanks >>> >>>> >>>> Signed-off-by: Jun Piao >>>> --- >>>> net/9p/trans_virtio.c | 15 +-- >>>> 1 file changed, 5 insertions(+), 10 deletions(-) >>>> >>>> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c >>>> index d422bfc..0fe9c37 100644 >>>> --- a/net/9p/trans_virtio.c >>>> +++ b/net/9p/trans_virtio.c >>>> @@ -89,10 +89,8 @@ struct virtio_chan { >>>>unsigned long p9_max_pages; >>>>/* Scatterlist: can be too big for stack. */ >>>>struct scatterlist sg[VIRTQUEUE_NUM]; >>>> - >>>> - int tag_len; >>>>/* >>>> - * tag name to identify a mount Non-null terminated >>>> + * tag name to identify a mount null terminated >>>> */ >>>>char *tag; >>>> >>>> @@ -529,10 +527,9 @@ static ssize_t p9_mount_tag_show(struct device *dev, >>>>vdev = dev_to_virtio(dev); >>>>chan = vdev->priv; >>>> >>>> - memcpy(buf, chan->tag, chan->tag_len); >>>> - buf[chan->tag_len] = 0; >>>> + memcpy(buf, chan->tag, strlen(chan->tag) + 1); >>>> >>>> - return chan->tag_len + 1; >>>> + return strlen(chan->tag) + 1; >>> >>> Use a local variable for strlen(chan->tag)? >>> >> Yes, local variable looks better. >> >>>> } >>>> >>>> static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); >>>> @@ -585,7 +582,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) >>>>err = -EINVAL; >>>>goto out_free_vq; >>>>} >>>> - tag = kmalloc(tag_len, GFP_KERNEL); >>>> + tag = kzalloc(tag_len + 1, GFP_KERNEL); >>>>if (!tag) { >>>>err = -ENOMEM; >>>>goto out_free_vq; >>>> @@ -594,7 +591,6 @@ static int p9_virtio_probe(struct virtio_device *vdev) >>>>virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), >>>> tag, tag_len); >>>>chan->tag = tag; >>>> - chan->tag_len = tag_len; >>>>err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); >>>>if (err) { >>>>goto out_free_tag; >>>> @@ -654,8 +650,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) >>>> >>>>mutex_lock(&virtio_9p_lock); >>>>list_for_each_entry(chan, &virtio_chan_list, chan_list) { >>>> - if (!strncmp(devname, chan->tag, chan->tag_len) && >>>> - strlen(devname) == chan->tag_len) { >>>> + if (!strncmp(devname, chan->tag, strlen(chan->tag) + 1)) { >>> >>> strncmp(x, y, strlen(y)+1) is precisely what strcmp does so let's use >>> the simpler version >>> >> strcmp looks simpler, and I will wait for a while to hear more >> suggestions, and then post another patch for these fixes. >> > > Nothing more to add. Please go ahead. > >> Thanks, >> Jun >> >> -- >> Check out the vibrant tech community on one of the world's most >> engaging tech sites, Slashdot.org! http://sdm.link/slashdot >> ___ >> V9fs-developer mailing list >> v9fs-develo...@lists.sourceforge.net >> https://lists.sourceforge.net/lists/listinfo/v9fs-developer > > . >
Re: [PATCH v2] net/9p/trans_virtio.c: add null terminal for mount tag
Hi Dominique, On 2018/8/2 9:54, Dominique Martinet wrote: > piaojun wrote on Thu, Aug 02, 2018: >> chan->tag is Non-null terminated which will result in printing messy code >> when debugging code. So we should add '\0' for tag to make the code more >> convenient and robust. In addition, I drop char->tag_len to simplify the >> code. > > Some new lines in commit message would be appreciated. > > That aside, I have a couple of nitpicks, but it looks good to me - thanks > >> >> Signed-off-by: Jun Piao >> --- >> net/9p/trans_virtio.c | 15 +-- >> 1 file changed, 5 insertions(+), 10 deletions(-) >> >> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c >> index d422bfc..0fe9c37 100644 >> --- a/net/9p/trans_virtio.c >> +++ b/net/9p/trans_virtio.c >> @@ -89,10 +89,8 @@ struct virtio_chan { >> unsigned long p9_max_pages; >> /* Scatterlist: can be too big for stack. */ >> struct scatterlist sg[VIRTQUEUE_NUM]; >> - >> -int tag_len; >> /* >> - * tag name to identify a mount Non-null terminated >> + * tag name to identify a mount null terminated >> */ >> char *tag; >> >> @@ -529,10 +527,9 @@ static ssize_t p9_mount_tag_show(struct device *dev, >> vdev = dev_to_virtio(dev); >> chan = vdev->priv; >> >> -memcpy(buf, chan->tag, chan->tag_len); >> -buf[chan->tag_len] = 0; >> +memcpy(buf, chan->tag, strlen(chan->tag) + 1); >> >> -return chan->tag_len + 1; >> +return strlen(chan->tag) + 1; > > Use a local variable for strlen(chan->tag)? > Yes, local variable looks better. >> } >> >> static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); >> @@ -585,7 +582,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) >> err = -EINVAL; >> goto out_free_vq; >> } >> -tag = kmalloc(tag_len, GFP_KERNEL); >> +tag = kzalloc(tag_len + 1, GFP_KERNEL); >> if (!tag) { >> err = -ENOMEM; >> goto out_free_vq; >> @@ -594,7 +591,6 @@ static int p9_virtio_probe(struct virtio_device *vdev) >> virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), >> tag, tag_len); >> chan->tag = tag; >> -chan->tag_len = tag_len; >> err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); >> if (err) { >> goto out_free_tag; >> @@ -654,8 +650,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) >> >> mutex_lock(&virtio_9p_lock); >> list_for_each_entry(chan, &virtio_chan_list, chan_list) { >> -if (!strncmp(devname, chan->tag, chan->tag_len) && >> -strlen(devname) == chan->tag_len) { >> +if (!strncmp(devname, chan->tag, strlen(chan->tag) + 1)) { > > strncmp(x, y, strlen(y)+1) is precisely what strcmp does so let's use > the simpler version > strcmp looks simpler, and I will wait for a while to hear more suggestions, and then post another patch for these fixes. Thanks, Jun
[PATCH v2] net/9p/trans_virtio.c: add null terminal for mount tag
chan->tag is Non-null terminated which will result in printing messy code when debugging code. So we should add '\0' for tag to make the code more convenient and robust. In addition, I drop char->tag_len to simplify the code. Signed-off-by: Jun Piao --- net/9p/trans_virtio.c | 15 +-- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index d422bfc..0fe9c37 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -89,10 +89,8 @@ struct virtio_chan { unsigned long p9_max_pages; /* Scatterlist: can be too big for stack. */ struct scatterlist sg[VIRTQUEUE_NUM]; - - int tag_len; /* -* tag name to identify a mount Non-null terminated +* tag name to identify a mount null terminated */ char *tag; @@ -529,10 +527,9 @@ static ssize_t p9_mount_tag_show(struct device *dev, vdev = dev_to_virtio(dev); chan = vdev->priv; - memcpy(buf, chan->tag, chan->tag_len); - buf[chan->tag_len] = 0; + memcpy(buf, chan->tag, strlen(chan->tag) + 1); - return chan->tag_len + 1; + return strlen(chan->tag) + 1; } static DEVICE_ATTR(mount_tag, 0444, p9_mount_tag_show, NULL); @@ -585,7 +582,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) err = -EINVAL; goto out_free_vq; } - tag = kmalloc(tag_len, GFP_KERNEL); + tag = kzalloc(tag_len + 1, GFP_KERNEL); if (!tag) { err = -ENOMEM; goto out_free_vq; @@ -594,7 +591,6 @@ static int p9_virtio_probe(struct virtio_device *vdev) virtio_cread_bytes(vdev, offsetof(struct virtio_9p_config, tag), tag, tag_len); chan->tag = tag; - chan->tag_len = tag_len; err = sysfs_create_file(&(vdev->dev.kobj), &dev_attr_mount_tag.attr); if (err) { goto out_free_tag; @@ -654,8 +650,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) mutex_lock(&virtio_9p_lock); list_for_each_entry(chan, &virtio_chan_list, chan_list) { - if (!strncmp(devname, chan->tag, chan->tag_len) && - strlen(devname) == chan->tag_len) { + if (!strncmp(devname, chan->tag, strlen(chan->tag) + 1)) { if (!chan->inuse) { chan->inuse = true; found = 1; --
Re: [PATCH] net/9p/trans_virtio.c: add a terminal char for mount tag
Hi Dominique and Greg, Thanks for your reviewing, and I will try to simplify other related code according your suggestions in patch v2. Thanks, Jun On 2018/8/1 20:09, Dominique Martinet wrote: > Greg Kurz wrote on Wed, Aug 01, 2018: >> So this patch basically turns chan->tag into a nul terminated string, >> which is indeed more convenient and robust. Maybe you can update the >> rest of the code accordingly and drop chan->tag_len then ? > > If we can use that to simplify some other part of the code, that's > certainly more appealing to me :) > > >> FWIW, 9P strings received from the client are also converted to >> nul terminated strings: > > Oh, good to know; I guess that makes sense when these strings come and > go from/to other components of the kernel that likely expect that. >
Re: [PATCH] net/9p/trans_virtio.c: add a terminal char for mount tag
Hi Dominique, On 2018/8/1 16:11, Dominique Martinet wrote: > piaojun wrote on Wed, Aug 01, 2018: >> chan->tag has no terminal char at last which will result in printing messy >> code when debugging code. So we should add '\0' for tag. > > 9p is full of non null-terminated string so I'm not sure how I feel > about it, is there anything wrong with how this is used or was this just > when you tried to printf it? There is nothing wrong at the places using tag, as they calculated the tag_len carefully. Adding '\0' for it will make the code more robust. And I'm glad to hear others' opinions. Thanks, Jun > > If it's just for debugging I'd suggest using the printf format "%.*s" > with "chan->tag_len, chan->tag" arguments, > > > That said it's not like this is costly, so I'll take it if someone else > thinks this is helpful > >> >> Signed-off-by: Jun Piao >> --- >> net/9p/trans_virtio.c | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c >> index d422bfc..49d71d6 100644 >> --- a/net/9p/trans_virtio.c >> +++ b/net/9p/trans_virtio.c >> @@ -585,7 +585,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) >> err = -EINVAL; >> goto out_free_vq; >> } >> -tag = kmalloc(tag_len, GFP_KERNEL); >> +tag = kzalloc(tag_len + 1, GFP_KERNEL); >> if (!tag) { >> err = -ENOMEM; >> goto out_free_vq; >> --
[PATCH] net/9p/trans_virtio.c: add a terminal char for mount tag
chan->tag has no terminal char at last which will result in printing messy code when debugging code. So we should add '\0' for tag. Signed-off-by: Jun Piao --- net/9p/trans_virtio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index d422bfc..49d71d6 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -585,7 +585,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) err = -EINVAL; goto out_free_vq; } - tag = kmalloc(tag_len, GFP_KERNEL); + tag = kzalloc(tag_len + 1, GFP_KERNEL); if (!tag) { err = -ENOMEM; goto out_free_vq; --
Re: [V9fs-developer] [PATCH 2/2] net/9p: add a per-client fcall kmem_cache
On 2018/7/31 9:35, Dominique Martinet wrote: > piaojun wrote on Tue, Jul 31, 2018: >> Could you help paste some test result before-and-after the patch applied? > > The only performance tests I did were sent to the list a couple of mails > earlier, you can find it here: > http://lkml.kernel.org/r/20180730093101.GA7894@nautica > > In particular, the results for benchmark on small writes just before and > after this patch, without KASAN (these are the same numbers as in the > link, hardware/setup is described there): > - no alloc (4.18-rc7 request cache): 65.4k req/s > - non-power of two alloc, no patch: 61.6k req/s > - power of two alloc, no patch: 62.2k req/s > - non-power of two alloc, with patch: 64.7k req/s > - power of two alloc, with patch: 65.1k req/s > > I'm rather happy with the result, I didn't expect using a dedicated > cache would bring this much back but it's certainly worth it. > It looks like an obvious promotion. >>> @@ -1011,6 +1034,7 @@ void p9_client_destroy(struct p9_client *clnt) >>> >>> p9_tag_cleanup(clnt); >>> >>> + kmem_cache_destroy(clnt->fcall_cache); >> >> We could set NULL for fcall_cache in case of use-after-free. >> >>> kfree(clnt); > > Hmm, I understand where this comes from, but I'm not sure I agree. > If someone tries to access the client while/after it is freed things are > going to break anyway, I'd rather let things break as obviously as > possible than try to cover it up. > Setting NULL is not a big matter, and I will hear others' opinion.
Re: [V9fs-developer] [PATCH 1/2] net/9p: embed fcall in req to round down buffer allocs
On 2018/7/31 9:12, Dominique Martinet wrote: > piaojun wrote on Tue, Jul 31, 2018: >> This is really a *big* patch, but the modification seems no harm. And I >> suggest running testcases to cover this. Please see my comments below. > > I'm always running tests, but more never hurt - please help ;) I'm glad to help testing, and actually I'm going to run some testcases in xfs-test for 9p. > > For reference I'm running a subset of cthon04[1], ltp[2] and some custom > tests like these[3][4] > > [1] https://fedorapeople.org/cgit/steved/public_git/cthon04.git/ > [2] https://github.com/linux-test-project/ltp > [3] https://github.com/phdeniel/sigmund/blob/master/modules/allfs.inc#L208 > [4] https://github.com/phdeniel/sigmund/blob/master/modules/allfs.inc#L251 > >>> [...] >>> @@ -263,13 +261,13 @@ p9_tag_alloc(struct p9_client *c, int8_t type, >>> unsigned int max_size) >>> if (!req) >>> return NULL; >>> >>> - req->tc = p9_fcall_alloc(alloc_msize); >>> - req->rc = p9_fcall_alloc(alloc_msize); >>> - if (!req->tc || !req->rc) >>> + if (p9_fcall_alloc(&req->tc, alloc_msize)) >>> + goto free; >>> + if (p9_fcall_alloc(&req->rc, alloc_msize)) >>> goto free; >>> >>> - p9pdu_reset(req->tc); >>> - p9pdu_reset(req->rc); >>> + p9pdu_reset(&req->tc); >>> + p9pdu_reset(&req->rc); >>> req->status = REQ_STATUS_ALLOC; >>> init_waitqueue_head(&req->wq); >>> INIT_LIST_HEAD(&req->req_list); >>> @@ -281,7 +279,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned >>> int max_size) >>> GFP_NOWAIT); >>> else >>> tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); >>> - req->tc->tag = tag; >>> + req->tc.tag = tag; >>> spin_unlock_irq(&c->lock); >>> idr_preload_end(); >>> if (tag < 0) >>> @@ -290,8 +288,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned >>> int max_size) >>> return req; >>> >>> free: >>> - kfree(req->tc); >>> - kfree(req->rc); >>> + kfree(req->tc.sdata); >>> + kfree(req->rc.sdata); >> >> I wonder if we will free a wild pointer as 'sdata' has not been initialized >> NULL. > > Good point, it's possible to jump here if the first fcall_alloc failed > since this declustered the two allocations. > > Please consider this added to the previous patch (I'll send a v2 after > this has had more time for review, you can find the amended commit in my > 9p-test tree meanwhile): > -8<- > diff --git a/net/9p/client.c b/net/9p/client.c > index ba99a94a12c9..fe030ef1c076 100644 > --- a/net/9p/client.c > +++ b/net/9p/client.c > @@ -262,7 +262,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > return NULL; > > if (p9_fcall_alloc(&req->tc, alloc_msize)) > - goto free; > + goto free_req; > if (p9_fcall_alloc(&req->rc, alloc_msize)) > goto free; > > @@ -290,6 +290,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > free: > kfree(req->tc.sdata); > kfree(req->rc.sdata); > +free_req: > kmem_cache_free(p9_req_cache, req); > return ERR_PTR(-ENOMEM); > } > -8<- > > The second goto doesn't need changing because rc.sdata will be set to > NULL if the allocation failed >
Re: [V9fs-developer] [PATCH 2/2] net/9p: add a per-client fcall kmem_cache
Hi Dominique, Could you help paste some test result before-and-after the patch applied? And I have a little suggestion in comments below. On 2018/7/30 17:34, Dominique Martinet wrote: > From: Dominique Martinet > > Having a specific cache for the fcall allocations helps speed up > allocations a bit, especially in case of non-"round" msizes. > > The caches will automatically be merged if there are multiple caches > of items with the same size so we do not need to try to share a cache > between different clients of the same size. > > Since the msize is negotiated with the server, only allocate the cache > after that negotiation has happened - previous allocations or > allocations of different sizes (e.g. zero-copy fcall) are made with > kmalloc directly. > > Signed-off-by: Dominique Martinet > --- > include/net/9p/client.h | 2 ++ > net/9p/client.c | 40 > net/9p/trans_rdma.c | 2 +- > 3 files changed, 35 insertions(+), 9 deletions(-) > > diff --git a/include/net/9p/client.h b/include/net/9p/client.h > index 4b4ac1362ad5..8d9bc7402a42 100644 > --- a/include/net/9p/client.h > +++ b/include/net/9p/client.h > @@ -123,6 +123,7 @@ struct p9_client { > struct p9_trans_module *trans_mod; > enum p9_trans_status status; > void *trans; > + struct kmem_cache *fcall_cache; > > union { > struct { > @@ -230,6 +231,7 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char > *name, int mode, > kgid_t gid, struct p9_qid *); > int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 > *status); > int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *fl); > +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc); > struct p9_req_t *p9_tag_lookup(struct p9_client *, u16); > void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status); > > diff --git a/net/9p/client.c b/net/9p/client.c > index ba99a94a12c9..215e3b1ed7b4 100644 > --- a/net/9p/client.c > +++ b/net/9p/client.c > @@ -231,15 +231,34 @@ static int parse_opts(char *opts, struct p9_client > *clnt) > return ret; > } > > -static int p9_fcall_alloc(struct p9_fcall *fc, int alloc_msize) > +static int p9_fcall_alloc(struct p9_client *c, struct p9_fcall *fc, > + int alloc_msize) > { > - fc->sdata = kmalloc(alloc_msize, GFP_NOFS); > + if (c->fcall_cache && alloc_msize == c->msize) > + fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); > + else > + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); > if (!fc->sdata) > return -ENOMEM; > fc->capacity = alloc_msize; > return 0; > } > > +void p9_fcall_free(struct p9_client *c, struct p9_fcall *fc) > +{ > + /* sdata can be NULL for interrupted requests in trans_rdma, > + * and kmem_cache_free does not do NULL-check for us > + */ > + if (unlikely(!fc->sdata)) > + return; > + > + if (c->fcall_cache && fc->capacity == c->msize) > + kmem_cache_free(c->fcall_cache, fc->sdata); > + else > + kfree(fc->sdata); > +} > +EXPORT_SYMBOL(p9_fcall_free); > + > static struct kmem_cache *p9_req_cache; > > /** > @@ -261,9 +280,9 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > if (!req) > return NULL; > > - if (p9_fcall_alloc(&req->tc, alloc_msize)) > + if (p9_fcall_alloc(c, &req->tc, alloc_msize)) > goto free; > - if (p9_fcall_alloc(&req->rc, alloc_msize)) > + if (p9_fcall_alloc(c, &req->rc, alloc_msize)) > goto free; > > p9pdu_reset(&req->tc); > @@ -288,8 +307,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > return req; > > free: > - kfree(req->tc.sdata); > - kfree(req->rc.sdata); > + p9_fcall_free(c, &req->tc); > + p9_fcall_free(c, &req->rc); > kmem_cache_free(p9_req_cache, req); > return ERR_PTR(-ENOMEM); > } > @@ -333,8 +352,8 @@ static void p9_free_req(struct p9_client *c, struct > p9_req_t *r) > spin_lock_irqsave(&c->lock, flags); > idr_remove(&c->reqs, tag); > spin_unlock_irqrestore(&c->lock, flags); > - kfree(r->tc.sdata); > - kfree(r->rc.sdata); > + p9_fcall_free(c, &r->tc); > + p9_fcall_free(c, &r->rc); > kmem_cache_free(p9_req_cache, r); > } > > @@ -944,6 +963,7 @@ struct p9_client *p9_client_create(const char *dev_name, > char *options) > > clnt->trans_mod = NULL; > clnt->trans = NULL; > + clnt->fcall_cache = NULL; > > client_id = utsname()->nodename; > memcpy(clnt->name, client_id, strlen(client_id) + 1); > @@ -980,6 +1000,9 @@ struct p9_client *p9_client_create(const char *dev_name, > char *options) > if (err) > goto close_trans; > > + clnt->fcall_cache = kmem_cache_create("9p-fcall-cache", clnt->msize, > +
Re: [V9fs-developer] [PATCH 1/2] net/9p: embed fcall in req to round down buffer allocs
Hi Dominique, This is really a *big* patch, but the modification seems no harm. And I suggest running testcases to cover this. Please see my comments below. On 2018/7/30 17:34, Dominique Martinet wrote: > From: Dominique Martinet > > 'msize' is often a power of two, or at least page-aligned, so avoiding > an overhead of two dozen bytes for each allocation will help the > allocator do its work and reduce memory fragmentation. > > Suggested-by: Matthew Wilcox > Signed-off-by: Dominique Martinet > --- > include/net/9p/client.h | 4 +- > net/9p/client.c | 160 > net/9p/trans_fd.c | 12 +-- > net/9p/trans_rdma.c | 29 > net/9p/trans_virtio.c | 18 ++--- > net/9p/trans_xen.c | 12 +-- > 6 files changed, 117 insertions(+), 118 deletions(-) > > diff --git a/include/net/9p/client.h b/include/net/9p/client.h > index a4dc42c53d18..4b4ac1362ad5 100644 > --- a/include/net/9p/client.h > +++ b/include/net/9p/client.h > @@ -95,8 +95,8 @@ struct p9_req_t { > int status; > int t_err; > wait_queue_head_t wq; > - struct p9_fcall *tc; > - struct p9_fcall *rc; > + struct p9_fcall tc; > + struct p9_fcall rc; > void *aux; > struct list_head req_list; > }; > diff --git a/net/9p/client.c b/net/9p/client.c > index 88db45966740..ba99a94a12c9 100644 > --- a/net/9p/client.c > +++ b/net/9p/client.c > @@ -231,15 +231,13 @@ static int parse_opts(char *opts, struct p9_client > *clnt) > return ret; > } > > -static struct p9_fcall *p9_fcall_alloc(int alloc_msize) > +static int p9_fcall_alloc(struct p9_fcall *fc, int alloc_msize) > { > - struct p9_fcall *fc; > - fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); > - if (!fc) > - return NULL; > + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); > + if (!fc->sdata) > + return -ENOMEM; > fc->capacity = alloc_msize; > - fc->sdata = (char *) fc + sizeof(struct p9_fcall); > - return fc; > + return 0; > } > > static struct kmem_cache *p9_req_cache; > @@ -263,13 +261,13 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > if (!req) > return NULL; > > - req->tc = p9_fcall_alloc(alloc_msize); > - req->rc = p9_fcall_alloc(alloc_msize); > - if (!req->tc || !req->rc) > + if (p9_fcall_alloc(&req->tc, alloc_msize)) > + goto free; > + if (p9_fcall_alloc(&req->rc, alloc_msize)) > goto free; > > - p9pdu_reset(req->tc); > - p9pdu_reset(req->rc); > + p9pdu_reset(&req->tc); > + p9pdu_reset(&req->rc); > req->status = REQ_STATUS_ALLOC; > init_waitqueue_head(&req->wq); > INIT_LIST_HEAD(&req->req_list); > @@ -281,7 +279,7 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > GFP_NOWAIT); > else > tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); > - req->tc->tag = tag; > + req->tc.tag = tag; > spin_unlock_irq(&c->lock); > idr_preload_end(); > if (tag < 0) > @@ -290,8 +288,8 @@ p9_tag_alloc(struct p9_client *c, int8_t type, unsigned > int max_size) > return req; > > free: > - kfree(req->tc); > - kfree(req->rc); > + kfree(req->tc.sdata); > + kfree(req->rc.sdata); I wonder if we will free a wild pointer as 'sdata' has not been initialized NULL. > kmem_cache_free(p9_req_cache, req); > return ERR_PTR(-ENOMEM); > } > @@ -329,14 +327,14 @@ EXPORT_SYMBOL(p9_tag_lookup); > static void p9_free_req(struct p9_client *c, struct p9_req_t *r) > { > unsigned long flags; > - u16 tag = r->tc->tag; > + u16 tag = r->tc.tag; > > p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); > spin_lock_irqsave(&c->lock, flags); > idr_remove(&c->reqs, tag); > spin_unlock_irqrestore(&c->lock, flags); > - kfree(r->tc); > - kfree(r->rc); > + kfree(r->tc.sdata); > + kfree(r->rc.sdata); > kmem_cache_free(p9_req_cache, r); > } > > @@ -368,7 +366,7 @@ static void p9_tag_cleanup(struct p9_client *c) > */ > void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) > { > - p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag); > + p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag); > > /* >* This barrier is needed to make sure any change made to req before > @@ -378,7 +376,7 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t > *req, int status) > req->status = status; > > wake_up(&req->wq); > - p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); > + p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); > } > EXPORT_SYMBOL(p9_client_cb); > > @@ -449,18 +447,18 @@ static int p9_check_errors(struct p9_client *c, struct > p9_req_t *req) > int err; > int ecode; > > - err = p9_parse_header(req->rc, NULL, &type
Re: [PATCH] fs/9p/xattr.c: catch the error of p9_client_clunk when setting xattr failed
Hi Dominique, On 2018/7/25 11:32, Dominique Martinet wrote: > piaojun wrote on Wed, Jul 25, 2018: >> In my testing, v9fs_fid_xattr_set will return successfully even if the >> backend ext4 filesystem has no space to store xattr key-value. That will >> cause inconsistent behavior between front end and back end. The reason is >> that lsetxattr will be triggered by p9_client_clunk, and unfortunately we >> did not catch the error. This patch will catch the error to notify upper >> caller. >> >> p9_client_clunk (in 9p) >> p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid); >> v9fs_clunk (in qemu) >> put_fid >> free_fid >> v9fs_xattr_fid_clunk >> v9fs_co_lsetxattr >> s->ops->lsetxattr >> ext4_xattr_user_set (in host ext4 filesystem) >> >> Signed-off-by: Jun Piao > > Good catch! > > LGTM on its own but see comment below for further error checking. > Please let me know if you want to do this in a v2 or submit a separate > patch altogether, I can pick this one up as it is. > >> --- >> fs/9p/xattr.c | 6 -- >> 1 file changed, 4 insertions(+), 2 deletions(-) >> >> diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c >> index f329eee..352abc3 100644 >> --- a/fs/9p/xattr.c >> +++ b/fs/9p/xattr.c >> @@ -105,7 +105,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char >> *name, >> { >> struct kvec kvec = {.iov_base = (void *)value, .iov_len = value_len}; >> struct iov_iter from; >> -int retval; >> +int retval, err; >> >> iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len); >> >> @@ -126,7 +126,9 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char >> *name, >> retval); >> else >> p9_client_write(fid, 0, &from, &retval); > > I'm not sure what to do about this but it's also possible for > p9_client_write to not write the full length without setting and error. > > We should probably compare the return value of p9_client_write and > value_len to detect short writes and return an error in this case. > It looks like we could identify short writes error from the *err. If no error case breaks the while loop, the total equal to the whole data len. Thanks, Jun >> -p9_client_clunk(fid); >> +err = p9_client_clunk(fid); >> +if (!retval && err) >> +retval = err; >> return retval; >> } >> >> -- > . >
[PATCH] fs/9p/xattr.c: catch the error of p9_client_clunk when setting xattr failed
In my testing, v9fs_fid_xattr_set will return successfully even if the backend ext4 filesystem has no space to store xattr key-value. That will cause inconsistent behavior between front end and back end. The reason is that lsetxattr will be triggered by p9_client_clunk, and unfortunately we did not catch the error. This patch will catch the error to notify upper caller. p9_client_clunk (in 9p) p9_client_rpc(clnt, P9_TCLUNK, "d", fid->fid); v9fs_clunk (in qemu) put_fid free_fid v9fs_xattr_fid_clunk v9fs_co_lsetxattr s->ops->lsetxattr ext4_xattr_user_set (in host ext4 filesystem) Signed-off-by: Jun Piao --- fs/9p/xattr.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index f329eee..352abc3 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -105,7 +105,7 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, { struct kvec kvec = {.iov_base = (void *)value, .iov_len = value_len}; struct iov_iter from; - int retval; + int retval, err; iov_iter_kvec(&from, WRITE | ITER_KVEC, &kvec, 1, value_len); @@ -126,7 +126,9 @@ int v9fs_fid_xattr_set(struct p9_fid *fid, const char *name, retval); else p9_client_write(fid, 0, &from, &retval); - p9_client_clunk(fid); + err = p9_client_clunk(fid); + if (!retval && err) + retval = err; return retval; } --
Re: [V9fs-developer] [PATCH v3] Fix a hard lockup case in the virtio transport
LGTM On 2018/7/19 15:17, jiangyiwen wrote: > When client has multiple threads that issue io requests > all the time, and the server has a very good performance, > it may cause cpu is running in the irq context for a long > time because it can check virtqueue has buf in the *while* > loop. > > So we should keep chan->lock in the whole loop. > > Signed-off-by: Yiwen Jiang Reviewed-by: Jun Piao > --- > net/9p/trans_virtio.c | 21 +++-- > 1 file changed, 11 insertions(+), 10 deletions(-) > > diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c > index 05006cb..3589864 100644 > --- a/net/9p/trans_virtio.c > +++ b/net/9p/trans_virtio.c > @@ -144,24 +144,25 @@ static void req_done(struct virtqueue *vq) > struct virtio_chan *chan = vq->vdev->priv; > unsigned int len; > struct p9_req_t *req; > + bool need_wakeup = false; > unsigned long flags; > > p9_debug(P9_DEBUG_TRANS, ": request done\n"); > > - while (1) { > - spin_lock_irqsave(&chan->lock, flags); > - req = virtqueue_get_buf(chan->vq, &len); > - if (req == NULL) { > - spin_unlock_irqrestore(&chan->lock, flags); > - break; > + spin_lock_irqsave(&chan->lock, flags); > + while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) { > + if (!chan->ring_bufs_avail) { > + chan->ring_bufs_avail = 1; > + need_wakeup = true; > } > - chan->ring_bufs_avail = 1; > - spin_unlock_irqrestore(&chan->lock, flags); > - /* Wakeup if anyone waiting for VirtIO ring space. */ > - wake_up(chan->vc_wq); > + > if (len) > p9_client_cb(chan->client, req, REQ_STATUS_RCVD); > } > + spin_unlock_irqrestore(&chan->lock, flags); > + /* Wakeup if anyone waiting for VirtIO ring space. */ > + if (need_wakeup) > + wake_up(chan->vc_wq); > } > > /** >
Re: [PATCH] net/9p/trans_virtio.c: replace mutex_lock with spin_lock to protect 'virtio_chan_list'
On 2018/7/19 11:36, Dominique Martinet wrote: > piaojun wrote on Thu, Jul 19, 2018: >>> piaojun wrote on Wed, Jul 18, 2018: >>> That's not a fast path operation, I don't mind changing things but I'd >>> like to understand why - these functions are only ever called at unmount >>> time or when something happens on the virtio bus (probe will happen on >>> probing on the pci bus and I'm not too sure on remove but probably pci >>> removal i.e. basically never?) >>> >>> I don't see why this wouldn't work, but I won't take this without a >>> (good?) reason. >>> >> virtio_9p_lock is responsable for protecting virtio_chan_list which has 3 >> operation: >> >> 1. Add a virtio chan to virtio_chan_list. This will happen when we insmod >> 9pnet_virtio.ko: >> p9_virtio_probe >> --list_add_tail(&chan->chan_list, &virtio_chan_list); >> >> 2. Remove a virtio chan. This will happen when remnod 9pnet_virtio.ko: >> p9_virtio_remove >> --list_del(&chan->chan_list); >> >> 3. Find a unused virtio chan when mount 9p: >> mount >> --p9_virtio_create >> --list_for_each_entry(chan, &virtio_chan_list, chan_list) >> >> Multi mount process will compete for virtio_9p_lock when finding unused >> virtio chan, in which case mutex lock will cause process sleep and wake >> up. I think this a waste of CPU time. So we could use spin lock to avoid >> this. > > Well, sure, that's theory; but how is that in practice? > I actually took the time to run some tests, setting up 20 virtio mount > points in qemu, and running this command with and without your patch: > # time sh -c 'for i in {1..20}; do > sh -c "for j in {1..100}; do > mount -t 9p d$i d.$i; > umount d.$i; > done" & > done; > wait' > > This is quick & dirty but basically, mounts and unmounts 100 times in a > loop all 20 mount points in parallel to stress that lock. > I get these times 5 times (one run per column), > without patch: > real 0m19.357s 0m19.626s 0m19.904s 0m19.926s > 0m21.321s > user 0m6.795s0m6.874s0m6.807s0m6.768s0m6.892s > sys 0m29.936s 0m31.196s 0m31.702s 0m31.914s > 0m30.791s > > With patch: > real 0m19.439s 0m19.849s 0m19.683s 0m19.600s > 0m20.689s > user 0m6.948s0m6.582s0m6.706s0m6.598s0m6.876s > sys 0m29.364s 0m30.898s 0m30.695s 0m31.311s > 0m33.391s > > I honestly can't say I'm convinced with a difference either way, the > variations look more like noise than anything to me. > > > More to the point, while these tests ran my dmesg buffer was filled with > errors like: > FS-Cache: Duplicate cookie detected > FS-Cache: O-cookie c=00368cdb [p=548b03c2 fl=222 nc=0 na=1] > FS-Cache: O-cookie d=4cebd15f n=029a0b83 > FS-Cache: O-key=[10] '34323935303838343536' > FS-Cache: N-cookie c=d4089478 [p=548b03c2 fl=2 nc=0 na=1] > FS-Cache: N-cookie d=4cebd15f n=959d4d37 > FS-Cache: N-key=[10] '34323935303838343536' > > or > (output mangled a bit) > > == > BUG: KASAN: use-after-free in p9_client_cb+0x14d/0x160 [9pnet] > Read of size 8 at addr 88003522a088 by task systemd-udevd/492 > > CPU: 1 PID: 492 Comm: systemd-udevd Tainted: G O 4.18.0-rc5+ #9 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS > ?-20180531_142017-buildhw-08.phx2.fedoraproject.org-1.fc28 0> > Call Trace: > > dump_stack+0x7b/0xad > print_address_description+0x6a/0x209 > ? p9_client_cb+0x14d/0x160 [9pnet] > kasan_report.cold.7+0x242/0x2fe > __asan_report_load8_noabort+0x19/0x20 > p9_client_cb+0x14d/0x160 [9pnet] > req_done+0x22f/0x280 [9pnet_virtio] > ? p9_mount_tag_show+0x120/0x120 [9pnet_virtio] > vring_interrupt+0x108/0x1b0 [virtio_ring] > ? vring_map_single.constprop.23+0x350/0x350 [virtio_ring] > __handle_irq_event_percpu+0xec/0x460 > handle_irq_event_percpu+0x71/0x140 > ? __handle_irq_event_percpu+0x460/0x460 > ? apic_ack_irq+0xa3/0xe0 > handle_irq_event+0xb9/0x14a > handle_edge_irq+0x1ea/0x7a0 > ? kasan_check_read+0x11/0x20 > handle_irq+0x48/0x60 > do_IRQ+0x67/0x140 > common_interrupt+0xf/0xf > > RIP: 0010:finish_task_switch+0x10e/0x630 > Code: e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 6d 04 00 00 41 c7 45 38 00 00 00 > 00 4c 89 e7 ff 14 25 28 f5 66 8e fb
Re: [PATCH] net/9p/trans_virtio.c: replace mutex_lock with spin_lock to protect 'virtio_chan_list'
Hi Dominique, On 2018/7/18 17:54, Dominique Martinet wrote: > piaojun wrote on Wed, Jul 18, 2018: >> spin_lock is more effective for short time protection than mutex_lock, as >> mutex lock may cause process sleep and wake up which consume much cpu >> time. > > That's not a fast path operation, I don't mind changing things but I'd > like to understand why - these functions are only ever called at unmount > time or when something happens on the virtio bus (probe will happen on > probing on the pci bus and I'm not too sure on remove but probably pci > removal i.e. basically never?) > > I don't see why this wouldn't work, but I won't take this without a > (good?) reason. > virtio_9p_lock is responsable for protecting virtio_chan_list which has 3 operation: 1. Add a virtio chan to virtio_chan_list. This will happen when we insmod 9pnet_virtio.ko: p9_virtio_probe --list_add_tail(&chan->chan_list, &virtio_chan_list); 2. Remove a virtio chan. This will happen when remnod 9pnet_virtio.ko: p9_virtio_remove --list_del(&chan->chan_list); 3. Find a unused virtio chan when mount 9p: mount --p9_virtio_create --list_for_each_entry(chan, &virtio_chan_list, chan_list) Multi mount process will compete for virtio_9p_lock when finding unused virtio chan, in which case mutex lock will cause process sleep and wake up. I think this a waste of CPU time. So we could use spin lock to avoid this. Thanks, Jun
[PATCH] net/9p/trans_virtio.c: replace mutex_lock with spin_lock to protect 'virtio_chan_list'
spin_lock is more effective for short time protection than mutex_lock, as mutex lock may cause process sleep and wake up which consume much cpu time. Signed-off-by: Jun Piao --- net/9p/trans_virtio.c | 24 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 86077f7..7ec0dbf 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -53,8 +53,8 @@ #define VIRTQUEUE_NUM 128 -/* a single mutex to manage channel initialization and attachment */ -static DEFINE_MUTEX(virtio_9p_lock); +/* a single spinlock to manage channel initialization and attachment */ +static DEFINE_SPINLOCK(virtio_9p_lock); static DECLARE_WAIT_QUEUE_HEAD(vp_wq); static atomic_t vp_pinned = ATOMIC_INIT(0); @@ -120,10 +120,10 @@ static void p9_virtio_close(struct p9_client *client) { struct virtio_chan *chan = client->trans; - mutex_lock(&virtio_9p_lock); + spin_lock(&virtio_9p_lock); if (chan) chan->inuse = false; - mutex_unlock(&virtio_9p_lock); + spin_unlock(&virtio_9p_lock); } /** @@ -605,9 +605,9 @@ static int p9_virtio_probe(struct virtio_device *vdev) virtio_device_ready(vdev); - mutex_lock(&virtio_9p_lock); + spin_lock(&virtio_9p_lock); list_add_tail(&chan->chan_list, &virtio_chan_list); - mutex_unlock(&virtio_9p_lock); + spin_unlock(&virtio_9p_lock); /* Let udev rules use the new mount_tag attribute. */ kobject_uevent(&(vdev->dev.kobj), KOBJ_CHANGE); @@ -645,7 +645,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) int ret = -ENOENT; int found = 0; - mutex_lock(&virtio_9p_lock); + spin_lock(&virtio_9p_lock); list_for_each_entry(chan, &virtio_chan_list, chan_list) { if (!strncmp(devname, chan->tag, chan->tag_len) && strlen(devname) == chan->tag_len) { @@ -657,7 +657,7 @@ static int p9_virtio_probe(struct virtio_device *vdev) ret = -EBUSY; } } - mutex_unlock(&virtio_9p_lock); + spin_unlock(&virtio_9p_lock); if (!found) { pr_err("no channels available for device %s\n", devname); @@ -682,7 +682,7 @@ static void p9_virtio_remove(struct virtio_device *vdev) struct virtio_chan *chan = vdev->priv; unsigned long warning_time; - mutex_lock(&virtio_9p_lock); + spin_lock(&virtio_9p_lock); /* Remove self from list so we don't get new users. */ list_del(&chan->chan_list); @@ -690,17 +690,17 @@ static void p9_virtio_remove(struct virtio_device *vdev) /* Wait for existing users to close. */ while (chan->inuse) { - mutex_unlock(&virtio_9p_lock); + spin_unlock(&virtio_9p_lock); msleep(250); if (time_after(jiffies, warning_time + 10 * HZ)) { dev_emerg(&vdev->dev, "p9_virtio_remove: waiting for device in use.\n"); warning_time = jiffies; } - mutex_lock(&virtio_9p_lock); + spin_lock(&virtio_9p_lock); } - mutex_unlock(&virtio_9p_lock); + spin_unlock(&virtio_9p_lock); vdev->config->reset(vdev); vdev->config->del_vqs(vdev); --
[PATCH] net/9p/trans_virtio.c: fix some spell mistakes in comments
Fix spelling mistake in comments of p9_virtio_zc_request(). Signed-off-by: Jun Piao --- net/9p/trans_virtio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index c9f2717..86077f7 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -384,8 +384,8 @@ static int p9_get_mapped_pages(struct virtio_chan *chan, * p9_virtio_zc_request - issue a zero copy request * @client: client instance issuing the request * @req: request to be issued - * @uidata: user bffer that should be ued for zero copy read - * @uodata: user buffer that shoud be user for zero copy write + * @uidata: user buffer that should be used for zero copy read + * @uodata: user buffer that should be used for zero copy write * @inlen: read buffer size * @outlen: write buffer size * @in_hdr_len: reader header size, This is the size of response protocol data --
Re: [PATCH v2] net/9p/client.c: fix misuse of spin_lock_irqsave for p9_client lock
Hi Dominique, On 2018/7/17 19:56, Dominique Martinet wrote: > piaojun wrote on Tue, Jul 17, 2018: >> In p9_read_work(), we use spin_lock for client->lock, but misuse >> spin_lock_irqsave for it in p9_fid_create(). As p9_client lock won't be >> locked in irq context, so spin_lock is enough. And that will improve the >> performance. > > (I didn't say in v1, but the commit message sounds a bit odd to me, if > there is any other change to the patch could you please rephrase it a > bit?) Yes, I will rephrase it if there is any change to the patch. > >> Rebased on 9p-next. > > (Likewise, this should probably go below the '---' mark to not be in the > final commit message, 9p-next does not make sense to linux as a whole) > Thanks for your suggestion. >> >> Signed-off-by: Jun Piao > > Nitpicks aside this still looks good to me, I'm not expert in locking > but I believe the conn_cancel stuff in trans_fd is called either from > user context or workqueue which also is process context, while the rest > comes from user context only. > > There *are* common functions called from interrupt context (I see: > p9_tag_lookup, p9_client_cb, p9_parse_header) but luckily enough > none of these take the lock. > > I'll give this one a little bit of time for others to review before > picking it up, just in case, but will do if no-one complains. > >> --- >> net/9p/client.c | 18 -- >> net/9p/trans_fd.c | 7 +++ >> 2 files changed, 11 insertions(+), 14 deletions(-) >> >> diff --git a/net/9p/client.c b/net/9p/client.c >> index 30f9aee..f6897ee 100644 >> --- a/net/9p/client.c >> +++ b/net/9p/client.c >> @@ -275,14 +275,14 @@ static struct p9_fcall *p9_fcall_alloc(int alloc_msize) >> INIT_LIST_HEAD(&req->req_list); >> >> idr_preload(GFP_NOFS); >> -spin_lock_irq(&c->lock); >> +spin_lock(&c->lock); >> if (type == P9_TVERSION) >> tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1, >> GFP_NOWAIT); >> else >> tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); >> req->tc->tag = tag; >> -spin_unlock_irq(&c->lock); >> +spin_unlock(&c->lock); >> idr_preload_end(); >> if (tag < 0) >> goto free; >> @@ -328,13 +328,12 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, >> u16 tag) >> */ >> static void p9_free_req(struct p9_client *c, struct p9_req_t *r) >> { >> -unsigned long flags; >> u16 tag = r->tc->tag; >> >> p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); >> -spin_lock_irqsave(&c->lock, flags); >> +spin_lock(&c->lock); >> idr_remove(&c->reqs, tag); >> -spin_unlock_irqrestore(&c->lock, flags); >> +spin_unlock(&c->lock); >> kfree(r->tc); >> kfree(r->rc); >> kmem_cache_free(p9_req_cache, r); >> @@ -846,10 +845,10 @@ static struct p9_fid *p9_fid_create(struct p9_client >> *clnt) >> fid->fid = 0; >> >> idr_preload(GFP_KERNEL); >> -spin_lock_irq(&clnt->lock); >> +spin_lock(&clnt->lock); >> ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1, >> GFP_NOWAIT); >> -spin_unlock_irq(&clnt->lock); >> +spin_unlock(&clnt->lock); >> idr_preload_end(); >> >> if (!ret) >> @@ -862,13 +861,12 @@ static struct p9_fid *p9_fid_create(struct p9_client >> *clnt) >> static void p9_fid_destroy(struct p9_fid *fid) >> { >> struct p9_client *clnt; >> -unsigned long flags; >> >> p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid); >> clnt = fid->clnt; >> -spin_lock_irqsave(&clnt->lock, flags); >> +spin_lock(&clnt->lock); >> idr_remove(&clnt->fids, fid->fid); >> -spin_unlock_irqrestore(&clnt->lock, flags); >> +spin_unlock(&clnt->lock); >> kfree(fid->rdir); >> kfree(fid); >> } >> diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c >> index bf459ee..94ef685 100644 >> --- a/net/9p/trans_fd.c >> +++ b/net/9p/trans_fd.c >> @@ -197,15 +197,14 @@ static void p9_mux_poll_stop(struct p9_conn *m) >> static void p9_conn_cancel(struct p9_conn *m, int err) >> { >> struct p9_req_t *req, *rtmp; >> -unsigned long flags; >> LIST_HEAD(cancel_list); >> >> p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err); >> >> -spin_lock_irqsave(&m->client->lock, flags); >> +spin_lock(&m->client->lock); >> >> if (m->err) { >> -spin_unlock_irqrestore(&m->client->lock, flags); >> +spin_unlock(&m->client->lock); >> return; >> } >> >> @@ -217,7 +216,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err) >> list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { >> list_move(&req->req_list, &cancel_list); >> } >> -spin_unlock_irqrestore(&m->client->lock, flags); >> +spin_unlock(&m->client->lock); >> >> list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { >> p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); >> -- >
Re: [V9fs-developer] [PATCH v2] net/9p: Fix a deadlock case in the virtio transport
LGTM On 2018/7/17 19:03, jiangyiwen wrote: > When client has multiple threads that issue io requests > all the time, and the server has a very good performance, > it may cause cpu is running in the irq context for a long > time because it can check virtqueue has buf in the *while* > loop. > > So we should keep chan->lock in the whole loop. > > Signed-off-by: Yiwen Jiang Reviewed-by: Jun Piao > --- > net/9p/trans_virtio.c | 17 ++--- > 1 file changed, 6 insertions(+), 11 deletions(-) > > diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c > index 05006cb..e5fea8b 100644 > --- a/net/9p/trans_virtio.c > +++ b/net/9p/trans_virtio.c > @@ -148,20 +148,15 @@ static void req_done(struct virtqueue *vq) > > p9_debug(P9_DEBUG_TRANS, ": request done\n"); > > - while (1) { > - spin_lock_irqsave(&chan->lock, flags); > - req = virtqueue_get_buf(chan->vq, &len); > - if (req == NULL) { > - spin_unlock_irqrestore(&chan->lock, flags); > - break; > - } > - chan->ring_bufs_avail = 1; > - spin_unlock_irqrestore(&chan->lock, flags); > - /* Wakeup if anyone waiting for VirtIO ring space. */ > - wake_up(chan->vc_wq); > + spin_lock_irqsave(&chan->lock, flags); > + while ((req = virtqueue_get_buf(chan->vq, &len)) != NULL) { > if (len) > p9_client_cb(chan->client, req, REQ_STATUS_RCVD); > } > + chan->ring_bufs_avail = 1; > + spin_unlock_irqrestore(&chan->lock, flags); > + /* Wakeup if anyone waiting for VirtIO ring space. */ > + wake_up(chan->vc_wq); > } > > /** >
[PATCH v2] net/9p/client.c: fix misuse of spin_lock_irqsave for p9_client lock
In p9_read_work(), we use spin_lock for client->lock, but misuse spin_lock_irqsave for it in p9_fid_create(). As p9_client lock won't be locked in irq context, so spin_lock is enough. And that will improve the performance. Rebased on 9p-next. Signed-off-by: Jun Piao --- net/9p/client.c | 18 -- net/9p/trans_fd.c | 7 +++ 2 files changed, 11 insertions(+), 14 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 30f9aee..f6897ee 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -275,14 +275,14 @@ static struct p9_fcall *p9_fcall_alloc(int alloc_msize) INIT_LIST_HEAD(&req->req_list); idr_preload(GFP_NOFS); - spin_lock_irq(&c->lock); + spin_lock(&c->lock); if (type == P9_TVERSION) tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1, GFP_NOWAIT); else tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); req->tc->tag = tag; - spin_unlock_irq(&c->lock); + spin_unlock(&c->lock); idr_preload_end(); if (tag < 0) goto free; @@ -328,13 +328,12 @@ struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) */ static void p9_free_req(struct p9_client *c, struct p9_req_t *r) { - unsigned long flags; u16 tag = r->tc->tag; p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); - spin_lock_irqsave(&c->lock, flags); + spin_lock(&c->lock); idr_remove(&c->reqs, tag); - spin_unlock_irqrestore(&c->lock, flags); + spin_unlock(&c->lock); kfree(r->tc); kfree(r->rc); kmem_cache_free(p9_req_cache, r); @@ -846,10 +845,10 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) fid->fid = 0; idr_preload(GFP_KERNEL); - spin_lock_irq(&clnt->lock); + spin_lock(&clnt->lock); ret = idr_alloc_u32(&clnt->fids, fid, &fid->fid, P9_NOFID - 1, GFP_NOWAIT); - spin_unlock_irq(&clnt->lock); + spin_unlock(&clnt->lock); idr_preload_end(); if (!ret) @@ -862,13 +861,12 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) static void p9_fid_destroy(struct p9_fid *fid) { struct p9_client *clnt; - unsigned long flags; p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid); clnt = fid->clnt; - spin_lock_irqsave(&clnt->lock, flags); + spin_lock(&clnt->lock); idr_remove(&clnt->fids, fid->fid); - spin_unlock_irqrestore(&clnt->lock, flags); + spin_unlock(&clnt->lock); kfree(fid->rdir); kfree(fid); } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index bf459ee..94ef685 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -197,15 +197,14 @@ static void p9_mux_poll_stop(struct p9_conn *m) static void p9_conn_cancel(struct p9_conn *m, int err) { struct p9_req_t *req, *rtmp; - unsigned long flags; LIST_HEAD(cancel_list); p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err); - spin_lock_irqsave(&m->client->lock, flags); + spin_lock(&m->client->lock); if (m->err) { - spin_unlock_irqrestore(&m->client->lock, flags); + spin_unlock(&m->client->lock); return; } @@ -217,7 +216,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { list_move(&req->req_list, &cancel_list); } - spin_unlock_irqrestore(&m->client->lock, flags); + spin_unlock(&m->client->lock); list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); --
Re: [V9fs-developer] [PATCH] net/9p/client.c: fix misuse of spin_lock_irqsave for p9_client lock
Hi Dominique, On 2018/7/12 15:01, Dominique Martinet wrote: > piaojun wrote on Thu, Jul 12, 2018: >> In p9_read_work(), we use spin_lock for client->lock, but misuse >> spin_lock_irqsave for it in p9_fid_create(). As p9_client lock won't be >> locked in irq context, so spin_lock is enough. And that will improve the >> performance. > > Agreed on principle, see remark below > >> Signed-off-by: Jun Piao >> --- >> net/9p/client.c | 17 +++-- >> net/9p/trans_fd.c | 7 +++ >> 2 files changed, 10 insertions(+), 14 deletions(-) >> >> diff --git a/net/9p/client.c b/net/9p/client.c >> index 8bc8b3e..b05cbfc 100644 >> --- a/net/9p/client.c >> +++ b/net/9p/client.c >> @@ -260,7 +260,6 @@ static struct p9_fcall *p9_fcall_alloc(int alloc_msize) >> static struct p9_req_t * >> p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size) >> { >> -unsigned long flags; >> int row, col; >> struct p9_req_t *req; >> int alloc_msize = min(c->msize, max_size); >> @@ -270,7 +269,7 @@ static struct p9_fcall *p9_fcall_alloc(int alloc_msize) >> tag++; >> >> if (tag >= c->max_tag) { >> -spin_lock_irqsave(&c->lock, flags); >> +spin_lock(&c->lock); > > This code doesn't exist anymore with Matthew's idr rework, could you > submit that patch based on top of my 9p-next branch? > (unless you really want Andrew to take this for the next 4.18-rc, but > I'm not convinced this qualifies) OK, I will rebase my patch and resend later. Thanks, Jun > > Please see my "Current 9P patches - test branch" for details: > https://sourceforge.net/p/v9fs/mailman/message/36365359/ >
[PATCH] net/9p/client.c: fix misuse of spin_lock_irqsave for p9_client lock
In p9_read_work(), we use spin_lock for client->lock, but misuse spin_lock_irqsave for it in p9_fid_create(). As p9_client lock won't be locked in irq context, so spin_lock is enough. And that will improve the performance. Signed-off-by: Jun Piao --- net/9p/client.c | 17 +++-- net/9p/trans_fd.c | 7 +++ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 8bc8b3e..b05cbfc 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -260,7 +260,6 @@ static struct p9_fcall *p9_fcall_alloc(int alloc_msize) static struct p9_req_t * p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size) { - unsigned long flags; int row, col; struct p9_req_t *req; int alloc_msize = min(c->msize, max_size); @@ -270,7 +269,7 @@ static struct p9_fcall *p9_fcall_alloc(int alloc_msize) tag++; if (tag >= c->max_tag) { - spin_lock_irqsave(&c->lock, flags); + spin_lock(&c->lock); /* check again since original check was outside of lock */ while (tag >= c->max_tag) { row = (tag / P9_ROW_MAXTAG); @@ -279,7 +278,7 @@ static struct p9_fcall *p9_fcall_alloc(int alloc_msize) if (!c->reqs[row]) { pr_err("Couldn't grow tag array\n"); - spin_unlock_irqrestore(&c->lock, flags); + spin_unlock(&c->lock); return ERR_PTR(-ENOMEM); } for (col = 0; col < P9_ROW_MAXTAG; col++) { @@ -288,7 +287,7 @@ static struct p9_fcall *p9_fcall_alloc(int alloc_msize) } c->max_tag += P9_ROW_MAXTAG; } - spin_unlock_irqrestore(&c->lock, flags); + spin_unlock(&c->lock); } row = tag / P9_ROW_MAXTAG; col = tag % P9_ROW_MAXTAG; @@ -909,7 +908,6 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) { int ret; struct p9_fid *fid; - unsigned long flags; p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL); @@ -928,9 +926,9 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) fid->uid = current_fsuid(); fid->clnt = clnt; fid->rdir = NULL; - spin_lock_irqsave(&clnt->lock, flags); + spin_lock(&clnt->lock); list_add(&fid->flist, &clnt->fidlist); - spin_unlock_irqrestore(&clnt->lock, flags); + spin_unlock(&clnt->lock); return fid; @@ -942,14 +940,13 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt) static void p9_fid_destroy(struct p9_fid *fid) { struct p9_client *clnt; - unsigned long flags; p9_debug(P9_DEBUG_FID, "fid %d\n", fid->fid); clnt = fid->clnt; p9_idpool_put(fid->fid, clnt->fidpool); - spin_lock_irqsave(&clnt->lock, flags); + spin_lock(&clnt->lock); list_del(&fid->flist); - spin_unlock_irqrestore(&clnt->lock, flags); + spin_unlock(&clnt->lock); kfree(fid->rdir); kfree(fid); } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 588bf88..96e1e68 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -197,15 +197,14 @@ static void p9_mux_poll_stop(struct p9_conn *m) static void p9_conn_cancel(struct p9_conn *m, int err) { struct p9_req_t *req, *rtmp; - unsigned long flags; LIST_HEAD(cancel_list); p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err); - spin_lock_irqsave(&m->client->lock, flags); + spin_lock(&m->client->lock); if (m->err) { - spin_unlock_irqrestore(&m->client->lock, flags); + spin_unlock(&m->client->lock); return; } @@ -217,7 +216,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_for_each_entry_safe(req, rtmp, &m->unsent_req_list, req_list) { list_move(&req->req_list, &cancel_list); } - spin_unlock_irqrestore(&m->client->lock, flags); + spin_unlock(&m->client->lock); list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); --
Re: [V9fs-developer] [PATCH v2 2/6] 9p: Change p9_fid_create calling convention
LGTM On 2018/7/12 5:02, Matthew Wilcox wrote: > Return NULL instead of ERR_PTR when we can't allocate a FID. The ENOSPC > return value was getting all the way back to userspace, and that's > confusing for a userspace program which isn't expecting read() to tell it > there's no space left on the filesystem. The best error we can return to > indicate a temporary failure caused by lack of client resources is ENOMEM. > > Maybe it would be better to sleep until a FID is available, but that's > not a change I'm comfortable making. > > Signed-off-by: Matthew Wilcox Reviewed-by: Jun Piao > --- > net/9p/client.c | 23 +-- > 1 file changed, 9 insertions(+), 14 deletions(-) > > diff --git a/net/9p/client.c b/net/9p/client.c > index 999eceb8af98..389a2904b7b3 100644 > --- a/net/9p/client.c > +++ b/net/9p/client.c > @@ -913,13 +913,11 @@ static struct p9_fid *p9_fid_create(struct p9_client > *clnt) > p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt); > fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL); > if (!fid) > - return ERR_PTR(-ENOMEM); > + return NULL; > > ret = p9_idpool_get(clnt->fidpool); > - if (ret < 0) { > - ret = -ENOSPC; > + if (ret < 0) > goto error; > - } > fid->fid = ret; > > memset(&fid->qid, 0, sizeof(struct p9_qid)); > @@ -935,7 +933,7 @@ static struct p9_fid *p9_fid_create(struct p9_client > *clnt) > > error: > kfree(fid); > - return ERR_PTR(ret); > + return NULL; > } > > static void p9_fid_destroy(struct p9_fid *fid) > @@ -1137,9 +1135,8 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, > struct p9_fid *afid, > p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n", >afid ? afid->fid : -1, uname, aname); > fid = p9_fid_create(clnt); > - if (IS_ERR(fid)) { > - err = PTR_ERR(fid); > - fid = NULL; > + if (!fid) { > + err = -ENOMEM; > goto error; > } > fid->uid = n_uname; > @@ -1188,9 +1185,8 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, > uint16_t nwname, > clnt = oldfid->clnt; > if (clone) { > fid = p9_fid_create(clnt); > - if (IS_ERR(fid)) { > - err = PTR_ERR(fid); > - fid = NULL; > + if (!fid) { > + err = -ENOMEM; > goto error; > } > > @@ -2018,9 +2014,8 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid > *file_fid, > err = 0; > clnt = file_fid->clnt; > attr_fid = p9_fid_create(clnt); > - if (IS_ERR(attr_fid)) { > - err = PTR_ERR(attr_fid); > - attr_fid = NULL; > + if (!attr_fid) { > + err = -ENOMEM; > goto error; > } > p9_debug(P9_DEBUG_9P, >
[PATCH] 9p/net/protocol.c: return -ENOMEM when kmalloc() failed
We should return -ENOMEM to upper user when kmalloc failed to indicate accurate errno. Signed-off-by: Jun Piao --- net/9p/protocol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 931ea00..4a1e1dd 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -156,7 +156,7 @@ static size_t pdu_write(struct p9_fcall *pdu, const void *data, size_t size) *sptr = kmalloc(len + 1, GFP_NOFS); if (*sptr == NULL) { - errcode = -EFAULT; + errcode = -ENOMEM; break; } if (pdu_read(pdu, *sptr, len)) { --
[PATCH] net/9p/client.c: add missing '\n' at the end of p9_debug()
In p9_client_getattr_dotl(), we should add '\n' at the end of printing log. Signed-off-by: Jun Piao --- net/9p/client.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/9p/client.c b/net/9p/client.c index 5c13431..8bc8b3e 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -1790,7 +1790,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, "<<< st_mtime_sec=%lld st_mtime_nsec=%lld\n" "<<< st_ctime_sec=%lld st_ctime_nsec=%lld\n" "<<< st_btime_sec=%lld st_btime_nsec=%lld\n" - "<<< st_gen=%lld st_data_version=%lld", + "<<< st_gen=%lld st_data_version=%lld\n", ret->st_result_mask, ret->qid.type, ret->qid.path, ret->qid.version, ret->st_mode, ret->st_nlink, from_kuid(&init_user_ns, ret->st_uid), --
[PATCH] net/9p/client.c: put refcount of trans_mod in error case in parse_opts()
>From my test, the second mount will fail after umounting successfully. The reason is that we put refcount of trans_mod in the correct case rather than the error case in parse_opts() at last. That will cause the refcount decrease to -1, and when we try to get trans_mod again in try_module_get(), we could only increase refcount to 0 which will cause failure as follows: parse_opts v9fs_get_trans_by_name try_module_get : return NULL to caller which cause error So we should put refcount of trans_mod in error case. Fixes: 9421c3e64137ec ("net/9p/client.c: fix potential refcnt problem of trans module") Signed-off-by: Jun Piao --- net/9p/client.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/9p/client.c b/net/9p/client.c index 18c5271..5c13431 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -225,7 +225,8 @@ static int parse_opts(char *opts, struct p9_client *clnt) } free_and_return: - v9fs_put_trans(clnt->trans_mod); + if (ret) + v9fs_put_trans(clnt->trans_mod); kfree(tmp_options); return ret; } --
Re: [V9fs-developer] [PATCH] fs: 9p: Adding new return type vm_fault_t
LGTM On 2018/7/2 23:49, Souptick Joarder wrote: > Use new return type vm_fault_t for page_mkwrite > handler. > > see commit 1c8f422059ae ("mm: change return type to > vm_fault_t") for reference. > > Signed-off-by: Souptick Joarder > Reviewed-by: Matthew Wilcox Acked-by: Jun Piao > --- > fs/9p/vfs_file.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c > index 03c9e32..5f2e48d 100644 > --- a/fs/9p/vfs_file.c > +++ b/fs/9p/vfs_file.c > @@ -533,7 +533,7 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, > loff_t end, > return retval; > } > > -static int > +static vm_fault_t > v9fs_vm_page_mkwrite(struct vm_fault *vmf) > { > struct v9fs_inode *v9inode; > -- > 1.9.1 > > > -- > Check out the vibrant tech community on one of the world's most > engaging tech sites, Slashdot.org! http://sdm.link/slashdot > ___ > V9fs-developer mailing list > v9fs-develo...@lists.sourceforge.net > https://lists.sourceforge.net/lists/listinfo/v9fs-developer >
Re: [Ocfs2-devel] [PATCH] Correct a comment error
Hi Changwei, On 2018/3/2 9:59, Changwei Ge wrote: > Hi Jun, > I think the comments for both two functions are OK. > No need to rework them. > As we know, ocfs2 lock name(lock id) are composed of several parts including > block number. I looked though the comments involved 'lockid', and found 'lockid' is a concept in dlm level, so ocfs2 level should not be aware of it. thanks, Jun > > Thanks, > Changw2ei > > On 2018/3/1 20:58, piaojun wrote: >> Hi Larry, >> >> There is the same mistake in ocfs2_reflink_inodes_lock(), could you help >> fixing them all? >> >> thanks, >> Jun >> >> On 2018/2/28 18:17, Larry Chen wrote: >>> The function ocfs2_double_lock tries to lock the inode with lower >>> blockid first, not lockid. >>> >>> Signed-off-by: Larry Chen >>> --- >>> fs/ocfs2/namei.c | 2 +- >>> 1 file changed, 1 insertion(+), 1 deletion(-) >>> >>> diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c >>> index c801eddc4bf3..30d454de35a8 100644 >>> --- a/fs/ocfs2/namei.c >>> +++ b/fs/ocfs2/namei.c >>> @@ -1133,7 +1133,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, >>> if (*bh2) >>> *bh2 = NULL; >>> >>> - /* we always want to lock the one with the lower lockid first. >>> + /* we always want to lock the one with the lower blockid first. >>> * and if they are nested, we lock ancestor first */ >>> if (oi1->ip_blkno != oi2->ip_blkno) { >>> inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, >>> >> >> ___ >> Ocfs2-devel mailing list >> ocfs2-de...@oss.oracle.com >> https://oss.oracle.com/mailman/listinfo/ocfs2-devel >> > . >
Re: [Ocfs2-devel] [PATCH] Correct a comment error
Hi Larry, There is the same mistake in ocfs2_reflink_inodes_lock(), could you help fixing them all? thanks, Jun On 2018/2/28 18:17, Larry Chen wrote: > The function ocfs2_double_lock tries to lock the inode with lower > blockid first, not lockid. > > Signed-off-by: Larry Chen > --- > fs/ocfs2/namei.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c > index c801eddc4bf3..30d454de35a8 100644 > --- a/fs/ocfs2/namei.c > +++ b/fs/ocfs2/namei.c > @@ -1133,7 +1133,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, > if (*bh2) > *bh2 = NULL; > > - /* we always want to lock the one with the lower lockid first. > + /* we always want to lock the one with the lower blockid first. >* and if they are nested, we lock ancestor first */ > if (oi1->ip_blkno != oi2->ip_blkno) { > inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, >
Re: [Ocfs2-devel] [PATCH v2] ocfs2: try a blocking lock before return AOP_TRUNCATED_PAGE
LGTM On 2017/12/28 15:48, Gang He wrote: > If we can't get inode lock immediately in the function > ocfs2_inode_lock_with_page() when reading a page, we should not > return directly here, since this will lead to a softlockup problem > when the kernel is configured with CONFIG_PREEMPT is not set. > The method is to get a blocking lock and immediately unlock before > returning, this can avoid CPU resource waste due to lots of retries, > and benefits fairness in getting lock among multiple nodes, increase > efficiency in case modifying the same file frequently from multiple > nodes. > The softlockup crash (when set /proc/sys/kernel/softlockup_panic to 1) > looks like, > Kernel panic - not syncing: softlockup: hung tasks > CPU: 0 PID: 885 Comm: multi_mmap Tainted: G L 4.12.14-6.1-default #1 > Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > Call Trace: > > dump_stack+0x5c/0x82 > panic+0xd5/0x21e > watchdog_timer_fn+0x208/0x210 > ? watchdog_park_threads+0x70/0x70 > __hrtimer_run_queues+0xcc/0x200 > hrtimer_interrupt+0xa6/0x1f0 > smp_apic_timer_interrupt+0x34/0x50 > apic_timer_interrupt+0x96/0xa0 > > RIP: 0010:unlock_page+0x17/0x30 > RSP: :af154080bc88 EFLAGS: 0246 ORIG_RAX: ff10 > RAX: dead0100 RBX: f21e009f5300 RCX: 0004 > RDX: dead00ff RSI: 0202 RDI: f21e009f5300 > RBP: R08: R09: af154080bb00 > R10: af154080bc30 R11: 0040 R12: 993749a39518 > R13: R14: f21e009f5300 R15: f21e009f5300 > ocfs2_inode_lock_with_page+0x25/0x30 [ocfs2] > ocfs2_readpage+0x41/0x2d0 [ocfs2] > ? pagecache_get_page+0x30/0x200 > filemap_fault+0x12b/0x5c0 > ? recalc_sigpending+0x17/0x50 > ? __set_task_blocked+0x28/0x70 > ? __set_current_blocked+0x3d/0x60 > ocfs2_fault+0x29/0xb0 [ocfs2] > __do_fault+0x1a/0xa0 > __handle_mm_fault+0xbe8/0x1090 > handle_mm_fault+0xaa/0x1f0 > __do_page_fault+0x235/0x4b0 > trace_do_page_fault+0x3c/0x110 > async_page_fault+0x28/0x30 > RIP: 0033:0x7fa75ded638e > RSP: 002b:7ffd6657db18 EFLAGS: 00010287 > RAX: 55c7662fb700 RBX: 0001 RCX: 55c7662fb700 > RDX: 1770 RSI: 7fa75e909000 RDI: 55c7662fb700 > RBP: 0003 R08: 000e R09: > R10: 0483 R11: 7fa75ded61b0 R12: 7fa75e90a770 > R13: 000e R14: 1770 R15: > > About performance improvement, we can see the testing time is reduced, > and CPU utilization decreases, the detailed data is as follows. > I ran multi_mmap test case in ocfs2-test package in a three nodes cluster. > Before apply this patch, > PID USER PR NIVIRTRESSHR S %CPU %MEM TIME+ COMMAND > 2754 ocfs2te+ 20 0 170248 6980 4856 D 80.73 0.341 0:18.71 > multi_mmap > 1505 root rt 0 36 123060 97224 S 2.658 6.015 0:01.44 corosync > 5 root 20 0 0 0 0 S 1.329 0.000 0:00.19 > kworker/u8:0 >95 root 20 0 0 0 0 S 1.329 0.000 0:00.25 > kworker/u8:1 > 2728 root 20 0 0 0 0 S 0.997 0.000 0:00.24 > jbd2/sda1-33 > 2721 root 20 0 0 0 0 S 0.664 0.000 0:00.07 > ocfs2dc-3C8CFD4 > 2750 ocfs2te+ 20 0 142976 4652 3532 S 0.664 0.227 0:00.28 mpirun > > ocfs2test@tb-node2:~>multiple_run.sh -i ens3 -k ~/linux-4.4.21-69.tar.gz -o > ~/ocfs2mullog -C hacluster -s pcmk -n tb-node2,tb-node1,tb-node3 -d > /dev/sda1 -b 4096 -c 32768 -t multi_mmap /mnt/shared > Tests with "-b 4096 -C 32768" > Thu Dec 28 14:44:52 CST 2017 > multi_mmap..Passed. > Runtime 783 seconds. > > After apply this patch, > PID USER PR NIVIRTRESSHR S %CPU %MEM TIME+ COMMAND > 2508 ocfs2te+ 20 0 170248 6804 4680 R 54.00 0.333 0:55.37 > multi_mmap > 155 root 20 0 0 0 0 S 2.667 0.000 0:01.20 > kworker/u8:3 >95 root 20 0 0 0 0 S 2.000 0.000 0:01.58 > kworker/u8:1 > 2504 ocfs2te+ 20 0 142976 4604 3480 R 1.667 0.225 0:01.65 mpirun > 5 root 20 0 0 0 0 S 1.000 0.000 0:01.36 > kworker/u8:0 > 2482 root 20 0 0 0 0 S 1.000 0.000 0:00.86 > jbd2/sda1-33 > 299 root 0 -20 0 0 0 S 0.333 0.000 0:00.13 > kworker/2:1H > 335 root 0 -20 0 0 0 S 0.333 0.000 0:00.17 > kworker/1:1H > 535 root 20 0 12140 7268 1456 S 0.333 0.355 0:00.34 haveged > 1282 root rt 0 84 123108 97224 S 0.333 6.017 0:01.33 corosync > > ocfs2test@tb-node2:~>multiple_run.sh -i ens3 -k ~/linux-4.4.21-69.tar.gz -o > ~/ocfs2mullog -C hacluster -s pcmk -n tb-node2,tb-node1,tb-node3 -d > /dev/sda1 -b 4096 -c 32768 -t multi_mmap /mnt/shared > Tests with "-b 4096 -C 32768" > Thu Dec 28 15:04:12 CST 2017
Re: [Ocfs2-devel] [PATCH] ocfs2: try a blocking lock before return AOP_TRUNCATED_PAGE
Hi Gang, This patch looks good to me. thanks, Jun On 2017/12/28 10:58, Gang He wrote: > > > >> Hi Gang, >> >> You cleared my doubt. Should we handle the errno of ocfs2_inode_lock() >> or just use mlog_errno()? > Hi Jun, I think it is not necessary, since we just want to hold a while > before get the DLM lock, > we do not care about the result, since we will unlock immediately here. > In fact, this patch does NOT add new code, just revert the old patch > 1cce4df04f37, and add > more clear comments in the front of these two lines code. > > Thanks > Gang > >> >> thanks, >> Jun >> >> On 2017/12/28 10:11, Gang He wrote: >>> Hi Jun, >>> >>> >> Hi Gang, Thanks for your explaination, and I just have one more question. Could we use 'ocfs2_inode_lock' instead of 'ocfs2_inode_lock_full' to avoid -EAGAIN circularly? >>> No, please see the comments above the function >> ocfs2_inode_lock_with_page(), >>> there will be probably a deadlock between tasks acquiring DLM >>> locks while holding a page lock and the downconvert thread which >>> blocks dlm lock acquiry while acquiring page locks. >>> Then, the OCFS2_LOCK_NONBLOCK flag was introduced as a workaround to >>> avoid this case. >>> >>> Thanks >>> Gang >>> thanks, Jun On 2017/12/27 18:37, Gang He wrote: > Hi Jun, > > >> Hi Gang, >> >> Do you mean that too many retrys in loop cast losts of CPU-time and >> block page-fault interrupt? We should not add any delay in >> ocfs2_fault(), right? And I still feel a little confused why your >> method can solve this problem. > You can see the related code in function filemap_fault(), if ocfs2 fails > to read a page since > it can not get a inode lock with non-block mode, the VFS layer code will invoke ocfs2 > read page call back function circularly, this will lead to a softlockup problem (like the below back trace). > So, we should get a blocking lock to let the dlm lock to this node and > also can avoid CPU loop, > second, base on my testing, the patch also can improve the efficiency in case modifying the same > file frequently from multiple nodes, since the lock acquisition chance is more fair. > In fact, the code was modified by a patch 1cce4df04f37 ("ocfs2: do not lock/unlock() inode DLM lock"), > before that patch, the code is the same, this patch can be considered to revert that patch, except adding more > clear comments. > > Thanks > Gang > > >> >> thanks, >> Jun >> >> On 2017/12/27 17:29, Gang He wrote: >>> If we can't get inode lock immediately in the function >>> ocfs2_inode_lock_with_page() when reading a page, we should not >>> return directly here, since this will lead to a softlockup problem. >>> The method is to get a blocking lock and immediately unlock before >>> returning, this can avoid CPU resource waste due to lots of retries, >>> and benefits fairness in getting lock among multiple nodes, increase >>> efficiency in case modifying the same file frequently from multiple >>> nodes. >>> The softlockup problem looks like, >>> Kernel panic - not syncing: softlockup: hung tasks >>> CPU: 0 PID: 885 Comm: multi_mmap Tainted: G L 4.12.14-6.1-default #1 >>> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 >>> Call Trace: >>> >>> dump_stack+0x5c/0x82 >>> panic+0xd5/0x21e >>> watchdog_timer_fn+0x208/0x210 >>> ? watchdog_park_threads+0x70/0x70 >>> __hrtimer_run_queues+0xcc/0x200 >>> hrtimer_interrupt+0xa6/0x1f0 >>> smp_apic_timer_interrupt+0x34/0x50 >>> apic_timer_interrupt+0x96/0xa0 >>> >>> RIP: 0010:unlock_page+0x17/0x30 >>> RSP: :af154080bc88 EFLAGS: 0246 ORIG_RAX: ff10 >>> RAX: dead0100 RBX: f21e009f5300 RCX: 0004 >>> RDX: dead00ff RSI: 0202 RDI: f21e009f5300 >>> RBP: R08: R09: af154080bb00 >>> R10: af154080bc30 R11: 0040 R12: 993749a39518 >>> R13: R14: f21e009f5300 R15: f21e009f5300 >>> ocfs2_inode_lock_with_page+0x25/0x30 [ocfs2] >>> ocfs2_readpage+0x41/0x2d0 [ocfs2] >>> ? pagecache_get_page+0x30/0x200 >>> filemap_fault+0x12b/0x5c0 >>> ? recalc_sigpending+0x17/0x50 >>> ? __set_task_blocked+0x28/0x70 >>> ? __set_current_blocked+0x3d/0x60 >>> ocfs2_fault+0x29/0xb0 [ocfs2] >>> __do_fault+0x1a/0xa0 >>> __handle_mm_fault+0xbe8/0x1090 >>> handle_mm_fault+0xaa/0x1f0 >>> __do_page_fault+0x235/0x4b0 >>> trace_do_page_fault+0x3c/0x110 >>> async_page_fault+0x28/0x30 >>> RIP: 0033:0x7fa75ded638e >>> RSP: 002b:7ffd6657db18 EFLAGS: 00010287 >>> RAX: 55c7662fb700 RBX: 00
Re: [Ocfs2-devel] [PATCH] ocfs2: try a blocking lock before return AOP_TRUNCATED_PAGE
Hi Gang, You cleared my doubt. Should we handle the errno of ocfs2_inode_lock() or just use mlog_errno()? thanks, Jun On 2017/12/28 10:11, Gang He wrote: > Hi Jun, > > >> Hi Gang, >> >> Thanks for your explaination, and I just have one more question. Could >> we use 'ocfs2_inode_lock' instead of 'ocfs2_inode_lock_full' to avoid >> -EAGAIN circularly? > No, please see the comments above the function ocfs2_inode_lock_with_page(), > there will be probably a deadlock between tasks acquiring DLM > locks while holding a page lock and the downconvert thread which > blocks dlm lock acquiry while acquiring page locks. > Then, the OCFS2_LOCK_NONBLOCK flag was introduced as a workaround to > avoid this case. > > Thanks > Gang > >> >> thanks, >> Jun >> >> On 2017/12/27 18:37, Gang He wrote: >>> Hi Jun, >>> >>> >> Hi Gang, Do you mean that too many retrys in loop cast losts of CPU-time and block page-fault interrupt? We should not add any delay in ocfs2_fault(), right? And I still feel a little confused why your method can solve this problem. >>> You can see the related code in function filemap_fault(), if ocfs2 fails to >> read a page since >>> it can not get a inode lock with non-block mode, the VFS layer code will >> invoke ocfs2 >>> read page call back function circularly, this will lead to a softlockup >> problem (like the below back trace). >>> So, we should get a blocking lock to let the dlm lock to this node and also >> can avoid CPU loop, >>> second, base on my testing, the patch also can improve the efficiency in >> case modifying the same >>> file frequently from multiple nodes, since the lock acquisition chance is >> more fair. >>> In fact, the code was modified by a patch 1cce4df04f37 ("ocfs2: do not >> lock/unlock() inode DLM lock"), >>> before that patch, the code is the same, this patch can be considered to >> revert that patch, except adding more >>> clear comments. >>> >>> Thanks >>> Gang >>> >>> thanks, Jun On 2017/12/27 17:29, Gang He wrote: > If we can't get inode lock immediately in the function > ocfs2_inode_lock_with_page() when reading a page, we should not > return directly here, since this will lead to a softlockup problem. > The method is to get a blocking lock and immediately unlock before > returning, this can avoid CPU resource waste due to lots of retries, > and benefits fairness in getting lock among multiple nodes, increase > efficiency in case modifying the same file frequently from multiple > nodes. > The softlockup problem looks like, > Kernel panic - not syncing: softlockup: hung tasks > CPU: 0 PID: 885 Comm: multi_mmap Tainted: G L 4.12.14-6.1-default #1 > Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > Call Trace: > > dump_stack+0x5c/0x82 > panic+0xd5/0x21e > watchdog_timer_fn+0x208/0x210 > ? watchdog_park_threads+0x70/0x70 > __hrtimer_run_queues+0xcc/0x200 > hrtimer_interrupt+0xa6/0x1f0 > smp_apic_timer_interrupt+0x34/0x50 > apic_timer_interrupt+0x96/0xa0 > > RIP: 0010:unlock_page+0x17/0x30 > RSP: :af154080bc88 EFLAGS: 0246 ORIG_RAX: ff10 > RAX: dead0100 RBX: f21e009f5300 RCX: 0004 > RDX: dead00ff RSI: 0202 RDI: f21e009f5300 > RBP: R08: R09: af154080bb00 > R10: af154080bc30 R11: 0040 R12: 993749a39518 > R13: R14: f21e009f5300 R15: f21e009f5300 > ocfs2_inode_lock_with_page+0x25/0x30 [ocfs2] > ocfs2_readpage+0x41/0x2d0 [ocfs2] > ? pagecache_get_page+0x30/0x200 > filemap_fault+0x12b/0x5c0 > ? recalc_sigpending+0x17/0x50 > ? __set_task_blocked+0x28/0x70 > ? __set_current_blocked+0x3d/0x60 > ocfs2_fault+0x29/0xb0 [ocfs2] > __do_fault+0x1a/0xa0 > __handle_mm_fault+0xbe8/0x1090 > handle_mm_fault+0xaa/0x1f0 > __do_page_fault+0x235/0x4b0 > trace_do_page_fault+0x3c/0x110 > async_page_fault+0x28/0x30 > RIP: 0033:0x7fa75ded638e > RSP: 002b:7ffd6657db18 EFLAGS: 00010287 > RAX: 55c7662fb700 RBX: 0001 RCX: 55c7662fb700 > RDX: 1770 RSI: 7fa75e909000 RDI: 55c7662fb700 > RBP: 0003 R08: 000e R09: > R10: 0483 R11: 7fa75ded61b0 R12: 7fa75e90a770 > R13: 000e R14: 1770 R15: > > Fixes: 1cce4df04f37 ("ocfs2: do not lock/unlock() inode DLM lock") > Signed-off-by: Gang He > --- > fs/ocfs2/dlmglue.c | 9 + > 1 file changed, 9 insertions(+) > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c > index 4689940..5193218 100644 > --- a/fs/ocfs2/dlmglue.c > +++ b/fs/ocfs2/dlmglue.c > @@ -2486,6 +2486,15 @@ i
Re: [Ocfs2-devel] [PATCH] ocfs2: try a blocking lock before return AOP_TRUNCATED_PAGE
Hi Gang, Thanks for your explaination, and I just have one more question. Could we use 'ocfs2_inode_lock' instead of 'ocfs2_inode_lock_full' to avoid -EAGAIN circularly? thanks, Jun On 2017/12/27 18:37, Gang He wrote: > Hi Jun, > > >> Hi Gang, >> >> Do you mean that too many retrys in loop cast losts of CPU-time and >> block page-fault interrupt? We should not add any delay in >> ocfs2_fault(), right? And I still feel a little confused why your >> method can solve this problem. > You can see the related code in function filemap_fault(), if ocfs2 fails to > read a page since > it can not get a inode lock with non-block mode, the VFS layer code will > invoke ocfs2 > read page call back function circularly, this will lead to a softlockup > problem (like the below back trace). > So, we should get a blocking lock to let the dlm lock to this node and also > can avoid CPU loop, > second, base on my testing, the patch also can improve the efficiency in case > modifying the same > file frequently from multiple nodes, since the lock acquisition chance is > more fair. > In fact, the code was modified by a patch 1cce4df04f37 ("ocfs2: do not > lock/unlock() inode DLM lock"), > before that patch, the code is the same, this patch can be considered to > revert that patch, except adding more > clear comments. > > Thanks > Gang > > >> >> thanks, >> Jun >> >> On 2017/12/27 17:29, Gang He wrote: >>> If we can't get inode lock immediately in the function >>> ocfs2_inode_lock_with_page() when reading a page, we should not >>> return directly here, since this will lead to a softlockup problem. >>> The method is to get a blocking lock and immediately unlock before >>> returning, this can avoid CPU resource waste due to lots of retries, >>> and benefits fairness in getting lock among multiple nodes, increase >>> efficiency in case modifying the same file frequently from multiple >>> nodes. >>> The softlockup problem looks like, >>> Kernel panic - not syncing: softlockup: hung tasks >>> CPU: 0 PID: 885 Comm: multi_mmap Tainted: G L 4.12.14-6.1-default #1 >>> Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 >>> Call Trace: >>> >>> dump_stack+0x5c/0x82 >>> panic+0xd5/0x21e >>> watchdog_timer_fn+0x208/0x210 >>> ? watchdog_park_threads+0x70/0x70 >>> __hrtimer_run_queues+0xcc/0x200 >>> hrtimer_interrupt+0xa6/0x1f0 >>> smp_apic_timer_interrupt+0x34/0x50 >>> apic_timer_interrupt+0x96/0xa0 >>> >>> RIP: 0010:unlock_page+0x17/0x30 >>> RSP: :af154080bc88 EFLAGS: 0246 ORIG_RAX: ff10 >>> RAX: dead0100 RBX: f21e009f5300 RCX: 0004 >>> RDX: dead00ff RSI: 0202 RDI: f21e009f5300 >>> RBP: R08: R09: af154080bb00 >>> R10: af154080bc30 R11: 0040 R12: 993749a39518 >>> R13: R14: f21e009f5300 R15: f21e009f5300 >>> ocfs2_inode_lock_with_page+0x25/0x30 [ocfs2] >>> ocfs2_readpage+0x41/0x2d0 [ocfs2] >>> ? pagecache_get_page+0x30/0x200 >>> filemap_fault+0x12b/0x5c0 >>> ? recalc_sigpending+0x17/0x50 >>> ? __set_task_blocked+0x28/0x70 >>> ? __set_current_blocked+0x3d/0x60 >>> ocfs2_fault+0x29/0xb0 [ocfs2] >>> __do_fault+0x1a/0xa0 >>> __handle_mm_fault+0xbe8/0x1090 >>> handle_mm_fault+0xaa/0x1f0 >>> __do_page_fault+0x235/0x4b0 >>> trace_do_page_fault+0x3c/0x110 >>> async_page_fault+0x28/0x30 >>> RIP: 0033:0x7fa75ded638e >>> RSP: 002b:7ffd6657db18 EFLAGS: 00010287 >>> RAX: 55c7662fb700 RBX: 0001 RCX: 55c7662fb700 >>> RDX: 1770 RSI: 7fa75e909000 RDI: 55c7662fb700 >>> RBP: 0003 R08: 000e R09: >>> R10: 0483 R11: 7fa75ded61b0 R12: 7fa75e90a770 >>> R13: 000e R14: 1770 R15: >>> >>> Fixes: 1cce4df04f37 ("ocfs2: do not lock/unlock() inode DLM lock") >>> Signed-off-by: Gang He >>> --- >>> fs/ocfs2/dlmglue.c | 9 + >>> 1 file changed, 9 insertions(+) >>> >>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c >>> index 4689940..5193218 100644 >>> --- a/fs/ocfs2/dlmglue.c >>> +++ b/fs/ocfs2/dlmglue.c >>> @@ -2486,6 +2486,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode, >>> ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); >>> if (ret == -EAGAIN) { >>> unlock_page(page); >>> + /* >>> +* If we can't get inode lock immediately, we should not return >>> +* directly here, since this will lead to a softlockup problem. >>> +* The method is to get a blocking lock and immediately unlock >>> +* before returning, this can avoid CPU resource waste due to >>> +* lots of retries, and benefits fairness in getting lock. >>> +*/ >>> + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) >>> + ocfs2_inode_unlock(inode, ex); >>> ret = AOP_TRU
Re: [Ocfs2-devel] [PATCH] ocfs2: try a blocking lock before return AOP_TRUNCATED_PAGE
Hi Gang, Do you mean that too many retrys in loop cast losts of CPU-time and block page-fault interrupt? We should not add any delay in ocfs2_fault(), right? And I still feel a little confused why your method can solve this problem. thanks, Jun On 2017/12/27 17:29, Gang He wrote: > If we can't get inode lock immediately in the function > ocfs2_inode_lock_with_page() when reading a page, we should not > return directly here, since this will lead to a softlockup problem. > The method is to get a blocking lock and immediately unlock before > returning, this can avoid CPU resource waste due to lots of retries, > and benefits fairness in getting lock among multiple nodes, increase > efficiency in case modifying the same file frequently from multiple > nodes. > The softlockup problem looks like, > Kernel panic - not syncing: softlockup: hung tasks > CPU: 0 PID: 885 Comm: multi_mmap Tainted: G L 4.12.14-6.1-default #1 > Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 > Call Trace: > > dump_stack+0x5c/0x82 > panic+0xd5/0x21e > watchdog_timer_fn+0x208/0x210 > ? watchdog_park_threads+0x70/0x70 > __hrtimer_run_queues+0xcc/0x200 > hrtimer_interrupt+0xa6/0x1f0 > smp_apic_timer_interrupt+0x34/0x50 > apic_timer_interrupt+0x96/0xa0 > > RIP: 0010:unlock_page+0x17/0x30 > RSP: :af154080bc88 EFLAGS: 0246 ORIG_RAX: ff10 > RAX: dead0100 RBX: f21e009f5300 RCX: 0004 > RDX: dead00ff RSI: 0202 RDI: f21e009f5300 > RBP: R08: R09: af154080bb00 > R10: af154080bc30 R11: 0040 R12: 993749a39518 > R13: R14: f21e009f5300 R15: f21e009f5300 > ocfs2_inode_lock_with_page+0x25/0x30 [ocfs2] > ocfs2_readpage+0x41/0x2d0 [ocfs2] > ? pagecache_get_page+0x30/0x200 > filemap_fault+0x12b/0x5c0 > ? recalc_sigpending+0x17/0x50 > ? __set_task_blocked+0x28/0x70 > ? __set_current_blocked+0x3d/0x60 > ocfs2_fault+0x29/0xb0 [ocfs2] > __do_fault+0x1a/0xa0 > __handle_mm_fault+0xbe8/0x1090 > handle_mm_fault+0xaa/0x1f0 > __do_page_fault+0x235/0x4b0 > trace_do_page_fault+0x3c/0x110 > async_page_fault+0x28/0x30 > RIP: 0033:0x7fa75ded638e > RSP: 002b:7ffd6657db18 EFLAGS: 00010287 > RAX: 55c7662fb700 RBX: 0001 RCX: 55c7662fb700 > RDX: 1770 RSI: 7fa75e909000 RDI: 55c7662fb700 > RBP: 0003 R08: 000e R09: > R10: 0483 R11: 7fa75ded61b0 R12: 7fa75e90a770 > R13: 000e R14: 1770 R15: > > Fixes: 1cce4df04f37 ("ocfs2: do not lock/unlock() inode DLM lock") > Signed-off-by: Gang He > --- > fs/ocfs2/dlmglue.c | 9 + > 1 file changed, 9 insertions(+) > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c > index 4689940..5193218 100644 > --- a/fs/ocfs2/dlmglue.c > +++ b/fs/ocfs2/dlmglue.c > @@ -2486,6 +2486,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode, > ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); > if (ret == -EAGAIN) { > unlock_page(page); > + /* > + * If we can't get inode lock immediately, we should not return > + * directly here, since this will lead to a softlockup problem. > + * The method is to get a blocking lock and immediately unlock > + * before returning, this can avoid CPU resource waste due to > + * lots of retries, and benefits fairness in getting lock. > + */ > + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) > + ocfs2_inode_unlock(inode, ex); > ret = AOP_TRUNCATED_PAGE; > } > >
Re: [Ocfs2-devel] [PATCH v2 1/3] ocfs2: add ocfs2_try_rw_lock and ocfs2_try_inode_lock
On 2017/11/29 16:36, Gang He wrote: > Add ocfs2_try_rw_lock and ocfs2_try_inode_lock functions, which > will be used in non-block IO scenarios. > > Signed-off-by: Gang He Reviewed-by: Jun Piao > --- > fs/ocfs2/dlmglue.c | 21 + > fs/ocfs2/dlmglue.h | 4 > 2 files changed, 25 insertions(+) > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c > index 4689940..a68efa3 100644 > --- a/fs/ocfs2/dlmglue.c > +++ b/fs/ocfs2/dlmglue.c > @@ -1742,6 +1742,27 @@ int ocfs2_rw_lock(struct inode *inode, int write) > return status; > } > > +int ocfs2_try_rw_lock(struct inode *inode, int write) > +{ > + int status, level; > + struct ocfs2_lock_res *lockres; > + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); > + > + mlog(0, "inode %llu try to take %s RW lock\n", > + (unsigned long long)OCFS2_I(inode)->ip_blkno, > + write ? "EXMODE" : "PRMODE"); > + > + if (ocfs2_mount_local(osb)) > + return 0; > + > + lockres = &OCFS2_I(inode)->ip_rw_lockres; > + > + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; > + > + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); > + return status; > +} > + > void ocfs2_rw_unlock(struct inode *inode, int write) > { > int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; > diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h > index a7fc18b..05910fc 100644 > --- a/fs/ocfs2/dlmglue.h > +++ b/fs/ocfs2/dlmglue.h > @@ -116,6 +116,7 @@ void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res > *lockres, > int ocfs2_create_new_inode_locks(struct inode *inode); > int ocfs2_drop_inode_locks(struct inode *inode); > int ocfs2_rw_lock(struct inode *inode, int write); > +int ocfs2_try_rw_lock(struct inode *inode, int write); > void ocfs2_rw_unlock(struct inode *inode, int write); > int ocfs2_open_lock(struct inode *inode); > int ocfs2_try_open_lock(struct inode *inode, int write); > @@ -140,6 +141,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode, > /* 99% of the time we don't want to supply any additional flags -- > * those are for very specific cases only. */ > #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, > OI_LS_NORMAL) > +#define ocfs2_try_inode_lock(i, b, e)\ > + ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\ > + OI_LS_NORMAL) > void ocfs2_inode_unlock(struct inode *inode, > int ex); > int ocfs2_super_lock(struct ocfs2_super *osb, >
Re: [Ocfs2-devel] [PATCH 2/3] ocfs2: add ocfs2_overwrite_io function
Hi Gang, If ocfs2_overwrite_io is only called in 'nowait' scenarios, I wonder if we can discard 'int wait' just as ext4 does: static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len); thans, Jun On 2017/11/27 17:46, Gang He wrote: > Add ocfs2_overwrite_io function, which is used to judge if > overwrite allocated blocks, otherwise, the write will bring extra > block allocation overhead. > > Signed-off-by: Gang He > --- > fs/ocfs2/extent_map.c | 67 > +++ > fs/ocfs2/extent_map.h | 3 +++ > 2 files changed, 70 insertions(+) > > diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c > index e4719e0..98bf325 100644 > --- a/fs/ocfs2/extent_map.c > +++ b/fs/ocfs2/extent_map.c > @@ -832,6 +832,73 @@ int ocfs2_fiemap(struct inode *inode, struct > fiemap_extent_info *fieinfo, > return ret; > } > > +/* Is IO overwriting allocated blocks? */ > +int ocfs2_overwrite_io(struct inode *inode, u64 map_start, u64 map_len, > +int wait) > +{ > + int ret = 0, is_last; > + u32 mapping_end, cpos; > + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); > + struct buffer_head *di_bh = NULL; > + struct ocfs2_extent_rec rec; > + > + if (wait) > + ret = ocfs2_inode_lock(inode, &di_bh, 0); > + else > + ret = ocfs2_try_inode_lock(inode, &di_bh, 0); > + if (ret) > + goto out; > + > + if (wait) > + down_read(&OCFS2_I(inode)->ip_alloc_sem); > + else { > + if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { > + ret = -EAGAIN; > + goto out_unlock1; > + } > + } > + > + if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) && > +((map_start + map_len) <= i_size_read(inode))) > + goto out_unlock2; > + > + cpos = map_start >> osb->s_clustersize_bits; > + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, > +map_start + map_len); > + is_last = 0; > + while (cpos < mapping_end && !is_last) { > + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, > + NULL, &rec, &is_last); > + if (ret) { > + mlog_errno(ret); > + goto out_unlock2; > + } > + > + if (rec.e_blkno == 0ULL) > + break; > + > + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) > + break; > + > + cpos = le32_to_cpu(rec.e_cpos) + > + le16_to_cpu(rec.e_leaf_clusters); > + } > + > + if (cpos < mapping_end) > + ret = 1; > + > +out_unlock2: > + brelse(di_bh); > + > + up_read(&OCFS2_I(inode)->ip_alloc_sem); > + > +out_unlock1: > + ocfs2_inode_unlock(inode, 0); > + > +out: > + return (ret ? 0 : 1); > +} > + > int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int > whence) > { > struct inode *inode = file->f_mapping->host; > diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h > index 67ea57d..fd9e86a 100644 > --- a/fs/ocfs2/extent_map.h > +++ b/fs/ocfs2/extent_map.h > @@ -53,6 +53,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 > v_blkno, u64 *p_blkno, > int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, >u64 map_start, u64 map_len); > > +int ocfs2_overwrite_io(struct inode *inode, u64 map_start, u64 map_len, > +int wait); > + > int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int > origin); > > int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, >
Re: [Ocfs2-devel] [PATCH 1/3] ocfs2: add ocfs2_try_rw_lock and ocfs2_try_inode_lock
Hi Gang, On 2017/11/27 17:46, Gang He wrote: > Add ocfs2_try_rw_lock and ocfs2_try_inode_lock functions, which > will be used in non-block IO scenarios. > > Signed-off-by: Gang He > --- > fs/ocfs2/dlmglue.c | 22 ++ > fs/ocfs2/dlmglue.h | 4 > 2 files changed, 26 insertions(+) > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c > index 4689940..5cfbd04 100644 > --- a/fs/ocfs2/dlmglue.c > +++ b/fs/ocfs2/dlmglue.c > @@ -1742,6 +1742,28 @@ int ocfs2_rw_lock(struct inode *inode, int write) > return status; > } > > +int ocfs2_try_rw_lock(struct inode *inode, int write) > +{ > + int status, level; > + struct ocfs2_lock_res *lockres; > + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); > + > + mlog(0, "inode %llu try to take %s RW lock\n", > + (unsigned long long)OCFS2_I(inode)->ip_blkno, > + write ? "EXMODE" : "PRMODE"); > + > + if (ocfs2_mount_local(osb)) > + return 0; > + > + lockres = &OCFS2_I(inode)->ip_rw_lockres; > + > + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; > + > + status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, > + DLM_LKF_NOQUEUE, 0); we'd better use 'osb' instead of 'OCFS2_SB(inode->i_sb)'. > + return status; > +} > + > void ocfs2_rw_unlock(struct inode *inode, int write) > { > int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; > diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h > index a7fc18b..05910fc 100644 > --- a/fs/ocfs2/dlmglue.h > +++ b/fs/ocfs2/dlmglue.h > @@ -116,6 +116,7 @@ void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res > *lockres, > int ocfs2_create_new_inode_locks(struct inode *inode); > int ocfs2_drop_inode_locks(struct inode *inode); > int ocfs2_rw_lock(struct inode *inode, int write); > +int ocfs2_try_rw_lock(struct inode *inode, int write); > void ocfs2_rw_unlock(struct inode *inode, int write); > int ocfs2_open_lock(struct inode *inode); > int ocfs2_try_open_lock(struct inode *inode, int write); > @@ -140,6 +141,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode, > /* 99% of the time we don't want to supply any additional flags -- > * those are for very specific cases only. */ > #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, > OI_LS_NORMAL) > +#define ocfs2_try_inode_lock(i, b, e)\ > + ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\ > + OI_LS_NORMAL) > void ocfs2_inode_unlock(struct inode *inode, > int ex); > int ocfs2_super_lock(struct ocfs2_super *osb, >
[PATCH] ocfs2: free 'dummy_sc' in sc_fop_release() in case of memory leak
'sd->dbg_sock' is malloc in sc_common_open(), but not freed at the end of sc_fop_release(). Signed-off-by: Jun Piao --- fs/ocfs2/cluster/netdebug.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 564c504..74a21f6 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -426,6 +426,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) struct o2net_sock_container *dummy_sc = sd->dbg_sock; o2net_debug_del_sc(dummy_sc); + kfree(dummy_sc); return seq_release_private(inode, file); } --