[Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h
There's no point in sharing the internal structure of lock value blocks with user space. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.h | 1 + fs/gfs2/incore.h | 1 + fs/gfs2/rgrp.c | 10 ++ include/uapi/linux/gfs2_ondisk.h | 10 -- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index b8adaf80e4c5..d2f2dba05a94 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -306,4 +306,5 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) spin_unlock(>gl_lockref.lock); } + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b5d9c11f4901..5155389e9b5c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -33,6 +33,7 @@ struct gfs2_trans; struct gfs2_jdesc; struct gfs2_sbd; struct lm_lockops; +struct gfs2_rgrp_lvb; typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 2466bb44a23c..1165627274cf 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -46,6 +46,16 @@ #define LBITSKIP00 (0xUL) #endif +struct gfs2_rgrp_lvb { + __be32 rl_magic; + __be32 rl_flags; + __be32 rl_free; + __be32 rl_dinodes; + __be64 rl_igeneration; + __be32 rl_unlinked; + __be32 __pad; +}; + /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h index 2dc10a034de1..4e9a80941bec 100644 --- a/include/uapi/linux/gfs2_ondisk.h +++ b/include/uapi/linux/gfs2_ondisk.h @@ -171,16 +171,6 @@ struct gfs2_rindex { #define GFS2_RGF_NOALLOC 0x0008 #define GFS2_RGF_TRIMMED 0x0010 -struct gfs2_rgrp_lvb { - __be32 rl_magic; - __be32 rl_flags; - __be32 rl_free; - __be32 rl_dinodes; - __be64 rl_igeneration; - __be32 rl_unlinked; - __be32 __pad; -}; - struct gfs2_rgrp { struct gfs2_meta_header rg_header; -- 2.20.1
[Cluster-devel] [PATCH] gfs2: Avoid access time trashing in gfs2_inode_lookup
In gfs2_inode_lookup, we initialize inode->i_atime to the lowest possibly value after gfs2_inode_refresh may already have been called. This should be the other way around, but we didn't notice because usually the inode type is known from the directory entry and so gfs2_inode_lookup won't call gfs2_inode_refresh. In addition, only initialize ip->i_no_formal_ino from no_formal_ino when actually needed. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index dafef10b91f1..2716d56ed0a0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -136,7 +136,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); - ip->i_no_formal_ino = no_formal_ino; error = gfs2_glock_get(sdp, no_addr, _inode_glops, CREATE, >i_gl); if (unlikely(error)) @@ -175,21 +174,22 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, gfs2_glock_put(io_gl); io_gl = NULL; + /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ + inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); + inode->i_atime.tv_nsec = 0; + if (type == DT_UNKNOWN) { /* Inode glock must be locked already */ error = gfs2_inode_refresh(GFS2_I(inode)); if (error) goto fail_refresh; } else { + ip->i_no_formal_ino = no_formal_ino; inode->i_mode = DT2IF(type); } gfs2_set_iop(inode); - /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ - inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); - inode->i_atime.tv_nsec = 0; - unlock_new_inode(inode); } -- 2.20.1
Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h
Hi, On 15/01/2020 08:49, Andreas Gruenbacher wrote: There's no point in sharing the internal structure of lock value blocks with user space. The reason that is in ondisk is that changing that structure is something that needs to follow the same rules as changing the on disk structures. So it is there as a reminder of that, Steve. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.h | 1 + fs/gfs2/incore.h | 1 + fs/gfs2/rgrp.c | 10 ++ include/uapi/linux/gfs2_ondisk.h | 10 -- 4 files changed, 12 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index b8adaf80e4c5..d2f2dba05a94 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -306,4 +306,5 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) spin_unlock(>gl_lockref.lock); } + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b5d9c11f4901..5155389e9b5c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -33,6 +33,7 @@ struct gfs2_trans; struct gfs2_jdesc; struct gfs2_sbd; struct lm_lockops; +struct gfs2_rgrp_lvb; typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 2466bb44a23c..1165627274cf 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -46,6 +46,16 @@ #define LBITSKIP00 (0xUL) #endif +struct gfs2_rgrp_lvb { + __be32 rl_magic; + __be32 rl_flags; + __be32 rl_free; + __be32 rl_dinodes; + __be64 rl_igeneration; + __be32 rl_unlinked; + __be32 __pad; +}; + /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h index 2dc10a034de1..4e9a80941bec 100644 --- a/include/uapi/linux/gfs2_ondisk.h +++ b/include/uapi/linux/gfs2_ondisk.h @@ -171,16 +171,6 @@ struct gfs2_rindex { #define GFS2_RGF_NOALLOC 0x0008 #define GFS2_RGF_TRIMMED 0x0010 -struct gfs2_rgrp_lvb { - __be32 rl_magic; - __be32 rl_flags; - __be32 rl_free; - __be32 rl_dinodes; - __be64 rl_igeneration; - __be32 rl_unlinked; - __be32 __pad; -}; - struct gfs2_rgrp { struct gfs2_meta_header rg_header;
Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h
On Wed, Jan 15, 2020 at 9:58 AM Steven Whitehouse wrote: > On 15/01/2020 08:49, Andreas Gruenbacher wrote: > > There's no point in sharing the internal structure of lock value blocks > > with user space. > > The reason that is in ondisk is that changing that structure is > something that needs to follow the same rules as changing the on disk > structures. So it is there as a reminder of that, I can see a point in that. The reason I've posted this is because Bob was complaining that changes to include/uapi/linux/gfs2_ondisk.h break his out-of-tree module build process. (One of the patches I'm working on adds an inode LVB.) The same would be true of on-disk format changes as well of course, and those definitely need to be shared with user space. I'm not usually building gfs2 out of tree, so I'm indifferent to this change. Thanks, Andreas
Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h
Hi, On 15/01/2020 09:24, Andreas Gruenbacher wrote: On Wed, Jan 15, 2020 at 9:58 AM Steven Whitehouse wrote: On 15/01/2020 08:49, Andreas Gruenbacher wrote: There's no point in sharing the internal structure of lock value blocks with user space. The reason that is in ondisk is that changing that structure is something that needs to follow the same rules as changing the on disk structures. So it is there as a reminder of that, I can see a point in that. The reason I've posted this is because Bob was complaining that changes to include/uapi/linux/gfs2_ondisk.h break his out-of-tree module build process. (One of the patches I'm working on adds an inode LVB.) The same would be true of on-disk format changes as well of course, and those definitely need to be shared with user space. I'm not usually building gfs2 out of tree, so I'm indifferent to this change. Thanks, Andreas Why would we need to be able to build gfs2 (at least I assume it is gfs2) out of tree anyway? Steve.
[Cluster-devel] [PATCH] gfs2: Avoid access time trashing in gfs2_inode_lookup
In gfs2_inode_lookup, we initialize inode->i_atime to the lowest possibly value after gfs2_inode_refresh may already have been called. This should be the other way around, but we didn't notice because usually the inode type is known from the directory entry and so gfs2_inode_lookup won't call gfs2_inode_refresh. In addition, only initialize ip->i_no_formal_ino from no_formal_ino when actually needed. Signed-off-by: Andreas Gruenbacher --- fs/gfs2/inode.c | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index dafef10b91f1..2716d56ed0a0 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -136,7 +136,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, if (inode->i_state & I_NEW) { struct gfs2_sbd *sdp = GFS2_SB(inode); - ip->i_no_formal_ino = no_formal_ino; error = gfs2_glock_get(sdp, no_addr, _inode_glops, CREATE, >i_gl); if (unlikely(error)) @@ -175,21 +174,22 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, gfs2_glock_put(io_gl); io_gl = NULL; + /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ + inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); + inode->i_atime.tv_nsec = 0; + if (type == DT_UNKNOWN) { /* Inode glock must be locked already */ error = gfs2_inode_refresh(GFS2_I(inode)); if (error) goto fail_refresh; } else { + ip->i_no_formal_ino = no_formal_ino; inode->i_mode = DT2IF(type); } gfs2_set_iop(inode); - /* Lowest possible timestamp; will be overwritten in gfs2_dinode_in. */ - inode->i_atime.tv_sec = 1LL << (8 * sizeof(inode->i_atime.tv_sec) - 1); - inode->i_atime.tv_nsec = 0; - unlock_new_inode(inode); } -- 2.20.1
Re: [Cluster-devel] [PATCH] gfs2: Avoid access time trashing in gfs2_inode_lookup
Oops, sorry for the duplicate post. Andreas
Re: [Cluster-devel] RFC: hold i_rwsem until aio completes
On Tue, Jan 14, 2020 at 05:12:13PM +0100, Christoph Hellwig wrote: > Hi all, > > Asynchronous read/write operations currently use a rather magic locking > scheme, were access to file data is normally protected using a rw_semaphore, > but if we are doing aio where the syscall returns to userspace before the > I/O has completed we also use an atomic_t to track the outstanding aio > ops. This scheme has lead to lots of subtle bugs in file systems where > didn't wait to the count to reach zero, and due to its adhoc nature also > means we have to serialize direct I/O writes that are smaller than the > file system block size. I've seen similar locking patterns quite a lot, enough I've thought about having a dedicated locking primitive to do it. It really wants to be a rwsem, but as here the rwsem rules don't allow it. The common pattern I'm looking at looks something like this: 'try begin read'() // aka down_read_trylock() /* The lockdep release hackery you describe, the rwsem remains read locked */ 'exit reader'() .. delegate unlock to work queue, timer, irq, etc .. in the new context: 're_enter reader'() // Get our lockdep tracking back 'end reader'() // aka up_read() vs a typical write side: 'begin write'() // aka down_write() /* There is no reason to unlock it before kfree of the rwsem memory. Somehow the user prevents any new down_read_trylock()'s */ 'abandon writer'() // The object will be kfree'd with a locked writer kfree() The typical goal is to provide an object destruction path that can serialize and fence all readers wherever they may be before proceeding to some synchronous destruction. Usually this gets open coded with some atomic/kref/refcount and a completion or wait queue. Often implemented wrongly, lacking the write favoring bias in the rwsem, and lacking any lockdep tracking on the naked completion. Not to discourage your patch, but to ask if we can make the solution more broadly applicable? Thanks, Jason
Re: [Cluster-devel] RFC: hold i_rwsem until aio completes
On Wed, Jan 15, 2020 at 09:24:28AM -0400, Jason Gunthorpe wrote: > I was interested because you are talking about allowing the read/write side > of a rw sem to be held across a return to user space/etc, which is the > same basic problem. No it is not; allowing the lock to be held across userspace doesn't change the owner. This is a crucial difference, PI depends on there being a distinct owner. That said, allowing the lock to be held across userspace still breaks PI in that it completely wrecks the ability to analyze the critical section.
Re: [Cluster-devel] RFC: hold i_rwsem until aio completes
On Wed, Jan 15, 2020 at 03:33:47PM +0100, Peter Zijlstra wrote: > On Wed, Jan 15, 2020 at 09:24:28AM -0400, Jason Gunthorpe wrote: > > > I was interested because you are talking about allowing the read/write side > > of a rw sem to be held across a return to user space/etc, which is the > > same basic problem. > > No it is not; allowing the lock to be held across userspace doesn't > change the owner. This is a crucial difference, PI depends on there > being a distinct owner. That said, allowing the lock to be held across > userspace still breaks PI in that it completely wrecks the ability to > analyze the critical section. I'm not sure what you are contrasting? I was remarking that I see many places open code a rwsem using an atomic and a completion specifically because they need to do the things Christoph identified: > (1) no unlocking by another process than the one that acquired it > (2) no return to userspace with locks held As an example flow: obtain the read side lock, schedual a work queue, return to user space, and unlock the read side from the work queue. If we can make some general primative that addresses this then maybe those open coded places can convert as well? Regards, Jason
Re: [Cluster-devel] RFC: hold i_rwsem until aio completes
On Wed, Jan 15, 2020 at 07:56:14AM +0100, Christoph Hellwig wrote: > On Tue, Jan 14, 2020 at 03:27:00PM -0400, Jason Gunthorpe wrote: > > I've seen similar locking patterns quite a lot, enough I've thought > > about having a dedicated locking primitive to do it. It really wants > > to be a rwsem, but as here the rwsem rules don't allow it. > > > > The common pattern I'm looking at looks something like this: > > > > 'try begin read'() // aka down_read_trylock() > > > > /* The lockdep release hackery you describe, > > the rwsem remains read locked */ > > 'exit reader'() > > > > .. delegate unlock to work queue, timer, irq, etc .. > > > > in the new context: > > > > 're_enter reader'() // Get our lockdep tracking back > > > > 'end reader'() // aka up_read() > > > > vs a typical write side: > > > > 'begin write'() // aka down_write() > > > > /* There is no reason to unlock it before kfree of the rwsem memory. > > Somehow the user prevents any new down_read_trylock()'s */ > > 'abandon writer'() // The object will be kfree'd with a locked writer > > kfree() > > > > The typical goal is to provide an object destruction path that can > > serialize and fence all readers wherever they may be before proceeding > > to some synchronous destruction. > > > > Usually this gets open coded with some atomic/kref/refcount and a > > completion or wait queue. Often implemented wrongly, lacking the write > > favoring bias in the rwsem, and lacking any lockdep tracking on the > > naked completion. > > > > Not to discourage your patch, but to ask if we can make the solution > > more broadly applicable? > > Your requirement seems a little different, and in fact in many ways > similar to the percpu_ref primitive. I was interested because you are talking about allowing the read/write side of a rw sem to be held across a return to user space/etc, which is the same basic problem. precpu refcount looks more like a typical refcount with a release that is called by whatever context does the final put. The point above is to basically move the release of a refcount into a synchrnous path by introducing some barrier to wait for the refcount to go to zero. In the above the barrier is the down_write() as it is really closer to a rwsem than a refcount. Thanks, Jason
Re: [Cluster-devel] [PATCH] gfs2: Avoid access time trashing in gfs2_inode_lookup
- Original Message - > In gfs2_inode_lookup, we initialize inode->i_atime to the lowest > possibly value after gfs2_inode_refresh may already have been called. > This should be the other way around, but we didn't notice because > usually the inode type is known from the directory entry and so > gfs2_inode_lookup won't call gfs2_inode_refresh. > > In addition, only initialize ip->i_no_formal_ino from no_formal_ino when > actually needed. > > Signed-off-by: Andreas Gruenbacher > --- Hi, The patch looks good, but can we please change the description from: "trashing" to: "thrashing"? Reviewed-by: Bob Peterson Thanks. Bob Peterson Red Hat File Systems
Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h
On 15/01/2020 13:19, Bob Peterson wrote: - Original Message - Hi, On 15/01/2020 09:24, Andreas Gruenbacher wrote: On Wed, Jan 15, 2020 at 9:58 AM Steven Whitehouse wrote: On 15/01/2020 08:49, Andreas Gruenbacher wrote: There's no point in sharing the internal structure of lock value blocks with user space. The reason that is in ondisk is that changing that structure is something that needs to follow the same rules as changing the on disk structures. So it is there as a reminder of that, I can see a point in that. The reason I've posted this is because Bob was complaining that changes to include/uapi/linux/gfs2_ondisk.h break his out-of-tree module build process. (One of the patches I'm working on adds an inode LVB.) The same would be true of on-disk format changes as well of course, and those definitely need to be shared with user space. I'm not usually building gfs2 out of tree, so I'm indifferent to this change. Thanks, Andreas Why would we need to be able to build gfs2 (at least I assume it is gfs2) out of tree anyway? Steve. Simply for productivity. The difference is this procedure, which literally takes 10 seconds, if done simultaneously on all nodes using something like cssh: make -C /usr/src/kernels/4.18.0-165.el8.x86_64 modules M=$PWD I'd be concerned about this generating "chimera" modules that produce invalid test results. rmmod gfs2 insmod gfs2.ko Compared to a procedure like this, which takes at least 30 minutes: make (a new kernel .src.rpm) scp or rsync the .src.rpm to a build machine cd ~/rpmbuild/ rpm --force -i --nodeps /home/bob/*kernel-4.18.0*.src.rpm &> /dev/null echo $? rpmbuild --target=x86_64 -ba SPECS/kernel.spec ( -or- submit a "real" kernel build) then wait for the kernel build Pull down all necessary kernel rpms scp to all the nodes in the cluster rpm --force -i --nodeps /sbin/reboot all the nodes in the cluster wait for all the nodes to reboot, the cluster to stabilize, etc. Isn't the next-best alternative just building the modules in-tree and copying them to the test machines? I'm not sure I understand the complication. Perhaps we need cluster_install and cluster_modules_install rules in the build system :) Andy
Re: [Cluster-devel] RFC: hold i_rwsem until aio completes
On Wed, Jan 15, 2020 at 09:24:28AM -0400, Jason Gunthorpe wrote: > > Your requirement seems a little different, and in fact in many ways > > similar to the percpu_ref primitive. > > I was interested because you are talking about allowing the read/write side > of a rw sem to be held across a return to user space/etc, which is the > same basic problem. > > precpu refcount looks more like a typical refcount with a release that > is called by whatever context does the final put. The point above is > to basically move the release of a refcount into a synchrnous path by > introducing some barrier to wait for the refcount to go to zero. In > the above the barrier is the down_write() as it is really closer to a > rwsem than a refcount. No, percpu_ref is a little different than the name suggests, as it has a magic initial reference, and then the other short term reference. To actually tear it down now just a normal put of the reference is needed, but an explicit percpu_ref_kill or percpu_ref_kill_and_confirm. Various callers (including all I added) would like that operation to be synchronous and currently hack that up, so a version of the percpu_ref that actually waits for the other references to away like we hacked up various places seems to exactly suit your requirements.
[Cluster-devel] [PATCH 2/2] gfs2: fix O_SYNC write handling
Don't ignore the return value from generic_write_sync for the direct to buffered I/O callback case when written is non-zero. Also don't bother to call generic_write_sync for the pure direct I/O case, as iomap_dio_rw already takes care of that. Signed-off-by: Christoph Hellwig --- fs/gfs2/file.c | 51 +- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 21d032c4b077..86c0e61407b6 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -847,7 +847,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct gfs2_inode *ip = GFS2_I(inode); - ssize_t written = 0, ret; + ssize_t ret = 0; ret = gfs2_rsqa_alloc(ip); if (ret) @@ -882,52 +882,51 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t pos, endbyte; ssize_t buffered; - written = gfs2_file_direct_write(iocb, from); - if (written < 0 || !iov_iter_count(from)) + ret = gfs2_file_direct_write(iocb, from); + if (ret < 0 || !iov_iter_count(from)) goto out_unlock; current->backing_dev_info = inode_to_bdi(inode); - ret = iomap_file_buffered_write(iocb, from, _iomap_ops); + buffered = iomap_file_buffered_write(iocb, from, +_iomap_ops); current->backing_dev_info = NULL; - if (unlikely(ret < 0)) + if (unlikely(buffered <= 0)) { + if (buffered < 0) + ret = buffered; goto out_unlock; - buffered = ret; + } /* * We need to ensure that the page cache pages are written to * disk and invalidated to preserve the expected O_DIRECT -* semantics. +* semantics. If the writeback or invalidate fails only report +* the direct I/O range as we don't know if the buffered pages +* made it to disk. */ pos = iocb->ki_pos; endbyte = pos + buffered - 1; ret = filemap_write_and_wait_range(mapping, pos, endbyte); - if (!ret) { - iocb->ki_pos += buffered; - written += buffered; - invalidate_mapping_pages(mapping, -pos >> PAGE_SHIFT, -endbyte >> PAGE_SHIFT); - } else { - /* -* We don't know how much we wrote, so just return -* the number of bytes which were direct-written -*/ - } + if (ret) + goto out_unlock; + + invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, +endbyte >> PAGE_SHIFT); + ret += buffered; } else { current->backing_dev_info = inode_to_bdi(inode); ret = iomap_file_buffered_write(iocb, from, _iomap_ops); current->backing_dev_info = NULL; - if (likely(ret > 0)) - iocb->ki_pos += ret; + if (unlikely(ret <= 0)) + goto out_unlock; } + iocb->ki_pos += ret; + inode_unlock(inode); + return generic_write_sync(iocb, ret); + out_unlock: inode_unlock(inode); - if (likely(ret > 0)) { - /* Handle various SYNC-type writes */ - ret = generic_write_sync(iocb, ret); - } - return written ? written : ret; + return ret; } static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len, -- 2.24.1
[Cluster-devel] RFC: gfs2 O_SYNC fix
Hi gfs2 maintainers, can you take a look at this completely untested series? I found some O_SYNC handling issues during code inspection for the direct I/O locking revamp.
Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h
On 15/01/2020 08:58, Steven Whitehouse wrote: Hi, On 15/01/2020 08:49, Andreas Gruenbacher wrote: There's no point in sharing the internal structure of lock value blocks with user space. The reason that is in ondisk is that changing that structure is something that needs to follow the same rules as changing the on disk structures. So it is there as a reminder of that, Perhaps some eye-catching code comments would suffice? Defining it in a uapi header does sort-of communicate that it's ok to use in userspace, whereas just having the structs in close proximity doesn't really communicate that they should be updated in sync. Andy Signed-off-by: Andreas Gruenbacher --- fs/gfs2/glock.h | 1 + fs/gfs2/incore.h | 1 + fs/gfs2/rgrp.c | 10 ++ include/uapi/linux/gfs2_ondisk.h | 10 -- 4 files changed, 12 insertions(+), 10 deletions(-) Steve. diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index b8adaf80e4c5..d2f2dba05a94 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -306,4 +306,5 @@ static inline void glock_clear_object(struct gfs2_glock *gl, void *object) spin_unlock(>gl_lockref.lock); } + #endif /* __GLOCK_DOT_H__ */ diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index b5d9c11f4901..5155389e9b5c 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -33,6 +33,7 @@ struct gfs2_trans; struct gfs2_jdesc; struct gfs2_sbd; struct lm_lockops; +struct gfs2_rgrp_lvb; typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 2466bb44a23c..1165627274cf 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -46,6 +46,16 @@ #define LBITSKIP00 (0xUL) #endif +struct gfs2_rgrp_lvb { + __be32 rl_magic; + __be32 rl_flags; + __be32 rl_free; + __be32 rl_dinodes; + __be64 rl_igeneration; + __be32 rl_unlinked; + __be32 __pad; +}; + /* * These routines are used by the resource group routines (rgrp.c) * to keep track of block allocation. Each block is represented by two diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h index 2dc10a034de1..4e9a80941bec 100644 --- a/include/uapi/linux/gfs2_ondisk.h +++ b/include/uapi/linux/gfs2_ondisk.h @@ -171,16 +171,6 @@ struct gfs2_rindex { #define GFS2_RGF_NOALLOC 0x0008 #define GFS2_RGF_TRIMMED 0x0010 -struct gfs2_rgrp_lvb { - __be32 rl_magic; - __be32 rl_flags; - __be32 rl_free; - __be32 rl_dinodes; - __be64 rl_igeneration; - __be32 rl_unlinked; - __be32 __pad; -}; - struct gfs2_rgrp { struct gfs2_meta_header rg_header;
[Cluster-devel] [PATCH 1/2] gfs2: move setting current->backing_dev_info
Only set current->backing_dev_info just around the buffered write calls to prepare for the next fix. Signed-off-by: Christoph Hellwig --- fs/gfs2/file.c | 21 ++--- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 9d58295ccf7a..21d032c4b077 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -867,18 +867,15 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) - goto out; - - /* We can write back this queue in page reclaim */ - current->backing_dev_info = inode_to_bdi(inode); + goto out_unlock; ret = file_remove_privs(file); if (ret) - goto out2; + goto out_unlock; ret = file_update_time(file); if (ret) - goto out2; + goto out_unlock; if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; @@ -887,11 +884,13 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) written = gfs2_file_direct_write(iocb, from); if (written < 0 || !iov_iter_count(from)) - goto out2; + goto out_unlock; + current->backing_dev_info = inode_to_bdi(inode); ret = iomap_file_buffered_write(iocb, from, _iomap_ops); + current->backing_dev_info = NULL; if (unlikely(ret < 0)) - goto out2; + goto out_unlock; buffered = ret; /* @@ -915,14 +914,14 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) */ } } else { + current->backing_dev_info = inode_to_bdi(inode); ret = iomap_file_buffered_write(iocb, from, _iomap_ops); + current->backing_dev_info = NULL; if (likely(ret > 0)) iocb->ki_pos += ret; } -out2: - current->backing_dev_info = NULL; -out: +out_unlock: inode_unlock(inode); if (likely(ret > 0)) { /* Handle various SYNC-type writes */ -- 2.24.1
Re: [Cluster-devel] RFC: hold i_rwsem until aio completes
On Wed, Jan 15, 2020 at 04:36:14PM +0100, Christoph Hellwig wrote: > synchronous and currently hack that up, so a version of the percpu_ref > that actually waits for the other references to away like we hacked > up various places seems to exactly suit your requirements. Ah, yes, sounds like a similar goal, many cases I'm thinking about also hack up a kref to trigger a completion to make it synchronous. Jason
Re: [Cluster-devel] RFC: hold i_rwsem until aio completes
On 1/15/20 9:49 AM, Jason Gunthorpe wrote: > On Wed, Jan 15, 2020 at 03:33:47PM +0100, Peter Zijlstra wrote: >> On Wed, Jan 15, 2020 at 09:24:28AM -0400, Jason Gunthorpe wrote: >> >>> I was interested because you are talking about allowing the read/write side >>> of a rw sem to be held across a return to user space/etc, which is the >>> same basic problem. >> No it is not; allowing the lock to be held across userspace doesn't >> change the owner. This is a crucial difference, PI depends on there >> being a distinct owner. That said, allowing the lock to be held across >> userspace still breaks PI in that it completely wrecks the ability to >> analyze the critical section. > I'm not sure what you are contrasting? > > I was remarking that I see many places open code a rwsem using an > atomic and a completion specifically because they need to do the > things Christoph identified: > >> (1) no unlocking by another process than the one that acquired it >> (2) no return to userspace with locks held > As an example flow: obtain the read side lock, schedual a work queue, > return to user space, and unlock the read side from the work queue. We currently have down_read_non_owner() and up_read_non_owner() that perform the lock and unlock without lockdep tracking. Of course, that is a hack and their use must be carefully scrutinized to make sure that there is no deadlock or other potentially locking issues. Cheers, Longman
Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h
- Original Message - > Hi, > > On 15/01/2020 09:24, Andreas Gruenbacher wrote: > > On Wed, Jan 15, 2020 at 9:58 AM Steven Whitehouse > > wrote: > >> On 15/01/2020 08:49, Andreas Gruenbacher wrote: > >>> There's no point in sharing the internal structure of lock value blocks > >>> with user space. > >> The reason that is in ondisk is that changing that structure is > >> something that needs to follow the same rules as changing the on disk > >> structures. So it is there as a reminder of that, > > I can see a point in that. The reason I've posted this is because Bob > > was complaining that changes to include/uapi/linux/gfs2_ondisk.h break > > his out-of-tree module build process. (One of the patches I'm working > > on adds an inode LVB.) The same would be true of on-disk format > > changes as well of course, and those definitely need to be shared with > > user space. I'm not usually building gfs2 out of tree, so I'm > > indifferent to this change. > > > > Thanks, > > Andreas > > > Why would we need to be able to build gfs2 (at least I assume it is > gfs2) out of tree anyway? > > Steve. Simply for productivity. The difference is this procedure, which literally takes 10 seconds, if done simultaneously on all nodes using something like cssh: make -C /usr/src/kernels/4.18.0-165.el8.x86_64 modules M=$PWD rmmod gfs2 insmod gfs2.ko Compared to a procedure like this, which takes at least 30 minutes: make (a new kernel .src.rpm) scp or rsync the .src.rpm to a build machine cd ~/rpmbuild/ rpm --force -i --nodeps /home/bob/*kernel-4.18.0*.src.rpm &> /dev/null echo $? rpmbuild --target=x86_64 -ba SPECS/kernel.spec ( -or- submit a "real" kernel build) then wait for the kernel build Pull down all necessary kernel rpms scp to all the nodes in the cluster rpm --force -i --nodeps /sbin/reboot all the nodes in the cluster wait for all the nodes to reboot, the cluster to stabilize, etc. Regards, Bob Peterson Red Hat File Systems