[Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h

2020-01-15 Thread Andreas Gruenbacher
There's no point in sharing the internal structure of lock value blocks
with user space.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/glock.h  |  1 +
 fs/gfs2/incore.h |  1 +
 fs/gfs2/rgrp.c   | 10 ++
 include/uapi/linux/gfs2_ondisk.h | 10 --
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index b8adaf80e4c5..d2f2dba05a94 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -306,4 +306,5 @@ static inline void glock_clear_object(struct gfs2_glock 
*gl, void *object)
spin_unlock(>gl_lockref.lock);
 }
 
+
 #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b5d9c11f4901..5155389e9b5c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -33,6 +33,7 @@ struct gfs2_trans;
 struct gfs2_jdesc;
 struct gfs2_sbd;
 struct lm_lockops;
+struct gfs2_rgrp_lvb;
 
 typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2466bb44a23c..1165627274cf 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -46,6 +46,16 @@
 #define LBITSKIP00 (0xUL)
 #endif
 
+struct gfs2_rgrp_lvb {
+   __be32 rl_magic;
+   __be32 rl_flags;
+   __be32 rl_free;
+   __be32 rl_dinodes;
+   __be64 rl_igeneration;
+   __be32 rl_unlinked;
+   __be32 __pad;
+};
+
 /*
  * These routines are used by the resource group routines (rgrp.c)
  * to keep track of block allocation.  Each block is represented by two
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 2dc10a034de1..4e9a80941bec 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -171,16 +171,6 @@ struct gfs2_rindex {
 #define GFS2_RGF_NOALLOC   0x0008
 #define GFS2_RGF_TRIMMED   0x0010
 
-struct gfs2_rgrp_lvb {
-   __be32 rl_magic;
-   __be32 rl_flags;
-   __be32 rl_free;
-   __be32 rl_dinodes;
-   __be64 rl_igeneration;
-   __be32 rl_unlinked;
-   __be32 __pad;
-};
-
 struct gfs2_rgrp {
struct gfs2_meta_header rg_header;
 
-- 
2.20.1



[Cluster-devel] [PATCH] gfs2: Avoid access time trashing in gfs2_inode_lookup

2020-01-15 Thread Andreas Gruenbacher
In gfs2_inode_lookup, we initialize inode->i_atime to the lowest
possibly value after gfs2_inode_refresh may already have been called.
This should be the other way around, but we didn't notice because
usually the inode type is known from the directory entry and so
gfs2_inode_lookup won't call gfs2_inode_refresh.

In addition, only initialize ip->i_no_formal_ino from no_formal_ino when
actually needed.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/inode.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index dafef10b91f1..2716d56ed0a0 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -136,7 +136,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, 
unsigned int type,
 
if (inode->i_state & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
-   ip->i_no_formal_ino = no_formal_ino;
 
error = gfs2_glock_get(sdp, no_addr, _inode_glops, CREATE, 
>i_gl);
if (unlikely(error))
@@ -175,21 +174,22 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, 
unsigned int type,
gfs2_glock_put(io_gl);
io_gl = NULL;
 
+   /* Lowest possible timestamp; will be overwritten in 
gfs2_dinode_in. */
+   inode->i_atime.tv_sec = 1LL << (8 * 
sizeof(inode->i_atime.tv_sec) - 1);
+   inode->i_atime.tv_nsec = 0;
+
if (type == DT_UNKNOWN) {
/* Inode glock must be locked already */
error = gfs2_inode_refresh(GFS2_I(inode));
if (error)
goto fail_refresh;
} else {
+   ip->i_no_formal_ino = no_formal_ino;
inode->i_mode = DT2IF(type);
}
 
gfs2_set_iop(inode);
 
-   /* Lowest possible timestamp; will be overwritten in 
gfs2_dinode_in. */
-   inode->i_atime.tv_sec = 1LL << (8 * 
sizeof(inode->i_atime.tv_sec) - 1);
-   inode->i_atime.tv_nsec = 0;
-
unlock_new_inode(inode);
}
 
-- 
2.20.1



Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h

2020-01-15 Thread Steven Whitehouse

Hi,

On 15/01/2020 08:49, Andreas Gruenbacher wrote:

There's no point in sharing the internal structure of lock value blocks
with user space.


The reason that is in ondisk is that changing that structure is 
something that needs to follow the same rules as changing the on disk 
structures. So it is there as a reminder of that,


Steve.




Signed-off-by: Andreas Gruenbacher 
---
  fs/gfs2/glock.h  |  1 +
  fs/gfs2/incore.h |  1 +
  fs/gfs2/rgrp.c   | 10 ++
  include/uapi/linux/gfs2_ondisk.h | 10 --
  4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index b8adaf80e4c5..d2f2dba05a94 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -306,4 +306,5 @@ static inline void glock_clear_object(struct gfs2_glock 
*gl, void *object)
spin_unlock(>gl_lockref.lock);
  }
  
+

  #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b5d9c11f4901..5155389e9b5c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -33,6 +33,7 @@ struct gfs2_trans;
  struct gfs2_jdesc;
  struct gfs2_sbd;
  struct lm_lockops;
+struct gfs2_rgrp_lvb;
  
  typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int ret);
  
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c

index 2466bb44a23c..1165627274cf 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -46,6 +46,16 @@
  #define LBITSKIP00 (0xUL)
  #endif
  
+struct gfs2_rgrp_lvb {

+   __be32 rl_magic;
+   __be32 rl_flags;
+   __be32 rl_free;
+   __be32 rl_dinodes;
+   __be64 rl_igeneration;
+   __be32 rl_unlinked;
+   __be32 __pad;
+};
+
  /*
   * These routines are used by the resource group routines (rgrp.c)
   * to keep track of block allocation.  Each block is represented by two
diff --git a/include/uapi/linux/gfs2_ondisk.h b/include/uapi/linux/gfs2_ondisk.h
index 2dc10a034de1..4e9a80941bec 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -171,16 +171,6 @@ struct gfs2_rindex {
  #define GFS2_RGF_NOALLOC  0x0008
  #define GFS2_RGF_TRIMMED  0x0010
  
-struct gfs2_rgrp_lvb {

-   __be32 rl_magic;
-   __be32 rl_flags;
-   __be32 rl_free;
-   __be32 rl_dinodes;
-   __be64 rl_igeneration;
-   __be32 rl_unlinked;
-   __be32 __pad;
-};
-
  struct gfs2_rgrp {
struct gfs2_meta_header rg_header;
  




Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h

2020-01-15 Thread Andreas Gruenbacher
On Wed, Jan 15, 2020 at 9:58 AM Steven Whitehouse  wrote:
> On 15/01/2020 08:49, Andreas Gruenbacher wrote:
> > There's no point in sharing the internal structure of lock value blocks
> > with user space.
>
> The reason that is in ondisk is that changing that structure is
> something that needs to follow the same rules as changing the on disk
> structures. So it is there as a reminder of that,

I can see a point in that. The reason I've posted this is because Bob
was complaining that changes to include/uapi/linux/gfs2_ondisk.h break
his out-of-tree module build process. (One of the patches I'm working
on adds an inode LVB.) The same would be true of on-disk format
changes as well of course, and those definitely need to be shared with
user space. I'm not usually building gfs2 out of tree, so I'm
indifferent to this change.

Thanks,
Andreas




Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h

2020-01-15 Thread Steven Whitehouse

Hi,

On 15/01/2020 09:24, Andreas Gruenbacher wrote:

On Wed, Jan 15, 2020 at 9:58 AM Steven Whitehouse  wrote:

On 15/01/2020 08:49, Andreas Gruenbacher wrote:

There's no point in sharing the internal structure of lock value blocks
with user space.

The reason that is in ondisk is that changing that structure is
something that needs to follow the same rules as changing the on disk
structures. So it is there as a reminder of that,

I can see a point in that. The reason I've posted this is because Bob
was complaining that changes to include/uapi/linux/gfs2_ondisk.h break
his out-of-tree module build process. (One of the patches I'm working
on adds an inode LVB.) The same would be true of on-disk format
changes as well of course, and those definitely need to be shared with
user space. I'm not usually building gfs2 out of tree, so I'm
indifferent to this change.

Thanks,
Andreas

Why would we need to be able to build gfs2 (at least I assume it is 
gfs2) out of tree anyway?


Steve.




[Cluster-devel] [PATCH] gfs2: Avoid access time trashing in gfs2_inode_lookup

2020-01-15 Thread Andreas Gruenbacher
In gfs2_inode_lookup, we initialize inode->i_atime to the lowest
possibly value after gfs2_inode_refresh may already have been called.
This should be the other way around, but we didn't notice because
usually the inode type is known from the directory entry and so
gfs2_inode_lookup won't call gfs2_inode_refresh.

In addition, only initialize ip->i_no_formal_ino from no_formal_ino when
actually needed.

Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/inode.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index dafef10b91f1..2716d56ed0a0 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -136,7 +136,6 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, 
unsigned int type,
 
if (inode->i_state & I_NEW) {
struct gfs2_sbd *sdp = GFS2_SB(inode);
-   ip->i_no_formal_ino = no_formal_ino;
 
error = gfs2_glock_get(sdp, no_addr, _inode_glops, CREATE, 
>i_gl);
if (unlikely(error))
@@ -175,21 +174,22 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, 
unsigned int type,
gfs2_glock_put(io_gl);
io_gl = NULL;
 
+   /* Lowest possible timestamp; will be overwritten in 
gfs2_dinode_in. */
+   inode->i_atime.tv_sec = 1LL << (8 * 
sizeof(inode->i_atime.tv_sec) - 1);
+   inode->i_atime.tv_nsec = 0;
+
if (type == DT_UNKNOWN) {
/* Inode glock must be locked already */
error = gfs2_inode_refresh(GFS2_I(inode));
if (error)
goto fail_refresh;
} else {
+   ip->i_no_formal_ino = no_formal_ino;
inode->i_mode = DT2IF(type);
}
 
gfs2_set_iop(inode);
 
-   /* Lowest possible timestamp; will be overwritten in 
gfs2_dinode_in. */
-   inode->i_atime.tv_sec = 1LL << (8 * 
sizeof(inode->i_atime.tv_sec) - 1);
-   inode->i_atime.tv_nsec = 0;
-
unlock_new_inode(inode);
}
 
-- 
2.20.1



Re: [Cluster-devel] [PATCH] gfs2: Avoid access time trashing in gfs2_inode_lookup

2020-01-15 Thread Andreas Gruenbacher
Oops, sorry for the duplicate post.

Andreas




Re: [Cluster-devel] RFC: hold i_rwsem until aio completes

2020-01-15 Thread Jason Gunthorpe
On Tue, Jan 14, 2020 at 05:12:13PM +0100, Christoph Hellwig wrote:
> Hi all,
> 
> Asynchronous read/write operations currently use a rather magic locking
> scheme, were access to file data is normally protected using a rw_semaphore,
> but if we are doing aio where the syscall returns to userspace before the
> I/O has completed we also use an atomic_t to track the outstanding aio
> ops.  This scheme has lead to lots of subtle bugs in file systems where
> didn't wait to the count to reach zero, and due to its adhoc nature also
> means we have to serialize direct I/O writes that are smaller than the
> file system block size.

I've seen similar locking patterns quite a lot, enough I've thought
about having a dedicated locking primitive to do it. It really wants
to be a rwsem, but as here the rwsem rules don't allow it.

The common pattern I'm looking at looks something like this:

 'try begin read'() // aka down_read_trylock()

  /* The lockdep release hackery you describe,
 the rwsem remains read locked */
 'exit reader'()

 .. delegate unlock to work queue, timer, irq, etc ..

in the new context:

 're_enter reader'() // Get our lockdep tracking back

 'end reader'() // aka up_read()

vs a typical write side:

 'begin write'() // aka down_write()

 /* There is no reason to unlock it before kfree of the rwsem memory.
Somehow the user prevents any new down_read_trylock()'s */
 'abandon writer'() // The object will be kfree'd with a locked writer
 kfree()

The typical goal is to provide an object destruction path that can
serialize and fence all readers wherever they may be before proceeding
to some synchronous destruction.

Usually this gets open coded with some atomic/kref/refcount and a
completion or wait queue. Often implemented wrongly, lacking the write
favoring bias in the rwsem, and lacking any lockdep tracking on the
naked completion.

Not to discourage your patch, but to ask if we can make the solution
more broadly applicable?

Thanks,
Jason




Re: [Cluster-devel] RFC: hold i_rwsem until aio completes

2020-01-15 Thread Peter Zijlstra
On Wed, Jan 15, 2020 at 09:24:28AM -0400, Jason Gunthorpe wrote:

> I was interested because you are talking about allowing the read/write side
> of a rw sem to be held across a return to user space/etc, which is the
> same basic problem.

No it is not; allowing the lock to be held across userspace doesn't
change the owner. This is a crucial difference, PI depends on there
being a distinct owner. That said, allowing the lock to be held across
userspace still breaks PI in that it completely wrecks the ability to
analyze the critical section.




Re: [Cluster-devel] RFC: hold i_rwsem until aio completes

2020-01-15 Thread Jason Gunthorpe
On Wed, Jan 15, 2020 at 03:33:47PM +0100, Peter Zijlstra wrote:
> On Wed, Jan 15, 2020 at 09:24:28AM -0400, Jason Gunthorpe wrote:
> 
> > I was interested because you are talking about allowing the read/write side
> > of a rw sem to be held across a return to user space/etc, which is the
> > same basic problem.
> 
> No it is not; allowing the lock to be held across userspace doesn't
> change the owner. This is a crucial difference, PI depends on there
> being a distinct owner. That said, allowing the lock to be held across
> userspace still breaks PI in that it completely wrecks the ability to
> analyze the critical section.

I'm not sure what you are contrasting?

I was remarking that I see many places open code a rwsem using an
atomic and a completion specifically because they need to do the
things Christoph identified:

> (1) no unlocking by another process than the one that acquired it
> (2) no return to userspace with locks held

As an example flow: obtain the read side lock, schedual a work queue,
return to user space, and unlock the read side from the work queue.

If we can make some general primative that addresses this then maybe
those open coded places can convert as well?

Regards,
Jason




Re: [Cluster-devel] RFC: hold i_rwsem until aio completes

2020-01-15 Thread Jason Gunthorpe
On Wed, Jan 15, 2020 at 07:56:14AM +0100, Christoph Hellwig wrote:
> On Tue, Jan 14, 2020 at 03:27:00PM -0400, Jason Gunthorpe wrote:
> > I've seen similar locking patterns quite a lot, enough I've thought
> > about having a dedicated locking primitive to do it. It really wants
> > to be a rwsem, but as here the rwsem rules don't allow it.
> > 
> > The common pattern I'm looking at looks something like this:
> > 
> >  'try begin read'() // aka down_read_trylock()
> > 
> >   /* The lockdep release hackery you describe,
> >  the rwsem remains read locked */
> >  'exit reader'()
> > 
> >  .. delegate unlock to work queue, timer, irq, etc ..
> > 
> > in the new context:
> > 
> >  're_enter reader'() // Get our lockdep tracking back
> > 
> >  'end reader'() // aka up_read()
> > 
> > vs a typical write side:
> > 
> >  'begin write'() // aka down_write()
> > 
> >  /* There is no reason to unlock it before kfree of the rwsem memory.
> > Somehow the user prevents any new down_read_trylock()'s */
> >  'abandon writer'() // The object will be kfree'd with a locked writer
> >  kfree()
> > 
> > The typical goal is to provide an object destruction path that can
> > serialize and fence all readers wherever they may be before proceeding
> > to some synchronous destruction.
> > 
> > Usually this gets open coded with some atomic/kref/refcount and a
> > completion or wait queue. Often implemented wrongly, lacking the write
> > favoring bias in the rwsem, and lacking any lockdep tracking on the
> > naked completion.
> > 
> > Not to discourage your patch, but to ask if we can make the solution
> > more broadly applicable?
> 
> Your requirement seems a little different, and in fact in many ways
> similar to the percpu_ref primitive.

I was interested because you are talking about allowing the read/write side
of a rw sem to be held across a return to user space/etc, which is the
same basic problem.

precpu refcount looks more like a typical refcount with a release that
is called by whatever context does the final put. The point above is
to basically move the release of a refcount into a synchrnous path by
introducing some barrier to wait for the refcount to go to zero. In
the above the barrier is the down_write() as it is really closer to a
rwsem than a refcount.

Thanks,
Jason




Re: [Cluster-devel] [PATCH] gfs2: Avoid access time trashing in gfs2_inode_lookup

2020-01-15 Thread Bob Peterson
- Original Message -
> In gfs2_inode_lookup, we initialize inode->i_atime to the lowest
> possibly value after gfs2_inode_refresh may already have been called.
> This should be the other way around, but we didn't notice because
> usually the inode type is known from the directory entry and so
> gfs2_inode_lookup won't call gfs2_inode_refresh.
> 
> In addition, only initialize ip->i_no_formal_ino from no_formal_ino when
> actually needed.
> 
> Signed-off-by: Andreas Gruenbacher 
> ---

Hi,

The patch looks good, but can we please change the description from:
"trashing"
to:
"thrashing"?

Reviewed-by: Bob Peterson 
Thanks.

Bob Peterson
Red Hat File Systems



Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h

2020-01-15 Thread Andrew Price

On 15/01/2020 13:19, Bob Peterson wrote:

- Original Message -

Hi,

On 15/01/2020 09:24, Andreas Gruenbacher wrote:

On Wed, Jan 15, 2020 at 9:58 AM Steven Whitehouse 
wrote:

On 15/01/2020 08:49, Andreas Gruenbacher wrote:

There's no point in sharing the internal structure of lock value blocks
with user space.

The reason that is in ondisk is that changing that structure is
something that needs to follow the same rules as changing the on disk
structures. So it is there as a reminder of that,

I can see a point in that. The reason I've posted this is because Bob
was complaining that changes to include/uapi/linux/gfs2_ondisk.h break
his out-of-tree module build process. (One of the patches I'm working
on adds an inode LVB.) The same would be true of on-disk format
changes as well of course, and those definitely need to be shared with
user space. I'm not usually building gfs2 out of tree, so I'm
indifferent to this change.

Thanks,
Andreas


Why would we need to be able to build gfs2 (at least I assume it is
gfs2) out of tree anyway?

Steve.


Simply for productivity. The difference is this procedure, which literally 
takes 10 seconds,
if done simultaneously on all nodes using something like cssh:

make -C /usr/src/kernels/4.18.0-165.el8.x86_64 modules M=$PWD


I'd be concerned about this generating "chimera" modules that produce 
invalid test results.



rmmod gfs2
insmod gfs2.ko

Compared to a procedure like this, which takes at least 30 minutes:

make (a new kernel .src.rpm)
scp or rsync the .src.rpm to a build machine
cd ~/rpmbuild/
rpm --force -i --nodeps /home/bob/*kernel-4.18.0*.src.rpm &> /dev/null
echo $?
rpmbuild --target=x86_64 -ba SPECS/kernel.spec
( -or- submit a "real" kernel build)
then wait for the kernel build
Pull down all necessary kernel rpms
scp  to all the nodes in the cluster
rpm --force -i --nodeps 
/sbin/reboot all the nodes in the cluster
wait for all the nodes to reboot, the cluster to stabilize, etc.


Isn't the next-best alternative just building the modules in-tree and 
copying them to the test machines? I'm not sure I understand the 
complication.


Perhaps we need cluster_install and cluster_modules_install rules in the 
build system :)


Andy



Re: [Cluster-devel] RFC: hold i_rwsem until aio completes

2020-01-15 Thread Christoph Hellwig
On Wed, Jan 15, 2020 at 09:24:28AM -0400, Jason Gunthorpe wrote:
> > Your requirement seems a little different, and in fact in many ways
> > similar to the percpu_ref primitive.
> 
> I was interested because you are talking about allowing the read/write side
> of a rw sem to be held across a return to user space/etc, which is the
> same basic problem.
> 
> precpu refcount looks more like a typical refcount with a release that
> is called by whatever context does the final put. The point above is
> to basically move the release of a refcount into a synchrnous path by
> introducing some barrier to wait for the refcount to go to zero. In
> the above the barrier is the down_write() as it is really closer to a
> rwsem than a refcount.

No, percpu_ref is a little different than the name suggests, as it has
a magic initial reference, and then the other short term reference.  To
actually tear it down now just a normal put of the reference is needed,
but an explicit percpu_ref_kill or percpu_ref_kill_and_confirm.  Various
callers (including all I added) would like that operation to be
synchronous and currently hack that up, so a version of the percpu_ref
that actually waits for the other references to away like we hacked
up various places seems to exactly suit your requirements.




[Cluster-devel] [PATCH 2/2] gfs2: fix O_SYNC write handling

2020-01-15 Thread Christoph Hellwig
Don't ignore the return value from generic_write_sync for the direct to
buffered I/O callback case when written is non-zero.  Also don't bother
to call generic_write_sync for the pure direct I/O case, as iomap_dio_rw
already takes care of that.

Signed-off-by: Christoph Hellwig 
---
 fs/gfs2/file.c | 51 +-
 1 file changed, 25 insertions(+), 26 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 21d032c4b077..86c0e61407b6 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -847,7 +847,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct gfs2_inode *ip = GFS2_I(inode);
-   ssize_t written = 0, ret;
+   ssize_t ret = 0;
 
ret = gfs2_rsqa_alloc(ip);
if (ret)
@@ -882,52 +882,51 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
loff_t pos, endbyte;
ssize_t buffered;
 
-   written = gfs2_file_direct_write(iocb, from);
-   if (written < 0 || !iov_iter_count(from))
+   ret = gfs2_file_direct_write(iocb, from);
+   if (ret < 0 || !iov_iter_count(from))
goto out_unlock;
 
current->backing_dev_info = inode_to_bdi(inode);
-   ret = iomap_file_buffered_write(iocb, from, _iomap_ops);
+   buffered = iomap_file_buffered_write(iocb, from,
+_iomap_ops);
current->backing_dev_info = NULL;
-   if (unlikely(ret < 0))
+   if (unlikely(buffered <= 0)) {
+   if (buffered < 0)
+   ret = buffered;
goto out_unlock;
-   buffered = ret;
+   }
 
/*
 * We need to ensure that the page cache pages are written to
 * disk and invalidated to preserve the expected O_DIRECT
-* semantics.
+* semantics.  If the writeback or invalidate fails only report
+* the direct I/O range as we don't know if the buffered pages
+* made it to disk.
 */
pos = iocb->ki_pos;
endbyte = pos + buffered - 1;
ret = filemap_write_and_wait_range(mapping, pos, endbyte);
-   if (!ret) {
-   iocb->ki_pos += buffered;
-   written += buffered;
-   invalidate_mapping_pages(mapping,
-pos >> PAGE_SHIFT,
-endbyte >> PAGE_SHIFT);
-   } else {
-   /*
-* We don't know how much we wrote, so just return
-* the number of bytes which were direct-written
-*/
-   }
+   if (ret)
+   goto out_unlock;
+
+   invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT,
+endbyte >> PAGE_SHIFT);
+   ret += buffered;
} else {
current->backing_dev_info = inode_to_bdi(inode);
ret = iomap_file_buffered_write(iocb, from, _iomap_ops);
current->backing_dev_info = NULL;
-   if (likely(ret > 0))
-   iocb->ki_pos += ret;
+   if (unlikely(ret <= 0))
+   goto out_unlock;
}
 
+   iocb->ki_pos += ret;
+   inode_unlock(inode);
+   return generic_write_sync(iocb, ret);
+
 out_unlock:
inode_unlock(inode);
-   if (likely(ret > 0)) {
-   /* Handle various SYNC-type writes */
-   ret = generic_write_sync(iocb, ret);
-   }
-   return written ? written : ret;
+   return ret;
 }
 
 static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
-- 
2.24.1




[Cluster-devel] RFC: gfs2 O_SYNC fix

2020-01-15 Thread Christoph Hellwig
Hi gfs2 maintainers,

can you take a look at this completely untested series?  I found some
O_SYNC handling issues during code inspection for the direct I/O
locking revamp.





Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h

2020-01-15 Thread Andrew Price

On 15/01/2020 08:58, Steven Whitehouse wrote:

Hi,

On 15/01/2020 08:49, Andreas Gruenbacher wrote:

There's no point in sharing the internal structure of lock value blocks
with user space.


The reason that is in ondisk is that changing that structure is 
something that needs to follow the same rules as changing the on disk 
structures. So it is there as a reminder of that,


Perhaps some eye-catching code comments would suffice? Defining it in a 
uapi header does sort-of communicate that it's ok to use in userspace, 
whereas just having the structs in close proximity doesn't really 
communicate that they should be updated in sync.


Andy





Signed-off-by: Andreas Gruenbacher 
---
  fs/gfs2/glock.h  |  1 +
  fs/gfs2/incore.h |  1 +
  fs/gfs2/rgrp.c   | 10 ++
  include/uapi/linux/gfs2_ondisk.h | 10 --
  4 files changed, 12 insertions(+), 10 deletions(-)





Steve.


diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index b8adaf80e4c5..d2f2dba05a94 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -306,4 +306,5 @@ static inline void glock_clear_object(struct 
gfs2_glock *gl, void *object)

  spin_unlock(>gl_lockref.lock);
  }
+
  #endif /* __GLOCK_DOT_H__ */
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b5d9c11f4901..5155389e9b5c 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -33,6 +33,7 @@ struct gfs2_trans;
  struct gfs2_jdesc;
  struct gfs2_sbd;
  struct lm_lockops;
+struct gfs2_rgrp_lvb;
  typedef void (*gfs2_glop_bh_t) (struct gfs2_glock *gl, unsigned int 
ret);

diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 2466bb44a23c..1165627274cf 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -46,6 +46,16 @@
  #define LBITSKIP00 (0xUL)
  #endif
+struct gfs2_rgrp_lvb {
+    __be32 rl_magic;
+    __be32 rl_flags;
+    __be32 rl_free;
+    __be32 rl_dinodes;
+    __be64 rl_igeneration;
+    __be32 rl_unlinked;
+    __be32 __pad;
+};
+
  /*
   * These routines are used by the resource group routines (rgrp.c)
   * to keep track of block allocation.  Each block is represented by two
diff --git a/include/uapi/linux/gfs2_ondisk.h 
b/include/uapi/linux/gfs2_ondisk.h

index 2dc10a034de1..4e9a80941bec 100644
--- a/include/uapi/linux/gfs2_ondisk.h
+++ b/include/uapi/linux/gfs2_ondisk.h
@@ -171,16 +171,6 @@ struct gfs2_rindex {
  #define GFS2_RGF_NOALLOC    0x0008
  #define GFS2_RGF_TRIMMED    0x0010
-struct gfs2_rgrp_lvb {
-    __be32 rl_magic;
-    __be32 rl_flags;
-    __be32 rl_free;
-    __be32 rl_dinodes;
-    __be64 rl_igeneration;
-    __be32 rl_unlinked;
-    __be32 __pad;
-};
-
  struct gfs2_rgrp {
  struct gfs2_meta_header rg_header;






[Cluster-devel] [PATCH 1/2] gfs2: move setting current->backing_dev_info

2020-01-15 Thread Christoph Hellwig
Only set current->backing_dev_info just around the buffered write calls
to prepare for the next fix.

Signed-off-by: Christoph Hellwig 
---
 fs/gfs2/file.c | 21 ++---
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 9d58295ccf7a..21d032c4b077 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -867,18 +867,15 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret <= 0)
-   goto out;
-
-   /* We can write back this queue in page reclaim */
-   current->backing_dev_info = inode_to_bdi(inode);
+   goto out_unlock;
 
ret = file_remove_privs(file);
if (ret)
-   goto out2;
+   goto out_unlock;
 
ret = file_update_time(file);
if (ret)
-   goto out2;
+   goto out_unlock;
 
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
@@ -887,11 +884,13 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
 
written = gfs2_file_direct_write(iocb, from);
if (written < 0 || !iov_iter_count(from))
-   goto out2;
+   goto out_unlock;
 
+   current->backing_dev_info = inode_to_bdi(inode);
ret = iomap_file_buffered_write(iocb, from, _iomap_ops);
+   current->backing_dev_info = NULL;
if (unlikely(ret < 0))
-   goto out2;
+   goto out_unlock;
buffered = ret;
 
/*
@@ -915,14 +914,14 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
 */
}
} else {
+   current->backing_dev_info = inode_to_bdi(inode);
ret = iomap_file_buffered_write(iocb, from, _iomap_ops);
+   current->backing_dev_info = NULL;
if (likely(ret > 0))
iocb->ki_pos += ret;
}
 
-out2:
-   current->backing_dev_info = NULL;
-out:
+out_unlock:
inode_unlock(inode);
if (likely(ret > 0)) {
/* Handle various SYNC-type writes */
-- 
2.24.1




Re: [Cluster-devel] RFC: hold i_rwsem until aio completes

2020-01-15 Thread Jason Gunthorpe
On Wed, Jan 15, 2020 at 04:36:14PM +0100, Christoph Hellwig wrote:

> synchronous and currently hack that up, so a version of the percpu_ref
> that actually waits for the other references to away like we hacked
> up various places seems to exactly suit your requirements.

Ah, yes, sounds like a similar goal, many cases I'm thinking about
also hack up a kref to trigger a completion to make it
synchronous.

Jason




Re: [Cluster-devel] RFC: hold i_rwsem until aio completes

2020-01-15 Thread Waiman Long
On 1/15/20 9:49 AM, Jason Gunthorpe wrote:
> On Wed, Jan 15, 2020 at 03:33:47PM +0100, Peter Zijlstra wrote:
>> On Wed, Jan 15, 2020 at 09:24:28AM -0400, Jason Gunthorpe wrote:
>>
>>> I was interested because you are talking about allowing the read/write side
>>> of a rw sem to be held across a return to user space/etc, which is the
>>> same basic problem.
>> No it is not; allowing the lock to be held across userspace doesn't
>> change the owner. This is a crucial difference, PI depends on there
>> being a distinct owner. That said, allowing the lock to be held across
>> userspace still breaks PI in that it completely wrecks the ability to
>> analyze the critical section.
> I'm not sure what you are contrasting?
>
> I was remarking that I see many places open code a rwsem using an
> atomic and a completion specifically because they need to do the
> things Christoph identified:
>
>> (1) no unlocking by another process than the one that acquired it
>> (2) no return to userspace with locks held
> As an example flow: obtain the read side lock, schedual a work queue,
> return to user space, and unlock the read side from the work queue.

We currently have down_read_non_owner() and up_read_non_owner() that
perform the lock and unlock without lockdep tracking. Of course, that is
a hack and their use must be carefully scrutinized to make sure that
there is no deadlock or other potentially locking issues.

Cheers,
Longman



Re: [Cluster-devel] [PATCH] Move struct gfs2_rgrp_lvb out of gfs2_ondisk.h

2020-01-15 Thread Bob Peterson
- Original Message -
> Hi,
> 
> On 15/01/2020 09:24, Andreas Gruenbacher wrote:
> > On Wed, Jan 15, 2020 at 9:58 AM Steven Whitehouse 
> > wrote:
> >> On 15/01/2020 08:49, Andreas Gruenbacher wrote:
> >>> There's no point in sharing the internal structure of lock value blocks
> >>> with user space.
> >> The reason that is in ondisk is that changing that structure is
> >> something that needs to follow the same rules as changing the on disk
> >> structures. So it is there as a reminder of that,
> > I can see a point in that. The reason I've posted this is because Bob
> > was complaining that changes to include/uapi/linux/gfs2_ondisk.h break
> > his out-of-tree module build process. (One of the patches I'm working
> > on adds an inode LVB.) The same would be true of on-disk format
> > changes as well of course, and those definitely need to be shared with
> > user space. I'm not usually building gfs2 out of tree, so I'm
> > indifferent to this change.
> >
> > Thanks,
> > Andreas
> >
> Why would we need to be able to build gfs2 (at least I assume it is
> gfs2) out of tree anyway?
> 
> Steve.

Simply for productivity. The difference is this procedure, which literally 
takes 10 seconds,
if done simultaneously on all nodes using something like cssh:

make -C /usr/src/kernels/4.18.0-165.el8.x86_64 modules M=$PWD
rmmod gfs2
insmod gfs2.ko

Compared to a procedure like this, which takes at least 30 minutes:

make (a new kernel .src.rpm)
scp or rsync the .src.rpm to a build machine
cd ~/rpmbuild/
rpm --force -i --nodeps /home/bob/*kernel-4.18.0*.src.rpm &> /dev/null
echo $?
rpmbuild --target=x86_64 -ba SPECS/kernel.spec
( -or- submit a "real" kernel build)
then wait for the kernel build
Pull down all necessary kernel rpms
scp  to all the nodes in the cluster
rpm --force -i --nodeps 
/sbin/reboot all the nodes in the cluster
wait for all the nodes to reboot, the cluster to stabilize, etc.

Regards,

Bob Peterson
Red Hat File Systems