Re: [PATCH 2/5] vfs: create a generic checking function for FS_IOC_FSSETXATTR

2019-06-25 Thread Al Viro
On Tue, Jun 25, 2019 at 07:32:18PM -0700, Darrick J. Wong wrote:
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -373,10 +373,9 @@ static int check_xflags(unsigned int flags)
>  static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
>  {
>   struct btrfs_inode *binode = BTRFS_I(file_inode(file));
> - struct fsxattr fa;
> -
> - memset(, 0, sizeof(fa));
> - fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
> + struct fsxattr fa = {
> + .fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags),
> + };

Umm...  Sure, there's no padding, but still - you are going to copy that thing
to userland...  How about

static inline void simple_fill_fsxattr(struct fsxattr *fa, unsigned xflags)
{
memset(fa, 0, sizeof(*fa));
fa->fsx_xflags = xflags;
}

and let the compiler optimize the crap out?


Re: [PATCH 1/5] vfs: create a generic checking and prep function for FS_IOC_SETFLAGS

2019-06-25 Thread Al Viro
On Tue, Jun 25, 2019 at 07:32:10PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong 
> 
> Create a generic function to check incoming FS_IOC_SETFLAGS flag values
> and later prepare the inode for updates so that we can standardize the
> implementations that follow ext4's flag values.

Change in efivarfs behaviour deserves a note.  I'm not saying it's wrong,
but behaviour is changed there - no-op FS_IOC_SETFLAGS used to fail
without CAP_LINUX_IMMUTABLE...


[PATCH 5/5] vfs: only allow FSSETXATTR to set DAX flag on files and dirs

2019-06-25 Thread Darrick J. Wong
From: Darrick J. Wong 

The DAX flag only applies to files and directories, so don't let it get
set for other types of files.

Signed-off-by: Darrick J. Wong 
---
 fs/inode.c |8 
 1 file changed, 8 insertions(+)


diff --git a/fs/inode.c b/fs/inode.c
index 670d5408d022..f08711b34341 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2259,6 +2259,14 @@ int vfs_ioc_fssetxattr_check(struct inode *inode, const 
struct fsxattr *old_fa,
!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
return -EINVAL;
 
+   /*
+* It is only valid to set the DAX flag on regular files and
+* directories on filesystems.
+*/
+   if ((fa->fsx_xflags & FS_XFLAG_DAX) &&
+   !(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)))
+   return -EINVAL;
+
/* Extent size hints of zero turn off the flags. */
if (fa->fsx_extsize == 0)
fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);



[Cluster-devel] [PATCH 2/5] vfs: create a generic checking function for FS_IOC_FSSETXATTR

2019-06-25 Thread Darrick J. Wong
From: Darrick J. Wong 

Create a generic checking function for the incoming FS_IOC_FSSETXATTR
fsxattr values so that we can standardize some of the implementation
behaviors.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Jan Kara 
---
 fs/btrfs/ioctl.c   |   18 ++---
 fs/ext4/ioctl.c|   27 +---
 fs/f2fs/file.c |   28 ++---
 fs/inode.c |   23 +
 fs/xfs/xfs_ioctl.c |   70 ++--
 include/linux/fs.h |3 ++
 6 files changed, 112 insertions(+), 57 deletions(-)


diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d3d9b4abb09b..0f5af7c5f66b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -373,10 +373,9 @@ static int check_xflags(unsigned int flags)
 static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
 {
struct btrfs_inode *binode = BTRFS_I(file_inode(file));
-   struct fsxattr fa;
-
-   memset(, 0, sizeof(fa));
-   fa.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
+   struct fsxattr fa = {
+   .fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags),
+   };
 
if (copy_to_user(arg, , sizeof(fa)))
return -EFAULT;
@@ -391,6 +390,9 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void 
__user *arg)
struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
struct fsxattr fa;
+   struct fsxattr old_fa = {
+   .fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags),
+   };
unsigned old_flags;
unsigned old_i_flags;
int ret = 0;
@@ -421,13 +423,9 @@ static int btrfs_ioctl_fssetxattr(struct file *file, void 
__user *arg)
old_flags = binode->flags;
old_i_flags = inode->i_flags;
 
-   /* We need the capabilities to change append-only or immutable inode */
-   if (((old_flags & (BTRFS_INODE_APPEND | BTRFS_INODE_IMMUTABLE)) ||
-(fa.fsx_xflags & (FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE))) &&
-   !capable(CAP_LINUX_IMMUTABLE)) {
-   ret = -EPERM;
+   ret = vfs_ioc_fssetxattr_check(inode, _fa, );
+   if (ret)
goto out_unlock;
-   }
 
if (fa.fsx_xflags & FS_XFLAG_SYNC)
binode->flags |= BTRFS_INODE_SYNC;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 272b6e44191b..ebcc173d1e7d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -721,6 +721,17 @@ static int ext4_ioctl_check_project(struct inode *inode, 
struct fsxattr *fa)
return 0;
 }
 
+static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
+{
+   struct ext4_inode_info *ei = EXT4_I(inode);
+
+   fa->fsx_xflags = ext4_iflags_to_xflags(ei->i_flags &
+  EXT4_FL_USER_VISIBLE);
+
+   if (ext4_has_feature_project(inode->i_sb))
+   fa->fsx_projid = from_kprojid(_user_ns, ei->i_projid);
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
struct inode *inode = file_inode(filp);
@@ -1087,15 +1098,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, 
unsigned long arg)
 
case EXT4_IOC_FSGETXATTR:
{
-   struct fsxattr fa;
-
-   memset(, 0, sizeof(struct fsxattr));
-   fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & 
EXT4_FL_USER_VISIBLE);
+   struct fsxattr fa = { 0 };
 
-   if (ext4_has_feature_project(inode->i_sb)) {
-   fa.fsx_projid = (__u32)from_kprojid(_user_ns,
-   EXT4_I(inode)->i_projid);
-   }
+   ext4_fill_fsxattr(inode, );
 
if (copy_to_user((struct fsxattr __user *)arg,
 , sizeof(fa)))
@@ -1104,7 +1109,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, 
unsigned long arg)
}
case EXT4_IOC_FSSETXATTR:
{
-   struct fsxattr fa;
+   struct fsxattr fa, old_fa = { 0 };
int err;
 
if (copy_from_user(, (struct fsxattr __user *)arg,
@@ -1127,7 +1132,11 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, 
unsigned long arg)
return err;
 
inode_lock(inode);
+   ext4_fill_fsxattr(inode, _fa);
err = ext4_ioctl_check_project(inode, );
+   if (err)
+   goto out;
+   err = vfs_ioc_fssetxattr_check(inode, _fa, );
if (err)
goto out;
flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 845ae6f43ebc..555b970f7945 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2773,19 +2773,24 @@ static inline unsigned long f2fs_xflags_to_iflags(__u32 
xflags)
return iflags;
 }
 
-static int f2fs_ioc_fsgetxattr(struct file *filp, unsigned long arg)

[PATCH 4/5] vfs: teach vfs_ioc_fssetxattr_check to check extent size hints

2019-06-25 Thread Darrick J. Wong
From: Darrick J. Wong 

Move the extent size hint checks that aren't xfs-specific to the vfs.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Jan Kara 
---
 fs/inode.c |   18 +
 fs/xfs/xfs_ioctl.c |   70 ++--
 2 files changed, 47 insertions(+), 41 deletions(-)


diff --git a/fs/inode.c b/fs/inode.c
index c4f8fb16f633..670d5408d022 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2247,6 +2247,24 @@ int vfs_ioc_fssetxattr_check(struct inode *inode, const 
struct fsxattr *old_fa,
return -EINVAL;
}
 
+   /* Check extent size hints. */
+   if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(inode->i_mode))
+   return -EINVAL;
+
+   if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
+   !S_ISDIR(inode->i_mode))
+   return -EINVAL;
+
+   if ((fa->fsx_xflags & FS_XFLAG_COWEXTSIZE) &&
+   !S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+   return -EINVAL;
+
+   /* Extent size hints of zero turn off the flags. */
+   if (fa->fsx_extsize == 0)
+   fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+   if (fa->fsx_cowextsize == 0)
+   fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
return 0;
 }
 EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d8c02b9905a7..011657bd50ca 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1201,39 +1201,31 @@ xfs_ioctl_setattr_check_extsize(
struct fsxattr  *fa)
 {
struct xfs_mount*mp = ip->i_mount;
-
-   if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(VFS_I(ip)->i_mode))
-   return -EINVAL;
-
-   if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
-   !S_ISDIR(VFS_I(ip)->i_mode))
-   return -EINVAL;
+   xfs_extlen_tsize;
+   xfs_fsblock_t   extsize_fsb;
 
if (S_ISREG(VFS_I(ip)->i_mode) && ip->i_d.di_nextents &&
((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
return -EINVAL;
 
-   if (fa->fsx_extsize != 0) {
-   xfs_extlen_tsize;
-   xfs_fsblock_t   extsize_fsb;
-
-   extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
-   if (extsize_fsb > MAXEXTLEN)
-   return -EINVAL;
+   if (fa->fsx_extsize == 0)
+   return 0;
 
-   if (XFS_IS_REALTIME_INODE(ip) ||
-   (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
-   size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
-   } else {
-   size = mp->m_sb.sb_blocksize;
-   if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
-   return -EINVAL;
-   }
+   extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+   if (extsize_fsb > MAXEXTLEN)
+   return -EINVAL;
 
-   if (fa->fsx_extsize % size)
+   if (XFS_IS_REALTIME_INODE(ip) ||
+   (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
+   size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+   } else {
+   size = mp->m_sb.sb_blocksize;
+   if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
return -EINVAL;
-   } else
-   fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
+   }
+
+   if (fa->fsx_extsize % size)
+   return -EINVAL;
 
return 0;
 }
@@ -1259,6 +1251,8 @@ xfs_ioctl_setattr_check_cowextsize(
struct fsxattr  *fa)
 {
struct xfs_mount*mp = ip->i_mount;
+   xfs_extlen_tsize;
+   xfs_fsblock_t   cowextsize_fsb;
 
if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
return 0;
@@ -1267,25 +1261,19 @@ xfs_ioctl_setattr_check_cowextsize(
ip->i_d.di_version != 3)
return -EINVAL;
 
-   if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode))
-   return -EINVAL;
-
-   if (fa->fsx_cowextsize != 0) {
-   xfs_extlen_tsize;
-   xfs_fsblock_t   cowextsize_fsb;
+   if (fa->fsx_cowextsize == 0)
+   return 0;
 
-   cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
-   if (cowextsize_fsb > MAXEXTLEN)
-   return -EINVAL;
+   cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+   if (cowextsize_fsb > MAXEXTLEN)
+   return -EINVAL;
 
-   size = mp->m_sb.sb_blocksize;
-   if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
-   return -EINVAL;
+   size = mp->m_sb.sb_blocksize;
+   if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
+   return -EINVAL;
 
-   if (fa->fsx_cowextsize % size)
-   return -EINVAL;
-

[PATCH 1/5] vfs: create a generic checking and prep function for FS_IOC_SETFLAGS

2019-06-25 Thread Darrick J. Wong
From: Darrick J. Wong 

Create a generic function to check incoming FS_IOC_SETFLAGS flag values
and later prepare the inode for updates so that we can standardize the
implementations that follow ext4's flag values.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Jan Kara 
Reviewed-by: Christoph Hellwig 
Acked-by: David Sterba 
---
 fs/btrfs/ioctl.c|   13 +
 fs/efivarfs/file.c  |   26 +-
 fs/ext2/ioctl.c |   16 
 fs/ext4/ioctl.c |   13 +++--
 fs/f2fs/file.c  |7 ---
 fs/gfs2/file.c  |   42 +-
 fs/hfsplus/ioctl.c  |   21 -
 fs/inode.c  |   24 
 fs/jfs/ioctl.c  |   22 +++---
 fs/nilfs2/ioctl.c   |9 ++---
 fs/ocfs2/ioctl.c|   13 +++--
 fs/orangefs/file.c  |   35 ++-
 fs/reiserfs/ioctl.c |   10 --
 fs/ubifs/ioctl.c|   13 +++--
 include/linux/fs.h  |3 +++
 15 files changed, 146 insertions(+), 121 deletions(-)


diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 6dafa857bbb9..d3d9b4abb09b 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -187,7 +187,7 @@ static int btrfs_ioctl_setflags(struct file *file, void 
__user *arg)
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
struct btrfs_trans_handle *trans;
-   unsigned int fsflags;
+   unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
u32 binode_flags = binode->flags;
@@ -212,13 +212,10 @@ static int btrfs_ioctl_setflags(struct file *file, void 
__user *arg)
inode_lock(inode);
 
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
-   if ((fsflags ^ btrfs_inode_flags_to_fsflags(binode->flags)) &
-   (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
-   if (!capable(CAP_LINUX_IMMUTABLE)) {
-   ret = -EPERM;
-   goto out_unlock;
-   }
-   }
+   old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+   ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
+   if (ret)
+   goto out_unlock;
 
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 8e568428c88b..a3cc10b1bfe1 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -110,16 +110,22 @@ static ssize_t efivarfs_file_read(struct file *file, char 
__user *userbuf,
return size;
 }
 
-static int
-efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+static inline unsigned int efivarfs_getflags(struct inode *inode)
 {
-   struct inode *inode = file->f_mapping->host;
unsigned int i_flags;
unsigned int flags = 0;
 
i_flags = inode->i_flags;
if (i_flags & S_IMMUTABLE)
flags |= FS_IMMUTABLE_FL;
+   return flags;
+}
+
+static int
+efivarfs_ioc_getxflags(struct file *file, void __user *arg)
+{
+   struct inode *inode = file->f_mapping->host;
+   unsigned int flags = efivarfs_getflags(inode);
 
if (copy_to_user(arg, , sizeof(flags)))
return -EFAULT;
@@ -132,6 +138,7 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
struct inode *inode = file->f_mapping->host;
unsigned int flags;
unsigned int i_flags = 0;
+   unsigned int oldflags = efivarfs_getflags(inode);
int error;
 
if (!inode_owner_or_capable(inode))
@@ -143,9 +150,6 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
if (flags & ~FS_IMMUTABLE_FL)
return -EOPNOTSUPP;
 
-   if (!capable(CAP_LINUX_IMMUTABLE))
-   return -EPERM;
-
if (flags & FS_IMMUTABLE_FL)
i_flags |= S_IMMUTABLE;
 
@@ -155,12 +159,16 @@ efivarfs_ioc_setxflags(struct file *file, void __user 
*arg)
return error;
 
inode_lock(inode);
+
+   error = vfs_ioc_setflags_prepare(inode, oldflags, flags);
+   if (error)
+   goto out;
+
inode_set_flags(inode, i_flags, S_IMMUTABLE);
+out:
inode_unlock(inode);
-
mnt_drop_write_file(file);
-
-   return 0;
+   return error;
 }
 
 static long
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 0367c0039e68..1b853fb0b163 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -60,18 +60,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, 
unsigned long arg)
}
oldflags = ei->i_flags;
 
-   /*
-* The IMMUTABLE and APPEND_ONLY flags can only be changed by
-* the relevant capability.
-*
-* This test looks nicer. Thanks to Pauline Middelink
-*/
-   if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
-   

[PATCH v3 0/5] vfs: clean up SETFLAGS and FSSETXATTR option processing

2019-06-25 Thread Darrick J. Wong
Hi all,

The FS_IOC_SETFLAGS and FS_IOC_FSSETXATTR ioctls were promoted from ext4
and XFS, respectively, into the VFS.  However, we didn't promote any of
the parameter checking code from those filesystems, which lead to a mess
where each filesystem open-codes whatever parameter checks they want and
the behavior across filesystems is no longer consistent.

Therefore, create some generic checking functions in the VFS and remove
all the open-coded pieces in each filesystem.  This preserves the
current behavior where a filesystem can choose to ignore fields it
doesn't understand.

If you're going to start using this mess, you probably ought to just
pull from my git trees, which are linked below.

This has been lightly tested with fstests.  Enjoy!
Comments and questions are, as always, welcome.

--D

kernel git tree:
https://git.kernel.org/cgit/linux/kernel/git/djwong/xfs-linux.git/log/?h=file-ioctl-cleanups


[PATCH 3/5] vfs: teach vfs_ioc_fssetxattr_check to check project id info

2019-06-25 Thread Darrick J. Wong
From: Darrick J. Wong 

Standardize the project id checks for FSSETXATTR.

Signed-off-by: Darrick J. Wong 
Reviewed-by: Jan Kara 
---
 fs/ext4/ioctl.c|   27 ---
 fs/f2fs/file.c |   27 ---
 fs/inode.c |   13 +
 fs/xfs/xfs_ioctl.c |   15 ---
 4 files changed, 13 insertions(+), 69 deletions(-)


diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index ebcc173d1e7d..1e88c3af9a8d 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -697,30 +697,6 @@ static long ext4_ioctl_group_add(struct file *file,
return err;
 }
 
-static int ext4_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
-{
-   /*
-* Project Quota ID state is only allowed to change from within the init
-* namespace. Enforce that restriction only if we are trying to change
-* the quota ID state. Everything else is allowed in user namespaces.
-*/
-   if (current_user_ns() == _user_ns)
-   return 0;
-
-   if (__kprojid_val(EXT4_I(inode)->i_projid) != fa->fsx_projid)
-   return -EINVAL;
-
-   if (ext4_test_inode_flag(inode, EXT4_INODE_PROJINHERIT)) {
-   if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
-   return -EINVAL;
-   } else {
-   if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
-   return -EINVAL;
-   }
-
-   return 0;
-}
-
 static void ext4_fill_fsxattr(struct inode *inode, struct fsxattr *fa)
 {
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -1133,9 +1109,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, 
unsigned long arg)
 
inode_lock(inode);
ext4_fill_fsxattr(inode, _fa);
-   err = ext4_ioctl_check_project(inode, );
-   if (err)
-   goto out;
err = vfs_ioc_fssetxattr_check(inode, _fa, );
if (err)
goto out;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 555b970f7945..d6ed319388d6 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2797,30 +2797,6 @@ static int f2fs_ioc_fsgetxattr(struct file *filp, 
unsigned long arg)
return 0;
 }
 
-static int f2fs_ioctl_check_project(struct inode *inode, struct fsxattr *fa)
-{
-   /*
-* Project Quota ID state is only allowed to change from within the init
-* namespace. Enforce that restriction only if we are trying to change
-* the quota ID state. Everything else is allowed in user namespaces.
-*/
-   if (current_user_ns() == _user_ns)
-   return 0;
-
-   if (__kprojid_val(F2FS_I(inode)->i_projid) != fa->fsx_projid)
-   return -EINVAL;
-
-   if (F2FS_I(inode)->i_flags & F2FS_PROJINHERIT_FL) {
-   if (!(fa->fsx_xflags & FS_XFLAG_PROJINHERIT))
-   return -EINVAL;
-   } else {
-   if (fa->fsx_xflags & FS_XFLAG_PROJINHERIT)
-   return -EINVAL;
-   }
-
-   return 0;
-}
-
 static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
 {
struct inode *inode = file_inode(filp);
@@ -2848,9 +2824,6 @@ static int f2fs_ioc_fssetxattr(struct file *filp, 
unsigned long arg)
return err;
 
inode_lock(inode);
-   err = f2fs_ioctl_check_project(inode, );
-   if (err)
-   goto out;
 
f2fs_fill_fsxattr(inode, _fa);
err = vfs_ioc_fssetxattr_check(inode, _fa, );
diff --git a/fs/inode.c b/fs/inode.c
index fdd6c5d3e48d..c4f8fb16f633 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -2234,6 +2234,19 @@ int vfs_ioc_fssetxattr_check(struct inode *inode, const 
struct fsxattr *old_fa,
!capable(CAP_LINUX_IMMUTABLE))
return -EPERM;
 
+   /*
+* Project Quota ID state is only allowed to change from within the init
+* namespace. Enforce that restriction only if we are trying to change
+* the quota ID state. Everything else is allowed in user namespaces.
+*/
+   if (current_user_ns() != _user_ns) {
+   if (old_fa->fsx_projid != fa->fsx_projid)
+   return -EINVAL;
+   if ((old_fa->fsx_xflags ^ fa->fsx_xflags) &
+   FS_XFLAG_PROJINHERIT)
+   return -EINVAL;
+   }
+
return 0;
 }
 EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 08ec1e458865..d8c02b9905a7 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1299,21 +1299,6 @@ xfs_ioctl_setattr_check_projid(
if (fa->fsx_projid > (uint16_t)-1 &&
!xfs_sb_version_hasprojid32bit(>i_mount->m_sb))
return -EINVAL;
-
-   /*
-* Project Quota ID state is only allowed to change from within the init
-* namespace. Enforce that restriction only if we are trying to change
-* the quota ID state. Everything 

Re: [Cluster-devel] [PATCH] fs: Move mark_inode_dirty out of __generic_write_end

2019-06-25 Thread Andreas Gruenbacher
On Tue, 25 Jun 2019 at 12:50, Christoph Hellwig  wrote:
>
> > That seems way more complicated.  I'd much rather go with something
> > like may patch plus maybe a big fat comment explaining that persisting
> > the size update is the file systems job.  Note that a lot of the modern
> > file systems don't use the VFS inode tracking for that, besides XFS
> > that includes at least btrfs and ocfs2 as well.
>
> I'd suggest something like this as the baseline:
>
> http://git.infradead.org/users/hch/xfs.git/shortlog/refs/heads/iomap-i_size

Alright, can we change this as follows?

[Also, I'm not really sure why we check for (pos + ret > inode->i_size)
when we have already read inode->i_size into old_size.]

Thanks,
Andreas

--

iomap: don't mark the inode dirty in iomap_write_end

Marking the inode dirty for each page copied into the page cache
can be very ineffcient for file systems that use the VFS dirty
inode tracking, and is completely pointless for those that don't
use the VFS dirty tracking.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Andreas Gruenbacher 
---
 fs/gfs2/bmap.c|  2 ++
 fs/iomap.c| 15 ++-
 include/linux/iomap.h |  1 +
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 93ea1d5..f4b895f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1182,6 +1182,8 @@ static int gfs2_iomap_end(struct inode *inode, loff_t 
pos, loff_t length,
 
if (ip->i_qadata && ip->i_qadata->qa_qd_num)
gfs2_quota_unlock(ip);
+   if (iomap->flags & IOMAP_F_SIZE_CHANGED)
+   mark_inode_dirty(inode);
gfs2_write_unlock(inode);
 
 out:
diff --git a/fs/iomap.c b/fs/iomap.c
index 12654c2..cd24bd1 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -777,6 +777,7 @@ struct iomap_readpage_ctx {
unsigned copied, struct page *page, struct iomap *iomap)
 {
const struct iomap_page_ops *page_ops = iomap->page_ops;
+   loff_t old_size = inode->i_size;
int ret;
 
if (iomap->type == IOMAP_INLINE) {
@@ -788,7 +789,19 @@ struct iomap_readpage_ctx {
ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
}
 
-   __generic_write_end(inode, pos, ret, page);
+   /*
+* Update the in-memory inode size after copying the data into the page
+* cache.  It's up to the file system to write the updated size to disk,
+* preferably after I/O completion so that no stale data is exposed.
+*/
+   if (pos + ret > inode->i_size) {
+   i_size_write(inode, pos + ret);
+   iomap->flags |= IOMAP_F_SIZE_CHANGED;
+   }
+   unlock_page(page);
+
+   if (old_size < pos)
+   pagecache_isize_extended(inode, old_size, pos);
if (page_ops && page_ops->page_done)
page_ops->page_done(inode, pos, copied, page, iomap);
put_page(page);
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index 2103b94..1df9ea1 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -35,6 +35,7 @@
 #define IOMAP_F_NEW0x01/* blocks have been newly allocated */
 #define IOMAP_F_DIRTY  0x02/* uncommitted metadata */
 #define IOMAP_F_BUFFER_HEAD0x04/* file system requires buffer heads */
+#define IOMAP_F_SIZE_CHANGED   0x08/* file size has changed */
 
 /*
  * Flags that only need to be reported for IOMAP_REPORT requests:
-- 
1.8.3.1



Re: [PATCH 1/4] vfs: create a generic checking function for FS_IOC_SETFLAGS

2019-06-25 Thread Darrick J. Wong
On Tue, Jun 25, 2019 at 07:12:54PM +0200, David Sterba wrote:
> On Fri, Jun 21, 2019 at 04:56:21PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong 
> > 
> > Create a generic checking function for the incoming FS_IOC_SETFLAGS flag
> > values so that we can standardize the implementations that follow ext4's
> > flag values.
> 
> I checked a few samples what's the type of the flags, there are unsigned
> types while the proposed VFS functions take signed type.
> 
> > +int vfs_ioc_setflags_check(struct inode *inode, int oldflags, int flags);
> 
> Specifically ext4 uses unsigned type and his was the original API that
> got copied so I'd think that it should unsigned everywhere.

Yeah, I'll change it.

> >  fs/btrfs/ioctl.c|   13 +
> 
> For the btrfs bits
> 
> Acked-by: David Sterba 
> 
> and besides the signedness, the rest of the changes look good to me.

Thanks for the look around!  I'll have a new revision with all changes
out by the end of the day. :)

--D


Re: [PATCH 2/4] vfs: create a generic checking function for FS_IOC_FSSETXATTR

2019-06-25 Thread David Sterba
On Tue, Jun 25, 2019 at 10:16:16AM -0700, Darrick J. Wong wrote:
> On Tue, Jun 25, 2019 at 07:02:48PM +0200, David Sterba wrote:
> > On Tue, Jun 25, 2019 at 03:57:25AM -0700, Christoph Hellwig wrote:
> > > On Fri, Jun 21, 2019 at 04:56:29PM -0700, Darrick J. Wong wrote:
> > > > From: Darrick J. Wong 
> > > > 
> > > > Create a generic checking function for the incoming FS_IOC_FSSETXATTR
> > > > fsxattr values so that we can standardize some of the implementation
> > > > behaviors.
> > > > 
> > > > Signed-off-by: Darrick J. Wong 
> > > > Reviewed-by: Jan Kara 
> > > > ---
> > > >  fs/btrfs/ioctl.c   |   21 +---
> > > >  fs/ext4/ioctl.c|   27 ++--
> > > >  fs/f2fs/file.c |   26 ++-
> > > >  fs/inode.c |   17 +
> > > >  fs/xfs/xfs_ioctl.c |   70 
> > > > ++--
> > > >  include/linux/fs.h |3 ++
> > > >  6 files changed, 111 insertions(+), 53 deletions(-)
> > > > 
> > > > 
> > > > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > > > index f408aa93b0cf..7ddda5b4b6a6 100644
> > > > --- a/fs/btrfs/ioctl.c
> > > > +++ b/fs/btrfs/ioctl.c
> > > > @@ -366,6 +366,13 @@ static int check_xflags(unsigned int flags)
> > > > return 0;
> > > >  }
> > > >  
> > > > +static void __btrfs_ioctl_fsgetxattr(struct btrfs_inode *binode,
> > > > +struct fsxattr *fa)
> > > > +{
> > > > +   memset(fa, 0, sizeof(*fa));
> > > > +   fa->fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
> > > 
> > > Is there really much of a point in this helper? Epeciall as
> > > the zeroing could easily be done in the variable declaration
> > > line using
> > > 
> > >   struct fsxattr fa = { };
> > 
> > Agreed, not counting the initialization the wrapper is merely another
> > name for btrfs_inode_flags_to_xflags. I also find it slightly confusing
> > that __btrfs_ioctl_fsgetxattr name is too close to the ioctl callback
> > implementation btrfs_ioctl_fsgetxattr but only does some initialization.
> 
> Ok; it's easily enough changed to:
> 
>   struct fsxattr old_fa = {
>   .fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags),
>   };

Works for me, thanks.


Re: [PATCH 2/4] vfs: create a generic checking function for FS_IOC_FSSETXATTR

2019-06-25 Thread Darrick J. Wong
On Tue, Jun 25, 2019 at 07:02:48PM +0200, David Sterba wrote:
> On Tue, Jun 25, 2019 at 03:57:25AM -0700, Christoph Hellwig wrote:
> > On Fri, Jun 21, 2019 at 04:56:29PM -0700, Darrick J. Wong wrote:
> > > From: Darrick J. Wong 
> > > 
> > > Create a generic checking function for the incoming FS_IOC_FSSETXATTR
> > > fsxattr values so that we can standardize some of the implementation
> > > behaviors.
> > > 
> > > Signed-off-by: Darrick J. Wong 
> > > Reviewed-by: Jan Kara 
> > > ---
> > >  fs/btrfs/ioctl.c   |   21 +---
> > >  fs/ext4/ioctl.c|   27 ++--
> > >  fs/f2fs/file.c |   26 ++-
> > >  fs/inode.c |   17 +
> > >  fs/xfs/xfs_ioctl.c |   70 
> > > ++--
> > >  include/linux/fs.h |3 ++
> > >  6 files changed, 111 insertions(+), 53 deletions(-)
> > > 
> > > 
> > > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > > index f408aa93b0cf..7ddda5b4b6a6 100644
> > > --- a/fs/btrfs/ioctl.c
> > > +++ b/fs/btrfs/ioctl.c
> > > @@ -366,6 +366,13 @@ static int check_xflags(unsigned int flags)
> > >   return 0;
> > >  }
> > >  
> > > +static void __btrfs_ioctl_fsgetxattr(struct btrfs_inode *binode,
> > > +  struct fsxattr *fa)
> > > +{
> > > + memset(fa, 0, sizeof(*fa));
> > > + fa->fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
> > 
> > Is there really much of a point in this helper? Epeciall as
> > the zeroing could easily be done in the variable declaration
> > line using
> > 
> > struct fsxattr fa = { };
> 
> Agreed, not counting the initialization the wrapper is merely another
> name for btrfs_inode_flags_to_xflags. I also find it slightly confusing
> that __btrfs_ioctl_fsgetxattr name is too close to the ioctl callback
> implementation btrfs_ioctl_fsgetxattr but only does some initialization.

Ok; it's easily enough changed to:

struct fsxattr old_fa = {
.fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags),
};

--D


Re: [PATCH 1/4] vfs: create a generic checking function for FS_IOC_SETFLAGS

2019-06-25 Thread David Sterba
On Fri, Jun 21, 2019 at 04:56:21PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong 
> 
> Create a generic checking function for the incoming FS_IOC_SETFLAGS flag
> values so that we can standardize the implementations that follow ext4's
> flag values.

I checked a few samples what's the type of the flags, there are unsigned
types while the proposed VFS functions take signed type.

> +int vfs_ioc_setflags_check(struct inode *inode, int oldflags, int flags);

Specifically ext4 uses unsigned type and his was the original API that
got copied so I'd think that it should unsigned everywhere.

>  fs/btrfs/ioctl.c|   13 +

For the btrfs bits

Acked-by: David Sterba 

and besides the signedness, the rest of the changes look good to me.


Re: [PATCH 2/4] vfs: create a generic checking function for FS_IOC_FSSETXATTR

2019-06-25 Thread David Sterba
On Tue, Jun 25, 2019 at 03:57:25AM -0700, Christoph Hellwig wrote:
> On Fri, Jun 21, 2019 at 04:56:29PM -0700, Darrick J. Wong wrote:
> > From: Darrick J. Wong 
> > 
> > Create a generic checking function for the incoming FS_IOC_FSSETXATTR
> > fsxattr values so that we can standardize some of the implementation
> > behaviors.
> > 
> > Signed-off-by: Darrick J. Wong 
> > Reviewed-by: Jan Kara 
> > ---
> >  fs/btrfs/ioctl.c   |   21 +---
> >  fs/ext4/ioctl.c|   27 ++--
> >  fs/f2fs/file.c |   26 ++-
> >  fs/inode.c |   17 +
> >  fs/xfs/xfs_ioctl.c |   70 
> > ++--
> >  include/linux/fs.h |3 ++
> >  6 files changed, 111 insertions(+), 53 deletions(-)
> > 
> > 
> > diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> > index f408aa93b0cf..7ddda5b4b6a6 100644
> > --- a/fs/btrfs/ioctl.c
> > +++ b/fs/btrfs/ioctl.c
> > @@ -366,6 +366,13 @@ static int check_xflags(unsigned int flags)
> > return 0;
> >  }
> >  
> > +static void __btrfs_ioctl_fsgetxattr(struct btrfs_inode *binode,
> > +struct fsxattr *fa)
> > +{
> > +   memset(fa, 0, sizeof(*fa));
> > +   fa->fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);
> 
> Is there really much of a point in this helper? Epeciall as
> the zeroing could easily be done in the variable declaration
> line using
> 
>   struct fsxattr fa = { };

Agreed, not counting the initialization the wrapper is merely another
name for btrfs_inode_flags_to_xflags. I also find it slightly confusing
that __btrfs_ioctl_fsgetxattr name is too close to the ioctl callback
implementation btrfs_ioctl_fsgetxattr but only does some initialization.


Re: [Cluster-devel] [PATCH] fs: Move mark_inode_dirty out of __generic_write_end

2019-06-25 Thread Andreas Gruenbacher
On Tue, 25 Jun 2019 at 11:57, Christoph Hellwig  wrote:
> On Mon, Jun 24, 2019 at 08:22:43PM +0200, Andreas Gruenbacher wrote:
> > That would work, but I don't like how this leaves us with a vfs function
> > that updates i_size without bothering to dirty the inode very much.
>
> This isn't a VFS function, it is a helper library.

Well, let's call it that if this suits you better.

> > How about if we move the __generic_write_end call into the page_done
> > callback and leave special handling to the filesystem code if needed
> > instead?  The below patch seems to work for gfs2.
>
> That seems way more complicated.  I'd much rather go with something
> like may patch plus maybe a big fat comment explaining that persisting
> the size update is the file systems job.  Note that a lot of the modern
> file systems don't use the VFS inode tracking for that, besides XFS
> that includes at least btrfs and ocfs2 as well.

Andreas



Re: [Cluster-devel] [RFC][PATCH] wake_up_var() memory ordering

2019-06-25 Thread Peter Zijlstra
On Tue, Jun 25, 2019 at 02:12:22PM +0200, Andreas Gruenbacher wrote:

> > Only if we do as David suggested and make clean_and_wake_up_bit()
> > provide the RELEASE barrier.
> 
> (It's clear_and_wake_up_bit, not clean_and_wake_up_bit.)

Yes, typing hard.

> > That is, currently clear_and_wake_up_bit() is
> >
> > clear_bit()
> > smp_mb__after_atomic();
> > wake_up_bit();
> >

> Now I'm confused because clear_and_wake_up_bit() in mainline does use
> clear_bit_unlock(), so it's the exact opposite of what you just said.

Argh; clearly I couldn't read. And then yes, you're right.



Re: [Cluster-devel] [RFC][PATCH] wake_up_var() memory ordering

2019-06-25 Thread Andreas Gruenbacher
On Tue, 25 Jun 2019 at 12:36, Peter Zijlstra  wrote:
> On Tue, Jun 25, 2019 at 11:19:35AM +0200, Andreas Gruenbacher wrote:
> > > diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
> > > index cf4c767005b1..29ea5da7 100644
> > > --- a/fs/gfs2/glops.c
> > > +++ b/fs/gfs2/glops.c
> > > @@ -227,6 +227,7 @@ static void gfs2_clear_glop_pending(struct gfs2_inode 
> > > *ip)
> > > return;
> > >
> > > clear_bit_unlock(GIF_GLOP_PENDING, >i_flags);
> > > +   smp_mb__after_atomic();
> > > wake_up_bit(>i_flags, GIF_GLOP_PENDING);
> >
> > This should become clear_and_wake_up_bit as well, right? There are
> > several more instances of the same pattern.
>
> Only if we do as David suggested and make clean_and_wake_up_bit()
> provide the RELEASE barrier.

(It's clear_and_wake_up_bit, not clean_and_wake_up_bit.)

> That is, currently clear_and_wake_up_bit() is
>
> clear_bit()
> smp_mb__after_atomic();
> wake_up_bit();
>
> But the above is:
>
> clear_bit_unlock();
> smp_mb__after_atomic();
> wake_up_bit()
>
> the difference is that _unlock() uses RELEASE semantics, where
> clear_bit() does not.
>
> The difference is illustrated with something like:
>
> cond = true;
> clear_bit()
> smp_mb__after_atomic();
> wake_up_bit();
>
> In this case, a remote CPU can first observe the clear_bit() and then
> the 'cond = true' store. When we use clear_bit_unlock() this is not
> possible, because the RELEASE barrier ensures that everything before,
> stays before.

Now I'm confused because clear_and_wake_up_bit() in mainline does use
clear_bit_unlock(), so it's the exact opposite of what you just said.

Thanks,
Andreas



Re: [PATCH 4/4] vfs: teach vfs_ioc_fssetxattr_check to check extent size hints

2019-06-25 Thread Christoph Hellwig
On Fri, Jun 21, 2019 at 04:56:45PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong 
> 
> Move the extent size hint checks that aren't xfs-specific to the vfs.
> 
> Signed-off-by: Darrick J. Wong 
> Reviewed-by: Jan Kara 

Looks good,

Reviewed-by: Christoph Hellwig 


Re: [PATCH 3/4] fs: teach vfs_ioc_fssetxattr_check to check project id info

2019-06-25 Thread Christoph Hellwig
On Fri, Jun 21, 2019 at 04:56:37PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong 
> 
> Standardize the project id checks for FSSETXATTR.
> 
> Signed-off-by: Darrick J. Wong 
> Reviewed-by: Jan Kara 

Looks good,

Reviewed-by: Christoph Hellwig 


Re: [PATCH 2/4] vfs: create a generic checking function for FS_IOC_FSSETXATTR

2019-06-25 Thread Christoph Hellwig
On Fri, Jun 21, 2019 at 04:56:29PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong 
> 
> Create a generic checking function for the incoming FS_IOC_FSSETXATTR
> fsxattr values so that we can standardize some of the implementation
> behaviors.
> 
> Signed-off-by: Darrick J. Wong 
> Reviewed-by: Jan Kara 
> ---
>  fs/btrfs/ioctl.c   |   21 +---
>  fs/ext4/ioctl.c|   27 ++--
>  fs/f2fs/file.c |   26 ++-
>  fs/inode.c |   17 +
>  fs/xfs/xfs_ioctl.c |   70 
> ++--
>  include/linux/fs.h |3 ++
>  6 files changed, 111 insertions(+), 53 deletions(-)
> 
> 
> diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
> index f408aa93b0cf..7ddda5b4b6a6 100644
> --- a/fs/btrfs/ioctl.c
> +++ b/fs/btrfs/ioctl.c
> @@ -366,6 +366,13 @@ static int check_xflags(unsigned int flags)
>   return 0;
>  }
>  
> +static void __btrfs_ioctl_fsgetxattr(struct btrfs_inode *binode,
> +  struct fsxattr *fa)
> +{
> + memset(fa, 0, sizeof(*fa));
> + fa->fsx_xflags = btrfs_inode_flags_to_xflags(binode->flags);

Is there really much of a point in this helper? Epeciall as
the zeroing could easily be done in the variable declaration
line using

struct fsxattr fa = { };

> + memset(fa, 0, sizeof(struct fsxattr));
> + fa->fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & 
> EXT4_FL_USER_VISIBLE);

Overly lone line.

> + if (ext4_has_feature_project(inode->i_sb)) {
> + fa->fsx_projid = (__u32)from_kprojid(_user_ns,
> + ei->i_projid);

The cast here looks bogus.  Same comment for f2fs.


Re: [PATCH 1/4] vfs: create a generic checking function for FS_IOC_SETFLAGS

2019-06-25 Thread Christoph Hellwig
On Fri, Jun 21, 2019 at 04:56:21PM -0700, Darrick J. Wong wrote:
> From: Darrick J. Wong 
> 
> Create a generic checking function for the incoming FS_IOC_SETFLAGS flag
> values so that we can standardize the implementations that follow ext4's
> flag values.
> 
> Signed-off-by: Darrick J. Wong 
> Reviewed-by: Jan Kara 

Looks fine:

Reviewed-by: Christoph Hellwig 


Re: [Cluster-devel] [PATCH] fs: Move mark_inode_dirty out of __generic_write_end

2019-06-25 Thread Christoph Hellwig
> That seems way more complicated.  I'd much rather go with something
> like may patch plus maybe a big fat comment explaining that persisting
> the size update is the file systems job.  Note that a lot of the modern
> file systems don't use the VFS inode tracking for that, besides XFS
> that includes at least btrfs and ocfs2 as well.

I'd suggest something like this as the baseline:

http://git.infradead.org/users/hch/xfs.git/shortlog/refs/heads/iomap-i_size



Re: [Cluster-devel] [RFC][PATCH] wake_up_var() memory ordering

2019-06-25 Thread Andreas Gruenbacher
Peter,

thanks for the fixes.

On Mon, 24 Jun 2019 at 18:53, Peter Zijlstra  wrote:
> Hi all,
>
> I tried using wake_up_var() today and accidentally noticed that it
> didn't imply an smp_mb() and specifically requires it through
> wake_up_bit() / waitqueue_active().
>
> Now, wake_up_bit() doesn't imply the barrier because it is assumed to be
> used with the atomic bitops API which either implies (test_and_clear) or
> only needs smp_mb__after_atomic(), which is 'much' cheaper than an
> unconditional smp_mb().
>
> Still, while auditing all that, I found a whole bunch of things that
> could be improved. There were missing barriers, superfluous barriers and
> a whole bunch of sites that could use clear_and_wake_up_bit().
>
> So this fixes all wake_up_bit() usage without actually changing
> semantics of it (which are unfortunate but understandable).

thanks for the fixes.

> This does
> however change the semantics of wake_up_var(); even though wake_up_var()
> is most often used with atomics and then the additional smp_mb() is most
> often superfluous :/
>
> There isn't really a good option here, comments (other than I need to
> split this up)?
>
>
> ---
>  drivers/bluetooth/btmtksdio.c   |  5 +
>  drivers/bluetooth/btmtkuart.c   |  5 +
>  drivers/bluetooth/hci_mrvl.c|  8 ++--
>  drivers/gpu/drm/i915/i915_reset.c   |  6 ++
>  drivers/md/dm-bufio.c   | 10 ++
>  drivers/media/usb/dvb-usb-v2/dvb_usb_core.c | 15 ---
>  fs/afs/fs_probe.c   |  1 +
>  fs/afs/server.c |  1 +
>  fs/afs/vl_probe.c   |  1 +
>  fs/afs/volume.c |  1 +
>  fs/aio.c|  4 +---
>  fs/block_dev.c  |  1 +
>  fs/btrfs/extent_io.c|  4 +---
>  fs/cachefiles/namei.c   |  1 +
>  fs/cifs/connect.c   |  3 +--
>  fs/cifs/misc.c  | 15 +--
>  fs/fscache/cookie.c |  2 ++
>  fs/fscache/object.c |  2 ++
>  fs/fscache/page.c   |  3 +++
>  fs/gfs2/glock.c |  8 ++--
>  fs/gfs2/glops.c |  1 +
>  fs/gfs2/lock_dlm.c  |  8 ++--
>  fs/gfs2/recovery.c  |  4 +---
>  fs/gfs2/super.c |  1 +
>  fs/gfs2/sys.c   |  4 +---
>  fs/nfs/nfs4state.c  |  4 +---
>  fs/nfs/pnfs_nfs.c   |  4 +---
>  fs/nfsd/nfs4recover.c   |  4 +---
>  fs/orangefs/file.c  |  2 +-
>  kernel/sched/wait_bit.c |  1 +
>  net/bluetooth/hci_event.c   |  5 +
>  net/rds/ib_recv.c   |  1 +
>  security/keys/gc.c  |  5 ++---
>  33 files changed, 50 insertions(+), 90 deletions(-)
>
> diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c
> index 813338288453..27523cfeac9a 100644
> --- a/drivers/bluetooth/btmtksdio.c
> +++ b/drivers/bluetooth/btmtksdio.c
> @@ -356,11 +356,8 @@ static int btmtksdio_recv_event(struct hci_dev *hdev, 
> struct sk_buff *skb)
>
> if (hdr->evt == HCI_EV_VENDOR) {
> if (test_and_clear_bit(BTMTKSDIO_TX_WAIT_VND_EVT,
> -  >tx_state)) {
> -   /* Barrier to sync with other CPUs */
> -   smp_mb__after_atomic();
> +  >tx_state))
> wake_up_bit(>tx_state, 
> BTMTKSDIO_TX_WAIT_VND_EVT);
> -   }
> }
>
> return 0;
> diff --git a/drivers/bluetooth/btmtkuart.c b/drivers/bluetooth/btmtkuart.c
> index f5dbeec8e274..7fe324df3799 100644
> --- a/drivers/bluetooth/btmtkuart.c
> +++ b/drivers/bluetooth/btmtkuart.c
> @@ -340,11 +340,8 @@ static int btmtkuart_recv_event(struct hci_dev *hdev, 
> struct sk_buff *skb)
>
> if (hdr->evt == HCI_EV_VENDOR) {
> if (test_and_clear_bit(BTMTKUART_TX_WAIT_VND_EVT,
> -  >tx_state)) {
> -   /* Barrier to sync with other CPUs */
> -   smp_mb__after_atomic();
> +  >tx_state))
> wake_up_bit(>tx_state, 
> BTMTKUART_TX_WAIT_VND_EVT);
> -   }
> }
>
> return 0;
> diff --git a/drivers/bluetooth/hci_mrvl.c b/drivers/bluetooth/hci_mrvl.c
> index 50212ac629e3..f03294d39d08 100644
> --- a/drivers/bluetooth/hci_mrvl.c
> +++ b/drivers/bluetooth/hci_mrvl.c
> @@ -157,9 +157,7 @@ static int mrvl_recv_fw_req(struct hci_dev *hdev, struct 
> sk_buff *skb)
>
> mrvl->tx_len = le16_to_cpu(pkt->lhs);
>
> -   

Re: [Cluster-devel] [RFC][PATCH] wake_up_var() memory ordering

2019-06-25 Thread Peter Zijlstra


(sorry for cross-posting to moderated lists btw, I've since
 acquired a patch to get_maintainers.pl that wil exclude them
 in the future)

On Tue, Jun 25, 2019 at 08:51:01AM +0100, David Howells wrote:
> Peter Zijlstra  wrote:
> 
> > I tried using wake_up_var() today and accidentally noticed that it
> > didn't imply an smp_mb() and specifically requires it through
> > wake_up_bit() / waitqueue_active().
> 
> Thinking about it again, I'm not sure why you need to add the barrier when
> wake_up() (which this is a wrapper around) is required to impose a barrier at
> the front if there's anything to wake up (ie. the wait queue isn't empty).
> 
> If this is insufficient, does it make sense just to have wake_up*() functions
> do an unconditional release or full barrier right at the front, rather than it
> being conditional on something being woken up?

The curprit is __wake_up_bit()'s usage of waitqueue_active(); it is this
latter (see its comment) that requires the smp_mb().

wake_up_bit() and wake_up_var() are wrappers around __wake_up_bit().

Without this barrier it is possible for the waitqueue_active() load to
be hoisted over the cond=true store and the remote end can miss the
store and we can miss its enqueue and we'll all miss a wakeup and get
stuck.

Adding an smp_mb() (or use wq_has_sleeper()) in __wake_up_bit() would be
nice, but I fear some people will complain about overhead, esp. since
about half the sites don't need the barrier due to being behind
test_and_clear_bit() and the other half using smp_mb__after_atomic()
after some clear_bit*() variant.

There's a few sites that seem to open-code
wait_var_event()/wake_up_var() and those actually need the full
smp_mb(), but then maybe they should be converted to var instread of bit
anyway.

> > @@ -619,9 +614,7 @@ static int dvb_usb_fe_sleep(struct dvb_frontend *fe)
> >  err:
> > if (!adap->suspend_resume_active) {
> > adap->active_fe = -1;
> 
> I'm wondering if there's a missing barrier here.  Should the clear_bit() on
> the next line be clear_bit_unlock() or clear_bit_release()?

That looks reasonable, but I'd like to hear from the DVB folks on that.

> > -   clear_bit(ADAP_SLEEP, >state_bits);
> > -   smp_mb__after_atomic();
> > -   wake_up_bit(>state_bits, ADAP_SLEEP);
> > +   clear_and_wake_up_bit(ADAP_SLEEP, >state_bits);
> > }
> >  
> > dev_dbg(>udev->dev, "%s: ret=%d\n", __func__, ret);
> > diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
> > index cfe62b154f68..377ee07d5f76 100644
> > --- a/fs/afs/fs_probe.c
> > +++ b/fs/afs/fs_probe.c
> > @@ -18,6 +18,7 @@ static bool afs_fs_probe_done(struct afs_server *server)
> >  
> > wake_up_var(>probe_outstanding);
> > clear_bit_unlock(AFS_SERVER_FL_PROBING, >flags);
> > +   smp_mb__after_atomic();
> > wake_up_bit(>flags, AFS_SERVER_FL_PROBING);
> > return true;
> >  }
> 
> Looking at this and the dvb one, does it make sense to stick the release
> semantics of clear_bit_unlock() into clear_and_wake_up_bit()?

I was thinking of adding another helper, maybe unlock_and_wake_up_bit()
that included that extra barrier, but maybe making it unconditional
isn't the worst idea.

> Also, should clear_bit_unlock() be renamed to clear_bit_release() (and
> similarly test_and_set_bit_lock() -> test_and_set_bit_acquire()) if we seem to
> be trying to standardise on that terminology.

That definitely makes sense to me, there's only 157 clear_bit_unlock()
and 76 test_and_set_bit_lock() users (note the asymetry of that).



Re: [Cluster-devel] [RFC][PATCH] wake_up_var() memory ordering

2019-06-25 Thread David Howells
Peter Zijlstra  wrote:

> I tried using wake_up_var() today and accidentally noticed that it
> didn't imply an smp_mb() and specifically requires it through
> wake_up_bit() / waitqueue_active().

Thinking about it again, I'm not sure why you need to add the barrier when
wake_up() (which this is a wrapper around) is required to impose a barrier at
the front if there's anything to wake up (ie. the wait queue isn't empty).

If this is insufficient, does it make sense just to have wake_up*() functions
do an unconditional release or full barrier right at the front, rather than it
being conditional on something being woken up?

> @@ -619,9 +614,7 @@ static int dvb_usb_fe_sleep(struct dvb_frontend *fe)
>  err:
>   if (!adap->suspend_resume_active) {
>   adap->active_fe = -1;

I'm wondering if there's a missing barrier here.  Should the clear_bit() on
the next line be clear_bit_unlock() or clear_bit_release()?

> - clear_bit(ADAP_SLEEP, >state_bits);
> - smp_mb__after_atomic();
> - wake_up_bit(>state_bits, ADAP_SLEEP);
> + clear_and_wake_up_bit(ADAP_SLEEP, >state_bits);
>   }
>  
>   dev_dbg(>udev->dev, "%s: ret=%d\n", __func__, ret);
> diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
> index cfe62b154f68..377ee07d5f76 100644
> --- a/fs/afs/fs_probe.c
> +++ b/fs/afs/fs_probe.c
> @@ -18,6 +18,7 @@ static bool afs_fs_probe_done(struct afs_server *server)
>  
>   wake_up_var(>probe_outstanding);
>   clear_bit_unlock(AFS_SERVER_FL_PROBING, >flags);
> + smp_mb__after_atomic();
>   wake_up_bit(>flags, AFS_SERVER_FL_PROBING);
>   return true;
>  }

Looking at this and the dvb one, does it make sense to stick the release
semantics of clear_bit_unlock() into clear_and_wake_up_bit()?

Also, should clear_bit_unlock() be renamed to clear_bit_release() (and
similarly test_and_set_bit_lock() -> test_and_set_bit_acquire()) if we seem to
be trying to standardise on that terminology.

David



[Cluster-devel] [RFC][PATCH] wake_up_var() memory ordering

2019-06-25 Thread Peter Zijlstra
Hi all,

I tried using wake_up_var() today and accidentally noticed that it
didn't imply an smp_mb() and specifically requires it through
wake_up_bit() / waitqueue_active().

Now, wake_up_bit() doesn't imply the barrier because it is assumed to be
used with the atomic bitops API which either implies (test_and_clear) or
only needs smp_mb__after_atomic(), which is 'much' cheaper than an
unconditional smp_mb().

Still, while auditing all that, I found a whole bunch of things that
could be improved. There were missing barriers, superfluous barriers and
a whole bunch of sites that could use clear_and_wake_up_bit().

So this fixes all wake_up_bit() usage without actually changing
semantics of it (which are unfortunate but understandable). This does
however change the semantics of wake_up_var(); even though wake_up_var()
is most often used with atomics and then the additional smp_mb() is most
often superfluous :/

There isn't really a good option here, comments (other than I need to
split this up)?


---
 drivers/bluetooth/btmtksdio.c   |  5 +
 drivers/bluetooth/btmtkuart.c   |  5 +
 drivers/bluetooth/hci_mrvl.c|  8 ++--
 drivers/gpu/drm/i915/i915_reset.c   |  6 ++
 drivers/md/dm-bufio.c   | 10 ++
 drivers/media/usb/dvb-usb-v2/dvb_usb_core.c | 15 ---
 fs/afs/fs_probe.c   |  1 +
 fs/afs/server.c |  1 +
 fs/afs/vl_probe.c   |  1 +
 fs/afs/volume.c |  1 +
 fs/aio.c|  4 +---
 fs/block_dev.c  |  1 +
 fs/btrfs/extent_io.c|  4 +---
 fs/cachefiles/namei.c   |  1 +
 fs/cifs/connect.c   |  3 +--
 fs/cifs/misc.c  | 15 +--
 fs/fscache/cookie.c |  2 ++
 fs/fscache/object.c |  2 ++
 fs/fscache/page.c   |  3 +++
 fs/gfs2/glock.c |  8 ++--
 fs/gfs2/glops.c |  1 +
 fs/gfs2/lock_dlm.c  |  8 ++--
 fs/gfs2/recovery.c  |  4 +---
 fs/gfs2/super.c |  1 +
 fs/gfs2/sys.c   |  4 +---
 fs/nfs/nfs4state.c  |  4 +---
 fs/nfs/pnfs_nfs.c   |  4 +---
 fs/nfsd/nfs4recover.c   |  4 +---
 fs/orangefs/file.c  |  2 +-
 kernel/sched/wait_bit.c |  1 +
 net/bluetooth/hci_event.c   |  5 +
 net/rds/ib_recv.c   |  1 +
 security/keys/gc.c  |  5 ++---
 33 files changed, 50 insertions(+), 90 deletions(-)

diff --git a/drivers/bluetooth/btmtksdio.c b/drivers/bluetooth/btmtksdio.c
index 813338288453..27523cfeac9a 100644
--- a/drivers/bluetooth/btmtksdio.c
+++ b/drivers/bluetooth/btmtksdio.c
@@ -356,11 +356,8 @@ static int btmtksdio_recv_event(struct hci_dev *hdev, 
struct sk_buff *skb)
 
if (hdr->evt == HCI_EV_VENDOR) {
if (test_and_clear_bit(BTMTKSDIO_TX_WAIT_VND_EVT,
-  >tx_state)) {
-   /* Barrier to sync with other CPUs */
-   smp_mb__after_atomic();
+  >tx_state))
wake_up_bit(>tx_state, BTMTKSDIO_TX_WAIT_VND_EVT);
-   }
}
 
return 0;
diff --git a/drivers/bluetooth/btmtkuart.c b/drivers/bluetooth/btmtkuart.c
index f5dbeec8e274..7fe324df3799 100644
--- a/drivers/bluetooth/btmtkuart.c
+++ b/drivers/bluetooth/btmtkuart.c
@@ -340,11 +340,8 @@ static int btmtkuart_recv_event(struct hci_dev *hdev, 
struct sk_buff *skb)
 
if (hdr->evt == HCI_EV_VENDOR) {
if (test_and_clear_bit(BTMTKUART_TX_WAIT_VND_EVT,
-  >tx_state)) {
-   /* Barrier to sync with other CPUs */
-   smp_mb__after_atomic();
+  >tx_state))
wake_up_bit(>tx_state, BTMTKUART_TX_WAIT_VND_EVT);
-   }
}
 
return 0;
diff --git a/drivers/bluetooth/hci_mrvl.c b/drivers/bluetooth/hci_mrvl.c
index 50212ac629e3..f03294d39d08 100644
--- a/drivers/bluetooth/hci_mrvl.c
+++ b/drivers/bluetooth/hci_mrvl.c
@@ -157,9 +157,7 @@ static int mrvl_recv_fw_req(struct hci_dev *hdev, struct 
sk_buff *skb)
 
mrvl->tx_len = le16_to_cpu(pkt->lhs);
 
-   clear_bit(STATE_FW_REQ_PENDING, >flags);
-   smp_mb__after_atomic();
-   wake_up_bit(>flags, STATE_FW_REQ_PENDING);
+   clear_and_wake_up_bit(STATE_FW_REQ_PENDING, >flags);
 
 done:
kfree_skb(skb);
@@ -192,9 +190,7 @@ static int mrvl_recv_chip_ver(struct hci_dev *hdev, struct 
sk_buff *skb)