Re: [Cluster-devel] [PATCH v6.4-rc2 5/5] fs: dlm: avoid F_SETLKW plock op lookup collisions

2023-05-19 Thread Alexander Aring
Hi,

On Fri, May 19, 2023 at 11:21 AM Alexander Aring  wrote:
>
> This patch fixes a possible plock op collisions when using F_SETLKW lock
> requests and fsid, number and owner are not enough to identify a result
> for a pending request. The ltp testcases [0] and [1] are examples when
> this is not enough in case of using classic posix locks with threads and
> open filedescriptor posix locks.
>
> The idea to fix the issue here is to split recv_list, which contains
> plock ops expecting a result from user space, into a F_SETLKW op
> recv_setlkw_list and for all other commands recv_list. Due DLM user
> space behavior e.g. dlm_controld a request and writing a result back can
> only happen in an ordered way. That means a lookup and iterating over
> the recv_list is not required. To place the right plock op as the first
> entry of recv_list a change to list_move_tail() was made.
>
> This behaviour is for F_SETLKW not possible as multiple waiters can be
> get a result back in an random order. To avoid a collisions in cases
> like [0] or [1] this patch adds more fields to compare the plock
> operations as the lock request is the same. This is also being made in
> NFS to find an result for an asynchronous F_SETLKW lock request [2][3]. We
> still can't find the exact lock request for a specific result if the
> lock request is the same, but if this is the case we don't care the
> order how the identical lock requests get their result back to grant the
> lock.
>
> [0] 
> https://gitlab.com/netcoder/ltp/-/blob/dlm_fcntl_owner_testcase/testcases/kernel/syscalls/fcntl/fcntl40.c
> [1] 
> https://gitlab.com/netcoder/ltp/-/blob/dlm_fcntl_owner_testcase/testcases/kernel/syscalls/fcntl/fcntl41.c
> [2] 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/lockd/lockd.h?h=v6.4-rc1#n373
> [3] 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/lockd/svclock.c?h=v6.4-rc1#n731
>
> Cc: sta...@vger.kernel.org
> Signed-off-by: Alexander Aring 
> ---
>  fs/dlm/plock.c | 47 ++-
>  1 file changed, 34 insertions(+), 13 deletions(-)
>
> diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
> index c9e1d5f54194..540a30a342f0 100644
> --- a/fs/dlm/plock.c
> +++ b/fs/dlm/plock.c
> @@ -17,6 +17,7 @@
>  static DEFINE_SPINLOCK(ops_lock);
>  static LIST_HEAD(send_list);
>  static LIST_HEAD(recv_list);
> +static LIST_HEAD(recv_setlkw_list);
>  static DECLARE_WAIT_QUEUE_HEAD(send_wq);
>  static DECLARE_WAIT_QUEUE_HEAD(recv_wq);
>
> @@ -392,10 +393,14 @@ static ssize_t dev_read(struct file *file, char __user 
> *u, size_t count,
> spin_lock(_lock);
> if (!list_empty(_list)) {
> op = list_first_entry(_list, struct plock_op, list);
> -   if (op->info.flags & DLM_PLOCK_FL_CLOSE)
> +   if (op->info.flags & DLM_PLOCK_FL_CLOSE) {
> list_del(>list);
> -   else
> -   list_move(>list, _list);
> +   } else {
> +   if (op->info.wait)
> +   list_move(>list, _setlkw_list);
> +   else
> +   list_move_tail(>list, _list);
> +   }
> memcpy(, >info, sizeof(info));
> }
> spin_unlock(_lock);
> @@ -434,18 +439,34 @@ static ssize_t dev_write(struct file *file, const char 
> __user *u, size_t count,
> return -EINVAL;
>
> spin_lock(_lock);
> -   list_for_each_entry(iter, _list, list) {
> -   if (iter->info.fsid == info.fsid &&
> -   iter->info.number == info.number &&
> -   iter->info.owner == info.owner) {
> -   list_del_init(>list);
> -   memcpy(>info, , sizeof(info));
> -   if (iter->data)
> +   if (info.wait) {
> +   list_for_each_entry(iter, _setlkw_list, list) {
> +   if (iter->info.fsid == info.fsid &&
> +   iter->info.number == info.number &&
> +   iter->info.owner == info.owner &&
> +   iter->info.pid == info.pid &&
> +   iter->info.start == info.start &&
> +   iter->info.end == info.end) {

There is a missing condition for info.ex, otherwise a lock request for
F_WRLCK and F_RDLCK could be evaluated as the same request. NFS is
doing this check as well by checking on fl1->fl_type  == fl2->fl_type,
we don't have fl_type but info.ex which is the only difference in
F_SETLKW to distinguish F_WRLCK and F_RDLCK.

I will send a v2.

- Alex



[Cluster-devel] [PATCH dlm/next 1/2] fs: dlm: plock debugfs to check for pending operations

2023-05-19 Thread Alexander Aring
In the past issues were found that there were still ongoing plock
operations in the kernel but it should cleanup routines should clear
them up because there were no plock activity by the user anymore. To
check that "dlm_tool plocks $LS" can be used, but this only shows
pending operations in dlm_controld daemon. To check the kernel part, if
the kernel waits for an answer of the user space, this patch introduces
a debugfs entry which reports if there are ongoing plock operations or
not.

Signed-off-by: Alexander Aring 
---
 fs/dlm/debug_fs.c | 26 ++
 fs/dlm/dlm_internal.h |  5 +
 fs/dlm/plock.c| 16 
 3 files changed, 47 insertions(+)

diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index a1aca41c49d0..494a6e73f8e8 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -25,6 +25,7 @@ static struct mutex debug_buf_lock;
 
 static struct dentry *dlm_root;
 static struct dentry *dlm_comms;
+static struct dentry *dlm_plock;
 
 static char *print_lockmode(int mode)
 {
@@ -883,6 +884,30 @@ void dlm_delete_debug_comms_file(void *ctx)
debugfs_remove(ctx);
 }
 
+static int dlm_plock_ops_pending_show(struct seq_file *file, void *offset)
+{
+   seq_printf(file, "%d\n", dlm_plock_ops_pending());
+   return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(dlm_plock_ops_pending);
+
+void dlm_create_debug_plock_file(void)
+{
+   /* TODO currently use case if only to look if everything got cleaned
+* up probably if user space dlm_tool plocks $LS shows no activity
+* anymore on all lockspaces.
+*
+* However in future a dump could be useful as well.
+*/
+   debugfs_create_file("plock_ops_pending", 0444, dlm_plock, NULL,
+   _plock_ops_pending_fops);
+}
+
+void dlm_remove_debug_plock_file(void)
+{
+   debugfs_remove(dlm_plock);
+}
+
 void dlm_create_debug_file(struct dlm_ls *ls)
 {
char name[DLM_LOCKSPACE_LEN + 8];
@@ -943,6 +968,7 @@ void __init dlm_register_debugfs(void)
mutex_init(_buf_lock);
dlm_root = debugfs_create_dir("dlm", NULL);
dlm_comms = debugfs_create_dir("comms", dlm_root);
+   dlm_plock = debugfs_create_dir("plock", dlm_root);
 }
 
 void dlm_unregister_debugfs(void)
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 986a9d7b1f33..f5f741ee527b 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -805,6 +805,7 @@ static inline void dlm_set_sbflags_val(struct dlm_lkb *lkb, 
uint32_t val)
  __DLM_SBF_MAX_BIT);
 }
 
+int dlm_plock_ops_pending(void);
 int dlm_plock_init(void);
 void dlm_plock_exit(void);
 
@@ -815,6 +816,8 @@ void dlm_create_debug_file(struct dlm_ls *ls);
 void dlm_delete_debug_file(struct dlm_ls *ls);
 void *dlm_create_debug_comms_file(int nodeid, void *data);
 void dlm_delete_debug_comms_file(void *ctx);
+void dlm_create_debug_plock_file(void);
+void dlm_remove_debug_plock_file(void);
 #else
 static inline void dlm_register_debugfs(void) { }
 static inline void dlm_unregister_debugfs(void) { }
@@ -822,6 +825,8 @@ static inline void dlm_create_debug_file(struct dlm_ls *ls) 
{ }
 static inline void dlm_delete_debug_file(struct dlm_ls *ls) { }
 static inline void *dlm_create_debug_comms_file(int nodeid, void *data) { 
return NULL; }
 static inline void dlm_delete_debug_comms_file(void *ctx) { }
+static inline void dlm_create_debug_plock_file(void) { };
+static inline void dlm_remove_debug_plock_file(void) { };
 #endif
 
 #endif /* __DLM_INTERNAL_DOT_H__ */
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 540a30a342f0..578bf8a1325f 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -36,6 +36,19 @@ struct plock_op {
struct plock_async_data *data;
 };
 
+int dlm_plock_ops_pending(void)
+{
+   int rv;
+
+   spin_lock(_lock);
+   rv = !list_empty(_list);
+   rv |= !list_empty(_list);
+   rv |= !list_empty(_setlkw_list);
+   spin_unlock(_lock);
+
+   return rv;
+}
+
 static inline void set_version(struct dlm_plock_info *info)
 {
info->version[0] = DLM_PLOCK_VERSION_MAJOR;
@@ -517,11 +530,14 @@ int dlm_plock_init(void)
rv = misc_register(_dev_misc);
if (rv)
log_print("dlm_plock_init: misc_register failed %d", rv);
+
+   dlm_create_debug_plock_file();
return rv;
 }
 
 void dlm_plock_exit(void)
 {
+   dlm_remove_debug_plock_file();
misc_deregister(_dev_misc);
 }
 
-- 
2.31.1



[Cluster-devel] [PATCH dlm/next 2/2] fs: dlm: cleanup plock op lookup functionality

2023-05-19 Thread Alexander Aring
This patch will introduce plock_op_lookup() to lookup a plock op when a
result needs to lookup the plock op to find the original request.
Besides that we add additional sanity check to confirm the lookup is
still working in case of non F_SETLKW request as it requires a specific
order in recv_list.

Signed-off-by: Alexander Aring 
---
 fs/dlm/plock.c | 107 +++--
 1 file changed, 68 insertions(+), 39 deletions(-)

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 578bf8a1325f..933fb575e83a 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -433,14 +433,58 @@ static ssize_t dev_read(struct file *file, char __user 
*u, size_t count,
return sizeof(info);
 }
 
+static struct plock_op *plock_op_lookup(const struct dlm_plock_info *info)
+{
+   struct plock_op *op = NULL, *iter;
+
+   if (info->wait) {
+   list_for_each_entry(iter, _setlkw_list, list) {
+   if (iter->info.fsid == info->fsid &&
+   iter->info.number == info->number &&
+   iter->info.owner == info->owner &&
+   iter->info.pid == info->pid &&
+   iter->info.start == info->start &&
+   iter->info.end == info->end) {
+   op = iter;
+   break;
+   }
+   }
+   } else {
+   op = list_first_entry_or_null(_list, struct plock_op,
+ list);
+   if (op) {
+   /* sanity check to check we got the right one */
+   switch (op->info.optype) {
+   case DLM_PLOCK_OP_GET:
+   /* we can't check on some fields on get */
+   WARN_ON(op->info.fsid != info->fsid ||
+   op->info.number != info->number ||
+   op->info.owner != info->owner ||
+   op->info.optype != info->optype);
+   break;
+   default:
+   WARN_ON(op->info.fsid != info->fsid ||
+   op->info.number != info->number ||
+   op->info.owner != info->owner ||
+   op->info.pid != info->pid ||
+   op->info.start != info->start ||
+   op->info.end != info->end ||
+   op->info.optype != info->optype);
+   break;
+   }
+   }
+   }
+
+   return op;
+}
+
 /* a write copies in one plock result that should match a plock_op
on the recv list */
 static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
 loff_t *ppos)
 {
-   struct plock_op *op = NULL, *iter;
struct dlm_plock_info info;
-   int do_callback = 0;
+   struct plock_op *op;
 
if (count != sizeof(info))
return -EINVAL;
@@ -452,46 +496,31 @@ static ssize_t dev_write(struct file *file, const char 
__user *u, size_t count,
return -EINVAL;
 
spin_lock(_lock);
-   if (info.wait) {
-   list_for_each_entry(iter, _setlkw_list, list) {
-   if (iter->info.fsid == info.fsid &&
-   iter->info.number == info.number &&
-   iter->info.owner == info.owner &&
-   iter->info.pid == info.pid &&
-   iter->info.start == info.start &&
-   iter->info.end == info.end) {
-   list_del_init(>list);
-   memcpy(>info, , sizeof(info));
-   if (iter->data)
-   do_callback = 1;
-   else
-   iter->done = 1;
-   op = iter;
-   break;
-   }
-   }
+   op = plock_op_lookup();
+   if (!op) {
+   spin_unlock(_lock);
+   pr_debug("%s: no op %x %llx", __func__,
+info.fsid, (unsigned long long)info.number);
+   return count;
+   }
+
+   list_del_init(>list);
+   /* update set new fields by user space e.g. F_GETLK */
+   memcpy(>info, , sizeof(info));
+
+   /* check for async handling */
+   if (op->data) {
+   spin_unlock(_lock);
+   dlm_plock_callback(op);
} else {
-   op = list_first_entry_or_null(_list, struct plock_op,
- list);
-   if (op) {

[Cluster-devel] [PATCH v6.4-rc2 3/5] fs: dlm: switch posix lock to killable only

2023-05-19 Thread Alexander Aring
This patch will revert commit a6b1533e9a57 ("dlm: make posix locks
interruptible"). It was probably introduced to reach the fcntl()
F_SETLKW requirement to make fcntl() calls interruptible, see:

"F_SETLKW (struct flock *):

 As for F_SETLK, but if a conflicting lock is held on the file,
 then wait for that lock to be released. If a signal is caught
 while waiting, then the call is interrupted and (after the signal
 handler has returned) returns immediately (with return value -1
 and errno set to EINTR; see signal(7))."

This requires to interrupt the current plock operation in question
sitting in the wait_event_interruptible() waiting for op->done becomes
true. This isn't currently the case as do_unlock_close() will act like
the process was killed as it unlocks all previously acquired locks which
can occur into data corruption because the process still thinks it has
the previously acquired locks still acquired.

To test this behaviour a ltp testcase was created [0]. After this patch
the process can't be interrupted anymore which is against the API
but considered currently as a limitation of DLM. However it will stop to
unlock all previously acquired locks and the user process isn't aware of
it.

It requires more work in DLM to support such feature as intended. It
requires some lock request cancellation request which does not yet
exists for dlm plock user space communication. As this feature never
worked as intended and have side effects as mentioned aboe this patch moves
the wait to killable again.

[0] 
https://gitlab.com/netcoder/ltp/-/blob/dlm_fcntl_owner_testcase/testcases/kernel/syscalls/fcntl/fcntl42.c

Cc: sta...@vger.kernel.org
Fixes: a6b1533e9a57 ("dlm: make posix locks interruptible")
Signed-off-by: Alexander Aring 
---
 fs/dlm/plock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index fea2157fac5b..31bc601ee3d8 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -155,7 +155,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, 
struct file *file,
 
send_op(op);
 
-   rv = wait_event_interruptible(recv_wq, (op->done != 0));
+   rv = wait_event_killable(recv_wq, (op->done != 0));
if (rv == -ERESTARTSYS) {
spin_lock(_lock);
/* recheck under ops_lock if we got a done != 0,
-- 
2.31.1



[Cluster-devel] [PATCH v6.4-rc2 1/5] fs: dlm: change local pids to be positive pids

2023-05-19 Thread Alexander Aring
This patch fixes to set local processes and their pid value represented
inside the struct flock when using F_GETLK if there is a conflict with
another process.

Currently every pid in struct flock l_pid is set as negative pid number.
This was changed by commit 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use
fs-specific l_pid for remote locks"). There is still the question how to
represent remote pid lock holders, which is possible for DLM posix
handling, in the flock structure. This patch however will only change
local process holders to be represented as positive pids.

Further patches may change the behaviour for remote pid lock holders,
for now this patch will leave the behaviour of remote lock holders
unchanged which will be represented as negative pid.

There exists a simple ltp testcase [0] as reproducer.

[0] 
https://gitlab.com/netcoder/ltp/-/blob/dlm_fcntl_owner_testcase/testcases/kernel/syscalls/fcntl/fcntl05.c

Cc: sta...@vger.kernel.org
Fixes: 9d5b86ac13c5 ("fs/locks: Remove fl_nspid and use fs-specific l_pid for 
remote locks")
Signed-off-by: Alexander Aring 
---
 fs/dlm/plock.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index ed4357e62f35..ff364901f22b 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -360,7 +360,9 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, 
struct file *file,
locks_init_lock(fl);
fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
fl->fl_flags = FL_POSIX;
-   fl->fl_pid = -op->info.pid;
+   fl->fl_pid = op->info.pid;
+   if (op->info.nodeid != dlm_our_nodeid())
+   fl->fl_pid = -fl->fl_pid;
fl->fl_start = op->info.start;
fl->fl_end = op->info.end;
rv = 0;
-- 
2.31.1



[Cluster-devel] [PATCH v6.4-rc2 5/5] fs: dlm: avoid F_SETLKW plock op lookup collisions

2023-05-19 Thread Alexander Aring
This patch fixes a possible plock op collisions when using F_SETLKW lock
requests and fsid, number and owner are not enough to identify a result
for a pending request. The ltp testcases [0] and [1] are examples when
this is not enough in case of using classic posix locks with threads and
open filedescriptor posix locks.

The idea to fix the issue here is to split recv_list, which contains
plock ops expecting a result from user space, into a F_SETLKW op
recv_setlkw_list and for all other commands recv_list. Due DLM user
space behavior e.g. dlm_controld a request and writing a result back can
only happen in an ordered way. That means a lookup and iterating over
the recv_list is not required. To place the right plock op as the first
entry of recv_list a change to list_move_tail() was made.

This behaviour is for F_SETLKW not possible as multiple waiters can be
get a result back in an random order. To avoid a collisions in cases
like [0] or [1] this patch adds more fields to compare the plock
operations as the lock request is the same. This is also being made in
NFS to find an result for an asynchronous F_SETLKW lock request [2][3]. We
still can't find the exact lock request for a specific result if the
lock request is the same, but if this is the case we don't care the
order how the identical lock requests get their result back to grant the
lock.

[0] 
https://gitlab.com/netcoder/ltp/-/blob/dlm_fcntl_owner_testcase/testcases/kernel/syscalls/fcntl/fcntl40.c
[1] 
https://gitlab.com/netcoder/ltp/-/blob/dlm_fcntl_owner_testcase/testcases/kernel/syscalls/fcntl/fcntl41.c
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/linux/lockd/lockd.h?h=v6.4-rc1#n373
[3] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/fs/lockd/svclock.c?h=v6.4-rc1#n731

Cc: sta...@vger.kernel.org
Signed-off-by: Alexander Aring 
---
 fs/dlm/plock.c | 47 ++-
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index c9e1d5f54194..540a30a342f0 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -17,6 +17,7 @@
 static DEFINE_SPINLOCK(ops_lock);
 static LIST_HEAD(send_list);
 static LIST_HEAD(recv_list);
+static LIST_HEAD(recv_setlkw_list);
 static DECLARE_WAIT_QUEUE_HEAD(send_wq);
 static DECLARE_WAIT_QUEUE_HEAD(recv_wq);
 
@@ -392,10 +393,14 @@ static ssize_t dev_read(struct file *file, char __user 
*u, size_t count,
spin_lock(_lock);
if (!list_empty(_list)) {
op = list_first_entry(_list, struct plock_op, list);
-   if (op->info.flags & DLM_PLOCK_FL_CLOSE)
+   if (op->info.flags & DLM_PLOCK_FL_CLOSE) {
list_del(>list);
-   else
-   list_move(>list, _list);
+   } else {
+   if (op->info.wait)
+   list_move(>list, _setlkw_list);
+   else
+   list_move_tail(>list, _list);
+   }
memcpy(, >info, sizeof(info));
}
spin_unlock(_lock);
@@ -434,18 +439,34 @@ static ssize_t dev_write(struct file *file, const char 
__user *u, size_t count,
return -EINVAL;
 
spin_lock(_lock);
-   list_for_each_entry(iter, _list, list) {
-   if (iter->info.fsid == info.fsid &&
-   iter->info.number == info.number &&
-   iter->info.owner == info.owner) {
-   list_del_init(>list);
-   memcpy(>info, , sizeof(info));
-   if (iter->data)
+   if (info.wait) {
+   list_for_each_entry(iter, _setlkw_list, list) {
+   if (iter->info.fsid == info.fsid &&
+   iter->info.number == info.number &&
+   iter->info.owner == info.owner &&
+   iter->info.pid == info.pid &&
+   iter->info.start == info.start &&
+   iter->info.end == info.end) {
+   list_del_init(>list);
+   memcpy(>info, , sizeof(info));
+   if (iter->data)
+   do_callback = 1;
+   else
+   iter->done = 1;
+   op = iter;
+   break;
+   }
+   }
+   } else {
+   op = list_first_entry_or_null(_list, struct plock_op,
+ list);
+   if (op) {
+   list_del_init(>list);
+   memcpy(>info, , sizeof(info));
+   if (op->data)
do_callback = 1;
else
-   iter->done = 1;
-  

[Cluster-devel] [PATCH v6.4-rc2 4/5] fs: dlm: make F_SETLK not killable

2023-05-19 Thread Alexander Aring
This patch changes that only F_SETLKW will be killable. As the man page
of fcntl() states out that F_SETLKW is the only one interruptible cmd as
I supposed it can block an unknown amount of time when it hits
contention. We use killable for the same reason just that the process
isn't alive anymore.

The command F_SETLK is a trylock command, a result can be expected very
close as the operation was send to the user space like unlock or get
operations which uses wait_event() only.

Cc: sta...@vger.kernel.org
Signed-off-by: Alexander Aring 
---
 fs/dlm/plock.c | 38 +-
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 31bc601ee3d8..c9e1d5f54194 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -155,25 +155,29 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 
number, struct file *file,
 
send_op(op);
 
-   rv = wait_event_killable(recv_wq, (op->done != 0));
-   if (rv == -ERESTARTSYS) {
-   spin_lock(_lock);
-   /* recheck under ops_lock if we got a done != 0,
-* if so this interrupt case should be ignored
-*/
-   if (op->done != 0) {
+   if (op->info.wait) {
+   rv = wait_event_killable(recv_wq, (op->done != 0));
+   if (rv == -ERESTARTSYS) {
+   spin_lock(_lock);
+   /* recheck under ops_lock if we got a done != 0,
+* if so this interrupt case should be ignored
+*/
+   if (op->done != 0) {
+   spin_unlock(_lock);
+   goto do_lock_wait;
+   }
+   list_del(>list);
spin_unlock(_lock);
-   goto do_lock_wait;
-   }
-   list_del(>list);
-   spin_unlock(_lock);
 
-   log_debug(ls, "%s: wait interrupted %x %llx pid %d",
- __func__, ls->ls_global_id,
- (unsigned long long)number, op->info.pid);
-   do_unlock_close(>info);
-   dlm_release_plock_op(op);
-   goto out;
+   log_debug(ls, "%s: wait interrupted %x %llx pid %d",
+ __func__, ls->ls_global_id,
+ (unsigned long long)number, op->info.pid);
+   do_unlock_close(>info);
+   dlm_release_plock_op(op);
+   goto out;
+   }
+   } else {
+   wait_event(recv_wq, (op->done != 0));
}
 
 do_lock_wait:
-- 
2.31.1



[Cluster-devel] [PATCH v6.4-rc2 2/5] fs: dlm: fix cleanup pending ops when interrupted

2023-05-19 Thread Alexander Aring
This patch mainly reverts what commit b92a4e3f86b1 ("fs: dlm: change posix
lock sigint handling") introduced. Except two things, checking if
op->done got true under ops_lock after it got interrupted and changing
"no op" messages to debug printout.

There is currently problems with cleaning up pending operations. The
main idea of commit b92a4e3f86b1 ("fs: dlm: change posix lock sigint
handling") was to wait for a reply and if it was interrupted then the
cleanup routine e.g. list_del(), do_unlock_close() will be executed.

This requires that for every dlm op request a answer in dev_write()
comes back. The cleanup routine do_unlock_close() is not operating in
the dlm user space software on a per request basis and will cleanup
everything else what matches certain plock op fields which concludes
that we don't get anymore for all request a result back. This will
have some leftovers inside the dlm plock recv_list which will never
being deleted.

It was confirmed with a new debugfs entry to look if some plock lists
have still entries left when there is no posix lock activity, checked
by dlm_tool plocks $LS, ongoing anymore. In the specific testcase on
a gfs2 mountpoint the following command was executed:

stress-ng --fcntl 32

and the stress-ng program was killed after certain time.

Due the fact that do_unlock_close() cleans more than just a specific
operation and the dlm operation is already removed by list_del(). This
list_del() can either be operating on send_list or recv_list. If it hits
recv_list it still can be that answers coming back for an ongoing
operation and do_unlock_close() is not synchronized with the list_del().
This will end in "no op ..." log_print(), to not confuse the user about
such issues which seems to be there by design we move this logging
information to pr_debug() as those are expected log messages.

Cc: sta...@vger.kernel.org
Fixes: b92a4e3f86b1 ("fs: dlm: change posix lock sigint handling")
Signed-off-by: Alexander Aring 
---
 fs/dlm/plock.c | 25 ++---
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index ff364901f22b..fea2157fac5b 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -30,8 +30,6 @@ struct plock_async_data {
 struct plock_op {
struct list_head list;
int done;
-   /* if lock op got interrupted while waiting dlm_controld reply */
-   bool sigint;
struct dlm_plock_info info;
/* if set indicates async handling */
struct plock_async_data *data;
@@ -167,12 +165,14 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 
number, struct file *file,
spin_unlock(_lock);
goto do_lock_wait;
}
-
-   op->sigint = true;
+   list_del(>list);
spin_unlock(_lock);
+
log_debug(ls, "%s: wait interrupted %x %llx pid %d",
  __func__, ls->ls_global_id,
  (unsigned long long)number, op->info.pid);
+   do_unlock_close(>info);
+   dlm_release_plock_op(op);
goto out;
}
 
@@ -434,19 +434,6 @@ static ssize_t dev_write(struct file *file, const char 
__user *u, size_t count,
if (iter->info.fsid == info.fsid &&
iter->info.number == info.number &&
iter->info.owner == info.owner) {
-   if (iter->sigint) {
-   list_del(>list);
-   spin_unlock(_lock);
-
-   pr_debug("%s: sigint cleanup %x %llx pid %d",
- __func__, iter->info.fsid,
- (unsigned long long)iter->info.number,
- iter->info.pid);
-   do_unlock_close(>info);
-   memcpy(>info, , sizeof(info));
-   dlm_release_plock_op(iter);
-   return count;
-   }
list_del_init(>list);
memcpy(>info, , sizeof(info));
if (iter->data)
@@ -465,8 +452,8 @@ static ssize_t dev_write(struct file *file, const char 
__user *u, size_t count,
else
wake_up(_wq);
} else
-   log_print("%s: no op %x %llx", __func__,
- info.fsid, (unsigned long long)info.number);
+   pr_debug("%s: no op %x %llx", __func__,
+info.fsid, (unsigned long long)info.number);
return count;
 }
 
-- 
2.31.1



Re: [Cluster-devel] [PATCH 01/17] fs: unexport buffer_check_dirty_writeback

2023-05-19 Thread Hannes Reinecke

On 4/24/23 07:49, Christoph Hellwig wrote:

buffer_check_dirty_writeback is only used by the block device aops,
remove the export.

Signed-off-by: Christoph Hellwig 
---
  fs/buffer.c | 1 -
  1 file changed, 1 deletion(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index 9e1e2add541e07..eb14fbaa7d35f7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -111,7 +111,6 @@ void buffer_check_dirty_writeback(struct folio *folio,
bh = bh->b_this_page;
} while (bh != head);
  }
-EXPORT_SYMBOL(buffer_check_dirty_writeback);
  
  /*

   * Block until a buffer comes unlocked.  This doesn't stop it


Reviewed-by: Hannes Reinecke 

Cheers,

Hannes



Re: [Cluster-devel] [PATCH 16/17] block: use iomap for writes to block devices

2023-05-19 Thread Hannes Reinecke

On 4/24/23 07:49, Christoph Hellwig wrote:

Use iomap in buffer_head compat mode to write to block devices.

Signed-off-by: Christoph Hellwig 
---
  block/Kconfig |  1 +
  block/fops.c  | 33 +
  2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index 941b2dca70db73..672b08f0096ab4 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,6 +5,7 @@
  menuconfig BLOCK
 bool "Enable the block layer" if EXPERT
 default y
+   select IOMAP
 select SBITMAP
 help
 Provide block layer support for the kernel.
diff --git a/block/fops.c b/block/fops.c
index 318247832a7bcf..7910636f8df33b 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -15,6 +15,7 @@
  #include 
  #include 
  #include 
+#include 
  #include 
  #include "blk.h"
  
@@ -386,6 +387,27 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)

return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
  }
  
+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,

+   unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+   struct block_device *bdev = I_BDEV(inode);
+   loff_t isize = i_size_read(inode);
+
+   iomap->bdev = bdev;
+   iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+   if (WARN_ON_ONCE(iomap->offset >= isize))
+   return -EIO;


I'm hitting this during booting:
[5.016324]  
[5.030256]  iomap_iter+0x11a/0x350
[5.030264]  iomap_readahead+0x1eb/0x2c0
[5.030272]  read_pages+0x5d/0x220
[5.030279]  page_cache_ra_unbounded+0x131/0x180
[5.030284]  filemap_get_pages+0xff/0x5a0
[5.030292]  filemap_read+0xca/0x320
[5.030296]  ? aa_file_perm+0x126/0x500
[5.040216]  ? touch_atime+0xc8/0x150
[5.040224]  blkdev_read_iter+0xb0/0x150
[5.040228]  vfs_read+0x226/0x2d0
[5.040234]  ksys_read+0xa5/0xe0
[5.040238]  do_syscall_64+0x5b/0x80

Maybe we should consider this patch:

diff --git a/block/fops.c b/block/fops.c
index 524b8a828aad..d202fb663f25 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -386,10 +386,13 @@ static int blkdev_iomap_begin(struct inode *inode, 
loff_t offset, loff_t length,


iomap->bdev = bdev;
iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
-   if (WARN_ON_ONCE(iomap->offset >= isize))
-   return -EIO;
-   iomap->type = IOMAP_MAPPED;
-   iomap->addr = iomap->offset;
+   if (WARN_ON_ONCE(iomap->offset >= isize)) {
+   iomap->type = IOMAP_HOLE;
+   iomap->addr = IOMAP_NULL_ADDR;
+   } else {
+   iomap->type = IOMAP_MAPPED;
+   iomap->addr = iomap->offset;
+   }
iomap->length = isize - iomap->offset;
if (IS_ENABLED(CONFIG_BUFFER_HEAD))
iomap->flags |= IOMAP_F_BUFFER_HEAD;


Other that the the system seems fine.

Cheers,

Hannes



[Cluster-devel] [PATCH 10/13] fs: factor out a direct_write_fallback helper

2023-05-19 Thread Christoph Hellwig
Add a helper dealing with handling the syncing of a buffered write fallback
for direct I/O.

Signed-off-by: Christoph Hellwig 
---
 fs/libfs.c | 36 
 include/linux/fs.h |  2 ++
 mm/filemap.c   | 59 ++
 3 files changed, 50 insertions(+), 47 deletions(-)

diff --git a/fs/libfs.c b/fs/libfs.c
index 89cf614a327158..9f3791fc6e0715 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1613,3 +1613,39 @@ u64 inode_query_iversion(struct inode *inode)
return cur >> I_VERSION_QUERIED_SHIFT;
 }
 EXPORT_SYMBOL(inode_query_iversion);
+
+ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
+   ssize_t direct_written, ssize_t buffered_written)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+   loff_t pos = iocb->ki_pos, end;
+   int err;
+
+   /*
+* If the buffered write fallback returned an error, we want to return
+* the number of bytes which were written by direct I/O, or the error
+* code if that was zero.
+*
+* Note that this differs from normal direct-io semantics, which will
+* return -EFOO even if some bytes were written.
+*/
+   if (unlikely(buffered_written < 0))
+   return buffered_written;
+
+   /*
+* We need to ensure that the page cache pages are written to disk and
+* invalidated to preserve the expected O_DIRECT semantics.
+*/
+   end = pos + buffered_written - 1;
+   err = filemap_write_and_wait_range(mapping, pos, end);
+   if (err < 0) {
+   /*
+* We don't know how much we wrote, so just return the number of
+* bytes which were direct-written
+*/
+   return err;
+   }
+   invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+   return direct_written + buffered_written;
+}
+EXPORT_SYMBOL_GPL(direct_write_fallback);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e4efc1792a877a..576a945db178ef 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2738,6 +2738,8 @@ extern ssize_t __generic_file_write_iter(struct kiocb *, 
struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_direct_write(struct kiocb *, struct iov_iter *);
 ssize_t generic_perform_write(struct kiocb *, struct iov_iter *);
+ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter,
+   ssize_t direct_written, ssize_t buffered_written);
 
 ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
rwf_t flags);
diff --git a/mm/filemap.c b/mm/filemap.c
index c1b988199aece5..875b2108d0a05f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -4008,25 +4008,21 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
 {
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
-   struct inode*inode = mapping->host;
-   ssize_t written = 0;
-   ssize_t err;
-   ssize_t status;
+   struct inode *inode = mapping->host;
+   ssize_t ret;
 
/* We can write back this queue in page reclaim */
current->backing_dev_info = inode_to_bdi(inode);
-   err = file_remove_privs(file);
-   if (err)
+   ret = file_remove_privs(file);
+   if (ret)
goto out;
 
-   err = file_update_time(file);
-   if (err)
+   ret = file_update_time(file);
+   if (ret)
goto out;
 
if (iocb->ki_flags & IOCB_DIRECT) {
-   loff_t pos, endbyte;
-
-   written = generic_file_direct_write(iocb, from);
+   ret = generic_file_direct_write(iocb, from);
/*
 * If the write stopped short of completing, fall back to
 * buffered writes.  Some filesystems do this for writes to
@@ -4034,46 +4030,15 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
 * not succeed (even if it did, DAX does not handle dirty
 * page-cache pages correctly).
 */
-   if (written < 0 || !iov_iter_count(from) || IS_DAX(inode))
-   goto out;
-
-   pos = iocb->ki_pos;
-   status = generic_perform_write(iocb, from);
-   /*
-* If generic_perform_write() returned a synchronous error
-* then we want to return the number of bytes which were
-* direct-written, or the error code if that was zero.  Note
-* that this differs from normal direct-io semantics, which
-* will return -EFOO even if some bytes were written.
-*/
-   if (unlikely(status < 0)) {
-   err = status;
-  

[Cluster-devel] [PATCH 07/13] iomap: update ki_pos in iomap_file_buffered_write

2023-05-19 Thread Christoph Hellwig
All callers of iomap_file_buffered_write need to updated ki_pos, move it
into common code.

Signed-off-by: Christoph Hellwig 
---
 fs/gfs2/file.c | 4 +---
 fs/iomap/buffered-io.c | 9 ++---
 fs/xfs/xfs_file.c  | 2 --
 fs/zonefs/file.c   | 4 +---
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 300844f50dcd28..499ef174dec138 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1046,10 +1046,8 @@ static ssize_t gfs2_file_buffered_write(struct kiocb 
*iocb,
ret = iomap_file_buffered_write(iocb, from, _iomap_ops);
pagefault_enable();
current->backing_dev_info = NULL;
-   if (ret > 0) {
-   iocb->ki_pos += ret;
+   if (ret > 0)
written += ret;
-   }
 
if (inode == sdp->sd_rindex)
gfs2_glock_dq_uninit(statfs_gh);
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 063133ec77f49e..550525a525c45c 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -864,16 +864,19 @@ iomap_file_buffered_write(struct kiocb *iocb, struct 
iov_iter *i,
.len= iov_iter_count(i),
.flags  = IOMAP_WRITE,
};
-   int ret;
+   ssize_t ret;
 
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
 
while ((ret = iomap_iter(, ops)) > 0)
iter.processed = iomap_write_iter(, i);
-   if (iter.pos == iocb->ki_pos)
+
+   if (unlikely(ret < 0))
return ret;
-   return iter.pos - iocb->ki_pos;
+   ret = iter.pos - iocb->ki_pos;
+   iocb->ki_pos += ret;
+   return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_file_buffered_write);
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aede746541f8ae..bfba10e0b0f3c2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -723,8 +723,6 @@ xfs_file_buffered_write(
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
_buffered_write_iomap_ops);
-   if (likely(ret >= 0))
-   iocb->ki_pos += ret;
 
/*
 * If we hit a space limit, try to free up some lingering preallocated
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index 132f01d3461f14..e212d0636f848e 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -643,9 +643,7 @@ static ssize_t zonefs_file_buffered_write(struct kiocb 
*iocb,
goto inode_unlock;
 
ret = iomap_file_buffered_write(iocb, from, _write_iomap_ops);
-   if (ret > 0)
-   iocb->ki_pos += ret;
-   else if (ret == -EIO)
+   if (ret == -EIO)
zonefs_io_error(inode, true);
 
 inode_unlock:
-- 
2.39.2



[Cluster-devel] [PATCH 06/13] filemap: add a kiocb_invalidate_post_write helper

2023-05-19 Thread Christoph Hellwig
Add a helper to invalidate page cache after a dio write.

Signed-off-by: Christoph Hellwig 
---
 fs/direct-io.c  | 10 ++
 fs/iomap/direct-io.c| 12 ++--
 include/linux/fs.h  |  5 -
 include/linux/pagemap.h |  1 +
 mm/filemap.c| 37 -
 5 files changed, 25 insertions(+), 40 deletions(-)

diff --git a/fs/direct-io.c b/fs/direct-io.c
index 0b380bb8a81e11..c25d68eabf4281 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -285,14 +285,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, 
unsigned int flags)
 * zeros from unwritten extents.
 */
if (flags & DIO_COMPLETE_INVALIDATE &&
-   ret > 0 && dio_op == REQ_OP_WRITE &&
-   dio->inode->i_mapping->nrpages) {
-   err = invalidate_inode_pages2_range(dio->inode->i_mapping,
-   offset >> PAGE_SHIFT,
-   (offset + ret - 1) >> PAGE_SHIFT);
-   if (err)
-   dio_warn_stale_pagecache(dio->iocb->ki_filp);
-   }
+   ret > 0 && dio_op == REQ_OP_WRITE)
+   kiocb_invalidate_post_write(dio->iocb, ret);
 
inode_dio_end(dio->inode);
 
diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 6207a59d2162e1..45accd98344e79 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -81,7 +81,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
 {
const struct iomap_dio_ops *dops = dio->dops;
struct kiocb *iocb = dio->iocb;
-   struct inode *inode = file_inode(iocb->ki_filp);
loff_t offset = iocb->ki_pos;
ssize_t ret = dio->error;
 
@@ -108,15 +107,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
 * ->end_io() when necessary, otherwise a racing buffer read would cache
 * zeros from unwritten extents.
 */
-   if (!dio->error && dio->size &&
-   (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
-   int err;
-   err = invalidate_inode_pages2_range(inode->i_mapping,
-   offset >> PAGE_SHIFT,
-   (offset + dio->size - 1) >> PAGE_SHIFT);
-   if (err)
-   dio_warn_stale_pagecache(iocb->ki_filp);
-   }
+   if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE))
+   kiocb_invalidate_post_write(iocb, dio->size);
 
inode_dio_end(file_inode(iocb->ki_filp));
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 21a98168085641..e4efc1792a877a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2837,11 +2837,6 @@ static inline void inode_dio_end(struct inode *inode)
wake_up_bit(>i_state, __I_DIO_WAKEUP);
 }
 
-/*
- * Warn about a page cache invalidation failure diring a direct I/O write.
- */
-void dio_warn_stale_pagecache(struct file *filp);
-
 extern void inode_set_flags(struct inode *inode, unsigned int flags,
unsigned int mask);
 
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 6e4c9ee40baa99..9695730ea86a98 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -31,6 +31,7 @@ int invalidate_inode_pages2(struct address_space *mapping);
 int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
 int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
+void kiocb_invalidate_post_write(struct kiocb *iocb, size_t count);
 
 int write_inode_now(struct inode *, int sync);
 int filemap_fdatawrite(struct address_space *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 8607220e20eae3..c1b988199aece5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3816,7 +3816,7 @@ EXPORT_SYMBOL(read_cache_page_gfp);
 /*
  * Warn about a page cache invalidation failure during a direct I/O write.
  */
-void dio_warn_stale_pagecache(struct file *filp)
+static void dio_warn_stale_pagecache(struct file *filp)
 {
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
char pathname[128];
@@ -3833,19 +3833,23 @@ void dio_warn_stale_pagecache(struct file *filp)
}
 }
 
+void kiocb_invalidate_post_write(struct kiocb *iocb, size_t count)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+
+   if (mapping->nrpages &&
+   invalidate_inode_pages2_range(mapping,
+   iocb->ki_pos >> PAGE_SHIFT,
+   (iocb->ki_pos + count - 1) >> PAGE_SHIFT))
+   dio_warn_stale_pagecache(iocb->ki_filp);
+}
+
 ssize_t
 generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
 {
-   struct file *file = iocb->ki_filp;
-   struct address_space *mapping = file->f_mapping;
-   struct inode*inode = mapping->host;
-   loff_t  pos = iocb->ki_pos;
-   ssize_t written;
-   size_t  write_len;
-   

[Cluster-devel] cleanup the filemap / direct I/O interaction

2023-05-19 Thread Christoph Hellwig
Hi all,

this series cleans up some of the generic write helper calling
conventions and the page cache writeback / invalidation for
direct I/O.  This is a spinoff from the no-bufferhead kernel
project, for while we'll want to an use iomap based buffered
write path in the block layer.

diffstat:
 block/fops.c|   18 
 fs/ceph/file.c  |6 -
 fs/direct-io.c  |   10 --
 fs/ext4/file.c  |   12 ---
 fs/f2fs/file.c  |3 
 fs/fuse/file.c  |   47 ++--
 fs/gfs2/file.c  |7 -
 fs/iomap/buffered-io.c  |   12 ++-
 fs/iomap/direct-io.c|   88 --
 fs/libfs.c  |   36 +
 fs/nfs/file.c   |6 -
 fs/xfs/xfs_file.c   |7 -
 fs/zonefs/file.c|4 -
 include/linux/fs.h  |7 -
 include/linux/pagemap.h |4 +
 mm/filemap.c|  184 +---
 16 files changed, 190 insertions(+), 261 deletions(-)



[Cluster-devel] [PATCH 11/13] fuse: update ki_pos in fuse_perform_write

2023-05-19 Thread Christoph Hellwig
Both callers of fuse_perform_write need to updated ki_pos, move it into
common code.

Signed-off-by: Christoph Hellwig 
---
 fs/fuse/file.c | 25 +++--
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 89d97f6188e05e..fd2f27f2144750 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1329,7 +1329,10 @@ static ssize_t fuse_perform_write(struct kiocb *iocb,
fuse_write_update_attr(inode, pos, res);
clear_bit(FUSE_I_SIZE_UNSTABLE, >state);
 
-   return res > 0 ? res : err;
+   if (!res)
+   return err;
+   iocb->ki_pos += res;
+   return res;
 }
 
 static ssize_t fuse_cache_write_iter(struct kiocb *iocb, struct iov_iter *from)
@@ -1378,42 +1381,36 @@ static ssize_t fuse_cache_write_iter(struct kiocb 
*iocb, struct iov_iter *from)
goto out;
 
if (iocb->ki_flags & IOCB_DIRECT) {
-   loff_t pos = iocb->ki_pos;
written = generic_file_direct_write(iocb, from);
if (written < 0 || !iov_iter_count(from))
goto out;
 
-   pos += written;
-
-   written_buffered = fuse_perform_write(iocb, mapping, from, pos);
+   written_buffered = fuse_perform_write(iocb, mapping, from,
+ iocb->ki_pos);
if (written_buffered < 0) {
err = written_buffered;
goto out;
}
-   endbyte = pos + written_buffered - 1;
+   endbyte = iocb->ki_pos + written_buffered - 1;
 
-   err = filemap_write_and_wait_range(file->f_mapping, pos,
+   err = filemap_write_and_wait_range(file->f_mapping,
+  iocb->ki_pos,
   endbyte);
if (err)
goto out;
 
invalidate_mapping_pages(file->f_mapping,
-pos >> PAGE_SHIFT,
+iocb->ki_pos >> PAGE_SHIFT,
 endbyte >> PAGE_SHIFT);
 
written += written_buffered;
-   iocb->ki_pos = pos + written_buffered;
+   iocb->ki_pos += written_buffered;
} else {
written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
-   if (written >= 0)
-   iocb->ki_pos += written;
}
 out:
current->backing_dev_info = NULL;
inode_unlock(inode);
-   if (written > 0)
-   written = generic_write_sync(iocb, written);
-
return written ? written : err;
 }
 
-- 
2.39.2



[Cluster-devel] [PATCH 05/13] filemap: add a kiocb_invalidate_pages helper

2023-05-19 Thread Christoph Hellwig
Factor out a helper that calls filemap_write_and_wait_range and
invalidate_inode_pages2_rangefor a the range covered by a write kiocb or
returns -EAGAIN if the kiocb is marked as nowait and there would be pages
to write or invalidate.

Signed-off-by: Christoph Hellwig 
---
 include/linux/pagemap.h |  1 +
 mm/filemap.c| 48 -
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 36fc2cea13ce20..6e4c9ee40baa99 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -30,6 +30,7 @@ static inline void invalidate_remote_inode(struct inode 
*inode)
 int invalidate_inode_pages2(struct address_space *mapping);
 int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
 
 int write_inode_now(struct inode *, int sync);
 int filemap_fdatawrite(struct address_space *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 2d7712b13b95c9..8607220e20eae3 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2777,6 +2777,33 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t 
count)
return filemap_write_and_wait_range(mapping, pos, end);
 }
 
+int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+   loff_t pos = iocb->ki_pos;
+   loff_t end = pos + count - 1;
+   int ret;
+
+   if (iocb->ki_flags & IOCB_NOWAIT) {
+   /* we could block if there are any pages in the range */
+   if (filemap_range_has_page(mapping, pos, end))
+   return -EAGAIN;
+   } else {
+   ret = filemap_write_and_wait_range(mapping, pos, end);
+   if (ret)
+   return ret;
+   }
+
+   /*
+* After a write we want buffered reads to be sure to go to disk to get
+* the new data.  We invalidate clean cached page from the region we're
+* about to write.  We do this *before* the write so that we can return
+* without clobbering -EIOCBQUEUED from ->direct_IO().
+*/
+   return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
+end >> PAGE_SHIFT);
+}
+
 /**
  * generic_file_read_iter - generic filesystem read routine
  * @iocb:  kernel I/O control block
@@ -3820,30 +3847,11 @@ generic_file_direct_write(struct kiocb *iocb, struct 
iov_iter *from)
write_len = iov_iter_count(from);
end = (pos + write_len - 1) >> PAGE_SHIFT;
 
-   if (iocb->ki_flags & IOCB_NOWAIT) {
-   /* If there are pages to writeback, return */
-   if (filemap_range_has_page(file->f_mapping, pos,
-  pos + write_len - 1))
-   return -EAGAIN;
-   } else {
-   written = filemap_write_and_wait_range(mapping, pos,
-   pos + write_len - 1);
-   if (written)
-   goto out;
-   }
-
-   /*
-* After a write we want buffered reads to be sure to go to disk to get
-* the new data.  We invalidate clean cached page from the region we're
-* about to write.  We do this *before* the write so that we can return
-* without clobbering -EIOCBQUEUED from ->direct_IO().
-*/
-   written = invalidate_inode_pages2_range(mapping,
-   pos >> PAGE_SHIFT, end);
/*
 * If a page can not be invalidated, return 0 to fall back
 * to buffered write.
 */
+   written = kiocb_invalidate_pages(iocb, write_len);
if (written) {
if (written == -EBUSY)
return 0;
-- 
2.39.2



[Cluster-devel] [PATCH 02/13] filemap: update ki_pos in generic_perform_write

2023-05-19 Thread Christoph Hellwig
All callers of generic_perform_write need to updated ki_pos, move it into
common code.

Signed-off-by: Christoph Hellwig 
---
 fs/ceph/file.c | 2 --
 fs/ext4/file.c | 9 +++--
 fs/f2fs/file.c | 1 -
 fs/nfs/file.c  | 1 -
 mm/filemap.c   | 8 
 5 files changed, 7 insertions(+), 14 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f4d8bf7dec88a8..feeb9882ef635a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1894,8 +1894,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct 
iov_iter *from)
 * can not run at the same time
 */
written = generic_perform_write(iocb, from);
-   if (likely(written >= 0))
-   iocb->ki_pos = pos + written;
ceph_end_io_write(inode);
}
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d101b3b0c7dad8..50824831d31def 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -291,12 +291,9 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
 
 out:
inode_unlock(inode);
-   if (likely(ret > 0)) {
-   iocb->ki_pos += ret;
-   ret = generic_write_sync(iocb, ret);
-   }
-
-   return ret;
+   if (unlikely(ret <= 0))
+   return ret;
+   return generic_write_sync(iocb, ret);
 }
 
 static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 5ac53d2627d20d..9e3855e43a7a63 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4522,7 +4522,6 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb 
*iocb,
current->backing_dev_info = NULL;
 
if (ret > 0) {
-   iocb->ki_pos += ret;
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_BUFFERED_IO, ret);
}
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f0edf5a36237d1..3cc87ae8473356 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -658,7 +658,6 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter 
*from)
goto out;
 
written = result;
-   iocb->ki_pos += written;
nfs_add_stats(inode, NFSIOS_NORMALWRITTENBYTES, written);
 
if (mntflags & NFS_MOUNT_WRITE_EAGER) {
diff --git a/mm/filemap.c b/mm/filemap.c
index b4c9bd368b7e58..4d0ec2fa1c7070 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3957,7 +3957,10 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct 
iov_iter *i)
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
 
-   return written ? written : status;
+   if (!written)
+   return status;
+   iocb->ki_pos += written;
+   return written;
 }
 EXPORT_SYMBOL(generic_perform_write);
 
@@ -4036,7 +4039,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
endbyte = pos + status - 1;
err = filemap_write_and_wait_range(mapping, pos, endbyte);
if (err == 0) {
-   iocb->ki_pos = endbyte + 1;
written += status;
invalidate_mapping_pages(mapping,
 pos >> PAGE_SHIFT,
@@ -4049,8 +4051,6 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
}
} else {
written = generic_perform_write(iocb, from);
-   if (likely(written > 0))
-   iocb->ki_pos += written;
}
 out:
current->backing_dev_info = NULL;
-- 
2.39.2



[Cluster-devel] [PATCH 08/13] iomap: assign current->backing_dev_info in iomap_file_buffered_write

2023-05-19 Thread Christoph Hellwig
Move the assignment to current->backing_dev_info from the callers into
iomap_file_buffered_write to reduce boiler plate code and reduce the
scope to just around the page dirtying loop.

Note that zonefs was missing this assignment before.

Signed-off-by: Christoph Hellwig 
---
 fs/gfs2/file.c | 3 ---
 fs/iomap/buffered-io.c | 3 +++
 fs/xfs/xfs_file.c  | 5 -
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 499ef174dec138..261897fcfbc495 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -25,7 +25,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 
 #include "gfs2.h"
@@ -1041,11 +1040,9 @@ static ssize_t gfs2_file_buffered_write(struct kiocb 
*iocb,
goto out_unlock;
}
 
-   current->backing_dev_info = inode_to_bdi(inode);
pagefault_disable();
ret = iomap_file_buffered_write(iocb, from, _iomap_ops);
pagefault_enable();
-   current->backing_dev_info = NULL;
if (ret > 0)
written += ret;
 
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 550525a525c45c..b2779bd1f10611 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -3,6 +3,7 @@
  * Copyright (C) 2010 Red Hat, Inc.
  * Copyright (C) 2016-2019 Christoph Hellwig.
  */
+#include 
 #include 
 #include 
 #include 
@@ -869,8 +870,10 @@ iomap_file_buffered_write(struct kiocb *iocb, struct 
iov_iter *i,
if (iocb->ki_flags & IOCB_NOWAIT)
iter.flags |= IOMAP_NOWAIT;
 
+   current->backing_dev_info = inode_to_bdi(iter.inode);
while ((ret = iomap_iter(, ops)) > 0)
iter.processed = iomap_write_iter(, i);
+   current->backing_dev_info = NULL;
 
if (unlikely(ret < 0))
return ret;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index bfba10e0b0f3c2..98d763cc3b114c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -27,7 +27,6 @@
 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -717,9 +716,6 @@ xfs_file_buffered_write(
if (ret)
goto out;
 
-   /* We can write back this queue in page reclaim */
-   current->backing_dev_info = inode_to_bdi(inode);
-
trace_xfs_file_buffered_write(iocb, from);
ret = iomap_file_buffered_write(iocb, from,
_buffered_write_iomap_ops);
@@ -751,7 +747,6 @@ xfs_file_buffered_write(
goto write_retry;
}
 
-   current->backing_dev_info = NULL;
 out:
if (iolock)
xfs_iunlock(ip, iolock);
-- 
2.39.2



[Cluster-devel] [PATCH 09/13] iomap: use kiocb_write_and_wait and kiocb_invalidate_pages

2023-05-19 Thread Christoph Hellwig
Use the common helpers for direct I/O page invalidation instead of
open coding the logic.  This leads to a slight reordering of checks
in __iomap_dio_rw to keep the logic straight.

Signed-off-by: Christoph Hellwig 
---
 fs/iomap/direct-io.c | 55 
 1 file changed, 20 insertions(+), 35 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 45accd98344e79..ccf51d57619721 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -472,7 +472,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, void *private, size_t done_before)
 {
-   struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
struct iomap_iter iomi = {
.inode  = inode,
@@ -481,11 +480,11 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
.flags  = IOMAP_DIRECT,
.private= private,
};
-   loff_t end = iomi.pos + iomi.len - 1, ret = 0;
bool wait_for_completion =
is_sync_kiocb(iocb) || (dio_flags & IOMAP_DIO_FORCE_WAIT);
struct blk_plug plug;
struct iomap_dio *dio;
+   loff_t ret = 0;
 
trace_iomap_dio_rw_begin(iocb, iter, dio_flags, done_before);
 
@@ -509,31 +508,29 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
dio->submit.waiter = current;
dio->submit.poll_bio = NULL;
 
+   if (iocb->ki_flags & IOCB_NOWAIT)
+   iomi.flags |= IOMAP_NOWAIT;
+
if (iov_iter_rw(iter) == READ) {
if (iomi.pos >= dio->i_size)
goto out_free_dio;
 
-   if (iocb->ki_flags & IOCB_NOWAIT) {
-   if (filemap_range_needs_writeback(mapping, iomi.pos,
-   end)) {
-   ret = -EAGAIN;
-   goto out_free_dio;
-   }
-   iomi.flags |= IOMAP_NOWAIT;
-   }
-
if (user_backed_iter(iter))
dio->flags |= IOMAP_DIO_DIRTY;
+
+   ret = kiocb_write_and_wait(iocb, iomi.len);
+   if (ret)
+   goto out_free_dio;
} else {
iomi.flags |= IOMAP_WRITE;
dio->flags |= IOMAP_DIO_WRITE;
 
-   if (iocb->ki_flags & IOCB_NOWAIT) {
-   if (filemap_range_has_page(mapping, iomi.pos, end)) {
-   ret = -EAGAIN;
+   if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
+   ret = -EAGAIN;
+   if (iomi.pos >= dio->i_size ||
+   iomi.pos + iomi.len > dio->i_size)
goto out_free_dio;
-   }
-   iomi.flags |= IOMAP_NOWAIT;
+   iomi.flags |= IOMAP_OVERWRITE_ONLY;
}
 
/* for data sync or sync, we need sync completion processing */
@@ -549,31 +546,19 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
if (!(iocb->ki_flags & IOCB_SYNC))
dio->flags |= IOMAP_DIO_WRITE_FUA;
}
-   }
-
-   if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
-   ret = -EAGAIN;
-   if (iomi.pos >= dio->i_size ||
-   iomi.pos + iomi.len > dio->i_size)
-   goto out_free_dio;
-   iomi.flags |= IOMAP_OVERWRITE_ONLY;
-   }
 
-   ret = filemap_write_and_wait_range(mapping, iomi.pos, end);
-   if (ret)
-   goto out_free_dio;
-
-   if (iov_iter_rw(iter) == WRITE) {
/*
 * Try to invalidate cache pages for the range we are writing.
 * If this invalidation fails, let the caller fall back to
 * buffered I/O.
 */
-   if (invalidate_inode_pages2_range(mapping,
-   iomi.pos >> PAGE_SHIFT, end >> PAGE_SHIFT)) {
-   trace_iomap_dio_invalidate_fail(inode, iomi.pos,
-   iomi.len);
-   ret = -ENOTBLK;
+   ret = kiocb_invalidate_pages(iocb, iomi.len);
+   if (ret) {
+   if (ret != -EAGAIN) {
+   trace_iomap_dio_invalidate_fail(inode, iomi.pos,
+   iomi.len);
+   ret = -ENOTBLK;
+   }
goto out_free_dio;
}
 
-- 
2.39.2



[Cluster-devel] [PATCH 04/13] filemap: add a kiocb_write_and_wait helper

2023-05-19 Thread Christoph Hellwig
Factor out a helper that does filemap_write_and_wait_range for a the
range covered by a read kiocb, or returns -EAGAIN if the kiocb
is marked as nowait and there would be pages to write.

Signed-off-by: Christoph Hellwig 
---
 block/fops.c| 18 +++---
 include/linux/pagemap.h |  2 ++
 mm/filemap.c| 30 ++
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index d2e6be4e3d1c7d..c194939b851cfb 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -576,21 +576,9 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct 
iov_iter *to)
goto reexpand; /* skip atime */
 
if (iocb->ki_flags & IOCB_DIRECT) {
-   struct address_space *mapping = iocb->ki_filp->f_mapping;
-
-   if (iocb->ki_flags & IOCB_NOWAIT) {
-   if (filemap_range_needs_writeback(mapping, pos,
- pos + count - 1)) {
-   ret = -EAGAIN;
-   goto reexpand;
-   }
-   } else {
-   ret = filemap_write_and_wait_range(mapping, pos,
-  pos + count - 1);
-   if (ret < 0)
-   goto reexpand;
-   }
-
+   ret = kiocb_write_and_wait(iocb, count);
+   if (ret < 0)
+   goto reexpand;
file_accessed(iocb->ki_filp);
 
ret = blkdev_direct_IO(iocb, to);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a56308a9d1a450..36fc2cea13ce20 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -30,6 +30,7 @@ static inline void invalidate_remote_inode(struct inode 
*inode)
 int invalidate_inode_pages2(struct address_space *mapping);
 int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end);
+
 int write_inode_now(struct inode *, int sync);
 int filemap_fdatawrite(struct address_space *);
 int filemap_flush(struct address_space *);
@@ -54,6 +55,7 @@ int filemap_check_errors(struct address_space *mapping);
 void __filemap_set_wb_err(struct address_space *mapping, int err);
 int filemap_fdatawrite_wbc(struct address_space *mapping,
   struct writeback_control *wbc);
+int kiocb_write_and_wait(struct kiocb *iocb, size_t count);
 
 static inline int filemap_write_and_wait(struct address_space *mapping)
 {
diff --git a/mm/filemap.c b/mm/filemap.c
index bf693ad1da1ece..2d7712b13b95c9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2762,6 +2762,21 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter 
*iter,
 }
 EXPORT_SYMBOL_GPL(filemap_read);
 
+int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
+{
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
+   loff_t pos = iocb->ki_pos;
+   loff_t end = pos + count - 1;
+
+   if (iocb->ki_flags & IOCB_NOWAIT) {
+   if (filemap_range_needs_writeback(mapping, pos, end))
+   return -EAGAIN;
+   return 0;
+   }
+
+   return filemap_write_and_wait_range(mapping, pos, end);
+}
+
 /**
  * generic_file_read_iter - generic filesystem read routine
  * @iocb:  kernel I/O control block
@@ -2797,18 +2812,9 @@ generic_file_read_iter(struct kiocb *iocb, struct 
iov_iter *iter)
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
 
-   if (iocb->ki_flags & IOCB_NOWAIT) {
-   if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
-   iocb->ki_pos + count - 1))
-   return -EAGAIN;
-   } else {
-   retval = filemap_write_and_wait_range(mapping,
-   iocb->ki_pos,
-   iocb->ki_pos + count - 1);
-   if (retval < 0)
-   return retval;
-   }
-
+   retval = kiocb_write_and_wait(iocb, count);
+   if (retval < 0)
+   return retval;
file_accessed(file);
 
retval = mapping->a_ops->direct_IO(iocb, iter);
-- 
2.39.2



[Cluster-devel] [PATCH 01/13] iomap: update ki_pos a little later in iomap_dio_complete

2023-05-19 Thread Christoph Hellwig
Move the ki_pos update down a bit to prepare for a better common
helper that invalidates pages based of an iocb.

Signed-off-by: Christoph Hellwig 
---
 fs/iomap/direct-io.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c
index 019cc87d0fb339..6207a59d2162e1 100644
--- a/fs/iomap/direct-io.c
+++ b/fs/iomap/direct-io.c
@@ -94,7 +94,6 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
if (offset + ret > dio->i_size &&
!(dio->flags & IOMAP_DIO_WRITE))
ret = dio->i_size - offset;
-   iocb->ki_pos += ret;
}
 
/*
@@ -120,19 +119,21 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
}
 
inode_dio_end(file_inode(iocb->ki_filp));
-   /*
-* If this is a DSYNC write, make sure we push it to stable storage now
-* that we've written data.
-*/
-   if (ret > 0 && (dio->flags & IOMAP_DIO_NEED_SYNC))
-   ret = generic_write_sync(iocb, ret);
 
-   if (ret > 0)
-   ret += dio->done_before;
+   if (ret > 0) {
+   iocb->ki_pos += ret;
 
+   /*
+* If this is a DSYNC write, make sure we push it to stable
+* storage now that we've written data.
+*/
+   if (dio->flags & IOMAP_DIO_NEED_SYNC)
+   ret = generic_write_sync(iocb, ret);
+   if (ret > 0)
+   ret += dio->done_before;
+   }
trace_iomap_dio_complete(iocb, dio->error, ret);
kfree(dio);
-
return ret;
 }
 EXPORT_SYMBOL_GPL(iomap_dio_complete);
-- 
2.39.2



[Cluster-devel] [PATCH 03/13] filemap: assign current->backing_dev_info in generic_perform_write

2023-05-19 Thread Christoph Hellwig
Move the assignment to current->backing_dev_info from the callers into
generic_perform_write to reduce boiler plate code and reduce the scope
to just around the page dirtying loop.

Signed-off-by: Christoph Hellwig 
---
 fs/ceph/file.c | 4 
 fs/ext4/file.c | 3 ---
 fs/f2fs/file.c | 2 --
 fs/nfs/file.c  | 5 +
 mm/filemap.c   | 2 ++
 5 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index feeb9882ef635a..767f4dfe7def64 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1791,9 +1791,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct 
iov_iter *from)
else
ceph_start_io_write(inode);
 
-   /* We can write back this queue in page reclaim */
-   current->backing_dev_info = inode_to_bdi(inode);
-
if (iocb->ki_flags & IOCB_APPEND) {
err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
if (err < 0)
@@ -1938,7 +1935,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct 
iov_iter *from)
ceph_end_io_write(inode);
 out_unlocked:
ceph_free_cap_flush(prealloc_cf);
-   current->backing_dev_info = NULL;
return written ? written : err;
 }
 
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 50824831d31def..3cb83a3e2e4a2a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -29,7 +29,6 @@
 #include 
 #include 
 #include 
-#include 
 #include "ext4.h"
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -285,9 +284,7 @@ static ssize_t ext4_buffered_write_iter(struct kiocb *iocb,
if (ret <= 0)
goto out;
 
-   current->backing_dev_info = inode_to_bdi(inode);
ret = generic_perform_write(iocb, from);
-   current->backing_dev_info = NULL;
 
 out:
inode_unlock(inode);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 9e3855e43a7a63..7134fe8bd008cb 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -4517,9 +4517,7 @@ static ssize_t f2fs_buffered_write_iter(struct kiocb 
*iocb,
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
 
-   current->backing_dev_info = inode_to_bdi(inode);
ret = generic_perform_write(iocb, from);
-   current->backing_dev_info = NULL;
 
if (ret > 0) {
f2fs_update_iostat(F2FS_I_SB(inode), inode,
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3cc87ae8473356..e8bb4c48a3210a 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -648,11 +648,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter 
*from)
since = filemap_sample_wb_err(file->f_mapping);
nfs_start_io_write(inode);
result = generic_write_checks(iocb, from);
-   if (result > 0) {
-   current->backing_dev_info = inode_to_bdi(inode);
+   if (result > 0)
result = generic_perform_write(iocb, from);
-   current->backing_dev_info = NULL;
-   }
nfs_end_io_write(inode);
if (result <= 0)
goto out;
diff --git a/mm/filemap.c b/mm/filemap.c
index 4d0ec2fa1c7070..bf693ad1da1ece 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3892,6 +3892,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct 
iov_iter *i)
long status = 0;
ssize_t written = 0;
 
+   current->backing_dev_info = inode_to_bdi(mapping->host);
do {
struct page *page;
unsigned long offset;   /* Offset into pagecache page */
@@ -3956,6 +3957,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct 
iov_iter *i)
 
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
+   current->backing_dev_info = NULL;
 
if (!written)
return status;
-- 
2.39.2



[Cluster-devel] [PATCH 13/13] fuse: use direct_write_fallback

2023-05-19 Thread Christoph Hellwig
Use the generic direct_write_fallback helper instead of duplicating the
logic.

Signed-off-by: Christoph Hellwig 
---
 fs/fuse/file.c | 27 +++
 1 file changed, 3 insertions(+), 24 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 5f7b58798f99fc..02ab446ab57f1f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1340,11 +1340,9 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
ssize_t written = 0;
-   ssize_t written_buffered = 0;
struct inode *inode = mapping->host;
ssize_t err;
struct fuse_conn *fc = get_fuse_conn(inode);
-   loff_t endbyte = 0;
 
if (fc->writeback_cache) {
/* Update size (EOF optimization) and mode (SUID clearing) */
@@ -1382,28 +1380,9 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
 
if (iocb->ki_flags & IOCB_DIRECT) {
written = generic_file_direct_write(iocb, from);
-   if (written < 0 || !iov_iter_count(from))
-   goto out;
-
-   written_buffered = fuse_perform_write(iocb, from);
-   if (written_buffered < 0) {
-   err = written_buffered;
-   goto out;
-   }
-   endbyte = iocb->ki_pos + written_buffered - 1;
-
-   err = filemap_write_and_wait_range(file->f_mapping,
-  iocb->ki_pos,
-  endbyte);
-   if (err)
-   goto out;
-
-   invalidate_mapping_pages(file->f_mapping,
-iocb->ki_pos >> PAGE_SHIFT,
-endbyte >> PAGE_SHIFT);
-
-   written += written_buffered;
-   iocb->ki_pos += written_buffered;
+   if (written >= 0 && iov_iter_count(from))
+   written = direct_write_fallback(iocb, from, written,
+   fuse_perform_write(iocb, from));
} else {
written = fuse_perform_write(iocb, from);
}
-- 
2.39.2



[Cluster-devel] [PATCH 12/13] fuse: drop redundant arguments to fuse_perform_write

2023-05-19 Thread Christoph Hellwig
pos is always equal to iocb->ki_pos, and mapping is always equal to
iocb->ki_filp->f_mapping.

Signed-off-by: Christoph Hellwig 
---
 fs/fuse/file.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fd2f27f2144750..5f7b58798f99fc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1280,13 +1280,13 @@ static inline unsigned int fuse_wr_pages(loff_t pos, 
size_t len,
 max_pages);
 }
 
-static ssize_t fuse_perform_write(struct kiocb *iocb,
- struct address_space *mapping,
- struct iov_iter *ii, loff_t pos)
+static ssize_t fuse_perform_write(struct kiocb *iocb, struct iov_iter *ii)
 {
+   struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
struct fuse_conn *fc = get_fuse_conn(inode);
struct fuse_inode *fi = get_fuse_inode(inode);
+   loff_t pos = iocb->ki_pos;
int err = 0;
ssize_t res = 0;
 
@@ -1385,8 +1385,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
if (written < 0 || !iov_iter_count(from))
goto out;
 
-   written_buffered = fuse_perform_write(iocb, mapping, from,
- iocb->ki_pos);
+   written_buffered = fuse_perform_write(iocb, from);
if (written_buffered < 0) {
err = written_buffered;
goto out;
@@ -1406,7 +1405,7 @@ static ssize_t fuse_cache_write_iter(struct kiocb *iocb, 
struct iov_iter *from)
written += written_buffered;
iocb->ki_pos += written_buffered;
} else {
-   written = fuse_perform_write(iocb, mapping, from, iocb->ki_pos);
+   written = fuse_perform_write(iocb, from);
}
 out:
current->backing_dev_info = NULL;
-- 
2.39.2