[Cluster-devel] [PATCH v5 09/45] gfs2: dynamically allocate the gfs2-qd shrinker

2023-08-23 Thread Qi Zheng
Use new APIs to dynamically allocate the gfs2-qd shrinker.

Signed-off-by: Qi Zheng 
Reviewed-by: Muchun Song 
CC: Bob Peterson 
CC: Andreas Gruenbacher 
CC: cluster-devel@redhat.com
---
 fs/gfs2/main.c  |  6 +++---
 fs/gfs2/quota.c | 26 --
 fs/gfs2/quota.h |  3 ++-
 3 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index afcb32854f14..e47b1cc79f59 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void)
if (!gfs2_trans_cachep)
goto fail_cachep8;
 
-   error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd");
+   error = gfs2_qd_shrinker_init();
if (error)
goto fail_shrinker;
 
@@ -196,7 +196,7 @@ static int __init init_gfs2_fs(void)
 fail_wq2:
destroy_workqueue(gfs_recovery_wq);
 fail_wq1:
-   unregister_shrinker(&gfs2_qd_shrinker);
+   gfs2_qd_shrinker_exit();
 fail_shrinker:
kmem_cache_destroy(gfs2_trans_cachep);
 fail_cachep8:
@@ -229,7 +229,7 @@ static int __init init_gfs2_fs(void)
 
 static void __exit exit_gfs2_fs(void)
 {
-   unregister_shrinker(&gfs2_qd_shrinker);
+   gfs2_qd_shrinker_exit();
gfs2_glock_exit();
gfs2_unregister_debugfs();
unregister_filesystem(&gfs2_fs_type);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 632806c5ed67..d1e4d8ab8fa1 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -186,13 +186,27 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker 
*shrink,
return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
 }
 
-struct shrinker gfs2_qd_shrinker = {
-   .count_objects = gfs2_qd_shrink_count,
-   .scan_objects = gfs2_qd_shrink_scan,
-   .seeks = DEFAULT_SEEKS,
-   .flags = SHRINKER_NUMA_AWARE,
-};
+static struct shrinker *gfs2_qd_shrinker;
+
+int __init gfs2_qd_shrinker_init(void)
+{
+   gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd");
+   if (!gfs2_qd_shrinker)
+   return -ENOMEM;
+
+   gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count;
+   gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan;
+   gfs2_qd_shrinker->seeks = DEFAULT_SEEKS;
+
+   shrinker_register(gfs2_qd_shrinker);
 
+   return 0;
+}
+
+void gfs2_qd_shrinker_exit(void)
+{
+   shrinker_free(gfs2_qd_shrinker);
+}
 
 static u64 qd2index(struct gfs2_quota_data *qd)
 {
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 21ada332d555..f0d54dcbbc75 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -59,7 +59,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
 }
 
 extern const struct quotactl_ops gfs2_quotactl_ops;
-extern struct shrinker gfs2_qd_shrinker;
+int __init gfs2_qd_shrinker_init(void);
+void gfs2_qd_shrinker_exit(void);
 extern struct list_lru gfs2_qd_lru;
 extern void __init gfs2_quota_hash_init(void);
 
-- 
2.30.2



[Cluster-devel] [PATCH v5 08/45] gfs2: dynamically allocate the gfs2-glock shrinker

2023-08-23 Thread Qi Zheng
Use new APIs to dynamically allocate the gfs2-glock shrinker.

Signed-off-by: Qi Zheng 
Reviewed-by: Muchun Song 
CC: Bob Peterson 
CC: Andreas Gruenbacher 
CC: cluster-devel@redhat.com
---
 fs/gfs2/glock.c | 20 +++-
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 675bfec77706..fd3eba1856a5 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2039,11 +2039,7 @@ static unsigned long gfs2_glock_shrink_count(struct 
shrinker *shrink,
return vfs_pressure_ratio(atomic_read(&lru_count));
 }
 
-static struct shrinker glock_shrinker = {
-   .seeks = DEFAULT_SEEKS,
-   .count_objects = gfs2_glock_shrink_count,
-   .scan_objects = gfs2_glock_shrink_scan,
-};
+static struct shrinker *glock_shrinker;
 
 /**
  * glock_hash_walk - Call a function for glock in a hash bucket
@@ -2463,13 +2459,19 @@ int __init gfs2_glock_init(void)
return -ENOMEM;
}
 
-   ret = register_shrinker(&glock_shrinker, "gfs2-glock");
-   if (ret) {
+   glock_shrinker = shrinker_alloc(0, "gfs2-glock");
+   if (!glock_shrinker) {
destroy_workqueue(glock_workqueue);
rhashtable_destroy(&gl_hash_table);
-   return ret;
+   return -ENOMEM;
}
 
+   glock_shrinker->count_objects = gfs2_glock_shrink_count;
+   glock_shrinker->scan_objects = gfs2_glock_shrink_scan;
+   glock_shrinker->seeks = DEFAULT_SEEKS;
+
+   shrinker_register(glock_shrinker);
+
for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++)
init_waitqueue_head(glock_wait_table + i);
 
@@ -2478,7 +2480,7 @@ int __init gfs2_glock_init(void)
 
 void gfs2_glock_exit(void)
 {
-   unregister_shrinker(&glock_shrinker);
+   shrinker_free(glock_shrinker);
rhashtable_destroy(&gl_hash_table);
destroy_workqueue(glock_workqueue);
 }
-- 
2.30.2



[Cluster-devel] [PATCH 0/7] lockd: dlm: async lock request changes

2023-08-23 Thread Alexander Aring
Hi,

I sent this as a PATCH now and drop the RFC. I got some review back from
Jeff Layton and hope I was successful to follow it. There are still
issues with lockd and asynchronous lock request but it will at least not
directly crash when somebody is using nfs on top of GFS2. The issues are
related to cancellation of locks/lockd decides to drop nlm_block for
some reasons while pending request is happening.

I did not change more documentation about the asynchronous lock request
functionality to not confuse users. In my opinion there should be more
documentation about what you SHOULD NOT do with this API. Otherwise I
think the documentation is still correct.

I will send a follow up patch to fsdevel to change all users using
IS_SETLKW() to use FL_SLEEP to determine if it's blocking or non-blocking
and send it to fsdevel as it was recommended. This will just be a grep
and replace patch and look what happens. 

- Alex

changes since RFC:

- drop the pending callback flag but introduce "lockd: don't call
  vfs_lock_file() for pending requests", see patch why I need it.
- drop per nlm_block callback mutex
- change async lock request documentation
- use always FL_SLEEP to determine if it's blocking vs non-blocking in
  DLM

Alexander Aring (7):
  lockd: introduce safe async lock op
  lockd: don't call vfs_lock_file() for pending requests
  lockd: fix race in async lock request handling
  lockd: add doc to enable EXPORT_OP_SAFE_ASYNC_LOCK
  dlm: use fl_owner from lockd
  dlm: use FL_SLEEP to determine blocking vs non-blocking
  dlm: implement EXPORT_OP_SAFE_ASYNC_LOCK

 fs/dlm/plock.c   | 20 +---
 fs/gfs2/export.c |  1 +
 fs/lockd/svclock.c   | 40 +++-
 fs/locks.c   | 12 +++-
 fs/nfsd/nfs4state.c  | 13 ++---
 fs/ocfs2/export.c|  1 +
 include/linux/exportfs.h |  8 
 7 files changed, 59 insertions(+), 36 deletions(-)

-- 
2.31.1



[Cluster-devel] [PATCH 3/7] lockd: fix race in async lock request handling

2023-08-23 Thread Alexander Aring
This patch fixes a race in async lock request handling between adding
the relevant struct nlm_block to nlm_blocked list after the request was
sent by vfs_lock_file() and nlmsvc_grant_deferred() does a lookup of the
nlm_block in the nlm_blocked list. It could be that the async request is
completed before the nlm_block was added to the list. This would end
in a -ENOENT and a kernel log message of "lockd: grant for unknown
block".

To solve this issue we add the nlm_block before the vfs_lock_file() call
to be sure it has been added when a possible nlmsvc_grant_deferred() is
called. If the vfs_lock_file() results in an case when it wouldn't be
added to nlm_blocked list, the nlm_block struct will be removed from
this list again.

The introducing of the new B_PENDING_CALLBACK nlm_block flag will handle
async lock requests on a pending lock requests as a retry on the caller
level to hit the -EAGAIN case.

Signed-off-by: Alexander Aring 
---
 fs/lockd/svclock.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index aa4174fbaf5b..3b158446203b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -546,6 +546,9 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
ret = nlm_lck_blocked;
goto out;
}
+
+   /* Append to list of blocked */
+   nlmsvc_insert_block_locked(block, NLM_NEVER);
spin_unlock(&nlm_blocked_lock);
 
if (!wait)
@@ -557,9 +560,12 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
dprintk("lockd: vfs_lock_file returned %d\n", error);
switch (error) {
case 0:
+   nlmsvc_remove_block(block);
ret = nlm_granted;
goto out;
case -EAGAIN:
+   if (!wait)
+   nlmsvc_remove_block(block);
ret = async_block ? nlm_lck_blocked : nlm_lck_denied;
goto out;
case FILE_LOCK_DEFERRED:
@@ -570,17 +576,16 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
ret = nlmsvc_defer_lock_rqst(rqstp, block);
goto out;
case -EDEADLK:
+   nlmsvc_remove_block(block);
ret = nlm_deadlock;
goto out;
default:/* includes ENOLCK */
+   nlmsvc_remove_block(block);
ret = nlm_lck_denied_nolocks;
goto out;
}
 
ret = nlm_lck_blocked;
-
-   /* Append to list of blocked */
-   nlmsvc_insert_block(block, NLM_NEVER);
 out:
mutex_unlock(&file->f_mutex);
nlmsvc_release_block(block);
-- 
2.31.1



[Cluster-devel] [PATCH 2/7] lockd: don't call vfs_lock_file() for pending requests

2023-08-23 Thread Alexander Aring
This patch returns nlm_lck_blocked in nlmsvc_lock() when an asynchronous
lock request is pending. During testing I ran into the case with the
side-effects that lockd is waiting for only one lm_grant() callback
because it's already part of the nlm_blocked list. If another
asynchronous for the same nlm_block is triggered two lm_grant()
callbacks will occur but lockd was only waiting for one.

To avoid any change of existing users this handling will only being made
when export_op_support_safe_async_lock() returns true.

Signed-off-by: Alexander Aring 
---
 fs/lockd/svclock.c | 24 +---
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6e3b230e8317..aa4174fbaf5b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -531,6 +531,23 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
goto out;
}
 
+   spin_lock(&nlm_blocked_lock);
+   /*
+* If this is a lock request for an already pending
+* lock request we return nlm_lck_blocked without calling
+* vfs_lock_file() again. Otherwise we have two pending
+* requests on the underlaying ->lock() implementation but
+* only one nlm_block to being granted by lm_grant().
+*/
+   if (export_op_support_safe_async_lock(inode->i_sb->s_export_op,
+ nlmsvc_file_file(file)->f_op) &&
+   !list_empty(&block->b_list)) {
+   spin_unlock(&nlm_blocked_lock);
+   ret = nlm_lck_blocked;
+   goto out;
+   }
+   spin_unlock(&nlm_blocked_lock);
+
if (!wait)
lock->fl.fl_flags &= ~FL_SLEEP;
mode = lock_to_openmode(&lock->fl);
@@ -543,13 +560,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
ret = nlm_granted;
goto out;
case -EAGAIN:
-   /*
-* If this is a blocking request for an
-* already pending lock request then we need
-* to put it back on lockd's block list
-*/
-   if (wait)
-   break;
ret = async_block ? nlm_lck_blocked : nlm_lck_denied;
goto out;
case FILE_LOCK_DEFERRED:
-- 
2.31.1



[Cluster-devel] [PATCH 1/7] lockd: introduce safe async lock op

2023-08-23 Thread Alexander Aring
This patch reverts mostly commit 40595cdc93ed ("nfs: block notification
on fs with its own ->lock") and introduces an EXPORT_OP_SAFE_ASYNC_LOCK
export flag to signal that the "own ->lock" implementation supports
async lock requests. The only main user is DLM that is used by GFS2 and
OCFS2 filesystem. Those implement their own lock() implementation and
return FILE_LOCK_DEFERRED as return value. Since commit 40595cdc93ed
("nfs: block notification on fs with its own ->lock") the DLM
implementation were never updated. This patch should prepare for DLM
to set the EXPORT_OP_SAFE_ASYNC_LOCK export flag and update the DLM
plock implementation regarding to it.

Acked-by: Jeff Layton 
Signed-off-by: Alexander Aring 
---
 fs/lockd/svclock.c   |  5 ++---
 fs/nfsd/nfs4state.c  | 13 ++---
 include/linux/exportfs.h |  8 
 3 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index c43ccdf28ed9..6e3b230e8317 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -470,9 +470,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
struct nlm_host *host, struct nlm_lock *lock, int wait,
struct nlm_cookie *cookie, int reclaim)
 {
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
struct inode*inode = nlmsvc_file_inode(file);
-#endif
struct nlm_block*block = NULL;
int error;
int mode;
@@ -486,7 +484,8 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
(long long)lock->fl.fl_end,
wait);
 
-   if (nlmsvc_file_file(file)->f_op->lock) {
+   if (!export_op_support_safe_async_lock(inode->i_sb->s_export_op,
+  nlmsvc_file_file(file)->f_op)) {
async_block = wait;
wait = 0;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3aefbad4cc09..14ca06424ff1 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -7430,6 +7430,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct 
nfsd4_compound_state *cstate,
struct nfsd4_blocked_lock *nbl = NULL;
struct file_lock *file_lock = NULL;
struct file_lock *conflock = NULL;
+   struct super_block *sb;
__be32 status = 0;
int lkflg;
int err;
@@ -7451,6 +7452,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct 
nfsd4_compound_state *cstate,
dprintk("NFSD: nfsd4_lock: permission denied!\n");
return status;
}
+   sb = cstate->current_fh.fh_dentry->d_sb;
 
if (lock->lk_is_new) {
if (nfsd4_has_session(cstate))
@@ -7502,7 +7504,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct 
nfsd4_compound_state *cstate,
fp = lock_stp->st_stid.sc_file;
switch (lock->lk_type) {
case NFS4_READW_LT:
-   if (nfsd4_has_session(cstate))
+   if (nfsd4_has_session(cstate) ||
+   export_op_support_safe_async_lock(sb->s_export_op,
+ 
nf->nf_file->f_op))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_READ_LT:
@@ -7514,7 +7518,9 @@ nfsd4_lock(struct svc_rqst *rqstp, struct 
nfsd4_compound_state *cstate,
fl_type = F_RDLCK;
break;
case NFS4_WRITEW_LT:
-   if (nfsd4_has_session(cstate))
+   if (nfsd4_has_session(cstate) ||
+   export_op_support_safe_async_lock(sb->s_export_op,
+ 
nf->nf_file->f_op))
fl_flags |= FL_SLEEP;
fallthrough;
case NFS4_WRITE_LT:
@@ -7542,7 +7548,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct 
nfsd4_compound_state *cstate,
 * for file locks), so don't attempt blocking lock notifications
 * on those filesystems:
 */
-   if (nf->nf_file->f_op->lock)
+   if (!export_op_support_safe_async_lock(sb->s_export_op,
+  nf->nf_file->f_op))
fl_flags &= ~FL_SLEEP;
 
nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index 11fbd0ee1370..10358a93cdc1 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -3,6 +3,7 @@
 #define LINUX_EXPORTFS_H 1
 
 #include 
+#include 
 
 struct dentry;
 struct iattr;
@@ -224,9 +225,16 @@ struct export_operations {
  atomic attribute updates
*/
 #define EXPORT_OP_FLUSH_ON_CLOSE   (0x20) /* fs flushes file data on close 
*/
+#define EXPORT_OP_SAFE_ASYNC_LOCK 

[Cluster-devel] [PATCH 4/7] lockd: add doc to enable EXPORT_OP_SAFE_ASYNC_LOCK

2023-08-23 Thread Alexander Aring
This patch adds a note to enable EXPORT_OP_SAFE_ASYNC_LOCK for
asynchronous lock request handling.

Signed-off-by: Alexander Aring 
---
 fs/locks.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index df8b26a42524..edee02d1ca93 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2255,11 +2255,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, 
struct flock *flock)
  * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
  * locks, the ->lock() interface may return asynchronously, before the lock has
  * been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Callers expecting ->lock() to return asynchronously
- * will only use F_SETLK, not F_SETLKW; they will set FL_SLEEP if (and only if)
- * the request is for a blocking lock. When ->lock() does return 
asynchronously,
- * it must return FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock
- * request completes.
+ * lm_grant is set. Additionally EXPORT_OP_SAFE_ASYNC_LOCK in export_operations
+ * flags need to be set.
+ *
+ * Callers expecting ->lock() to return asynchronously will only use F_SETLK,
+ * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
+ * blocking lock. When ->lock() does return asynchronously, it must return
+ * FILE_LOCK_DEFERRED, and call ->lm_grant() when the lock request completes.
  * If the request is for non-blocking lock the file system should return
  * FILE_LOCK_DEFERRED then try to get the lock and call the callback routine
  * with the result. If the request timed out the callback routine will return a
-- 
2.31.1



[Cluster-devel] [PATCH 6/7] dlm: use FL_SLEEP to determine blocking vs non-blocking

2023-08-23 Thread Alexander Aring
This patch uses the FL_SLEEP flag in struct file_lock to determine if
the lock request is a blocking or non-blocking request. Before dlm was
using IS_SETLKW() was being used which is not usable for lock requests
coming from lockd when EXPORT_OP_SAFE_ASYNC_LOCK inside the export flags
is set.

Signed-off-by: Alexander Aring 
---
 fs/dlm/plock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 0094fa4004cc..0c6ed5eeb840 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -140,7 +140,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, 
struct file *file,
op->info.optype = DLM_PLOCK_OP_LOCK;
op->info.pid= fl->fl_pid;
op->info.ex = (fl->fl_type == F_WRLCK);
-   op->info.wait   = IS_SETLKW(cmd);
+   op->info.wait   = !!(fl->fl_flags & FL_SLEEP);
op->info.fsid   = ls->ls_global_id;
op->info.number = number;
op->info.start  = fl->fl_start;
-- 
2.31.1



[Cluster-devel] [PATCH 7/7] dlm: implement EXPORT_OP_SAFE_ASYNC_LOCK

2023-08-23 Thread Alexander Aring
This patch is activating the EXPORT_OP_SAFE_ASYNC_LOCK export flag to
signal lockd that both filesystems are able to handle async lock
requests. The cluster filesystems gfs2 and ocfs2 will redirect their
lock requests to DLMs plock implementation that can handle async lock
requests.

Signed-off-by: Alexander Aring 
---
 fs/gfs2/export.c  | 1 +
 fs/ocfs2/export.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index cf40895233f5..36bc43b9d141 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -192,5 +192,6 @@ const struct export_operations gfs2_export_ops = {
.fh_to_parent = gfs2_fh_to_parent,
.get_name = gfs2_get_name,
.get_parent = gfs2_get_parent,
+   .flags = EXPORT_OP_SAFE_ASYNC_LOCK,
 };
 
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index eaa8c80ace3c..8a1169e01dd9 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -280,4 +280,5 @@ const struct export_operations ocfs2_export_ops = {
.fh_to_dentry   = ocfs2_fh_to_dentry,
.fh_to_parent   = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
+   .flags  = EXPORT_OP_SAFE_ASYNC_LOCK,
 };
-- 
2.31.1



[Cluster-devel] [PATCH 5/7] dlm: use fl_owner from lockd

2023-08-23 Thread Alexander Aring
This patch is changing the fl_owner value in case of an nfs lock request
to not be the pid of lockd. Instead this patch changes it to be the
owner value that nfs is giving us.

Currently there exists proved problems with this behaviour. One nfsd
server was created to export a gfs2 filesystem mount. Two nfs clients
doing a nfs mount of this export. Those two clients should conflict each
other operating on the same nfs file.

A small test program was written:

int main(int argc, const char *argv[])
{
struct flock fl = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
.l_start = 1L,
.l_len = 1L,
};
int fd;

fd = open("filename", O_RDWR | O_CREAT, 0700);
printf("try to lock...\n");
fcntl(fd, F_SETLKW, &fl);
printf("locked!\n");
getc(stdin);

return 0;
}

Running on both clients at the same time and don't interrupting by
pressing any key. It will show that both clients are able to acquire the
lock which shouldn't be the case. The issue is here that the fl_owner
value is the same and the lock context of both clients should be
separated.

This patch lets lockd define how to deal with lock contexts and chose
hopefully the right fl_owner value. A test after this patch was made and
the locks conflicts each other which should be the case.

Acked-by: Jeff Layton 
Signed-off-by: Alexander Aring 
---
 fs/dlm/plock.c | 18 --
 1 file changed, 4 insertions(+), 14 deletions(-)

diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 00e1d802a81c..0094fa4004cc 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -145,6 +145,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, 
struct file *file,
op->info.number = number;
op->info.start  = fl->fl_start;
op->info.end= fl->fl_end;
+   op->info.owner = (__u64)(long)fl->fl_owner;
/* async handling */
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
@@ -154,9 +155,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, 
struct file *file,
goto out;
}
 
-   /* fl_owner is lockd which doesn't distinguish
-  processes on the nfs client */
-   op->info.owner  = (__u64) fl->fl_pid;
op_data->callback = fl->fl_lmops->lm_grant;
locks_init_lock(&op_data->flc);
locks_copy_lock(&op_data->flc, fl);
@@ -168,8 +166,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, 
struct file *file,
send_op(op);
rv = FILE_LOCK_DEFERRED;
goto out;
-   } else {
-   op->info.owner  = (__u64)(long) fl->fl_owner;
}
 
send_op(op);
@@ -326,10 +322,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 
number, struct file *file,
op->info.number = number;
op->info.start  = fl->fl_start;
op->info.end= fl->fl_end;
-   if (fl->fl_lmops && fl->fl_lmops->lm_grant)
-   op->info.owner  = (__u64) fl->fl_pid;
-   else
-   op->info.owner  = (__u64)(long) fl->fl_owner;
+   op->info.owner = (__u64)(long)fl->fl_owner;
 
if (fl->fl_flags & FL_CLOSE) {
op->info.flags |= DLM_PLOCK_FL_CLOSE;
@@ -389,7 +382,7 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 
number, struct file *file,
info.number = number;
info.start = fl->fl_start;
info.end = fl->fl_end;
-   info.owner = (__u64)fl->fl_pid;
+   info.owner = (__u64)(long)fl->fl_owner;
 
rv = do_lock_cancel(&info);
switch (rv) {
@@ -450,10 +443,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, 
struct file *file,
op->info.number = number;
op->info.start  = fl->fl_start;
op->info.end= fl->fl_end;
-   if (fl->fl_lmops && fl->fl_lmops->lm_grant)
-   op->info.owner  = (__u64) fl->fl_pid;
-   else
-   op->info.owner  = (__u64)(long) fl->fl_owner;
+   op->info.owner = (__u64)(long)fl->fl_owner;
 
send_op(op);
wait_event(recv_wq, (op->done != 0));
-- 
2.31.1



[Cluster-devel] [PATCH] generic: add fcntl corner cases tests

2023-08-23 Thread Alexander Aring
This patch adds generic 730 testcase. It will test on various fcntl()
corner cases that was making problems on a GFS2 filesystem. GFS2 has
it's own lock() implementation that has it's own posix lock
implementation behind it. There are testcases to find issues with struct
file_lock matches. Currently the Linux kernel does not have a unique
identifier per lock request to e.g. find the original lock request when
a complete handler of an async lock request comes back. The current way
is to use struct file_lock fields to fine the original lock request.
However there was issues being found that in some cases it wasn't an
unique match because multiple pending struct file_lock could have the
same state. To find issues the testcases fcntl_lock_equal_file_lock and
fcntl_lock_same_owner are introduced and their OFD variants.

Other test like fcntl_lock_kill_child tests cleanup routines when a
process blocking in F_SETLKW to wait the lock request getting granted
and the process gets killed.

A similar test is fcntl_lock_signal_interrupt which checks for
side-effects e.g. unlock all previous acquired locks when a blocking
F_SETLKW gets interrupted by a signal.

Signed-off-by: Alexander Aring 
---
 src/Makefile |   5 +-
 src/fcntl_lock_equal_file_lock.c | 140 +++
 src/fcntl_lock_equal_file_lock_ofd.c | 144 
 src/fcntl_lock_kill_child.c  | 148 +
 src/fcntl_lock_same_owner.c  | 146 
 src/fcntl_lock_same_owner_ofd.c  | 144 
 src/fcntl_lock_signal_interrupt.c| 159 +++
 tests/generic/730|  70 
 tests/generic/730.out|   2 +
 9 files changed, 957 insertions(+), 1 deletion(-)
 create mode 100644 src/fcntl_lock_equal_file_lock.c
 create mode 100644 src/fcntl_lock_equal_file_lock_ofd.c
 create mode 100644 src/fcntl_lock_kill_child.c
 create mode 100644 src/fcntl_lock_same_owner.c
 create mode 100644 src/fcntl_lock_same_owner_ofd.c
 create mode 100644 src/fcntl_lock_signal_interrupt.c
 create mode 100755 tests/generic/730
 create mode 100644 tests/generic/730.out

diff --git a/src/Makefile b/src/Makefile
index 24cd4747..e633f748 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -19,7 +19,10 @@ TARGETS = dirstress fill fill2 getpagesize holes lstat64 \
t_ofd_locks t_mmap_collision mmap-write-concurrent \
t_get_file_time t_create_short_dirs t_create_long_dirs t_enospc \
t_mmap_writev_overlap checkpoint_journal mmap-rw-fault allocstale \
-   t_mmap_cow_memory_failure fake-dump-rootino dio-buf-fault
+   t_mmap_cow_memory_failure fake-dump-rootino dio-buf-fault \
+   fcntl_lock_equal_file_lock fcntl_lock_equal_file_lock_ofd \
+   fcntl_lock_kill_child fcntl_lock_same_owner fcntl_lock_same_owner_ofd \
+   fcntl_lock_signal_interrupt
 
 LINUX_TARGETS = xfsctl bstat t_mtab getdevicesize preallo_rw_pattern_reader \
preallo_rw_pattern_writer ftrunc trunc fs_perms testx looptest \
diff --git a/src/fcntl_lock_equal_file_lock.c b/src/fcntl_lock_equal_file_lock.c
new file mode 100644
index ..38b111c5
--- /dev/null
+++ b/src/fcntl_lock_equal_file_lock.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * This program tests fcntl() operations that have two
+ * of struct file_lock in the kernel in waiting state. Those
+ * two struct file_lock have exact the identitcal fields. Currently
+ * the Linux kernel matches lock requests by file_lock fields and not
+ * by an unique identifiers. There is a verifier to check if the
+ * posix locks got unlocked.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static const char *filename;
+static int fd;
+
+static void *do_equal_file_lock_thread0(void *arg)
+{
+   struct flock fl = {
+   .l_type = F_WRLCK,
+   .l_whence = SEEK_SET,
+   .l_start = 0L,
+   .l_len = 1L,
+   };
+   int rv;
+
+   rv = fcntl(fd, F_SETLKW, &fl);
+   if (rv == -1)
+   _exit(1);
+
+   return NULL;
+}
+
+static void *do_equal_file_lock_thread1(void *arg)
+{
+   struct flock fl = {
+   .l_type = F_WRLCK,
+   .l_whence = SEEK_SET,
+   .l_start = 0L,
+   .l_len = 1L,
+   };
+   int rv;
+
+   rv = fcntl(fd, F_SETLKW, &fl);
+   if (rv == -1)
+   _exit(1);
+
+   return NULL;
+}
+
+static void do_setup()
+{
+   fd = open(filename, O_RDWR | O_CREAT, 0700);
+   if (fd == -1)
+   _exit(1);
+}
+
+static void do_teardown()
+{
+   close(fd);
+}
+
+static void do_equal_file_lock()
+{
+   struct flock fl = {
+   .l_type = F_WRLCK,
+   .l_whence = SEEK_SET,
+   .l_start = 0L,
+   .l_len = 1L,
+   };
+   pthread_t t[2];
+   int pid, rv;
+
+   rv = fcntl(fd, F_SETLK, &

Re: [Cluster-devel] [PATCH] gfs2: Fix uaf for qda in gfs2_quota_sync

2023-08-23 Thread Andreas Gruenbacher
On Tue, Aug 22, 2023 at 9:32 PM Bob Peterson  wrote:
> On 1/26/23 11:10 PM, eada...@sina.com wrote:
> > From: Edward Adam Davis 
> >
> > [   81.372851][ T5532] CPU: 1 PID: 5532 Comm: syz-executor.0 Not tainted 
> > 6.2.0-rc1-syzkaller-dirty #0
> > [   81.382080][ T5532] Hardware name: Google Google Compute Engine/Google 
> > Compute Engine, BIOS Google 01/12/2023
> > [   81.392343][ T5532] Call Trace:
> > [   81.395654][ T5532]  
> > [   81.398603][ T5532]  dump_stack_lvl+0x1b1/0x290
> > [   81.418421][ T5532]  gfs2_assert_warn_i+0x19a/0x2e0
> > [   81.423480][ T5532]  gfs2_quota_cleanup+0x4c6/0x6b0
> > [   81.428611][ T5532]  gfs2_make_fs_ro+0x517/0x610
> > [   81.457802][ T5532]  gfs2_withdraw+0x609/0x1540
> > [   81.481452][ T5532]  gfs2_inode_refresh+0xb2d/0xf60
> > [   81.506658][ T5532]  gfs2_instantiate+0x15e/0x220
> > [   81.511504][ T5532]  gfs2_glock_wait+0x1d9/0x2a0
> > [   81.516352][ T5532]  do_sync+0x485/0xc80
> > [   81.554943][ T5532]  gfs2_quota_sync+0x3da/0x8b0
> > [   81.559738][ T5532]  gfs2_sync_fs+0x49/0xb0
> > [   81.564063][ T5532]  sync_filesystem+0xe8/0x220
> > [   81.568740][ T5532]  generic_shutdown_super+0x6b/0x310
> > [   81.574112][ T5532]  kill_block_super+0x79/0xd0
> > [   81.578779][ T5532]  deactivate_locked_super+0xa7/0xf0
> > [   81.584064][ T5532]  cleanup_mnt+0x494/0x520
> > [   81.593753][ T5532]  task_work_run+0x243/0x300
> > [   81.608837][ T5532]  exit_to_user_mode_loop+0x124/0x150
> > [   81.614232][ T5532]  exit_to_user_mode_prepare+0xb2/0x140
> > [   81.619820][ T5532]  syscall_exit_to_user_mode+0x26/0x60
> > [   81.625287][ T5532]  do_syscall_64+0x49/0xb0
> > [   81.629710][ T5532]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
> > [   81.636292][ T5532] RIP: 0033:0x7efdd688d517
> > [   81.640728][ T5532] Code: ff ff ff f7 d8 64 89 01 48 83 c8 ff c3 66 0f 
> > 1f 44 00 00 31 f6 e9 09 00 00 00 66 0f 1f 84 00 00 00 00 00 b8 a6 00 00 00 
> > 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48
> > [   81.660550][ T5532] RSP: 002b:7fff34520ce8 EFLAGS: 0246 
> > ORIG_RAX: 00a6
> > [   81.669413][ T5532] RAX:  RBX:  RCX: 
> > 7efdd688d517
> > [   81.677403][ T5532] RDX: 7fff34520db9 RSI: 000a RDI: 
> > 7fff34520db0
> > [   81.685388][ T5532] RBP: 7fff34520db0 R08:  R09: 
> > 7fff34520b80
> > [   81.695973][ T5532] R10: 55ca38b3 R11: 0246 R12: 
> > 7efdd68e6b24
> > [   81.704152][ T5532] R13: 7fff34521e70 R14: 55ca3810 R15: 
> > 7fff34521eb0
> > [   81.712868][ T5532]  
> >
> > The function "gfs2_quota_cleanup()" may be called in the function 
> > "do_sync()",
> > This will cause the qda obtained in the function "qd_check_sync" to be 
> > released, resulting in the occurrence of uaf.
> > In order to avoid this uaf, we can increase the judgment of 
> > "sdp->sd_quota_bitmap" released in the function
> > "gfs2_quota_cleanup" to confirm that "sdp->sd_quota_list" has been released.
> >
> > Link: https://lore.kernel.org/all/2b5e2405f14e8...@google.com
> > Reported-and-tested-by: 
> > syzbot+3f6a670108ce43356...@syzkaller.appspotmail.com
> > Signed-off-by: Edward Adam Davis 
> > ---
> >   fs/gfs2/quota.c | 3 +++
> >   1 file changed, 3 insertions(+)
> >
> > diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
> > index 1ed1722..4cf66bd 100644
> > --- a/fs/gfs2/quota.c
> > +++ b/fs/gfs2/quota.c
> > @@ -1321,6 +1321,9 @@ int gfs2_quota_sync(struct super_block *sb, int type)
> >   qda[x]->qd_sync_gen =
> >   sdp->sd_quota_sync_gen;
> >
> > + if (!sdp->sd_quota_bitmap)
> > + break;
> > +
> >   for (x = 0; x < num_qd; x++)
> >   qd_unlock(qda[x]);
> >   }
>
> Hi Edward,
>
> Can you try to recreate this problem on a newer version of the gfs2 code?

The problem is that gfs2_quota_sync() is holding references to
gfs2_quota_data objects when the filesystem decides to withdraw. The
withdraw code calls gfs2_quota_cleanup(), which wipes away the
gfs2_quota_data objects that gfs2_quota_sync() is still referencing.
So that reference counting needs to be fixed. Alternatively, if we
could delay the withdraw until the end of gfs2_quota_sync(), we would
be fine as well, but we don't have that kind of mechanism.

> In the linux-gfs2 repository I've got a branch called "bobquota" that
> has a bunch of patches related to quota syncing. I don't know if these
> will fix your problem, but it's worth trying.

That branch doesn't change the reference counting. It doesn't address this bug.

> The thing is, the qda array should have been populated by previous
> calls, like qd_fish and such, and should be okay to release by
> quota_cleanup.
>
> I can tell you this:
>
> In the call trace above, function do_sync tried to lock an inode glock,
> which tried to i