the changes to zvol_open added to 2.1.2 (for coping with kernel
changes in 5.13) seem to have introduced a lock order inversion [0].

(noticed while reviewing the 2.0.6->2.0.7 changes (the patch was
applied after 2.1.2 was already tagged)

[0] https://github.com/openzfs/zfs/pull/12863
Signed-off-by: Stoiko Ivanov <s.iva...@proxmox.com>
---
did not reproduce the dead-lock myself (in my very limited tests)
but given that our 2.1.2 packages (and kernel modules) have not seen
too much testing in public yet - thought it makes sense to proactively
pull this in

quickly tested a kernel+userspace with this patch on a physical testhost

 .../0012-Fix-zvol_open-lock-inversion.patch   | 212 ++++++++++++++++++
 debian/patches/series                         |   1 +
 2 files changed, 213 insertions(+)
 create mode 100644 debian/patches/0012-Fix-zvol_open-lock-inversion.patch

diff --git a/debian/patches/0012-Fix-zvol_open-lock-inversion.patch 
b/debian/patches/0012-Fix-zvol_open-lock-inversion.patch
new file mode 100644
index 00000000..eb74550f
--- /dev/null
+++ b/debian/patches/0012-Fix-zvol_open-lock-inversion.patch
@@ -0,0 +1,212 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendo...@llnl.gov>
+Date: Fri, 17 Dec 2021 09:52:13 -0800
+Subject: [PATCH] Fix zvol_open() lock inversion
+
+When restructuring the zvol_open() logic for the Linux 5.13 kernel
+a lock inversion was accidentally introduced.  In the updated code
+the spa_namespace_lock is now taken before the zv_suspend_lock
+allowing the following scenario to occur:
+
+    down_read <=== waiting for zv_suspend_lock
+    zvol_open <=== holds spa_namespace_lock
+    __blkdev_get
+    blkdev_get_by_dev
+    blkdev_open
+    ...
+
+     mutex_lock <== waiting for spa_namespace_lock
+     spa_open_common
+     spa_open
+     dsl_pool_hold
+     dmu_objset_hold_flags
+     dmu_objset_hold
+     dsl_prop_get
+     dsl_prop_get_integer
+     zvol_create_minor
+     dmu_recv_end
+     zfs_ioc_recv_impl <=== holds zv_suspend_lock via zvol_suspend()
+     zfs_ioc_recv
+     ...
+
+This commit resolves the issue by moving the acquisition of the
+spa_namespace_lock back to after the zv_suspend_lock which restores
+the original ordering.
+
+Additionally, as part of this change the error exit paths were
+simplified where possible.
+
+Reviewed-by: Tony Hutter <hutt...@llnl.gov>
+Reviewed-by: Rich Ercolani <rincebr...@gmail.com>
+Signed-off-by: Brian Behlendorf <behlendo...@llnl.gov>
+Closes #12863
+(cherry picked from commit 8a02d01e85556bbe3a1c6947bc11b8ef028d4023)
+Signed-off-by: Stoiko Ivanov <s.iva...@proxmox.com>
+---
+ module/os/linux/zfs/zvol_os.c | 121 ++++++++++++++++------------------
+ 1 file changed, 58 insertions(+), 63 deletions(-)
+
+diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c
+index 44caadd58..69479b3f7 100644
+--- a/module/os/linux/zfs/zvol_os.c
++++ b/module/os/linux/zfs/zvol_os.c
+@@ -496,8 +496,7 @@ zvol_open(struct block_device *bdev, fmode_t flag)
+ {
+       zvol_state_t *zv;
+       int error = 0;
+-      boolean_t drop_suspend = B_TRUE;
+-      boolean_t drop_namespace = B_FALSE;
++      boolean_t drop_suspend = B_FALSE;
+ #ifndef HAVE_BLKDEV_GET_ERESTARTSYS
+       hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms);
+       hrtime_t start = gethrtime();
+@@ -517,7 +516,36 @@ retry:
+               return (SET_ERROR(-ENXIO));
+       }
+ 
+-      if (zv->zv_open_count == 0 && !mutex_owned(&spa_namespace_lock)) {
++      mutex_enter(&zv->zv_state_lock);
++      /*
++       * Make sure zvol is not suspended during first open
++       * (hold zv_suspend_lock) and respect proper lock acquisition
++       * ordering - zv_suspend_lock before zv_state_lock
++       */
++      if (zv->zv_open_count == 0) {
++              if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
++                      mutex_exit(&zv->zv_state_lock);
++                      rw_enter(&zv->zv_suspend_lock, RW_READER);
++                      mutex_enter(&zv->zv_state_lock);
++                      /* check to see if zv_suspend_lock is needed */
++                      if (zv->zv_open_count != 0) {
++                              rw_exit(&zv->zv_suspend_lock);
++                      } else {
++                              drop_suspend = B_TRUE;
++                      }
++              } else {
++                      drop_suspend = B_TRUE;
++              }
++      }
++      rw_exit(&zvol_state_lock);
++
++      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
++
++      if (zv->zv_open_count == 0) {
++              boolean_t drop_namespace = B_FALSE;
++
++              ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
++
+               /*
+                * In all other call paths the spa_namespace_lock is taken
+                * before the bdev->bd_mutex lock.  However, on open(2)
+@@ -542,84 +570,51 @@ retry:
+                * the kernel so the only option is to return the error for
+                * the caller to handle it.
+                */
+-              if (!mutex_tryenter(&spa_namespace_lock)) {
+-                      rw_exit(&zvol_state_lock);
++              if (!mutex_owned(&spa_namespace_lock)) {
++                      if (!mutex_tryenter(&spa_namespace_lock)) {
++                              mutex_exit(&zv->zv_state_lock);
++                              rw_exit(&zv->zv_suspend_lock);
+ 
+ #ifdef HAVE_BLKDEV_GET_ERESTARTSYS
+-                      schedule();
+-                      return (SET_ERROR(-ERESTARTSYS));
+-#else
+-                      if ((gethrtime() - start) > timeout)
++                              schedule();
+                               return (SET_ERROR(-ERESTARTSYS));
++#else
++                              if ((gethrtime() - start) > timeout)
++                                      return (SET_ERROR(-ERESTARTSYS));
+ 
+-                      schedule_timeout(MSEC_TO_TICK(10));
+-                      goto retry;
++                              schedule_timeout(MSEC_TO_TICK(10));
++                              goto retry;
+ #endif
+-              } else {
+-                      drop_namespace = B_TRUE;
+-              }
+-      }
+-
+-      mutex_enter(&zv->zv_state_lock);
+-      /*
+-       * make sure zvol is not suspended during first open
+-       * (hold zv_suspend_lock) and respect proper lock acquisition
+-       * ordering - zv_suspend_lock before zv_state_lock
+-       */
+-      if (zv->zv_open_count == 0) {
+-              if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) {
+-                      mutex_exit(&zv->zv_state_lock);
+-                      rw_enter(&zv->zv_suspend_lock, RW_READER);
+-                      mutex_enter(&zv->zv_state_lock);
+-                      /* check to see if zv_suspend_lock is needed */
+-                      if (zv->zv_open_count != 0) {
+-                              rw_exit(&zv->zv_suspend_lock);
+-                              drop_suspend = B_FALSE;
++                      } else {
++                              drop_namespace = B_TRUE;
+                       }
+               }
+-      } else {
+-              drop_suspend = B_FALSE;
+-      }
+-      rw_exit(&zvol_state_lock);
+-
+-      ASSERT(MUTEX_HELD(&zv->zv_state_lock));
+ 
+-      if (zv->zv_open_count == 0) {
+-              ASSERT(RW_READ_HELD(&zv->zv_suspend_lock));
+               error = -zvol_first_open(zv, !(flag & FMODE_WRITE));
+-              if (error)
+-                      goto out_mutex;
+-      }
+ 
+-      if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+-              error = -EROFS;
+-              goto out_open_count;
++              if (drop_namespace)
++                      mutex_exit(&spa_namespace_lock);
+       }
+ 
+-      zv->zv_open_count++;
+-
+-      mutex_exit(&zv->zv_state_lock);
+-      if (drop_namespace)
+-              mutex_exit(&spa_namespace_lock);
+-      if (drop_suspend)
+-              rw_exit(&zv->zv_suspend_lock);
+-
+-      zfs_check_media_change(bdev);
+-
+-      return (0);
++      if (error == 0) {
++              if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
++                      if (zv->zv_open_count == 0)
++                              zvol_last_close(zv);
+ 
+-out_open_count:
+-      if (zv->zv_open_count == 0)
+-              zvol_last_close(zv);
++                      error = SET_ERROR(-EROFS);
++              } else {
++                      zv->zv_open_count++;
++              }
++      }
+ 
+-out_mutex:
+       mutex_exit(&zv->zv_state_lock);
+-      if (drop_namespace)
+-              mutex_exit(&spa_namespace_lock);
+       if (drop_suspend)
+               rw_exit(&zv->zv_suspend_lock);
+ 
+-      return (SET_ERROR(error));
++      if (error == 0)
++              zfs_check_media_change(bdev);
++
++      return (error);
+ }
+ 
+ static void
diff --git a/debian/patches/series b/debian/patches/series
index d2770d39..8166db91 100644
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -9,3 +9,4 @@
 0009-Patch-move-manpage-arcstat-1-to-arcstat-8.patch
 0010-arcstat-Fix-integer-division-with-python3.patch
 0011-arc-stat-summary-guard-access-to-l2arc-MFU-MRU-stats.patch
+0012-Fix-zvol_open-lock-inversion.patch
-- 
2.30.2



_______________________________________________
pve-devel mailing list
pve-devel@lists.proxmox.com
https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel

Reply via email to