The branch main has been updated by asomers:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=a13ddd6210f349dec341eba0506b5f2395a7a2d6

commit a13ddd6210f349dec341eba0506b5f2395a7a2d6
Author:     Alan Somers <asom...@freebsd.org>
AuthorDate: 2025-05-02 12:45:32 +0000
Commit:     Alan Somers <asom...@freebsd.org>
CommitDate: 2025-08-04 16:00:52 +0000

    zfsd: don't try to fix an OFFLINE condition
    
    If the system administrator does "zpool offline", he's doing it for a
    reason.  zfsd shouldn't consider an offline disk to be an event that
    requires automatic healing.  Don't online it in response to a GEOM
    event, and don't try to activate a hotspare to take over from it.
    
    MFC after:      2 weeks
    Sponsored by:   ConnectWise
---
 cddl/usr.sbin/zfsd/case_file.cc                    | 18 +++++-
 cddl/usr.sbin/zfsd/zfsd_event.cc                   |  7 +++
 tests/sys/cddl/zfs/tests/zfsd/Makefile             |  2 +
 .../cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh   | 64 +++++++++++++++++++++
 .../cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh   | 66 ++++++++++++++++++++++
 tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh         | 60 ++++++++++++++++++++
 6 files changed, 215 insertions(+), 2 deletions(-)

diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc
index 7adfb08b75c6..852767aeb227 100644
--- a/cddl/usr.sbin/zfsd/case_file.cc
+++ b/cddl/usr.sbin/zfsd/case_file.cc
@@ -299,6 +299,15 @@ CaseFile::ReEvaluate(const string &devPath, const string 
&physPath, Vdev *vdev)
                    PoolGUIDString().c_str(), VdevGUIDString().c_str());
                return (/*consumed*/false);
        }
+       if (VdevState() == VDEV_STATE_OFFLINE) {
+               /*
+                * OFFLINE is an administrative decision.  No need for zfsd to
+                * do anything.
+                */
+               syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev 
ignored",
+                   PoolGUIDString().c_str(), VdevGUIDString().c_str());
+               return (/*consumed*/false);
+       }
 
        if (vdev != NULL
         && ( vdev->PoolGUID() == m_poolGUID
@@ -401,7 +410,8 @@ CaseFile::ReEvaluate(const ZfsEvent &event)
                return (/*consumed*/true);
        } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") {
                RefreshVdevState();
-               if (VdevState() < VDEV_STATE_HEALTHY)
+               if (VdevState() < VDEV_STATE_HEALTHY &&
+                   VdevState() != VDEV_STATE_OFFLINE)
                        consumed = ActivateSpare();
        }
 
@@ -694,6 +704,11 @@ CaseFile::CloseIfSolved()
                switch (VdevState()) {
                case VDEV_STATE_HEALTHY:
                        /* No need to keep cases for healthy vdevs */
+               case VDEV_STATE_OFFLINE:
+                       /*
+                        * Offline is a deliberate administrative action.  zfsd
+                        * doesn't need to do anything for this state.
+                        */
                        Close();
                        return (true);
                case VDEV_STATE_REMOVED:
@@ -710,7 +725,6 @@ CaseFile::CloseIfSolved()
                         */
                case VDEV_STATE_UNKNOWN:
                case VDEV_STATE_CLOSED:
-               case VDEV_STATE_OFFLINE:
                        /*
                         * Keep open?  This may not be the correct behavior,
                         * but it's what we've always done
diff --git a/cddl/usr.sbin/zfsd/zfsd_event.cc b/cddl/usr.sbin/zfsd/zfsd_event.cc
index 7a19b95abeed..afdabd99a8c3 100644
--- a/cddl/usr.sbin/zfsd/zfsd_event.cc
+++ b/cddl/usr.sbin/zfsd/zfsd_event.cc
@@ -355,6 +355,13 @@ ZfsEvent::Process() const
 
        Vdev vdev(zpl.front(), vdevConfig);
        caseFile = &CaseFile::Create(vdev);
+       if (caseFile->VdevState() == VDEV_STATE_OFFLINE) {
+               /*
+                * An administrator did this deliberately.  It's not considered
+                * an error that zfsd must fix.
+                */
+               return (false);
+       }
        if (caseFile->ReEvaluate(*this) == false) {
                stringstream msg;
                int priority = LOG_INFO;
diff --git a/tests/sys/cddl/zfs/tests/zfsd/Makefile 
b/tests/sys/cddl/zfs/tests/zfsd/Makefile
index e34e24b40906..588ca6e6c145 100644
--- a/tests/sys/cddl/zfs/tests/zfsd/Makefile
+++ b/tests/sys/cddl/zfs/tests/zfsd/Makefile
@@ -30,6 +30,8 @@ ${PACKAGE}FILES+=     zfsd_hotspare_006_pos.ksh
 ${PACKAGE}FILES+=      zfsd_hotspare_007_pos.ksh
 ${PACKAGE}FILES+=      zfsd_hotspare_008_neg.ksh
 ${PACKAGE}FILES+=      zfsd_import_001_pos.ksh
+${PACKAGE}FILES+=      zfsd_offline_001_neg.ksh
+${PACKAGE}FILES+=      zfsd_offline_002_neg.ksh
 ${PACKAGE}FILES+=      zfsd_replace_001_pos.ksh
 ${PACKAGE}FILES+=      zfsd_replace_002_pos.ksh
 ${PACKAGE}FILES+=      zfsd_replace_003_pos.ksh
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh 
b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh
new file mode 100644
index 000000000000..de7996976504
--- /dev/null
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh
@@ -0,0 +1,64 @@
+#!/usr/local/bin/ksh93 -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2025 ConnectWise.  All rights reserved.
+# Use is subject to license terms.
+
+. $STF_SUITE/tests/hotspare/hotspare.kshlib
+
+verify_runnable "global"
+
+function cleanup
+{
+       $ZPOOL status $TESTPOOL
+       if poolexists $TESTPOOL ; then
+               destroy_pool $TESTPOOL
+       fi
+
+       partition_cleanup
+}
+
+function verify_assertion
+{
+       log_must $ZPOOL offline $TESTPOOL $FAULT_DISK
+
+       # Wait a few seconds before verifying the state
+       $SLEEP 10
+       log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE"
+}
+
+log_onexit cleanup
+
+log_assert "ZFSD will not automatically reactivate a disk which has been 
administratively offlined"
+
+ensure_zfsd_running
+
+typeset FAULT_DISK=$DISK0
+typeset POOLDEVS="$DISK0 $DISK1 $DISK2"
+set -A MY_KEYWORDS mirror raidz1
+for keyword in "${MY_KEYWORDS[@]}" ; do
+       log_must create_pool $TESTPOOL $keyword $POOLDEVS
+       verify_assertion
+
+       destroy_pool "$TESTPOOL"
+done
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh 
b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh
new file mode 100644
index 000000000000..7d8dfc62d365
--- /dev/null
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh
@@ -0,0 +1,66 @@
+#!/usr/local/bin/ksh93 -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2025 ConnectWise.  All rights reserved.
+# Use is subject to license terms.
+
+. $STF_SUITE/tests/hotspare/hotspare.kshlib
+
+verify_runnable "global"
+
+function cleanup
+{
+       $ZPOOL status $TESTPOOL
+       if poolexists $TESTPOOL ; then
+               destroy_pool $TESTPOOL
+       fi
+
+       partition_cleanup
+}
+
+function verify_assertion
+{
+       log_must $ZPOOL offline $TESTPOOL $FAULT_DISK
+
+       # Wait a few seconds before verifying the state
+       $SLEEP 10
+       log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE"
+       log_must check_state $TESTPOOL "$SPARE_DISK" "AVAIL"
+}
+
+log_onexit cleanup
+
+log_assert "ZFSD will not automatically activate a spare when a disk has been 
administratively offlined"
+
+ensure_zfsd_running
+
+typeset FAULT_DISK=$DISK0
+typeset SPARE_DISK=$DISK3
+typeset POOLDEVS="$DISK0 $DISK1 $DISK2"
+set -A MY_KEYWORDS mirror raidz1
+for keyword in "${MY_KEYWORDS[@]}" ; do
+       log_must create_pool $TESTPOOL $keyword $POOLDEVS spare $SPARE_DISK
+       verify_assertion
+
+       destroy_pool "$TESTPOOL"
+done
diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh 
b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
index fe4ac4866ed3..b9924500a298 100755
--- a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
+++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh
@@ -483,6 +483,64 @@ zfsd_autoreplace_003_pos_cleanup()
        ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
 }
 
+atf_test_case zfsd_offline_001_neg cleanup
+zfsd_offline_001_neg_head()
+{
+       atf_set "descr" "ZFSD will not automatically reactivate a disk which 
has been administratively offlined"
+       atf_set "require.progs" "ksh93 zpool zfs"
+}
+zfsd_offline_001_neg_body()
+{
+       . $(atf_get_srcdir)/../../include/default.cfg
+       . $(atf_get_srcdir)/../hotspare/hotspare.cfg
+       . $(atf_get_srcdir)/zfsd.cfg
+
+       verify_disk_count "$DISKS" 3
+       verify_zfsd_running
+       ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"
+       ksh93 $(atf_get_srcdir)/zfsd_offline_001_neg.ksh
+       if [[ $? != 0 ]]; then
+               save_artifacts
+               atf_fail "Testcase failed"
+       fi
+}
+zfsd_offline_001_neg_cleanup()
+{
+       . $(atf_get_srcdir)/../../include/default.cfg
+       . $(atf_get_srcdir)/zfsd.cfg
+
+       ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
+}
+
+atf_test_case zfsd_offline_002_neg cleanup
+zfsd_offline_002_neg_head()
+{
+       atf_set "descr" "ZFSD will not automatically activate a spare when a 
disk has been administratively offlined"
+       atf_set "require.progs" "ksh93 zpool zfs"
+}
+zfsd_offline_002_neg_body()
+{
+       . $(atf_get_srcdir)/../../include/default.cfg
+       . $(atf_get_srcdir)/../hotspare/hotspare.cfg
+       . $(atf_get_srcdir)/zfsd.cfg
+
+       verify_disk_count "$DISKS" 4
+       verify_zfsd_running
+       ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed"
+       ksh93 $(atf_get_srcdir)/zfsd_offline_002_neg.ksh
+       if [[ $? != 0 ]]; then
+               save_artifacts
+               atf_fail "Testcase failed"
+       fi
+}
+zfsd_offline_002_neg_cleanup()
+{
+       . $(atf_get_srcdir)/../../include/default.cfg
+       . $(atf_get_srcdir)/zfsd.cfg
+
+       ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed"
+}
+
 atf_test_case zfsd_replace_001_pos cleanup
 zfsd_replace_001_pos_head()
 {
@@ -676,6 +734,8 @@ atf_init_test_cases()
        atf_add_test_case zfsd_autoreplace_001_neg
        atf_add_test_case zfsd_autoreplace_002_pos
        atf_add_test_case zfsd_autoreplace_003_pos
+       atf_add_test_case zfsd_offline_001_neg
+       atf_add_test_case zfsd_offline_002_neg
        atf_add_test_case zfsd_replace_001_pos
        atf_add_test_case zfsd_replace_002_pos
        atf_add_test_case zfsd_replace_003_pos

Reply via email to