The branch main has been updated by asomers: URL: https://cgit.FreeBSD.org/src/commit/?id=a13ddd6210f349dec341eba0506b5f2395a7a2d6
commit a13ddd6210f349dec341eba0506b5f2395a7a2d6 Author: Alan Somers <asom...@freebsd.org> AuthorDate: 2025-05-02 12:45:32 +0000 Commit: Alan Somers <asom...@freebsd.org> CommitDate: 2025-08-04 16:00:52 +0000 zfsd: don't try to fix an OFFLINE condition If the system administrator does "zpool offline", he's doing it for a reason. zfsd shouldn't consider an offline disk to be an event that requires automatic healing. Don't online it in response to a GEOM event, and don't try to activate a hotspare to take over from it. MFC after: 2 weeks Sponsored by: ConnectWise --- cddl/usr.sbin/zfsd/case_file.cc | 18 +++++- cddl/usr.sbin/zfsd/zfsd_event.cc | 7 +++ tests/sys/cddl/zfs/tests/zfsd/Makefile | 2 + .../cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh | 64 +++++++++++++++++++++ .../cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh | 66 ++++++++++++++++++++++ tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh | 60 ++++++++++++++++++++ 6 files changed, 215 insertions(+), 2 deletions(-) diff --git a/cddl/usr.sbin/zfsd/case_file.cc b/cddl/usr.sbin/zfsd/case_file.cc index 7adfb08b75c6..852767aeb227 100644 --- a/cddl/usr.sbin/zfsd/case_file.cc +++ b/cddl/usr.sbin/zfsd/case_file.cc @@ -299,6 +299,15 @@ CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev) PoolGUIDString().c_str(), VdevGUIDString().c_str()); return (/*consumed*/false); } + if (VdevState() == VDEV_STATE_OFFLINE) { + /* + * OFFLINE is an administrative decision. No need for zfsd to + * do anything. + */ + syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored", + PoolGUIDString().c_str(), VdevGUIDString().c_str()); + return (/*consumed*/false); + } if (vdev != NULL && ( vdev->PoolGUID() == m_poolGUID @@ -401,7 +410,8 @@ CaseFile::ReEvaluate(const ZfsEvent &event) return (/*consumed*/true); } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") { RefreshVdevState(); - if (VdevState() < VDEV_STATE_HEALTHY) + if (VdevState() < VDEV_STATE_HEALTHY && + VdevState() != VDEV_STATE_OFFLINE) consumed = ActivateSpare(); } @@ -694,6 +704,11 @@ CaseFile::CloseIfSolved() switch (VdevState()) { case VDEV_STATE_HEALTHY: /* No need to keep cases for healthy vdevs */ + case VDEV_STATE_OFFLINE: + /* + * Offline is a deliberate administrative action. zfsd + * doesn't need to do anything for this state. + */ Close(); return (true); case VDEV_STATE_REMOVED: @@ -710,7 +725,6 @@ CaseFile::CloseIfSolved() */ case VDEV_STATE_UNKNOWN: case VDEV_STATE_CLOSED: - case VDEV_STATE_OFFLINE: /* * Keep open? This may not be the correct behavior, * but it's what we've always done diff --git a/cddl/usr.sbin/zfsd/zfsd_event.cc b/cddl/usr.sbin/zfsd/zfsd_event.cc index 7a19b95abeed..afdabd99a8c3 100644 --- a/cddl/usr.sbin/zfsd/zfsd_event.cc +++ b/cddl/usr.sbin/zfsd/zfsd_event.cc @@ -355,6 +355,13 @@ ZfsEvent::Process() const Vdev vdev(zpl.front(), vdevConfig); caseFile = &CaseFile::Create(vdev); + if (caseFile->VdevState() == VDEV_STATE_OFFLINE) { + /* + * An administrator did this deliberately. It's not considered + * an error that zfsd must fix. + */ + return (false); + } if (caseFile->ReEvaluate(*this) == false) { stringstream msg; int priority = LOG_INFO; diff --git a/tests/sys/cddl/zfs/tests/zfsd/Makefile b/tests/sys/cddl/zfs/tests/zfsd/Makefile index e34e24b40906..588ca6e6c145 100644 --- a/tests/sys/cddl/zfs/tests/zfsd/Makefile +++ b/tests/sys/cddl/zfs/tests/zfsd/Makefile @@ -30,6 +30,8 @@ ${PACKAGE}FILES+= zfsd_hotspare_006_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_007_pos.ksh ${PACKAGE}FILES+= zfsd_hotspare_008_neg.ksh ${PACKAGE}FILES+= zfsd_import_001_pos.ksh +${PACKAGE}FILES+= zfsd_offline_001_neg.ksh +${PACKAGE}FILES+= zfsd_offline_002_neg.ksh ${PACKAGE}FILES+= zfsd_replace_001_pos.ksh ${PACKAGE}FILES+= zfsd_replace_002_pos.ksh ${PACKAGE}FILES+= zfsd_replace_003_pos.ksh diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh new file mode 100644 index 000000000000..de7996976504 --- /dev/null +++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_001_neg.ksh @@ -0,0 +1,64 @@ +#!/usr/local/bin/ksh93 -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2025 ConnectWise. All rights reserved. +# Use is subject to license terms. + +. $STF_SUITE/tests/hotspare/hotspare.kshlib + +verify_runnable "global" + +function cleanup +{ + $ZPOOL status $TESTPOOL + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + partition_cleanup +} + +function verify_assertion +{ + log_must $ZPOOL offline $TESTPOOL $FAULT_DISK + + # Wait a few seconds before verifying the state + $SLEEP 10 + log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE" +} + +log_onexit cleanup + +log_assert "ZFSD will not automatically reactivate a disk which has been administratively offlined" + +ensure_zfsd_running + +typeset FAULT_DISK=$DISK0 +typeset POOLDEVS="$DISK0 $DISK1 $DISK2" +set -A MY_KEYWORDS mirror raidz1 +for keyword in "${MY_KEYWORDS[@]}" ; do + log_must create_pool $TESTPOOL $keyword $POOLDEVS + verify_assertion + + destroy_pool "$TESTPOOL" +done diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh new file mode 100644 index 000000000000..7d8dfc62d365 --- /dev/null +++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_offline_002_neg.ksh @@ -0,0 +1,66 @@ +#!/usr/local/bin/ksh93 -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2025 ConnectWise. All rights reserved. +# Use is subject to license terms. + +. $STF_SUITE/tests/hotspare/hotspare.kshlib + +verify_runnable "global" + +function cleanup +{ + $ZPOOL status $TESTPOOL + if poolexists $TESTPOOL ; then + destroy_pool $TESTPOOL + fi + + partition_cleanup +} + +function verify_assertion +{ + log_must $ZPOOL offline $TESTPOOL $FAULT_DISK + + # Wait a few seconds before verifying the state + $SLEEP 10 + log_must check_state $TESTPOOL "$FAULT_DISK" "OFFLINE" + log_must check_state $TESTPOOL "$SPARE_DISK" "AVAIL" +} + +log_onexit cleanup + +log_assert "ZFSD will not automatically activate a spare when a disk has been administratively offlined" + +ensure_zfsd_running + +typeset FAULT_DISK=$DISK0 +typeset SPARE_DISK=$DISK3 +typeset POOLDEVS="$DISK0 $DISK1 $DISK2" +set -A MY_KEYWORDS mirror raidz1 +for keyword in "${MY_KEYWORDS[@]}" ; do + log_must create_pool $TESTPOOL $keyword $POOLDEVS spare $SPARE_DISK + verify_assertion + + destroy_pool "$TESTPOOL" +done diff --git a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh index fe4ac4866ed3..b9924500a298 100755 --- a/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh +++ b/tests/sys/cddl/zfs/tests/zfsd/zfsd_test.sh @@ -483,6 +483,64 @@ zfsd_autoreplace_003_pos_cleanup() ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" } +atf_test_case zfsd_offline_001_neg cleanup +zfsd_offline_001_neg_head() +{ + atf_set "descr" "ZFSD will not automatically reactivate a disk which has been administratively offlined" + atf_set "require.progs" "ksh93 zpool zfs" +} +zfsd_offline_001_neg_body() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/../hotspare/hotspare.cfg + . $(atf_get_srcdir)/zfsd.cfg + + verify_disk_count "$DISKS" 3 + verify_zfsd_running + ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" + ksh93 $(atf_get_srcdir)/zfsd_offline_001_neg.ksh + if [[ $? != 0 ]]; then + save_artifacts + atf_fail "Testcase failed" + fi +} +zfsd_offline_001_neg_cleanup() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/zfsd.cfg + + ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" +} + +atf_test_case zfsd_offline_002_neg cleanup +zfsd_offline_002_neg_head() +{ + atf_set "descr" "ZFSD will not automatically activate a spare when a disk has been administratively offlined" + atf_set "require.progs" "ksh93 zpool zfs" +} +zfsd_offline_002_neg_body() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/../hotspare/hotspare.cfg + . $(atf_get_srcdir)/zfsd.cfg + + verify_disk_count "$DISKS" 4 + verify_zfsd_running + ksh93 $(atf_get_srcdir)/setup.ksh || atf_fail "Setup failed" + ksh93 $(atf_get_srcdir)/zfsd_offline_002_neg.ksh + if [[ $? != 0 ]]; then + save_artifacts + atf_fail "Testcase failed" + fi +} +zfsd_offline_002_neg_cleanup() +{ + . $(atf_get_srcdir)/../../include/default.cfg + . $(atf_get_srcdir)/zfsd.cfg + + ksh93 $(atf_get_srcdir)/cleanup.ksh || atf_fail "Cleanup failed" +} + atf_test_case zfsd_replace_001_pos cleanup zfsd_replace_001_pos_head() { @@ -676,6 +734,8 @@ atf_init_test_cases() atf_add_test_case zfsd_autoreplace_001_neg atf_add_test_case zfsd_autoreplace_002_pos atf_add_test_case zfsd_autoreplace_003_pos + atf_add_test_case zfsd_offline_001_neg + atf_add_test_case zfsd_offline_002_neg atf_add_test_case zfsd_replace_001_pos atf_add_test_case zfsd_replace_002_pos atf_add_test_case zfsd_replace_003_pos