Script 'mail_helper' called by obssrc
Hello community,
here is the log from the commit of package resource-agents for openSUSE:Factory
checked in at 2025-12-19 16:41:54
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/resource-agents (Old)
and /work/SRC/openSUSE:Factory/.resource-agents.new.1928 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Package is "resource-agents"
Fri Dec 19 16:41:54 2025 rev:144 rq:1323402 version:4.17.0+git18.92719d83
Changes:
--------
--- /work/SRC/openSUSE:Factory/resource-agents/resource-agents.changes
2025-12-03 14:13:50.221881824 +0100
+++
/work/SRC/openSUSE:Factory/.resource-agents.new.1928/resource-agents.changes
2025-12-19 16:41:57.172105135 +0100
@@ -1,0 +2,14 @@
+Wed Dec 17 17:47:09 UTC 2025 - Peter Varkoly <[email protected]>
+
+- Update to version 4.17.0+git18.92719d83:
+ * Filesystem: signal many processes in parallel
+ * podman-etcd: sync environment variables with Pod manifest
+ * OCPEDGE-2231: podman-etcd: improve error handling to support retry on
start errors (#2105)
+ * Filesystem: new force_unmount=move option
+ * Filesystem: try umount immediately after signals are sent
+ * Filesystem: tmpfs/overlayfs have no need for systemd_drop_in
+ * Filesystem: improve shell trace (set -x) output
+ * OCPEDGE-2213: podman-etcd: fix to prevent learner from starting before
cluster is ready (#2098)
+ * exportfs: fix grep error on stop
+
+-------------------------------------------------------------------
Old:
----
resource-agents-4.17.0+git5.e3a22113.tar.xz
New:
----
resource-agents-4.17.0+git18.92719d83.tar.xz
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Other differences:
------------------
++++++ resource-agents.spec ++++++
--- /var/tmp/diff_new_pack.IV6dkY/_old 2025-12-19 16:41:59.220190667 +0100
+++ /var/tmp/diff_new_pack.IV6dkY/_new 2025-12-19 16:41:59.224190835 +0100
@@ -17,7 +17,7 @@
Name: resource-agents
-Version: 4.17.0+git5.e3a22113
+Version: 4.17.0+git18.92719d83
Release: 0
Summary: HA Reusable Cluster Resource Scripts
License: GPL-2.0-only AND LGPL-2.1-or-later AND GPL-3.0-or-later
++++++ _servicedata ++++++
--- /var/tmp/diff_new_pack.IV6dkY/_old 2025-12-19 16:41:59.304194176 +0100
+++ /var/tmp/diff_new_pack.IV6dkY/_new 2025-12-19 16:41:59.312194510 +0100
@@ -1,7 +1,7 @@
<servicedata>
<service name="tar_scm">
<param name="url">https://github.com/ClusterLabs/resource-agents.git</param>
-<param name="changesrevision">e3a22113c7cdce932de605f55b929daaee254bc3</param>
+<param name="changesrevision">92719d83353a8c7128f5fc72812e4b7c06cf8a6b</param>
</service>
</servicedata>
(No newline at EOF)
++++++ resource-agents-4.17.0+git5.e3a22113.tar.xz ->
resource-agents-4.17.0+git18.92719d83.tar.xz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/resource-agents-4.17.0+git5.e3a22113/heartbeat/Filesystem
new/resource-agents-4.17.0+git18.92719d83/heartbeat/Filesystem
--- old/resource-agents-4.17.0+git5.e3a22113/heartbeat/Filesystem
2025-11-25 13:04:52.000000000 +0100
+++ new/resource-agents-4.17.0+git18.92719d83/heartbeat/Filesystem
2025-12-08 15:46:10.000000000 +0100
@@ -19,6 +19,12 @@
# OCF_RESKEY_run_fsck
# OCF_RESKEY_fast_stop
# OCF_RESKEY_force_clones
+# OCF_RESKEY_force_unmount
+# OCF_RESKEY_term_signals
+# OCF_RESKEY_kill_signals
+# OCF_RESKEY_signal_delay
+# See also the ocf_heartbeat_Filesystem(7) man page.
+
#
#OCF_RESKEY_device : name of block device for the filesystem. e.g.
/dev/sda1, /dev/md0
# Or a -U or -L option for mount, or an NFS mount
specification
@@ -91,12 +97,6 @@
fi
-: ${OCF_RESKEY_device=${OCF_RESKEY_device_default}}
-: ${OCF_RESKEY_directory=${OCF_RESKEY_directory_default}}
-: ${OCF_RESKEY_fstype=${OCF_RESKEY_fstype_default}}
-: ${OCF_RESKEY_options=${OCF_RESKEY_options_default}}
-: ${OCF_RESKEY_statusfile_prefix=${OCF_RESKEY_statusfile_prefix_default}}
-: ${OCF_RESKEY_run_fsck=${OCF_RESKEY_run_fsck_default}}
if [ -z "${OCF_RESKEY_fast_stop}" ]; then
case "$OCF_RESKEY_fstype" in
gfs2)
@@ -105,11 +105,18 @@
OCF_RESKEY_fast_stop=${OCF_RESKEY_fast_stop_default};;
esac
fi
-: ${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}}
-: ${OCF_RESKEY_force_unmount=${OCF_RESKEY_force_unmount_default}}
-: ${OCF_RESKEY_term_signals=${OCF_RESKEY_term_signals_default}}
-: ${OCF_RESKEY_kill_signals=${OCF_RESKEY_kill_signals_default}}
-: ${OCF_RESKEY_signal_delay=${OCF_RESKEY_signal_delay_default}}
+: "fast_stop ::" ${OCF_RESKEY_fast_stop}
+: "device ::" ${OCF_RESKEY_device=${OCF_RESKEY_device_default}}
+: "directory ::" ${OCF_RESKEY_directory=${OCF_RESKEY_directory_default}}
+: "fstype ::" ${OCF_RESKEY_fstype=${OCF_RESKEY_fstype_default}}
+: "options ::" ${OCF_RESKEY_options=${OCF_RESKEY_options_default}}
+: "statusfile_prefix::"
${OCF_RESKEY_statusfile_prefix=${OCF_RESKEY_statusfile_prefix_default}}
+: "run_fsck ::" ${OCF_RESKEY_run_fsck=${OCF_RESKEY_run_fsck_default}}
+: "force_clones ::"
${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}}
+: "force_umount ::"
${OCF_RESKEY_force_unmount=${OCF_RESKEY_force_unmount_default}}
+: "term_signals ::"
${OCF_RESKEY_term_signals=${OCF_RESKEY_term_signals_default}}
+: "kill_signals ::"
${OCF_RESKEY_kill_signals=${OCF_RESKEY_kill_signals_default}}
+: "signal_delay ::"
${OCF_RESKEY_signal_delay=${OCF_RESKEY_signal_delay_default}}
# Variables used by multiple methods
HOSTOS=$(uname)
@@ -253,7 +260,7 @@
</parameter>
<parameter name="force_unmount">
-<longdesc lang="en">
+<longdesc lang="en"><![CDATA[
This option allows specifying how to handle processes that are
currently accessing the mount directory.
@@ -262,12 +269,25 @@
avoid functions that could potentially block during process
detection
"false" : Do not kill any processes.
+"move" : like "safe", but try to mount --move first
-The 'safe' option uses shell logic to walk the /procs/ directory
+The 'safe' option uses shell logic to walk the /proc/<pid>/ directories
for pids using the mount point while the default option uses the
fuser cli tool. fuser is known to perform operations that can potentially
block if unresponsive nfs mounts are in use on the system.
-</longdesc>
+
+If new users of the file system are being spawned continuously by unmanaged 3rd
+party apps, we likely never win the race and the file system will be kept busy.
+Which may result in a timeout and stop failure, potentially escalating to
+hard-reset of this node via fencing.
+
+The 'move' option tries to move the mount point somewhere those "rogue apps"
+do not expect it, then proceed to kill current users and attempt to umount.
+
+For 'move' to work, you will have to make sure the mount point does not reside
+under a shared mount, for example by mount -o bind,private /mount /mount
+before mounting /mount/point.
+]]></longdesc>
<shortdesc lang="en">Kill processes before unmount</shortdesc>
<content type="string" default="${OCF_RESKEY_force_unmount_default}" />
</parameter>
@@ -582,8 +602,21 @@
return $OCF_ERR_CONFIGURED
fi
+ if $move_before_umount && test -d "$MOVED_CANONICALIZED_MOUNTPOINT";
then
+ CANONICALIZED_MOUNTPOINT=$MOVED_CANONICALIZED_MOUNTPOINT \
+ move_before_umount=false \
+ Filesystem_status
+ rc=$?
+ if [ $rc != $OCF_NOT_RUNNING ]; then
+ msg="move_before_umount=$move_before_umount and
$MOVED_CANONICALIZED_MOUNTPOINT status is [$rc]"
+ rc=$OCF_ERR_GENERIC
+ ocf_exit_reason "$msg"
+ return $rc
+ fi
+ fi
+
# See if the device is already mounted.
- Filesystem_status
+ move_before_umount=false Filesystem_status
case "$?" in
$OCF_SUCCESS)
ocf_log info "Filesystem $MOUNTPOINT is already
mounted."
@@ -663,11 +696,11 @@
if ocf_is_true "$FORCE_UNMOUNT"; then
if [ "X${HOSTOS}" = "XOpenBSD" ]; then
- fstat | grep $dir | awk '{print $3}'
+ fstat | grep "$dir" | awk '{print $3}'
elif [ "X${HOSTOS}" = "XFreeBSD" ]; then
- $FUSER -c $dir 2>/dev/null
+ $FUSER -c "$dir" 2>/dev/null
else
- $FUSER -Mm $dir 2>/dev/null
+ $FUSER -Mm "$dir" 2>/dev/null
fi
elif [ "$FORCE_UNMOUNT" = "safe" ]; then
# Yes, in theory, ${dir} could contain "intersting" characters
@@ -721,18 +754,36 @@
local dir=$1
local sig=$2
local pids pid
+ local nr_pids
+ local sed_script=""
# fuser returns a non-zero return code if none of the
# specified files is accessed or in case of a fatal
# error.
- pids=$(get_pids "$dir")
- if [ -z "$pids" ]; then
+ # We don't know whether we have one single line or mutiple lines.
+ # Canonicalize.
+ pids=$(get_pids "$dir" | tr -s ' \t\n' '\n')
+ nr_pids=$(echo "$pids" | grep -c "^.")
+
+ # for many pids, reporting and killing them indivitually just takes too
long.
+ # may even be too many words for the shell!
+ #
+ # If the list of pids is too long for a single shell variable,
+ # fix your fork bomb workload, and get a better shell anyways.
+ #
+ if [ $nr_pids = 0 ]; then
ocf_log info "No processes on $dir were signalled.
force_unmount is set to '$FORCE_UNMOUNT'"
return 1
+ elif [ $nr_pids -le 24 ]; then
+ for pid in $pids; do
+ ocf_log info "sending signal $sig to: $(ps -f $pid |
tail -1)"
+ kill -s $sig $pid
+ done
+ else
+ echo "$pids" | xargs -r kill -s $sig
+ sed_script="11 s/^.*/... and more .../; 12,$(( $nr_pids - 10))d"
+ pids=$(echo "$pids" | sed -e "$sed_script" | tr '\n' ' ')
+ ocf_log info "sent signals $sig to ${nr_pids} processes
[${pids}]"
fi
- for pid in $pids; do
- ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)"
- kill -s $sig $pid
- done
return 0
}
try_umount() {
@@ -760,13 +811,21 @@
return $ret
}
fs_stop_loop() {
- local force_arg="$1" SUB="$2" signals="$3" sig send_signal
+ local force_arg="$1" SUB="$2" signals="$3" sig sent_signal
while true; do
- send_signal=false
+ sent_signal=false
for sig in $signals; do
- signal_processes "$SUB" $sig && send_signal=true
+ signal_processes "$SUB" $sig && sent_signal=true
done
- $send_signal && sleep $OCF_RESKEY_signal_delay
+ if $sent_signal; then
+ # Try to umount immediately after signalling, to reduce
+ # the time window in which new users of the file system
+ # may be spawned.
+ try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
+ sleep $OCF_RESKEY_signal_delay
+ fi
+ # Try to umount after the signal_delay, maybe some processes
+ # needed a moment to "exit cleanly" after receiving SIGTERM.
try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
done
}
@@ -797,16 +856,34 @@
return $OCF_ERR_GENERIC
}
+try_mount_move()
+{
+ test -d "$MOVED_CANONICALIZED_MOUNTPOINT" || mkdir
"$MOVED_CANONICALIZED_MOUNTPOINT" || return
+ mount --move "$CANONICALIZED_MOUNTPOINT"
"$MOVED_CANONICALIZED_MOUNTPOINT" || return
+ ocf_log info "Moved $MOUNTPOINT to $MOVED_CANONICALIZED_MOUNTPOINT"
+ return 0
+ # To test really bad timing of "action timeout":
+ # test -e /tmp/fail-after-move && rm -f /tmp/fail-after-move && kill
-KILL $$
+}
+
#
# STOP: Unmount the filesystem
#
Filesystem_stop()
{
# See if the device is currently mounted
- Filesystem_status >/dev/null 2>&1
+ move_before_umount=false Filesystem_status >/dev/null 2>&1
if [ $? -eq $OCF_NOT_RUNNING ]; then
# Already unmounted, wonderful.
- rc=$OCF_SUCCESS
+ # But did we also unmount the moved fs?
+ if $move_before_umount; then
+
CANONICALIZED_MOUNTPOINT=$MOVED_CANONICALIZED_MOUNTPOINT \
+ move_before_umount=false \
+ Filesystem_stop
+ return $?
+ else
+ rc=$OCF_SUCCESS
+ fi
else
# Wipe the status file, but continue with a warning if
# removal fails -- the file system might be read only
@@ -826,6 +903,13 @@
nfs4|nfs|aznfs|efs|cifs|smbfs) umount_force="-f" ;;
esac
+ if $move_before_umount && try_mount_move ; then
+
CANONICALIZED_MOUNTPOINT=$MOVED_CANONICALIZED_MOUNTPOINT \
+ move_before_umount=false \
+ Filesystem_stop
+ return $?
+ fi
+
# Umount all sub-filesystems mounted under $MOUNTPOINT/ too.
local timeout
while read SUB; do
@@ -859,6 +943,21 @@
#
Filesystem_status()
{
+ if $move_before_umount && test -d $MOVED_CANONICALIZED_MOUNTPOINT; then
+ # Have to recurse once.
+ CANONICALIZED_MOUNTPOINT=$MOVED_CANONICALIZED_MOUNTPOINT \
+ move_before_umount=false \
+ OP= \
+ Filesystem_status
+ rc=$?
+ if [ $rc = $OCF_SUCCESS ]; then
+ rc=$OCF_ERR_GENERIC
+ msg="move_before_umount=$move_before_umount and
something is mounted on $MOVED_CANONICALIZED_MOUNTPOINT"
+ ocf_exit_reason "$msg"
+ return $rc
+ fi
+ fi
+
local match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}"
local mounted_device=$(list_mounts | grep "$match_string" | awk '{print
$1}')
@@ -1082,6 +1181,11 @@
fi
FAST_STOP=${OCF_RESKEY_fast_stop:="yes"}
+case $FORCE_UNMOUNT in
+ move) move_before_umount=true; FORCE_UNMOUNT=safe ;;
+ *) move_before_umount=false ;;
+esac
+
OP=$1
# These operations do not require instance parameters
@@ -1143,6 +1247,9 @@
fi
fi
+MOVED_CANONICALIZED_MOUNTPOINT=$(echo "$CANONICALIZED_MOUNTPOINT" | sed -e
's,/\([^/]\+\)$,/.\1,')
+
+
# Check to make sure the utilites are found
if [ "X${HOSTOS}" != "XOpenBSD" ];then
check_binary $MODPROBE
@@ -1176,7 +1283,10 @@
CLUSTERSAFE=2
case "$FSTYPE" in
-nfs4|nfs|aznfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre)
+none|overlay|overlayfs|tmpfs)
+ CLUSTERSAFE=1 # this is kind of safe too
+ ;;
+nfs4|nfs|aznfs|efs|smbfs|cifs|gfs2|glusterfs|ceph|ocfs2|cvfs|lustre)
CLUSTERSAFE=1 # this is kind of safe too
systemd_drop_in "99-Filesystem-remote" "After" "remote-fs.target"
;;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/resource-agents-4.17.0+git5.e3a22113/heartbeat/exportfs
new/resource-agents-4.17.0+git18.92719d83/heartbeat/exportfs
--- old/resource-agents-4.17.0+git5.e3a22113/heartbeat/exportfs 2025-11-25
13:04:52.000000000 +0100
+++ new/resource-agents-4.17.0+git18.92719d83/heartbeat/exportfs
2025-12-08 15:46:10.000000000 +0100
@@ -390,10 +390,10 @@
local contentfile=/proc/net/rpc/nfsd.export/content
local fsid_re
local i=1
- fsid_re="fsid=(echo `forall get_fsid`|sed 's/ /|/g'),"
+ local fsid_all=`forall get_fsid`
+ fsid_re="fsid=(`echo $fsid_all | sed 's/ /|/g'`),"
while :; do
- grep -E -q "$fsid_re" $contentfile ||
- break
+ grep -E -q "$fsid_re" $contentfile || break
ocf_log info "Cleanup export cache ... (try $i)"
ocf_run exportfs -f
sleep 0.5
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn'
'--exclude=.svnignore'
old/resource-agents-4.17.0+git5.e3a22113/heartbeat/podman-etcd
new/resource-agents-4.17.0+git18.92719d83/heartbeat/podman-etcd
--- old/resource-agents-4.17.0+git5.e3a22113/heartbeat/podman-etcd
2025-11-25 13:04:52.000000000 +0100
+++ new/resource-agents-4.17.0+git18.92719d83/heartbeat/podman-etcd
2025-12-08 15:46:10.000000000 +0100
@@ -604,8 +604,8 @@
fi
ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
- ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest
"ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
-
ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest
"ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
+ ETCD_WARNING_APPLY_DURATION=$(get_env_from_manifest
"ETCD_WARNING_APPLY_DURATION")
+ ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest
"ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL")
ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest
"ETCD_HEARTBEAT_INTERVAL")
ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest
"ETCD_QUOTA_BACKEND_BYTES")
ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest
"ETCD_SOCKET_REUSE_ADDRESS")
@@ -617,9 +617,13 @@
LISTEN_CLIENT_URLS="0.0.0.0"
LISTEN_PEER_URLS="0.0.0.0"
LISTEN_METRICS_URLS="0.0.0.0"
+
+ return $OCF_SUCCESS
}
compute_bump_revision() {
+ local rc
+
# Same logic used by cluster-etcd-operator quorum-restore-pod utility.
# see
https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
# set a default value: 1bn would be an etcd running at 1000 writes/s
for about eleven days.
@@ -656,6 +660,7 @@
# the space indentation for client-transport-security and
peer-transport-security
# is required for correct YAML formatting.
+ # TODO: replace flags deprecated in Etcd v3.6
cat > "$ETCD_CONFIGURATION_FILE" << EOF
logger: zap
log-level: info
@@ -688,10 +693,16 @@
metrics: extensive
experimental-initial-corrupt-check: true
experimental-max-learners: 1
-experimental-warning-apply-duration: $(convert_duration_in_nanoseconds
"$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
-experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds
"$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
+experimental-warning-apply-duration: $(convert_duration_in_nanoseconds
"$ETCD_WARNING_APPLY_DURATION")
+experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds
"$ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL")
EOF
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ ocf_log err "could not create etcd configuration, 'cat' error
code: $rc"
+ return $OCF_ERR_CONFIGURED
+ fi
+ # Append cipher suites from the env variable where the entries are
comma separated.
{
if [ -n "$ETCD_CIPHER_SUITES" ]; then
echo "cipher-suites:"
@@ -700,6 +711,13 @@
done
fi
} >> "$ETCD_CONFIGURATION_FILE"
+ rc=$?
+ if [ $rc -ne 0 ]; then
+ ocf_log err "could not append cipher suites to etcd
configuration, error code: $rc"
+ return $OCF_ERR_CONFIGURED
+ fi
+
+ return $OCF_SUCCESS
}
archive_data_folder()
@@ -880,11 +898,11 @@
local endpoint_url=$(ip_url $(attribute_node_ip get))
local peer_url=$(ip_url $member_ip)
- ocf_log info "add $member_name ($member_ip, $endpoint_url) to the
member list as learner"
+ ocf_log info "add $member_name ($member_ip) to the member list as
learner"
out=$(podman exec "${CONTAINER}" etcdctl
--endpoints="$endpoint_url:2379" member add "$member_name"
--peer-urls="$peer_url:2380" --learner)
rc=$?
if [ $rc -ne 0 ]; then
- ocf_log err "could not add $member_name as learner, error code:
$rc"
+ ocf_log err "could not add $member_name as learner, error code
$rc, etcdctl output: $out"
return $rc
fi
ocf_log info "$out"
@@ -1032,7 +1050,7 @@
if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote
"$learner_member_id_hex" 2>&1; then
# promotion is expected to fail if the peer is not yet
up-to-date
ocf_log info "could not promote member $learner_member_id_hex,
error code: $?"
- return $OCF_SUCCESS
+ return $OCF_ERR_GENERIC
fi
ocf_log info "successfully promoted member '$learner_member_id_hex'"
return $OCF_SUCCESS
@@ -1063,19 +1081,19 @@
fi
if [ -n "$learner_member_id" ]; then
- promote_learner_member "$learner_member_id"
- return $?
- fi
-
- if [ -z "$learner_member_id" ]; then
- if ! clear_standalone_node; then
- ocf_log error "could not clear standalone_node
attribute, error code: $?"
- return $OCF_ERR_GENERIC
- fi
- if ! attribute_learner_node clear; then
- ocf_log error "could not clear learner_node attribute,
error code: $?"
+ if ! promote_learner_member "$learner_member_id"; then
return $OCF_ERR_GENERIC
fi
+ # promotion succeded: continue to clear standalone_node and
learner_node
+ fi
+
+ if ! clear_standalone_node; then
+ ocf_log error "could not clear standalone_node attribute, error
code: $?"
+ return $OCF_ERR_GENERIC
+ fi
+ if ! attribute_learner_node clear; then
+ ocf_log error "could not clear learner_node attribute, error
code: $?"
+ return $OCF_ERR_GENERIC
fi
return $OCF_SUCCESS
@@ -1258,6 +1276,7 @@
set_standalone_node
else
ocf_log debug "$name is in the members list by IP: $ip"
+ # Errors from reconcile_member_state are logged
internally. Ignoring them here prevents stopping a healthy voter agent;
critical local failures are caught by detect_cluster_leadership_loss.
reconcile_member_state "$member_list_json"
fi
done
@@ -1369,7 +1388,7 @@
# Could not execute monitor check command and state file exists - the
container failed, check recovery status in this lifecycle
local time_since_heartbeat
time_since_heartbeat=$(get_time_since_last_heartbeat)
- ocf_log err "Container ${CONTAINER} failed (last healthy:
${time_since_heartbeat}s ago)"
+ ocf_log err "Container ${CONTAINER} failed (last healthy:
${time_since_heartbeat}s ago, error code: $rc)"
# Check if peer has set force_new_cluster for recovery
local fnc_holders
@@ -1762,7 +1781,7 @@
fnc_holder_count=$(echo "$fnc_holders" | wc -w)
if [ "$fnc_holder_count" -gt 1 ]; then
ocf_exit_reason "force_new_cluster attribute is set on
multiple nodes ($fnc_holders)"
- return "$OCF_ERR_GENERIC"
+ return "$OCF_ERR_CONFIGURED"
fi
if [ "$fnc_holder_count" -eq 1 ]; then
@@ -1795,6 +1814,9 @@
fi
;;
0)
+ # No active resources: clear any stale
learner_node attribute from previous failed session
+ ocf_log debug "clearing stale learner_node
attribute (safe when active_resources_count=0)"
+ attribute_learner_node clear
# count how many agents are starting now
local start_resources_count
start_resources_count=$(echo
"$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
@@ -1833,7 +1855,7 @@
ocf_log info "same
cluster_id and revision: start normal"
else
ocf_exit_reason "same
revision but different cluster id"
- return
"$OCF_ERR_GENERIC"
+ return
"$OCF_ERR_CONFIGURED"
fi
fi
;;
@@ -1858,12 +1880,6 @@
run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
- # check to see if the container has already started
- podman_simple_status
- if [ $? -eq $OCF_SUCCESS ]; then
- return "$OCF_SUCCESS"
- fi
-
if ocf_is_true "$JOIN_AS_LEARNER"; then
local wait_timeout_sec=$((10*60))
local poll_interval_sec=5
@@ -1890,9 +1906,8 @@
ocf_log info "check for changes in pod manifest to decide if the
container should be reused or replaced"
if ! can_reuse_container ; then
- rc="$?"
- ocf_log err "could not determine etcd container reuse strategy,
rc: $rc"
- return "$rc"
+ ocf_log err "could not determine etcd container reuse strategy"
+ return $OCF_ERR_GENERIC
fi
# Archive current container and its configuration before creating
@@ -1908,13 +1923,13 @@
fi
if ! prepare_env; then
- ocf_log err "Could not prepare environment for podman, error
code: $?"
+ ocf_log err "Could not prepare environment for podman"
return $OCF_ERR_GENERIC
fi
if ! generate_etcd_configuration; then
- ocf_log err "Could not generate etcd configuration, error code:
$?"
- return $OCF_ERR_GENERIC
+ ocf_log err "Could not generate etcd configuration"
+ return $OCF_ERR_CONFIGURED
fi
run_opts="$run_opts \
@@ -2090,6 +2105,7 @@
ocf_log err "could not delete container health check state file"
fi
+ attribute_learner_node clear
attribute_node_revision update
attribute_node_cluster_id update