commit resource-agents for openSUSE:Factory

Source-Sync Fri, 19 Dec 2025 07:42:43 -0800

Script 'mail_helper' called by obssrc
Hello community,

here is the log from the commit of package resource-agents for openSUSE:Factory 
checked in at 2025-12-19 16:41:54
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Comparing /work/SRC/openSUSE:Factory/resource-agents (Old)
 and      /work/SRC/openSUSE:Factory/.resource-agents.new.1928 (New)
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


Package is "resource-agents"

Fri Dec 19 16:41:54 2025 rev:144 rq:1323402 version:4.17.0+git18.92719d83

Changes:
--------
--- /work/SRC/openSUSE:Factory/resource-agents/resource-agents.changes  
2025-12-03 14:13:50.221881824 +0100
+++ 
/work/SRC/openSUSE:Factory/.resource-agents.new.1928/resource-agents.changes    
    2025-12-19 16:41:57.172105135 +0100
@@ -1,0 +2,14 @@
+Wed Dec 17 17:47:09 UTC 2025 - Peter Varkoly <[email protected]>
+
+- Update to version 4.17.0+git18.92719d83:
+  * Filesystem: signal many processes in parallel
+  * podman-etcd: sync environment variables with Pod manifest
+  * OCPEDGE-2231: podman-etcd: improve error handling to support retry on 
start errors (#2105)
+  * Filesystem: new force_unmount=move option
+  * Filesystem: try umount immediately after signals are sent
+  * Filesystem: tmpfs/overlayfs have no need for systemd_drop_in
+  * Filesystem: improve shell trace (set -x) output
+  * OCPEDGE-2213: podman-etcd: fix to prevent learner from starting before 
cluster is ready (#2098)
+  * exportfs: fix grep error on stop
+
+-------------------------------------------------------------------

Old:
----
  resource-agents-4.17.0+git5.e3a22113.tar.xz

New:
----
  resource-agents-4.17.0+git18.92719d83.tar.xz

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

Other differences:
------------------
++++++ resource-agents.spec ++++++
--- /var/tmp/diff_new_pack.IV6dkY/_old  2025-12-19 16:41:59.220190667 +0100
+++ /var/tmp/diff_new_pack.IV6dkY/_new  2025-12-19 16:41:59.224190835 +0100
@@ -17,7 +17,7 @@
 
 
 Name:           resource-agents
-Version:        4.17.0+git5.e3a22113
+Version:        4.17.0+git18.92719d83
 Release:        0
 Summary:        HA Reusable Cluster Resource Scripts
 License:        GPL-2.0-only AND LGPL-2.1-or-later AND GPL-3.0-or-later

++++++ _servicedata ++++++
--- /var/tmp/diff_new_pack.IV6dkY/_old  2025-12-19 16:41:59.304194176 +0100
+++ /var/tmp/diff_new_pack.IV6dkY/_new  2025-12-19 16:41:59.312194510 +0100
@@ -1,7 +1,7 @@
 <servicedata>
 <service name="tar_scm">
 <param name="url">https://github.com/ClusterLabs/resource-agents.git</param>
-<param name="changesrevision">e3a22113c7cdce932de605f55b929daaee254bc3</param>
+<param name="changesrevision">92719d83353a8c7128f5fc72812e4b7c06cf8a6b</param>
 </service>
 </servicedata>
 (No newline at EOF)

++++++ resource-agents-4.17.0+git5.e3a22113.tar.xz -> 
resource-agents-4.17.0+git18.92719d83.tar.xz ++++++
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/resource-agents-4.17.0+git5.e3a22113/heartbeat/Filesystem 
new/resource-agents-4.17.0+git18.92719d83/heartbeat/Filesystem
--- old/resource-agents-4.17.0+git5.e3a22113/heartbeat/Filesystem       
2025-11-25 13:04:52.000000000 +0100
+++ new/resource-agents-4.17.0+git18.92719d83/heartbeat/Filesystem      
2025-12-08 15:46:10.000000000 +0100
@@ -19,6 +19,12 @@
 #              OCF_RESKEY_run_fsck
 #              OCF_RESKEY_fast_stop
 #              OCF_RESKEY_force_clones
+#              OCF_RESKEY_force_unmount
+#              OCF_RESKEY_term_signals
+#              OCF_RESKEY_kill_signals
+#              OCF_RESKEY_signal_delay
+#      See also the ocf_heartbeat_Filesystem(7) man page.
+
 #
 #OCF_RESKEY_device    : name of block device for the filesystem. e.g. 
/dev/sda1, /dev/md0
 #                       Or a -U or -L option for mount, or an NFS mount 
specification
@@ -91,12 +97,6 @@
 fi
 
 
-: ${OCF_RESKEY_device=${OCF_RESKEY_device_default}}
-: ${OCF_RESKEY_directory=${OCF_RESKEY_directory_default}}
-: ${OCF_RESKEY_fstype=${OCF_RESKEY_fstype_default}}
-: ${OCF_RESKEY_options=${OCF_RESKEY_options_default}}
-: ${OCF_RESKEY_statusfile_prefix=${OCF_RESKEY_statusfile_prefix_default}}
-: ${OCF_RESKEY_run_fsck=${OCF_RESKEY_run_fsck_default}}
 if [ -z "${OCF_RESKEY_fast_stop}" ]; then
        case "$OCF_RESKEY_fstype" in
                gfs2)
@@ -105,11 +105,18 @@
                        OCF_RESKEY_fast_stop=${OCF_RESKEY_fast_stop_default};;
        esac
 fi
-: ${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}}
-: ${OCF_RESKEY_force_unmount=${OCF_RESKEY_force_unmount_default}}
-: ${OCF_RESKEY_term_signals=${OCF_RESKEY_term_signals_default}}
-: ${OCF_RESKEY_kill_signals=${OCF_RESKEY_kill_signals_default}}
-: ${OCF_RESKEY_signal_delay=${OCF_RESKEY_signal_delay_default}}
+: "fast_stop        ::" ${OCF_RESKEY_fast_stop}
+: "device           ::" ${OCF_RESKEY_device=${OCF_RESKEY_device_default}}
+: "directory        ::" ${OCF_RESKEY_directory=${OCF_RESKEY_directory_default}}
+: "fstype           ::" ${OCF_RESKEY_fstype=${OCF_RESKEY_fstype_default}}
+: "options          ::" ${OCF_RESKEY_options=${OCF_RESKEY_options_default}}
+: "statusfile_prefix::" 
${OCF_RESKEY_statusfile_prefix=${OCF_RESKEY_statusfile_prefix_default}}
+: "run_fsck         ::" ${OCF_RESKEY_run_fsck=${OCF_RESKEY_run_fsck_default}}
+: "force_clones     ::" 
${OCF_RESKEY_force_clones=${OCF_RESKEY_force_clones_default}}
+: "force_umount     ::" 
${OCF_RESKEY_force_unmount=${OCF_RESKEY_force_unmount_default}}
+: "term_signals     ::" 
${OCF_RESKEY_term_signals=${OCF_RESKEY_term_signals_default}}
+: "kill_signals     ::" 
${OCF_RESKEY_kill_signals=${OCF_RESKEY_kill_signals_default}}
+: "signal_delay     ::" 
${OCF_RESKEY_signal_delay=${OCF_RESKEY_signal_delay_default}}
 
 # Variables used by multiple methods
 HOSTOS=$(uname)
@@ -253,7 +260,7 @@
 </parameter>
 
 <parameter name="force_unmount">
-<longdesc lang="en">
+<longdesc lang="en"><![CDATA[
 This option allows specifying how to handle processes that are
 currently accessing the mount directory.
 
@@ -262,12 +269,25 @@
           avoid functions that could potentially block during process
           detection
 "false" : Do not kill any processes.
+"move"  : like "safe", but try to mount --move first
 
-The 'safe' option uses shell logic to walk the /procs/ directory
+The 'safe' option uses shell logic to walk the /proc/<pid>/ directories
 for pids using the mount point while the default option uses the
 fuser cli tool. fuser is known to perform operations that can potentially
 block if unresponsive nfs mounts are in use on the system.
-</longdesc>
+
+If new users of the file system are being spawned continuously by unmanaged 3rd
+party apps, we likely never win the race and the file system will be kept busy.
+Which may result in a timeout and stop failure, potentially escalating to
+hard-reset of this node via fencing.
+
+The 'move' option tries to move the mount point somewhere those "rogue apps"
+do not expect it, then proceed to kill current users and attempt to umount.
+
+For 'move' to work, you will have to make sure the mount point does not reside
+under a shared mount, for example by mount -o bind,private /mount /mount
+before mounting /mount/point.
+]]></longdesc>
 <shortdesc lang="en">Kill processes before unmount</shortdesc>
 <content type="string" default="${OCF_RESKEY_force_unmount_default}" />
 </parameter>
@@ -582,8 +602,21 @@
                return $OCF_ERR_CONFIGURED
        fi
 
+       if $move_before_umount && test -d "$MOVED_CANONICALIZED_MOUNTPOINT"; 
then
+               CANONICALIZED_MOUNTPOINT=$MOVED_CANONICALIZED_MOUNTPOINT \
+               move_before_umount=false \
+               Filesystem_status
+               rc=$?
+               if [ $rc != $OCF_NOT_RUNNING ]; then
+                       msg="move_before_umount=$move_before_umount and 
$MOVED_CANONICALIZED_MOUNTPOINT status is [$rc]"
+                       rc=$OCF_ERR_GENERIC
+                       ocf_exit_reason "$msg"
+                       return $rc
+               fi
+       fi
+
        # See if the device is already mounted.
-       Filesystem_status
+       move_before_umount=false Filesystem_status
        case "$?" in
                $OCF_SUCCESS)
                        ocf_log info "Filesystem $MOUNTPOINT is already 
mounted."
@@ -663,11 +696,11 @@
 
        if ocf_is_true  "$FORCE_UNMOUNT"; then
                if [ "X${HOSTOS}" = "XOpenBSD" ]; then
-                       fstat | grep $dir | awk '{print $3}'
+                       fstat | grep "$dir" | awk '{print $3}'
                elif [ "X${HOSTOS}" = "XFreeBSD" ]; then
-                       $FUSER -c $dir 2>/dev/null
+                       $FUSER -c "$dir" 2>/dev/null
                else
-                       $FUSER -Mm $dir 2>/dev/null
+                       $FUSER -Mm "$dir" 2>/dev/null
                fi
        elif [ "$FORCE_UNMOUNT" = "safe" ]; then
                # Yes, in theory, ${dir} could contain "intersting" characters
@@ -721,18 +754,36 @@
        local dir=$1
        local sig=$2
        local pids pid
+       local nr_pids
+       local sed_script=""
        # fuser returns a non-zero return code if none of the
        # specified files is accessed or in case of a fatal
        # error.
-       pids=$(get_pids "$dir")
-       if [ -z "$pids" ]; then
+       # We don't know whether we have one single line or mutiple lines.
+       # Canonicalize.
+       pids=$(get_pids "$dir" | tr -s ' \t\n' '\n')
+       nr_pids=$(echo "$pids" | grep -c "^.")
+
+       # for many pids, reporting and killing them indivitually just takes too 
long.
+       # may even be too many words for the shell!
+       #
+       # If the list of pids is too long for a single shell variable,
+       # fix your fork bomb workload, and get a better shell anyways.
+       #
+       if [ $nr_pids = 0 ]; then
                ocf_log info "No processes on $dir were signalled. 
force_unmount is set to '$FORCE_UNMOUNT'"
                return 1
+       elif [ $nr_pids -le 24 ]; then
+               for pid in $pids; do
+                       ocf_log info "sending signal $sig to: $(ps -f $pid | 
tail -1)"
+                       kill -s $sig $pid
+               done
+       else
+               echo "$pids" | xargs -r kill -s $sig
+               sed_script="11 s/^.*/... and more .../; 12,$(( $nr_pids - 10))d"
+               pids=$(echo "$pids" | sed -e "$sed_script" | tr '\n' ' ')
+               ocf_log info "sent signals $sig to ${nr_pids} processes 
[${pids}]"
        fi
-       for pid in $pids; do
-               ocf_log info "sending signal $sig to: $(ps -f $pid | tail -1)"
-               kill -s $sig $pid
-       done
        return 0
 }
 try_umount() {
@@ -760,13 +811,21 @@
        return $ret
 }
 fs_stop_loop() {
-       local force_arg="$1" SUB="$2" signals="$3" sig send_signal
+       local force_arg="$1" SUB="$2" signals="$3" sig sent_signal
        while true; do
-               send_signal=false
+               sent_signal=false
                for sig in $signals; do
-                       signal_processes "$SUB" $sig && send_signal=true
+                       signal_processes "$SUB" $sig && sent_signal=true
                done
-               $send_signal && sleep $OCF_RESKEY_signal_delay
+               if $sent_signal; then
+                       # Try to umount immediately after signalling, to reduce
+                       # the time window in which new users of the file system
+                       # may be spawned.
+                       try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
+                       sleep $OCF_RESKEY_signal_delay
+               fi
+               # Try to umount after the signal_delay, maybe some processes
+               # needed a moment to "exit cleanly" after receiving SIGTERM.
                try_umount "$force_arg" "$SUB" && return $OCF_SUCCESS
        done
 }
@@ -797,16 +856,34 @@
        return $OCF_ERR_GENERIC
 }
 
+try_mount_move()
+{
+       test -d "$MOVED_CANONICALIZED_MOUNTPOINT" || mkdir 
"$MOVED_CANONICALIZED_MOUNTPOINT" || return
+       mount --move "$CANONICALIZED_MOUNTPOINT" 
"$MOVED_CANONICALIZED_MOUNTPOINT" || return
+       ocf_log info "Moved $MOUNTPOINT to $MOVED_CANONICALIZED_MOUNTPOINT"
+       return 0
+       # To test really bad timing of "action timeout":
+       # test -e /tmp/fail-after-move && rm -f /tmp/fail-after-move && kill 
-KILL $$
+}
+
 #
 # STOP: Unmount the filesystem
 #
 Filesystem_stop()
 {
        # See if the device is currently mounted
-       Filesystem_status >/dev/null 2>&1
+       move_before_umount=false Filesystem_status >/dev/null 2>&1
        if [ $? -eq $OCF_NOT_RUNNING ]; then
                # Already unmounted, wonderful.
-               rc=$OCF_SUCCESS
+               # But did we also unmount the moved fs?
+               if $move_before_umount; then
+                       
CANONICALIZED_MOUNTPOINT=$MOVED_CANONICALIZED_MOUNTPOINT \
+                       move_before_umount=false \
+                       Filesystem_stop
+                       return $?
+               else
+                       rc=$OCF_SUCCESS
+               fi
        else
                # Wipe the status file, but continue with a warning if
                # removal fails -- the file system might be read only
@@ -826,6 +903,13 @@
                nfs4|nfs|aznfs|efs|cifs|smbfs) umount_force="-f" ;;
                esac
 
+               if $move_before_umount && try_mount_move ; then
+                       
CANONICALIZED_MOUNTPOINT=$MOVED_CANONICALIZED_MOUNTPOINT \
+                       move_before_umount=false \
+                       Filesystem_stop
+                       return $?
+               fi
+
                # Umount all sub-filesystems mounted under $MOUNTPOINT/ too.
                local timeout
                while read SUB; do
@@ -859,6 +943,21 @@
 #
 Filesystem_status()
 {
+       if $move_before_umount && test -d $MOVED_CANONICALIZED_MOUNTPOINT; then
+               # Have to recurse once.
+               CANONICALIZED_MOUNTPOINT=$MOVED_CANONICALIZED_MOUNTPOINT \
+               move_before_umount=false \
+               OP= \
+               Filesystem_status
+               rc=$?
+               if [ $rc = $OCF_SUCCESS ]; then
+                       rc=$OCF_ERR_GENERIC
+                       msg="move_before_umount=$move_before_umount and 
something is mounted on $MOVED_CANONICALIZED_MOUNTPOINT"
+                       ocf_exit_reason "$msg"
+                       return $rc
+               fi
+       fi
+
        local match_string="${TAB}${CANONICALIZED_MOUNTPOINT}${TAB}"
        local mounted_device=$(list_mounts | grep "$match_string" | awk '{print 
$1}')
 
@@ -1082,6 +1181,11 @@
 fi
 FAST_STOP=${OCF_RESKEY_fast_stop:="yes"}
 
+case $FORCE_UNMOUNT in
+       move) move_before_umount=true; FORCE_UNMOUNT=safe ;;
+       *)    move_before_umount=false ;;
+esac
+
 OP=$1
 
 # These operations do not require instance parameters
@@ -1143,6 +1247,9 @@
        fi
 fi
 
+MOVED_CANONICALIZED_MOUNTPOINT=$(echo "$CANONICALIZED_MOUNTPOINT" | sed -e 
's,/\([^/]\+\)$,/.\1,')
+
+
 # Check to make sure the utilites are found
 if [ "X${HOSTOS}" != "XOpenBSD" ];then
 check_binary $MODPROBE
@@ -1176,7 +1283,10 @@
        CLUSTERSAFE=2
 
 case "$FSTYPE" in
-nfs4|nfs|aznfs|efs|smbfs|cifs|none|gfs2|glusterfs|ceph|ocfs2|overlay|overlayfs|tmpfs|cvfs|lustre)
+none|overlay|overlayfs|tmpfs)
+       CLUSTERSAFE=1 # this is kind of safe too
+       ;;
+nfs4|nfs|aznfs|efs|smbfs|cifs|gfs2|glusterfs|ceph|ocfs2|cvfs|lustre)
        CLUSTERSAFE=1 # this is kind of safe too
        systemd_drop_in "99-Filesystem-remote" "After" "remote-fs.target"
        ;;
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/resource-agents-4.17.0+git5.e3a22113/heartbeat/exportfs 
new/resource-agents-4.17.0+git18.92719d83/heartbeat/exportfs
--- old/resource-agents-4.17.0+git5.e3a22113/heartbeat/exportfs 2025-11-25 
13:04:52.000000000 +0100
+++ new/resource-agents-4.17.0+git18.92719d83/heartbeat/exportfs        
2025-12-08 15:46:10.000000000 +0100
@@ -390,10 +390,10 @@
        local contentfile=/proc/net/rpc/nfsd.export/content
        local fsid_re
        local i=1
-       fsid_re="fsid=(echo `forall get_fsid`|sed 's/ /|/g'),"
+       local fsid_all=`forall get_fsid`
+       fsid_re="fsid=(`echo $fsid_all | sed 's/ /|/g'`),"
        while :; do
-               grep -E -q "$fsid_re" $contentfile ||
-                       break
+               grep -E -q "$fsid_re" $contentfile || break
                ocf_log info "Cleanup export cache ... (try $i)"
                ocf_run exportfs -f
                sleep 0.5
diff -urN '--exclude=CVS' '--exclude=.cvsignore' '--exclude=.svn' 
'--exclude=.svnignore' 
old/resource-agents-4.17.0+git5.e3a22113/heartbeat/podman-etcd 
new/resource-agents-4.17.0+git18.92719d83/heartbeat/podman-etcd
--- old/resource-agents-4.17.0+git5.e3a22113/heartbeat/podman-etcd      
2025-11-25 13:04:52.000000000 +0100
+++ new/resource-agents-4.17.0+git18.92719d83/heartbeat/podman-etcd     
2025-12-08 15:46:10.000000000 +0100
@@ -604,8 +604,8 @@
        fi
        ETCD_ELECTION_TIMEOUT=$(get_env_from_manifest "ETCD_ELECTION_TIMEOUT")
        ETCD_ENABLE_PPROF=$(get_env_from_manifest "ETCD_ENABLE_PPROF")
-       ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION=$(get_env_from_manifest 
"ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
-       
ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest 
"ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
+       ETCD_WARNING_APPLY_DURATION=$(get_env_from_manifest 
"ETCD_WARNING_APPLY_DURATION")
+       ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL=$(get_env_from_manifest 
"ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL")
        ETCD_HEARTBEAT_INTERVAL=$(get_env_from_manifest 
"ETCD_HEARTBEAT_INTERVAL")
        ETCD_QUOTA_BACKEND_BYTES=$(get_env_from_manifest 
"ETCD_QUOTA_BACKEND_BYTES")
        ETCD_SOCKET_REUSE_ADDRESS=$(get_env_from_manifest 
"ETCD_SOCKET_REUSE_ADDRESS")
@@ -617,9 +617,13 @@
        LISTEN_CLIENT_URLS="0.0.0.0"
        LISTEN_PEER_URLS="0.0.0.0"
        LISTEN_METRICS_URLS="0.0.0.0"
+
+       return $OCF_SUCCESS
 }
 
 compute_bump_revision() {
+       local rc
+
        # Same logic used by cluster-etcd-operator quorum-restore-pod utility.
        # see 
https://github.com/openshift/cluster-etcd-operator/blob/215998939f5223da916622c71fd07d17656faf6b/bindata/etcd/quorum-restore-pod.yaml#L26-L34
        # set a default value: 1bn would be an etcd running at 1000 writes/s 
for about eleven days.
@@ -656,6 +660,7 @@
 
        # the space indentation for client-transport-security and 
peer-transport-security
        # is required for correct YAML formatting.
+       # TODO: replace flags deprecated in Etcd v3.6
        cat > "$ETCD_CONFIGURATION_FILE" << EOF
 logger: zap
 log-level: info
@@ -688,10 +693,16 @@
 metrics: extensive
 experimental-initial-corrupt-check: true
 experimental-max-learners: 1
-experimental-warning-apply-duration: $(convert_duration_in_nanoseconds 
"$ETCD_EXPERIMENTAL_WARNING_APPLY_DURATION")
-experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds 
"$ETCD_EXPERIMENTAL_WATCH_PROGRESS_NOTIFY_INTERVAL")
+experimental-warning-apply-duration: $(convert_duration_in_nanoseconds 
"$ETCD_WARNING_APPLY_DURATION")
+experimental-watch-progress-notify-interval: $(convert_duration_in_nanoseconds 
"$ETCD_WATCH_PROGRESS_NOTIFY_INTERVAL")
 EOF
+       rc=$?
+       if [ $rc -ne 0 ]; then
+               ocf_log err "could not create etcd configuration, 'cat' error 
code: $rc"
+               return $OCF_ERR_CONFIGURED
+       fi
 
+       # Append cipher suites from the env variable where the entries are 
comma separated.
        {
                if [ -n "$ETCD_CIPHER_SUITES" ]; then
                        echo "cipher-suites:"
@@ -700,6 +711,13 @@
                        done
                fi
        } >> "$ETCD_CONFIGURATION_FILE"
+       rc=$?
+       if [ $rc -ne 0 ]; then
+               ocf_log err "could not append cipher suites to etcd 
configuration, error code: $rc"
+               return $OCF_ERR_CONFIGURED
+       fi
+
+       return $OCF_SUCCESS
 }
 
 archive_data_folder()
@@ -880,11 +898,11 @@
        local endpoint_url=$(ip_url $(attribute_node_ip get))
        local peer_url=$(ip_url $member_ip)
 
-       ocf_log info "add $member_name ($member_ip, $endpoint_url) to the 
member list as learner"
+       ocf_log info "add $member_name ($member_ip) to the member list as 
learner"
        out=$(podman exec "${CONTAINER}" etcdctl 
--endpoints="$endpoint_url:2379" member add "$member_name" 
--peer-urls="$peer_url:2380" --learner)
        rc=$?
        if [ $rc -ne 0 ]; then
-               ocf_log err "could not add $member_name as learner, error code: 
$rc"
+               ocf_log err "could not add $member_name as learner, error code 
$rc, etcdctl output: $out"
                return $rc
        fi
        ocf_log info "$out"
@@ -1032,7 +1050,7 @@
        if ! ocf_run podman exec "${CONTAINER}" etcdctl member promote 
"$learner_member_id_hex" 2>&1; then
                # promotion is expected to fail if the peer is not yet 
up-to-date
                ocf_log info "could not promote member $learner_member_id_hex, 
error code: $?"
-               return $OCF_SUCCESS
+               return $OCF_ERR_GENERIC
        fi
        ocf_log info "successfully promoted member '$learner_member_id_hex'"
        return $OCF_SUCCESS
@@ -1063,19 +1081,19 @@
        fi
 
        if [ -n "$learner_member_id" ]; then
-               promote_learner_member "$learner_member_id"
-               return $?
-       fi
-
-       if [ -z "$learner_member_id" ]; then
-               if ! clear_standalone_node; then
-                       ocf_log error "could not clear standalone_node 
attribute, error code: $?"
-                       return $OCF_ERR_GENERIC
-               fi
-               if ! attribute_learner_node clear; then
-                       ocf_log error "could not clear learner_node attribute, 
error code: $?"
+               if ! promote_learner_member "$learner_member_id"; then
                        return $OCF_ERR_GENERIC
                fi
+               # promotion succeded: continue to clear standalone_node and 
learner_node
+       fi
+
+       if ! clear_standalone_node; then
+               ocf_log error "could not clear standalone_node attribute, error 
code: $?"
+               return $OCF_ERR_GENERIC
+       fi
+       if ! attribute_learner_node clear; then
+               ocf_log error "could not clear learner_node attribute, error 
code: $?"
+               return $OCF_ERR_GENERIC
        fi
 
        return $OCF_SUCCESS
@@ -1258,6 +1276,7 @@
                        set_standalone_node
                else
                        ocf_log debug "$name is in the members list by IP: $ip"
+                       # Errors from reconcile_member_state are logged 
internally. Ignoring them here prevents stopping a healthy voter agent; 
critical local failures are caught by detect_cluster_leadership_loss.
                        reconcile_member_state "$member_list_json"
                fi
        done
@@ -1369,7 +1388,7 @@
        # Could not execute monitor check command and state file exists - the 
container failed, check recovery status in this lifecycle
        local time_since_heartbeat
        time_since_heartbeat=$(get_time_since_last_heartbeat)
-       ocf_log err "Container ${CONTAINER} failed (last healthy: 
${time_since_heartbeat}s ago)"
+       ocf_log err "Container ${CONTAINER} failed (last healthy: 
${time_since_heartbeat}s ago, error code: $rc)"
 
        # Check if peer has set force_new_cluster for recovery
        local fnc_holders
@@ -1762,7 +1781,7 @@
                fnc_holder_count=$(echo "$fnc_holders" | wc -w)
                if [ "$fnc_holder_count" -gt 1 ]; then
                        ocf_exit_reason "force_new_cluster attribute is set on 
multiple nodes ($fnc_holders)"
-                       return "$OCF_ERR_GENERIC"
+                       return "$OCF_ERR_CONFIGURED"
                fi
 
                if [ "$fnc_holder_count" -eq 1 ]; then
@@ -1795,6 +1814,9 @@
                                fi
                                ;;
                        0)
+                               # No active resources: clear any stale 
learner_node attribute from previous failed session
+                               ocf_log debug "clearing stale learner_node 
attribute (safe when active_resources_count=0)"
+                               attribute_learner_node clear
                                # count how many agents are starting now
                                local start_resources_count
                                start_resources_count=$(echo 
"$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
@@ -1833,7 +1855,7 @@
                                                        ocf_log info "same 
cluster_id and revision: start normal"
                                                else
                                                        ocf_exit_reason "same 
revision but different cluster id"
-                                                       return 
"$OCF_ERR_GENERIC"
+                                                       return 
"$OCF_ERR_CONFIGURED"
                                                fi
                                        fi
                                        ;;
@@ -1858,12 +1880,6 @@
 
        run_opts="$run_opts --oom-score-adj=${OCF_RESKEY_oom}"
 
-       # check to see if the container has already started
-       podman_simple_status
-       if [ $? -eq $OCF_SUCCESS ]; then
-               return "$OCF_SUCCESS"
-       fi
-
        if ocf_is_true "$JOIN_AS_LEARNER"; then
                local wait_timeout_sec=$((10*60))
                local poll_interval_sec=5
@@ -1890,9 +1906,8 @@
 
        ocf_log info "check for changes in pod manifest to decide if the 
container should be reused or replaced"
        if ! can_reuse_container ; then
-               rc="$?"
-               ocf_log err "could not determine etcd container reuse strategy, 
rc: $rc"
-               return "$rc"
+               ocf_log err "could not determine etcd container reuse strategy"
+               return $OCF_ERR_GENERIC
        fi
 
        # Archive current container and its configuration before creating
@@ -1908,13 +1923,13 @@
        fi
 
        if ! prepare_env; then
-               ocf_log err "Could not prepare environment for podman, error 
code: $?"
+               ocf_log err "Could not prepare environment for podman"
                return $OCF_ERR_GENERIC
        fi
 
        if ! generate_etcd_configuration; then
-               ocf_log err "Could not generate etcd configuration, error code: 
$?"
-               return $OCF_ERR_GENERIC
+               ocf_log err "Could not generate etcd configuration"
+               return $OCF_ERR_CONFIGURED
        fi
 
        run_opts="$run_opts \
@@ -2090,6 +2105,7 @@
                ocf_log err "could not delete container health check state file"
        fi
 
+       attribute_learner_node clear
        attribute_node_revision update
        attribute_node_cluster_id update

commit resource-agents for openSUSE:Factory

Reply via email to