Hi Vu, see my comment below/Hans On 1/28/19 10:59, Vu Minh Nguyen wrote: > Hi Hans, > > Thanks for your comments. See my comment inline. Thanks > > Regards, Vu > >> -----Original Message----- >> From: Hans Nordebäck <hans.nordeb...@ericsson.com> >> Sent: Monday, January 28, 2019 4:37 PM >> To: Hans Nordebäck <hans.nordeb...@ericsson.com>; Vu Minh Nguyen >> <vu.m.ngu...@dektech.com.au>; Gary Lee <gary....@dektech.com.au>; >> Minh Hon Chau <minh.c...@dektech.com.au> >> Cc: opensaf-devel@lists.sourceforge.net >> Subject: Re: [devel] [PATCH 1/1] osaf: do quick local node reboot when > split >> network [#3001] >> >> Hi Vu, >> See one more comment below/Thanks HansN >> >> -----Original Message----- >> From: Hans Nordebäck <hans.nordeb...@ericsson.com> >> Sent: den 28 januari 2019 10:15 >> To: Vu Minh Nguyen <vu.m.ngu...@dektech.com.au>; Gary Lee >> <gary....@dektech.com.au>; Minh Hon Chau <minh.c...@dektech.com.au> >> Cc: opensaf-devel@lists.sourceforge.net >> Subject: Re: [devel] [PATCH 1/1] osaf: do quick local node reboot when > split >> network [#3001] >> >> Hi Vu, ack review only. Two comments below/Thanks HansN >> >> On 1/25/19 12:34, Vu Minh Nguyen wrote: >>> --- >>> scripts/opensaf_reboot | 33 +++++++++++++++++++++++++++------ >>> src/amf/amfd/ndproc.cc | 4 ++-- >>> src/base/ncssysf_def.h | 6 ++++++ >>> src/base/sysf_def.c | 10 ++++++++++ >>> src/fm/fmd/fm_main.cc | 6 +++--- >>> src/fm/fmd/fm_rda.cc | 5 ++--- >>> src/rde/rded/rde_main.cc | 6 ++---- >>> 7 files changed, 52 insertions(+), 18 deletions(-) >>> >>> diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot index >>> 727272e1d..2f7a7daeb 100644 >>> --- a/scripts/opensaf_reboot >>> +++ b/scripts/opensaf_reboot >>> @@ -31,7 +31,7 @@ export >> LD_LIBRARY_PATH=$libdir:$LD_LIBRARY_PATH >>> # Node fencing: OpenSAF cannot reboot a node when there's no CLM >> node to >>> # PLM EE mapping in the information model. In such cases rebooting >>> would be done -# through proprietary mechanisms, i.e. not through PLM. >>> Node_id is (the only >>> +# through proprietary mechanisms, i.e. not through PLM. Node_id is >>> +(the only >>> # entity) at the disposal of such a mechanism. >>> >>> if [ -f "$pkgsysconfdir/fmd.conf" ]; then @@ -81,7 +81,6 @@ >>> opensaf_reboot_with_remote_fencing() >>> #if plm exists in the system,then the reboot is performed using the >> eename. >>> opensaf_reboot_with_plm() >>> { >>> - >>> immadm -o 7 $ee_name >>> retval=$? >>> if [ $retval != 0 ]; then >>> @@ -96,12 +95,29 @@ opensaf_reboot_with_plm() >>> logger -t "opensaf_reboot" "abrupt restart failed for $ee_name: unable > to >> restart remote node" >>> exit 1 >>> fi >>> -fi >>> +fi >>> fi >>> #Note: Operation Id SA_PLM_ADMIN_RESTART=7 >>> #In the example the $ee_name would expand to (for eg:-) >> safEE=my_linux_os,safHE=64bitmulticore,safDomain=my_domain >>> } >>> >>> +# Force local node reboot as fast as possible >>> +quick_local_node_reboot() >>> +{ >>> +logger -t "opensaf_reboot" "Do quick local node reboot" >> [HansN] perhaps reuse the same logic as in sysf_def.c, i.e. use the sysrq > as >> fallback and use a short timeout > [Vu] > Forcing node reboot by touching /proc/sysrq-trigger is not allowed on > containers such as LXC > (as container is immutable), therefore I provided 02 more alternatives below > in case the first try is failed. [HansN] preferable to only run the sysrq if the reboot fails, i.e. the same logic as in sysf_def.c, see the SIGALRM and supervision_time. >>> + >>> +$icmd /bin/echo -n 'b' 2> /dev/null > /proc/sysrq-trigger >> [HansN] if not run as root, i.e. icmd is sudo, I think you need to use >> cmd: /bin/echo -n 'b' | $icmd tee /proc/sysrq-trigger , please check >> [HansN] or $icmd /bin/sh -c "/bin/echo -n 'b' 2> /dev/null > /proc/sysrq- >> trigger" > [Vu] Thanks for your suggestion. Will update accordingly. >>> +ret_code=$? >>> + >>> +if [ $ret_code != 0 ] && [ -x /bin/systemctl ]; then >>> +$icmd /bin/systemctl --force --force reboot >>> +ret_code=$? >>> +fi >>> + >>> +if [ $ret_code != 0 ]; then >>> +$icmd /sbin/reboot -f >>> +fi >>> +} >>> >>> if ! test -f "$NODE_ID_FILE"; then >>> logger -t "opensaf_reboot" "$NODE_ID_FILE doesnt exists,reboot failed > " >>> @@ -112,8 +128,13 @@ temp_node_id=`cat "$NODE_ID_FILE"` >>> temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e >> 's:^:0x:'` >>> self_node_id=`printf "%d" $temp_node_id` >>> >>> -# If clm cluster reboot requested argument one and two are set but >>> not used, argument 3 is set to 1, "safe reboot" request -if [ >>> "$safe_reboot" = 1 ]; then >>> +# If no argument is provided, forcing node reboot immediately without >>> +log # flushing, process terminating, disk un-mounting. >>> +# If clm cluster reboot requested argument one and two are set but >>> +not used, # argument 3 is set to 1, "safe reboot" request. >>> +if [ "$#" = 0 ]; then >>> +quick_local_node_reboot >>> +elif [ "$safe_reboot" = 1 ]; then >>> opensaf_safe_reboot >>> else >>> # A node ID of zero(0) means an order to reboot the local node @@ >>> -165,7 +186,7 @@ else >>> logger -t "opensaf_reboot" "Not rebooting remote node $ee_name as it > is >> not in INSTANTIATED state" >>> elif [ $plm_node_state != 2 ]; then >>> opensaf_reboot_with_plm >>> -else >>> +else >>> logger -t "opensaf_reboot" "Not rebooting remote node $ee_name as it > is >> already in locked state" >>> fi >>> else >>> diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc index >>> ec347fc71..c5060e67b 100644 >>> --- a/src/amf/amfd/ndproc.cc >>> +++ b/src/amf/amfd/ndproc.cc >>> @@ -1262,7 +1262,7 @@ void check_quorum(AVD_CL_CB *cb) { >>> >>> // remote fencing is disabled and we have lost write access >>> // reboot this node to prevent split brain >>> - opensaf_reboot(0, nullptr, >>> - "Quorum lost. Rebooting this node to prevent split-brain"); >>> + opensaf_quick_reboot("Quorum lost. Rebooting this node to " >>> + "prevent split-brain"); >>> } >>> } >>> diff --git a/src/base/ncssysf_def.h b/src/base/ncssysf_def.h index >>> 3df8b221e..3de6f44f6 100644 >>> --- a/src/base/ncssysf_def.h >>> +++ b/src/base/ncssysf_def.h >>> @@ -76,6 +76,12 @@ void opensaf_reboot_prepare(void); >>> */ >>> void opensaf_reboot(unsigned node_id, const char* ee_name, const >>> char* reason); >>> >>> +/** >>> + * Do quick local node reboot - without first unmounting file systems >>> + * or syncing disks attached to the system. >>> +*/ >>> +void opensaf_quick_reboot(const char* reason); >>> + >>> >> /************************************************************** >> *************** >>> ** > ** >>> ** ncs_os_get_time_stamp: Return the current timestamp as > "time_t" >> in ** >>> diff --git a/src/base/sysf_def.c b/src/base/sysf_def.c index >>> d0cfc94ef..880f08ec4 100644 >>> --- a/src/base/sysf_def.c >>> +++ b/src/base/sysf_def.c >>> @@ -271,6 +271,16 @@ void opensaf_reboot(unsigned node_id, const >> char *ee_name, const char *reason) >>> } >>> } >>> >>> +void opensaf_quick_reboot(const char *reason) { >>> +syslog(LOG_CRIT, "Quick local node rebooting, Reason: %s ", reason); >>> +int reboot_result = system(PKGLIBDIR "/opensaf_reboot"); >>> +if (reboot_result != EXIT_SUCCESS) { >>> +syslog(LOG_CRIT, "node reboot failure: exit code %d", >>> + WEXITSTATUS(reboot_result)); >>> +} >>> +} >>> + >>> /** >>> * syslog and abort to generate a core dump (if enabled) >>> * @param __file >>> diff --git a/src/fm/fmd/fm_main.cc b/src/fm/fmd/fm_main.cc index >>> 122b2175c..b473b68aa 100644 >>> --- a/src/fm/fmd/fm_main.cc >>> +++ b/src/fm/fmd/fm_main.cc >>> @@ -665,9 +665,9 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb, >> FM_EVT *fm_mbx_evt) { >>> "Two active controllers observed in a cluster, > newActive: %x and " >>> "old-Active: %x", >>> unsigned(fm_cb->node_id), > unsigned(fm_cb->peer_node_id)); >>> - opensaf_reboot(0, NULL, >>> - "Received svc up from peer node (old-active is > not " >>> - "fully DOWN), hence rebooting the new > Active"); >>> + opensaf_quick_reboot("Received svc up from peer node > (old-active " >>> + "is not fully DOWN), " >>> + "hence rebooting the new Active"); >>> } >>> } >>> >>> diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc index >>> 2a9f1664f..028bfa380 100644 >>> --- a/src/fm/fmd/fm_rda.cc >>> +++ b/src/fm/fmd/fm_rda.cc >>> @@ -105,9 +105,8 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb, >> PCS_RDA_ROLE role) { >>> LOG_ER( >>> "A controller is already active. We were separated from the > " >>> "cluster?"); >>> - opensaf_reboot(0, nullptr, >>> - "A controller is already active. We were separated > " >>> - "from the cluster?"); >>> + opensaf_quick_reboot("A controller is already active. We were >> separated " >>> + "from the cluster?"); >>> } >>> } >>> >>> diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc index >>> 2d9aa51dc..b4c842942 100644 >>> --- a/src/rde/rded/rde_main.cc >>> +++ b/src/rde/rded/rde_main.cc >>> @@ -140,9 +140,7 @@ static void handle_mbx_event() { >>> active_controller.c_str()); >>> if (consensus_service.IsRemoteFencingEnabled() == false) { >>> LOG_ER("Probable split-brain. Rebooting this node"); >>> - >>> - opensaf_reboot(0, nullptr, >>> - "Split-brain detected by consensus > service"); >>> + opensaf_quick_reboot("Split-brain detected by consensus >>> + service"); >>> } >>> } >>> >>> @@ -258,7 +256,7 @@ static void CheckForSplitBrain(const rde_msg *msg) >> { >>> PCS_RDA_ROLE own_role = role->role(); >>> PCS_RDA_ROLE other_role = msg->info.peer_info.ha_role; >>> if (own_role == PCS_RDA_ACTIVE && other_role == PCS_RDA_ACTIVE) { >>> - opensaf_reboot(0, nullptr, "Split-brain detected"); >>> + opensaf_quick_reboot("Split-brain detected"); >>> } >>> } >>> >> _______________________________________________ >> Opensaf-devel mailing list >> Opensaf-devel@lists.sourceforge.net >> https://lists.sourceforge.net/lists/listinfo/opensaf-devel
_______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel