Hi Vu, see my comment below/Hans

On 1/28/19 10:59, Vu Minh Nguyen wrote:
> Hi Hans,
>
> Thanks for your comments. See my comment inline. Thanks
>
> Regards, Vu
>
>> -----Original Message-----
>> From: Hans Nordebäck <hans.nordeb...@ericsson.com>
>> Sent: Monday, January 28, 2019 4:37 PM
>> To: Hans Nordebäck <hans.nordeb...@ericsson.com>; Vu Minh Nguyen
>> <vu.m.ngu...@dektech.com.au>; Gary Lee <gary....@dektech.com.au>;
>> Minh Hon Chau <minh.c...@dektech.com.au>
>> Cc: opensaf-devel@lists.sourceforge.net
>> Subject: Re: [devel] [PATCH 1/1] osaf: do quick local node reboot when
> split
>> network [#3001]
>>
>> Hi Vu,
>> See one more comment below/Thanks HansN
>>
>> -----Original Message-----
>> From: Hans Nordebäck <hans.nordeb...@ericsson.com>
>> Sent: den 28 januari 2019 10:15
>> To: Vu Minh Nguyen <vu.m.ngu...@dektech.com.au>; Gary Lee
>> <gary....@dektech.com.au>; Minh Hon Chau <minh.c...@dektech.com.au>
>> Cc: opensaf-devel@lists.sourceforge.net
>> Subject: Re: [devel] [PATCH 1/1] osaf: do quick local node reboot when
> split
>> network [#3001]
>>
>> Hi Vu, ack review only. Two comments below/Thanks HansN
>>
>> On 1/25/19 12:34, Vu Minh Nguyen wrote:
>>> ---
>>>    scripts/opensaf_reboot   | 33 +++++++++++++++++++++++++++------
>>>    src/amf/amfd/ndproc.cc   |  4 ++--
>>>    src/base/ncssysf_def.h   |  6 ++++++
>>>    src/base/sysf_def.c      | 10 ++++++++++
>>>    src/fm/fmd/fm_main.cc    |  6 +++---
>>>    src/fm/fmd/fm_rda.cc     |  5 ++---
>>>    src/rde/rded/rde_main.cc |  6 ++----
>>>    7 files changed, 52 insertions(+), 18 deletions(-)
>>>
>>> diff --git a/scripts/opensaf_reboot b/scripts/opensaf_reboot index
>>> 727272e1d..2f7a7daeb 100644
>>> --- a/scripts/opensaf_reboot
>>> +++ b/scripts/opensaf_reboot
>>> @@ -31,7 +31,7 @@ export
>> LD_LIBRARY_PATH=$libdir:$LD_LIBRARY_PATH
>>>    # Node fencing: OpenSAF cannot reboot a node when there's no CLM
>> node to
>>>    # PLM EE mapping in the information model. In such cases rebooting
>>> would be done -# through proprietary mechanisms, i.e. not through PLM.
>>> Node_id is (the only
>>> +# through proprietary mechanisms, i.e. not through PLM. Node_id is
>>> +(the only
>>>    # entity) at the disposal of such a mechanism.
>>>
>>>    if [ -f "$pkgsysconfdir/fmd.conf" ]; then @@ -81,7 +81,6 @@
>>> opensaf_reboot_with_remote_fencing()
>>>    #if plm exists in the system,then the reboot is performed using the
>> eename.
>>>    opensaf_reboot_with_plm()
>>>    {
>>> -
>>>    immadm -o 7 $ee_name
>>>    retval=$?
>>>    if [ $retval != 0 ]; then
>>> @@ -96,12 +95,29 @@ opensaf_reboot_with_plm()
>>>    logger -t "opensaf_reboot" "abrupt restart failed for $ee_name: unable
> to
>> restart remote node"
>>>    exit 1
>>>    fi
>>> -fi
>>> +fi
>>>    fi
>>>    #Note: Operation Id SA_PLM_ADMIN_RESTART=7
>>>    #In the example the $ee_name would expand to (for eg:-)
>> safEE=my_linux_os,safHE=64bitmulticore,safDomain=my_domain
>>>    }
>>>
>>> +# Force local node reboot as fast as possible
>>> +quick_local_node_reboot()
>>> +{
>>> +logger -t "opensaf_reboot" "Do quick local node reboot"
>> [HansN] perhaps reuse the same logic as in sysf_def.c, i.e. use the sysrq
> as
>> fallback and use a short timeout
> [Vu]
> Forcing node reboot by touching /proc/sysrq-trigger is not allowed on
> containers such as LXC
> (as container is immutable), therefore I provided 02 more alternatives below
> in case the first try is failed.
[HansN] preferable to only run the sysrq if the reboot fails, i.e. the 
same logic as in sysf_def.c, see the SIGALRM and supervision_time.
>>> +
>>> +$icmd /bin/echo -n 'b' 2> /dev/null > /proc/sysrq-trigger
>> [HansN] if not run as root, i.e. icmd is sudo, I think you need to use
>> cmd: /bin/echo -n 'b' | $icmd tee /proc/sysrq-trigger , please check
>> [HansN] or $icmd  /bin/sh -c "/bin/echo -n 'b' 2> /dev/null > /proc/sysrq-
>> trigger"
> [Vu] Thanks for your suggestion. Will update accordingly.
>>> +ret_code=$?
>>> +
>>> +if [ $ret_code != 0 ] && [ -x /bin/systemctl ]; then
>>> +$icmd /bin/systemctl --force --force reboot
>>> +ret_code=$?
>>> +fi
>>> +
>>> +if [ $ret_code != 0 ]; then
>>> +$icmd /sbin/reboot -f
>>> +fi
>>> +}
>>>
>>>    if ! test -f "$NODE_ID_FILE"; then
>>>    logger -t "opensaf_reboot" "$NODE_ID_FILE doesnt exists,reboot failed
> "
>>> @@ -112,8 +128,13 @@ temp_node_id=`cat "$NODE_ID_FILE"`
>>>    temp_node_id=`echo "$temp_node_id" |sed -e 's:^0[bBxX]::'| sed -e
>> 's:^:0x:'`
>>>    self_node_id=`printf "%d" $temp_node_id`
>>>
>>> -# If clm cluster reboot requested argument one and two are set but
>>> not used, argument 3 is set to 1, "safe reboot" request -if [
>>> "$safe_reboot" = 1 ]; then
>>> +# If no argument is provided, forcing node reboot immediately without
>>> +log # flushing, process terminating, disk un-mounting.
>>> +# If clm cluster reboot requested argument one and two are set but
>>> +not used, # argument 3 is set to 1, "safe reboot" request.
>>> +if [ "$#" = 0 ]; then
>>> +quick_local_node_reboot
>>> +elif [ "$safe_reboot" = 1 ]; then
>>>    opensaf_safe_reboot
>>>    else
>>>    # A node ID of zero(0) means an order to reboot the local node @@
>>> -165,7 +186,7 @@ else
>>>    logger -t "opensaf_reboot" "Not rebooting remote node $ee_name as it
> is
>> not in INSTANTIATED state"
>>>    elif [ $plm_node_state != 2 ]; then
>>>    opensaf_reboot_with_plm
>>> -else
>>> +else
>>>    logger -t "opensaf_reboot" "Not rebooting remote node $ee_name as it
> is
>> already in locked state"
>>>    fi
>>>    else
>>> diff --git a/src/amf/amfd/ndproc.cc b/src/amf/amfd/ndproc.cc index
>>> ec347fc71..c5060e67b 100644
>>> --- a/src/amf/amfd/ndproc.cc
>>> +++ b/src/amf/amfd/ndproc.cc
>>> @@ -1262,7 +1262,7 @@ void check_quorum(AVD_CL_CB *cb) {
>>>
>>>        // remote fencing is disabled and we have lost write access
>>>        // reboot this node to prevent split brain
>>> -    opensaf_reboot(0, nullptr,
>>> -      "Quorum lost. Rebooting this node to prevent split-brain");
>>> +    opensaf_quick_reboot("Quorum lost. Rebooting this node to "
>>> +                         "prevent split-brain");
>>>      }
>>>    }
>>> diff --git a/src/base/ncssysf_def.h b/src/base/ncssysf_def.h index
>>> 3df8b221e..3de6f44f6 100644
>>> --- a/src/base/ncssysf_def.h
>>> +++ b/src/base/ncssysf_def.h
>>> @@ -76,6 +76,12 @@ void opensaf_reboot_prepare(void);
>>>     */
>>>    void opensaf_reboot(unsigned node_id, const char* ee_name, const
>>> char* reason);
>>>
>>> +/**
>>> + * Do quick local node reboot - without first unmounting file systems
>>> + * or syncing disks attached to the system.
>>> +*/
>>> +void opensaf_quick_reboot(const char* reason);
>>> +
>>>
>> /**************************************************************
>> ***************
>>>     **
> **
>>>     ** ncs_os_get_time_stamp:      Return the current timestamp as
> "time_t"
>> in **
>>> diff --git a/src/base/sysf_def.c b/src/base/sysf_def.c index
>>> d0cfc94ef..880f08ec4 100644
>>> --- a/src/base/sysf_def.c
>>> +++ b/src/base/sysf_def.c
>>> @@ -271,6 +271,16 @@ void opensaf_reboot(unsigned node_id, const
>> char *ee_name, const char *reason)
>>>    }
>>>    }
>>>
>>> +void opensaf_quick_reboot(const char *reason) {
>>> +syslog(LOG_CRIT, "Quick local node rebooting, Reason: %s ", reason);
>>> +int reboot_result = system(PKGLIBDIR "/opensaf_reboot");
>>> +if (reboot_result != EXIT_SUCCESS) {
>>> +syslog(LOG_CRIT, "node reboot failure: exit code %d",
>>> +       WEXITSTATUS(reboot_result));
>>> +}
>>> +}
>>> +
>>>    /**
>>>     * syslog and abort to generate a core dump (if enabled)
>>>     * @param __file
>>> diff --git a/src/fm/fmd/fm_main.cc b/src/fm/fmd/fm_main.cc index
>>> 122b2175c..b473b68aa 100644
>>> --- a/src/fm/fmd/fm_main.cc
>>> +++ b/src/fm/fmd/fm_main.cc
>>> @@ -665,9 +665,9 @@ static void fm_mbx_msg_handler(FM_CB *fm_cb,
>> FM_EVT *fm_mbx_evt) {
>>>                  "Two active controllers observed in a cluster,
> newActive: %x and "
>>>                  "old-Active: %x",
>>>                  unsigned(fm_cb->node_id),
> unsigned(fm_cb->peer_node_id));
>>> -          opensaf_reboot(0, NULL,
>>> -                         "Received svc up from peer node (old-active is
> not "
>>> -                         "fully DOWN), hence rebooting the new
> Active");
>>> +          opensaf_quick_reboot("Received svc up from peer node
> (old-active "
>>> +                               "is not fully DOWN), "
>>> +                               "hence rebooting the new Active");
>>>            }
>>>          }
>>>
>>> diff --git a/src/fm/fmd/fm_rda.cc b/src/fm/fmd/fm_rda.cc index
>>> 2a9f1664f..028bfa380 100644
>>> --- a/src/fm/fmd/fm_rda.cc
>>> +++ b/src/fm/fmd/fm_rda.cc
>>> @@ -105,9 +105,8 @@ uint32_t fm_rda_set_role(FM_CB *fm_cb,
>> PCS_RDA_ROLE role) {
>>>          LOG_ER(
>>>              "A controller is already active. We were separated from the
> "
>>>              "cluster?");
>>> -      opensaf_reboot(0, nullptr,
>>> -                     "A controller is already active. We were separated
> "
>>> -                     "from the cluster?");
>>> +      opensaf_quick_reboot("A controller is already active. We were
>> separated "
>>> +                           "from the cluster?");
>>>        }
>>>      }
>>>
>>> diff --git a/src/rde/rded/rde_main.cc b/src/rde/rded/rde_main.cc index
>>> 2d9aa51dc..b4c842942 100644
>>> --- a/src/rde/rded/rde_main.cc
>>> +++ b/src/rde/rded/rde_main.cc
>>> @@ -140,9 +140,7 @@ static void handle_mbx_event() {
>>>                     active_controller.c_str());
>>>              if (consensus_service.IsRemoteFencingEnabled() == false) {
>>>                LOG_ER("Probable split-brain. Rebooting this node");
>>> -
>>> -            opensaf_reboot(0, nullptr,
>>> -                           "Split-brain detected by consensus
> service");
>>> +            opensaf_quick_reboot("Split-brain detected by consensus
>>> + service");
>>>              }
>>>            }
>>>
>>> @@ -258,7 +256,7 @@ static void CheckForSplitBrain(const rde_msg *msg)
>> {
>>>      PCS_RDA_ROLE own_role = role->role();
>>>      PCS_RDA_ROLE other_role = msg->info.peer_info.ha_role;
>>>      if (own_role == PCS_RDA_ACTIVE && other_role == PCS_RDA_ACTIVE) {
>>> -    opensaf_reboot(0, nullptr, "Split-brain detected");
>>> +    opensaf_quick_reboot("Split-brain detected");
>>>      }
>>>    }
>>>
>> _______________________________________________
>> Opensaf-devel mailing list
>> Opensaf-devel@lists.sourceforge.net
>> https://lists.sourceforge.net/lists/listinfo/opensaf-devel

_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to