[ https://issues.apache.org/jira/browse/TRAFODION-2881?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16324361#comment-16324361 ]
ASF GitHub Bot commented on TRAFODION-2881: ------------------------------------------- Github user zcorrea commented on a diff in the pull request: https://github.com/apache/trafodion/pull/1392#discussion_r161296519 --- Diff: core/sqf/monitor/linux/cmsh.cxx --- @@ -128,31 +168,97 @@ int CCmsh::GetClusterState( PhysicalNodeNameMap_t &physicalNodeMap ) if (it != physicalNodeMap.end()) { // TEST_POINT and Exclude List : to force state down on node name - const char *downNodeName = getenv( TP001_NODE_DOWN ); - const char *downNodeList = getenv( TRAF_EXCLUDE_LIST ); - string downNodeString = " "; - if (downNodeList) - { - downNodeString += downNodeList; - downNodeString += " "; - } - string downNodeToFind = " "; - downNodeToFind += nodeName.c_str(); - downNodeToFind += " "; - if (((downNodeList != NULL) && - strstr(downNodeString.c_str(),downNodeToFind.c_str())) || - ( (downNodeName != NULL) && - !strcmp( downNodeName, nodeName.c_str()) )) - { - nodeState = StateDown; - } - + const char *downNodeName = getenv( TP001_NODE_DOWN ); + const char *downNodeList = getenv( TRAF_EXCLUDE_LIST ); + string downNodeString = " "; + if (downNodeList) + { + downNodeString += downNodeList; + downNodeString += " "; + } + string downNodeToFind = " "; + downNodeToFind += nodeName.c_str(); + downNodeToFind += " "; + if (((downNodeList != NULL) && + strstr(downNodeString.c_str(),downNodeToFind.c_str())) || + ((downNodeName != NULL) && + !strcmp(downNodeName, nodeName.c_str()))) + { + nodeState = StateDown; + } + // Set physical node state physicalNode = it->second; physicalNode->SetState( nodeState ); } } - } + } + + TRACE_EXIT; + return( rc ); +} + +/////////////////////////////////////////////////////////////////////////////// +// +// Function/Method: CCmsh::GetNodeState +// +// Description: Updates the state of the nodeName in the physicalNode passed in +// as a parameter. Caller should ensure that the node names are already +// present in the physicalNodeMap. +// +// Return: +// 0 - success +// -1 - failure +// +/////////////////////////////////////////////////////////////////////////////// +int CCmsh::GetNodeState( char *name ,CPhysicalNode *physicalNode ) +{ + const char method_name[] = "CCmsh::GetNodeState"; + TRACE_ENTRY; + + int rc; + + rc = PopulateNodeState( name ); + + if ( rc != -1 ) + { + // Parse each line extracting name and state + string nodeName; + NodeState_t nodeState; + PhysicalNodeNameMap_t::iterator it; + + StringList_t::iterator alit; + for ( alit = nodeStateList_.begin(); alit != nodeStateList_.end() ; alit++ ) + { + ParseNodeStatus( *alit, nodeName, nodeState ); + + // TEST_POINT and Exclude List : to force state down on node name + const char *downNodeName = getenv( TP001_NODE_DOWN ); + const char *downNodeList = getenv( TRAF_EXCLUDE_LIST ); --- End diff -- Created JIRA (TRAFODION-2907) to clean up the use of TRAF_EXCLUDE_LIST in the monitor code. > Multiple node failures occur during HA testing > ---------------------------------------------- > > Key: TRAFODION-2881 > URL: https://issues.apache.org/jira/browse/TRAFODION-2881 > Project: Apache Trafodion > Issue Type: Bug > Components: foundation > Affects Versions: 2.3 > Reporter: Gonzalo E Correa > Assignee: Gonzalo E Correa > Fix For: 2.3 > > > Inflicting server failure in certain modes will cause multiple monitor > process to also bring their nodes down along with the intended target of the > test. > Server down modes: > init 6 > reboot -f > shutdown -r now > shell node down command > In addition, after a server down, the shell 'node up' command will also fail > intermittently. This requires a longevity HA test to down and up nodes over a > long period of time like 24-48 hours. -- This message was sent by Atlassian JIRA (v6.4.14#64029)