Multiple fixes with Name Server enabled logic. - More than one Name Server is now supported - Node failures handled with with/without Name Server in node - Long process names are now the default and support clusters larger that 1000 nodes
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/32fe8565 Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/32fe8565 Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/32fe8565 Branch: refs/heads/master Commit: 32fe856510ed9c741de13bdd2a3b80561c6b0e2f Parents: 7184f8f Author: Zalo Correa <[email protected]> Authored: Wed Jul 25 16:24:18 2018 -0700 Committer: Zalo Correa <[email protected]> Committed: Wed Jul 25 16:24:18 2018 -0700 ---------------------------------------------------------------------- .../export/include/common/evl_sqlog_eventnum.h | 49 +- core/sqf/monitor/linux/cluster.cxx | 221 ++++- core/sqf/monitor/linux/cluster.h | 7 + core/sqf/monitor/linux/commaccept.cxx | 8 +- core/sqf/monitor/linux/config.cxx | 8 +- core/sqf/monitor/linux/healthcheck.cxx | 3 +- core/sqf/monitor/linux/internal.h | 5 + core/sqf/monitor/linux/lnode.cxx | 12 +- core/sqf/monitor/linux/makefile | 1 + core/sqf/monitor/linux/monitor.cxx | 18 +- core/sqf/monitor/linux/msgdef.h | 2 +- core/sqf/monitor/linux/nameserver.cxx | 470 ++++++++-- core/sqf/monitor/linux/nameserver.h | 18 +- core/sqf/monitor/linux/nscommacceptmon.cxx | 92 +- core/sqf/monitor/linux/nscommacceptmon.h | 5 +- core/sqf/monitor/linux/nsreqprocinfons.cxx | 13 +- core/sqf/monitor/linux/nsreqshutdown.cxx | 6 +- core/sqf/monitor/linux/nsreqstop.cxx | 2 +- core/sqf/monitor/linux/pnode.cxx | 90 +- core/sqf/monitor/linux/process.cxx | 217 ++++- core/sqf/monitor/linux/process.h | 9 +- core/sqf/monitor/linux/ptpclient.cxx | 934 +++++++++++-------- core/sqf/monitor/linux/ptpclient.h | 66 +- core/sqf/monitor/linux/ptpcommaccept.cxx | 328 ++++--- core/sqf/monitor/linux/ptpcommaccept.h | 12 +- core/sqf/monitor/linux/redirector.cxx | 53 +- core/sqf/monitor/linux/reqdump.cxx | 16 +- core/sqf/monitor/linux/reqevent.cxx | 16 +- core/sqf/monitor/linux/reqkill.cxx | 26 +- core/sqf/monitor/linux/reqnotify.cxx | 18 +- core/sqf/monitor/linux/reqopen.cxx | 20 +- core/sqf/monitor/linux/reqprocinfo.cxx | 15 +- core/sqf/monitor/linux/reqqueue.cxx | 43 +- core/sqf/monitor/linux/reqqueue.h | 19 + core/sqf/monitor/linux/shell.cxx | 67 +- core/sqf/monitor/linux/tmsync.cxx | 4 +- core/sqf/monitor/linux/zclient.cxx | 2 +- core/sqf/sqenvcom.sh | 18 +- core/sqf/sql/scripts/gomon.cold | 13 +- 39 files changed, 2026 insertions(+), 900 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/export/include/common/evl_sqlog_eventnum.h ---------------------------------------------------------------------- diff --git a/core/sqf/export/include/common/evl_sqlog_eventnum.h b/core/sqf/export/include/common/evl_sqlog_eventnum.h index 8930ccc..10268d8 100644 --- a/core/sqf/export/include/common/evl_sqlog_eventnum.h +++ b/core/sqf/export/include/common/evl_sqlog_eventnum.h @@ -84,6 +84,7 @@ #define MON_CLUSTER_MARKDOWN_1 101011101 #define MON_CLUSTER_MARKDOWN_2 101011102 #define MON_CLUSTER_MARKDOWN_3 101011103 +#define MON_CLUSTER_MARKDOWN_4 101011104 //#define MON_CLUSTER_MARKUP 101011201 #define MON_CLUSTER_NODE_TM_READY_1 101011301 #define MON_CLUSTER_NODE_TM_READY_2 101011302 @@ -253,8 +254,16 @@ #define MON_CLUSTER_ASSIGNMONITORLEADER_2 101015302 #define MON_CLUSTER_ASSIGNMONITORLEADER_3 101015303 #define MON_CLUSTER_ASSIGNMONITORLEADER_4 101015304 + #define MON_CLUSTER_CHECKIFDONE_1 101015401 +#define MON_CLUSTER_HARDNODEUPNS_1 101015501 +#define MON_CLUSTER_HARDNODEUPNS_2 101015502 + +#define MON_CLUSTER_RECEIVESOCK_1 101015601 + +#define MON_CLUSTER_SENDSOCK_1 101015701 + /* Module: monitor.cxx = 02 */ #define MON_MONITOR_MAIN_1 101020101 @@ -1016,24 +1025,30 @@ #define ZCONFIG_DELETECONFIGZNODE_3 101381003 /* Module nameserver.cxx = 39 */ -#define MON_NAMESERVER_MKCLTSOCK_1 101390101 -#define MON_NAMESERVER_MKCLTSOCK_2 101390102 -#define MON_NAMESERVER_MKCLTSOCK_3 101390103 -#define MON_NAMESERVER_MKCLTSOCK_4 101390104 -#define MON_NAMESERVER_MKCLTSOCK_5 101390105 -#define MON_NAMESERVER_MKCLTSOCK_6 101390106 +#define NAMESERVER_CLIENTSOCKCREATE_1 101390101 +#define NAMESERVER_CLIENTSOCKCREATE_2 101390102 +#define NAMESERVER_CLIENTSOCKCREATE_3 101390103 +#define NAMESERVER_CLIENTSOCKCREATE_4 101390104 +#define NAMESERVER_CLIENTSOCKCREATE_5 101390105 +#define NAMESERVER_CLIENTSOCKCREATE_6 101390106 +#define NAMESERVER_SENDTONS_1 101390201 +#define NAMESERVER_SENDTONS_2 101390202 +#define NAMESERVER_SOCKRECEIVE_1 101390301 +#define NAMESERVER_SOCKSEND_1 101390401 +#define NAMESERVER_GETM2NPORT_1 101390501 +#define NAMESERVER_CHOOSENEXTNS_1 101390601 /* Module nscommaccept.cxx = 40 */ -#define NS_COMMACCEPT_1 101400101 -#define NS_COMMACCEPT_3 101400102 -#define NS_COMMACCEPT_2 101400103 -#define NS_COMMACCEPT_4 101400104 -#define NS_COMMACCEPT_5 101400105 -#define NS_COMMACCEPT_6 101400106 -#define NS_COMMACCEPT_7 101400107 -#define NS_COMMACCEPT_8 101400108 -#define NS_COMMACCEPT_9 101400109 -#define NS_COMMACCEPT_10 101400110 +#define NS_COMMACCEPT_PROCESSMONREQS_1 101400101 +#define NS_COMMACCEPT_PROCESSMONREQS_2 101400102 +#define NS_COMMACCEPT_PROCESSMONREQS_3 101400103 +#define NS_COMMACCEPT_PROCESSMONREQS_4 101400104 +#define NS_COMMACCEPT_PROCESSMONREQS_5 101400105 +#define NS_COMMACCEPT_PROCESSNEWSOCK_1 101400201 +#define NS_COMMACCEPT_COMMACCEPTORSOCK_1 101400301 +#define NS_COMMACCEPT_MON2NSACCEPTMON_1 101400401 +#define NS_COMMACCEPT_MON2NSPROCESS_1 101400501 +#define NS_COMMACCEPT_START_1 101400601 /* Module: reqnodedown.cxx = 41 */ #define MON_EXT_NAMESERVERDOWN_REQ 101410101 @@ -1067,6 +1082,8 @@ #define PTPCLIENT_STDINREQ_2 101930202 #define PTPCLIENT_STDIODATA_1 101930301 #define PTPCLIENT_STDIODATA_2 101930302 +#define PTPCLIENT_SENDTOMON_1 101930401 +#define PTPCLIENT_SENDTOMON_2 101930402 /* Module ptpcommaccept.cxx = 94 */ #define PTP_COMMACCEPT_1 101940101 http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/cluster.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx index 4d0d3c2..762c1a0 100644 --- a/core/sqf/monitor/linux/cluster.cxx +++ b/core/sqf/monitor/linux/cluster.cxx @@ -69,6 +69,7 @@ using namespace std; #include "nscommacceptmon.h" #else #include "nameserver.h" +#include "ptpclient.h" #endif extern bool IAmIntegrating; @@ -88,7 +89,9 @@ extern char MySyncPort[MPI_MAX_PORT_NAME]; extern CCommAcceptMon CommAcceptMon; extern char MyMon2NsPort[MPI_MAX_PORT_NAME]; #else +extern CProcess *NameServerProcess; extern CNameServer *NameServer; +extern CPtpClient *PtpClient; extern bool NameServerEnabled; extern char MyPtPPort[MPI_MAX_PORT_NAME]; #endif @@ -1069,12 +1072,11 @@ unsigned long long CCluster::EnsureAndGetSeqNum(cluster_state_def_t nodestate[]) } +#ifndef NAMESERVER_PROCESS void CCluster::HardNodeDown (int pnid, bool communicate_state) { -#ifndef NAMESERVER_PROCESS char port_fname[MAX_PROCESS_PATH]; char temp_fname[MAX_PROCESS_PATH]; -#endif CNode *node; CLNode *lnode; char buf[MON_STRING_BUF_SIZE]; @@ -1130,7 +1132,6 @@ void CCluster::HardNodeDown (int pnid, bool communicate_state) return; } -#ifndef NAMESERVER_PROCESS if ( !Emulate_Down ) { if( !IsRealCluster ) @@ -1161,7 +1162,6 @@ void CCluster::HardNodeDown (int pnid, bool communicate_state) remove(temp_fname); rename(port_fname, temp_fname); } -#endif if (node->GetState() != State_Down || !node->isInQuiesceState()) { @@ -1194,9 +1194,7 @@ void CCluster::HardNodeDown (int pnid, bool communicate_state) if ( ! Emulate_Down ) { // make sure no processes are alive if in the middle of re-integration -#ifndef NAMESERVER_PROCESS node->KillAllDown(); -#endif snprintf(buf, sizeof(buf), "[CCluster::HardNodeDown], Node %s (%d)is down.\n", node->GetName(), node->GetPNid()); @@ -1212,29 +1210,29 @@ void CCluster::HardNodeDown (int pnid, bool communicate_state) } else { - if ( node->GetPNid() == integratingPNid_ ) - { - ResetIntegratingPNid(); - } -#ifndef NAMESERVER_PROCESS - node->KillAllDown(); -#endif - node->SetState( State_Down ); - // Send node down message to local node's processes - lnode = node->GetFirstLNode(); - for ( ; lnode; lnode = lnode->GetNextP() ) + if (node->GetState() != State_Down) { - lnode->Down(); - } - if ( ZClientEnabled ) - { - ZClient->WatchNodeDelete( node->GetName() ); - ZClient->WatchNodeMasterDelete( node->GetName() ); + if ( node->GetPNid() == integratingPNid_ ) + { + ResetIntegratingPNid(); + } + node->KillAllDown(); + node->SetState( State_Down ); + // Send node down message to local node's processes + lnode = node->GetFirstLNode(); + for ( ; lnode; lnode = lnode->GetNextP() ) + { + lnode->Down(); + } + if ( ZClientEnabled ) + { + ZClient->WatchNodeDelete( node->GetName() ); + ZClient->WatchNodeMasterDelete( node->GetName() ); + } } } } -#ifndef NAMESERVER_PROCESS // we need to abort any active TmSync if (( MyNode->GetTmSyncState() == SyncState_Start ) || ( MyNode->GetTmSyncState() == SyncState_Continue ) || @@ -1245,21 +1243,79 @@ void CCluster::HardNodeDown (int pnid, bool communicate_state) if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() )); } -#endif -#ifndef NAMESERVER_PROCESS if ( Emulate_Down ) { AssignTmLeader(pnid, false); } else -#endif { AssignLeaders(pnid, node->GetName(), false); } TRACE_EXIT; } +#endif + +#ifdef NAMESERVER_PROCESS +void CCluster::HardNodeDownNs( int pnid ) +{ + CNode *node; + char buf[MON_STRING_BUF_SIZE]; + + const char method_name[] = "CCluster::HardNodeDownNs"; + TRACE_ENTRY; + + node = Nodes->GetNode(pnid); + + if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) + trace_printf( "%s@%d - pnid=%d, state=%s, isInQuiesceState=%d," + " (local pnid=%d, state=%s, isInQuiesceState=%d, " + "shutdown level=%d)\n", method_name, __LINE__, + pnid, StateString(node->GetState()), + node->isInQuiesceState(), + MyPNID, StateString(MyNode->GetState()), + MyNode->isInQuiesceState(), MyNode->GetShutdownLevel() ); + + if (( MyPNID == pnid ) && + ( MyNode->GetState() == State_Down || + MyNode->IsKillingNode() ) ) + { + // we are coming down ... don't process it + if ( !IsRealCluster && MyNode->isInQuiesceState()) + { + // in virtual env, this would be called after node quiescing, + // so continue with mark down processing. + } + else + { + return; + } + } + + if (node->GetState() != State_Down) + { + snprintf( buf, sizeof(buf) + , "[%s], Node %s (%d) is going down.\n" + , method_name, node->GetName(), node->GetPNid()); + mon_log_write(MON_CLUSTER_MARKDOWN_4, SQ_LOG_INFO, buf); + + node->SetKillingNode( true ); + node->DeleteAllDown(); + node->SetState( State_Down ); + + if ( ZClientEnabled ) + { + //ZClient->WatchNodeDelete( node->GetName() ); + ZClient->WatchNodeMasterDelete( node->GetName() ); + } + } + + AssignLeaders(pnid, node->GetName(), false); + + TRACE_EXIT; +} +#endif void CCluster::SoftNodeDown( int pnid ) { @@ -1651,8 +1707,10 @@ int CCluster::HardNodeUp( int pnid, char *node_name ) if ( nodeState == State_Down ) { node->SetKillingNode( false ); +#ifndef NAMESERVER_PROCESS if ( Emulate_Down ) { +#endif // Any DTMs running? for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ ) { @@ -1706,6 +1764,7 @@ int CCluster::HardNodeUp( int pnid, char *node_name ) } } } +#ifndef NAMESERVER_PROCESS } else { @@ -1714,6 +1773,7 @@ int CCluster::HardNodeUp( int pnid, char *node_name ) method_name, __LINE__ ); } +#endif } else if ( nodeState == State_Merged ) { @@ -1866,6 +1926,74 @@ int CCluster::HardNodeUp( int pnid, char *node_name ) return( rc ); } +#ifdef NAMESERVER_PROCESS +int CCluster::HardNodeUpNs( int pnid ) +{ + int rc = 0; + CNode *node; + STATE nodeState; + + const char method_name[] = "CCluster::HardNodeUpNs"; + TRACE_ENTRY; + + if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) + trace_printf( "%s@%d - pnid=%d, MyPNID = %d, currentNodes_=%d\n" + , method_name, __LINE__, pnid, MyPNID, currentNodes_ ); + + node = Nodes->GetNode( pnid ); + if ( node == NULL ) + { + if ( rc ) + { // Handle error + char buf[MON_STRING_BUF_SIZE]; + snprintf( buf, sizeof(buf) + , "[%s], Invalid node, pnid=%d\n" + , method_name, pnid ); + mon_log_write(MON_CLUSTER_HARDNODEUPNS_1, SQ_LOG_ERR, buf); + return( -1 ); + } + } + + nodeState = node->GetState(); + + if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY)) + trace_printf( "%s@%d" " - Node state=%s" "\n" + , method_name, __LINE__, StateString( nodeState ) ); + + if ( nodeState != State_Up ) + { + if ( nodeState == State_Down ) + { + node->SetKillingNode( false ); + // We need to remove any old process objects before we restart the node. + node->CleanUpProcesses(); + node->SetState( State_Up ); + if ( MyPNID != pnid ) + { + // Let other monitors know this node is up + CReplNodeUp *repl = new CReplNodeUp(pnid); + Replicator.addItem(repl); + } + } + } + else + { // Handle error + char buf[MON_STRING_BUF_SIZE]; + snprintf( buf, sizeof(buf) + , "[%s], Invalid node state, node %s, pnid=%d, state=%s\n" + , method_name + , node->GetName() + , node->GetPNid() + , StateString( nodeState ) ); + mon_log_write(MON_CLUSTER_HARDNODEUPNS_2, SQ_LOG_ERR, buf); + return( -1 ); + } + + TRACE_EXIT; + return( rc ); +} +#endif + int CCluster::SoftNodeUpPrepare( int pnid ) { char buf[MON_STRING_BUF_SIZE]; @@ -7456,7 +7584,10 @@ void CCluster::UpdateClusterState( bool &doShutdown, case State_Unknown: break; case State_Down: - doShutdown = true; + if (IsRealCluster) + { + doShutdown = true; + } break; case State_Stopped: case State_Shutdown: @@ -7780,19 +7911,23 @@ bool CCluster::checkIfDone ( ) // let the watchdog process exit HealthCheck.setState(MON_EXIT_PRIMITIVES); } - else if ( (MyNode->GetNumProcs() <= // only My Name Server alive - myNameServerCount ) + else if ( NameServerProcess != NULL + && myNameServerCount > 0 + && (MyNode->GetNumProcs() <= myNameServerCount ) // only My Name Server alive && !MyNode->isInQuiesceState() // post-quiescing will // expire WDG (cluster) && !waitForNameServerExit_ ) // Name Server not yet exiting { if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC)) - trace_printf("%s@%d - Stopping Name Server process. " - "(process count: cluster=%d, MyNode=%d)\n", - method_name, __LINE__, - Nodes->ProcessCount(), MyNode->ProcessCount()); - + { + trace_printf("%s@%d - Stopping Name Server process. " + "(process count: cluster=%d, MyNode=%d)\n", + method_name, __LINE__, + Nodes->ProcessCount(), MyNode->ProcessCount()); + } + waitForNameServerExit_ = true; + MyNode->SetProcessState( NameServerProcess, State_Down, false ); int rc = NameServer->ProcessShutdown(); if (rc) { @@ -10196,6 +10331,14 @@ int CCluster::ReceiveSock(char *buf, int size, int sockFd, const char *desc) if ( errno != EINTR) { error = errno; + char la_buf[MON_STRING_BUF_SIZE]; + sprintf( la_buf, "[%s], recv(), received=%d, sock=%d, error=%d(%s), desc=%s\n" + , method_name + , received + , sockFd + , error, strerror(error) + , desc ); + mon_log_write(MON_CLUSTER_RECEIVESOCK_1, SQ_LOG_ERR, la_buf); readAgain = false; } else @@ -10264,6 +10407,14 @@ int CCluster::SendSock(char *buf, int size, int sockFd, const char *desc) if ( errno != EINTR) { error = errno; + char la_buf[MON_STRING_BUF_SIZE]; + sprintf( la_buf, "[%s], send(), sent=%d, sock=%d, error=%d(%s), desc=%s\n" + , method_name + , sent + , sockFd + , error, strerror(error) + , desc ); + mon_log_write(MON_CLUSTER_SENDSOCK_1, SQ_LOG_ERR, la_buf); sendAgain = false; } else http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/cluster.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/cluster.h b/core/sqf/monitor/linux/cluster.h index 970cf4c..2dfbfb8 100644 --- a/core/sqf/monitor/linux/cluster.h +++ b/core/sqf/monitor/linux/cluster.h @@ -163,7 +163,11 @@ public: int GetConfigPNodesMax() { return configPNodesMax_; } bool ImAlive( bool needed=false, struct sync_def *sync = NULL ); int MapRank( int current_rank ); +#ifndef NAMESERVER_PROCESS void HardNodeDown( int nid, bool communicate_state=false ); +#else + void HardNodeDownNs( int nid ); +#endif void SoftNodeDown( int pnid ); int SoftNodeUpPrepare( int pnid ); bool CheckSpareSet( int pnid ); @@ -174,6 +178,9 @@ public: void ResetIntegratingPNid( void ); void SetIntegratingPNid( int pnid ); int HardNodeUp( int pnid, char *node_name ); +#ifdef NAMESERVER_PROCESS + int HardNodeUpNs( int pnid ); +#endif inline CNode *GetIntegratingNode() { return Node[integratingPNid_]; } inline CNode *GetNode( int pnid ) { return Node[pnid]; } static char *Timestamp( void ); http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/commaccept.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/commaccept.cxx b/core/sqf/monitor/linux/commaccept.cxx index 5c4e3a5..13c2ebd 100644 --- a/core/sqf/monitor/linux/commaccept.cxx +++ b/core/sqf/monitor/linux/commaccept.cxx @@ -220,12 +220,18 @@ bool CCommAccept::sendNodeInfoSock( int sockFd ) , i, node->GetPNid(), node->GetName()); } - nodeInfo[i].pnid = -1; nodeInfo[i].nodeName[0] = '\0'; nodeInfo[i].commPort[0] = '\0'; nodeInfo[i].syncPort[0] = '\0'; + nodeInfo[i].pnid = -1; nodeInfo[i].creatorPNid = -1; } + nodeInfo[i].creatorShellPid = -1; + nodeInfo[i].creatorShellVerifier = -1; + nodeInfo[i].creator = false; + nodeInfo[i].ping = false; + nodeInfo[i].nsPid = -1; + nodeInfo[i].nsPNid = -1; } if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/config.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/config.cxx b/core/sqf/monitor/linux/config.cxx index 4ef9b72..9794199 100644 --- a/core/sqf/monitor/linux/config.cxx +++ b/core/sqf/monitor/linux/config.cxx @@ -1193,7 +1193,7 @@ int CConfigContainer::PackRegistry( char *&buffer, ConfigType type ) regClusterEntry->valueLength = strlen (regClusterConfig[i].value); if (trace_settings & (TRACE_INIT | TRACE_REQUEST)) { - trace_printf ("%s%d pack type %d, scope %s (%d), key %s (%d), value %s(%d)\n",method_name, __LINE__, + trace_printf ("%s@%d pack type %d, scope %s (%d), key %s (%d), value %s(%d)\n",method_name, __LINE__, regClusterEntry->type, regClusterConfig[i].scope, regClusterEntry->scopeLength,regClusterConfig[i].key,regClusterEntry->keyLength, regClusterConfig[i].value, regClusterEntry->valueLength); @@ -1226,7 +1226,7 @@ int CConfigContainer::PackRegistry( char *&buffer, ConfigType type ) if (regClusterConfig) { - delete regClusterConfig; + delete [] regClusterConfig; } return numberOfEntries; @@ -1258,7 +1258,7 @@ void CConfigContainer::UnpackRegistry( char *&buffer, int count ) if (trace_settings & (TRACE_INIT | TRACE_REQUEST)) { - trace_printf ("%s%d scope length %d, key length %d, value length %d\n", method_name, __LINE__, + trace_printf ("%s@%d scope length %d, key length %d, value length %d\n", method_name, __LINE__, clusterObj2->scopeLength, clusterObj2->keyLength, clusterObj2->valueLength); } @@ -1317,7 +1317,7 @@ int CConfigContainer::PackUniqueStrings( char *&buffer ) stringObj->stringLength = strlen(unique_string); if (trace_settings & (TRACE_INIT | TRACE_REQUEST)) { - trace_printf ("%s%d packing nid %d, unique id %d, stringt %s (length %d)\n", method_name, __LINE__, + trace_printf ("%s@%d packing nid %d, unique id %d, stringt %s (length %d)\n", method_name, __LINE__, pnid, maxId, unique_string,stringObj->stringLength ); } stringObj->unique_id = maxId; http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/healthcheck.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/healthcheck.cxx b/core/sqf/monitor/linux/healthcheck.cxx index 1f344fc..203465c 100644 --- a/core/sqf/monitor/linux/healthcheck.cxx +++ b/core/sqf/monitor/linux/healthcheck.cxx @@ -54,7 +54,6 @@ using namespace std; #include "redirector.h" #include "replicate.h" - extern CReqQueue ReqQueue; extern CMonitor *Monitor; extern CNode *MyNode; @@ -64,6 +63,7 @@ extern CRedirector Redirector; extern CHealthCheck HealthCheck; extern CReplicate Replicator; extern int MyPNID; +extern bool IsRealCluster; // constructor CHealthCheck::CHealthCheck() @@ -229,7 +229,6 @@ void CHealthCheck::healthCheckThread() TRACE_ENTRY; HealthCheckStates state; - struct timespec ts; if (trace_settings & TRACE_HEALTH) http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/internal.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/internal.h b/core/sqf/monitor/linux/internal.h index 35fa32a..5dc6456 100644 --- a/core/sqf/monitor/linux/internal.h +++ b/core/sqf/monitor/linux/internal.h @@ -537,5 +537,10 @@ struct sync_buffer_def char msg[MAX_SYNC_SIZE]; }; +typedef struct ptpMsgInfo +{ + int pnid; // Current offset into the msg buffer + int size; // Number if messages to replicate +} ptpMsgInfo_t; #endif http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/lnode.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/lnode.cxx b/core/sqf/monitor/linux/lnode.cxx index 69a186d..bbe5ac4 100644 --- a/core/sqf/monitor/linux/lnode.cxx +++ b/core/sqf/monitor/linux/lnode.cxx @@ -41,6 +41,7 @@ using namespace std; #include "lnode.h" #include "pnode.h" #include "mlio.h" +#include "nameserver.h" extern bool IsRealCluster; extern CommType_t CommType; @@ -50,6 +51,10 @@ extern CMonitor *Monitor; extern CMonStats *MonStats; extern bool usingCpuAffinity; extern bool usingTseCpuAffinity; +#ifndef NAMESERVER_PROCESS +extern CNameServer *NameServer; +extern bool NameServerEnabled; +#endif void CoreMaskString( char *str, cpu_set_t coreMask, int totalCores ) { @@ -396,7 +401,12 @@ void CLNode::Down( void ) , method_name, __LINE__, GetNid() , GetNode()->GetName(), msg->u.request.u.down.takeover ); } - +#ifndef NAMESERVER_PROCESS + if ( NameServerEnabled ) + { + NameServer->ProcessNodeDown( Nid, msg->u.request.u.down.node_name ); + } +#endif MyNode->Bcast( msg ); delete msg; } http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/makefile ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/makefile b/core/sqf/monitor/linux/makefile index 73127e4..3d16bab 100644 --- a/core/sqf/monitor/linux/makefile +++ b/core/sqf/monitor/linux/makefile @@ -265,6 +265,7 @@ NSOBJS += $(OUTDIR)/nsreqqueue.o NSOBJS += $(OUTDIR)/nsreqdelproc.o NSOBJS += $(OUTDIR)/nsreqstop.o NSOBJS += $(OUTDIR)/nsreqnewproc.o +NSOBJS += $(OUTDIR)/nsreqnodedown.o NSOBJS += $(OUTDIR)/nsreqprocinfo.o NSOBJS += $(OUTDIR)/nsreqprocinfons.o NSOBJS += $(OUTDIR)/nsreqstart.o http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/monitor.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/monitor.cxx b/core/sqf/monitor/linux/monitor.cxx index d588945..2ad0528 100755 --- a/core/sqf/monitor/linux/monitor.cxx +++ b/core/sqf/monitor/linux/monitor.cxx @@ -1367,7 +1367,8 @@ int main (int argc, char *argv[]) env = getenv("SQ_NAMESERVER_ENABLED"); if ( env && isdigit(*env) ) { - NameServerEnabled = atoi(env); + int val = atoi(env); + NameServerEnabled = (val != 0) ? true : false; } #endif @@ -1605,6 +1606,7 @@ int main (int argc, char *argv[]) } setlinebuf(stdout); +#ifndef NAMESERVER_PROCESS // Send stderr output to same file as stdout. (Note: the monitor does // not write to stderr but perhaps there could be components included in // the monitor build that do write to stderr.) @@ -1612,6 +1614,10 @@ int main (int argc, char *argv[]) { printf ( "dup2 failed for stderr: %s (%d)\n", strerror(errno), errno); } +#else + // Name Server is a child process of the monitor, the process create logic + // will establish IO redirection between the monitor process and the child. +#endif switch( CommType ) { @@ -2052,13 +2058,15 @@ int main (int argc, char *argv[]) #ifdef NAMESERVER_PROCESS Monitor = new CMonitor (); #else - Monitor = new CMonitor (procTermSig); -#endif -#ifndef NAMESERVER_PROCESS if (NameServerEnabled) { + PtpClient = new CPtpClient (); + Monitor = new CMonitor (procTermSig); NameServer = new CNameServer (); - PtpClient = new CPtpClient (); + } + else + { + Monitor = new CMonitor (procTermSig); } #endif http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/msgdef.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/msgdef.h b/core/sqf/monitor/linux/msgdef.h index 639c15c..8218a8e 100644 --- a/core/sqf/monitor/linux/msgdef.h +++ b/core/sqf/monitor/linux/msgdef.h @@ -89,7 +89,7 @@ #define MAX_PROCINFO_LIST 64 #define MAX_PROC_CONTEXT 5 #define MAX_PROCESS_NAME MAX_KEY_NAME -#define MAX_PROCESS_NAME_STR 12 +#define MAX_PROCESS_NAME_STR 13 #define MAX_PROCESS_PATH 256 #define MAX_PROCESSOR_NAME 128 #define MAX_RECONN_PING_WAIT_TIMEOUT 5 http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nameserver.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nameserver.cxx b/core/sqf/monitor/linux/nameserver.cxx index e9f1900..ad024f6 100644 --- a/core/sqf/monitor/linux/nameserver.cxx +++ b/core/sqf/monitor/linux/nameserver.cxx @@ -44,6 +44,7 @@ using namespace std; #include <limits.h> #include <unistd.h> +#include "trafconf/trafconfig.h" #include "lnode.h" #include "pnode.h" #include "nameserver.h" @@ -51,21 +52,24 @@ using namespace std; #include "montrace.h" #include "nameserverconfig.h" #include "meas.h" +#include "reqqueue.h" extern CNode *MyNode; extern CProcess *NameServerProcess; extern CNodeContainer *Nodes; +extern CReqQueue ReqQueue; extern bool IsRealCluster; extern int MyPNID; extern CNameServerConfigContainer *NameServerConfig; extern CMeas Meas; +#define NAMESERVER_IO_RETRIES 3 + CNameServer::CNameServer( void ) -: mon2nsSock_(-1) -, nsConfigInx_(-1) -, nsStartupComplete_(false) -, seqNum_(0) -, shutdown_(false) + : mon2nsSock_(-1) + , nsStartupComplete_(false) + , seqNum_(0) + , shutdown_(false) { const char method_name[] = "CNameServer::CNameServer"; TRACE_ENTRY; @@ -84,7 +88,7 @@ CNameServer::~CNameServer( void ) TRACE_EXIT; } -void CNameServer::ChooseNextNs( void ) +int CNameServer::ChooseNextNs( void ) { const char method_name[] = "CNameServer::ChooseNextNs"; TRACE_ENTRY; @@ -103,17 +107,58 @@ void CNameServer::ChooseNextNs( void ) { config = config->GetNext(); } - strcpy( mon2nsHost_, config->GetName() ); - if ( trace_settings & TRACE_NS ) + CNode *node = Nodes->GetNode( (char*) config->GetName() ); + if (node && node->GetState() == State_Up) { - trace_printf( "%s@%d - nameserver=%s, rnd=%d, cnt=%d\n" - , method_name, __LINE__ - , mon2nsHost_ - , rnd - , cnt ); + strcpy( mon2nsHost_, config->GetName() ); + if ( trace_settings & TRACE_NS ) + { + trace_printf( "%s@%d - nameserver=%s, rnd=%d, cnt=%d\n" + , method_name, __LINE__ + , mon2nsHost_ + , rnd + , cnt ); + } + } + else + { + config = config->GetNext()?config->GetNext():NameServerConfig->GetFirstConfig(); + while (config) + { + node = Nodes->GetNode( (char*) config->GetName() ); + if (node && node->GetState() != State_Up) + { + config = config->GetNext(); + continue; + } + + strcpy( mon2nsHost_, config->GetName() ); + if ( trace_settings & TRACE_NS ) + { + trace_printf( "%s@%d - selected alternate nameserver=%s\n" + , method_name, __LINE__ + , mon2nsHost_ ); + } + break; + } + } + + if (strlen(mon2nsHost_) == 0) + { + char la_buf[MON_STRING_BUF_SIZE]; + sprintf( la_buf + , "[%s], No Name Server nodes available.\n" + "Scheduling shutdown (abrupt)!\n" + , method_name ); + mon_log_write(NAMESERVER_CHOOSENEXTNS_1, SQ_LOG_CRIT, la_buf ); + ReqQueue.enqueueShutdownReq( ShutdownLevel_Abrupt ); + + TRACE_EXIT; + return( -2 ); } TRACE_EXIT; + return( 0 ); } int CNameServer::ConnectToNs( bool *retry ) @@ -123,21 +168,32 @@ int CNameServer::ConnectToNs( bool *retry ) int err = 0; +reconnect: + if ( !mon2nsPort_[0] ) - CNameServer::GetM2NPort( -1 ); - if ( !mon2nsHost_[0] ) - ChooseNextNs(); + { + err = GetM2NPort( -1 ); + } + if ( err == 0 && !mon2nsHost_[0] ) + { + err = ChooseNextNs(); + } int sock = 0; if ( shutdown_ ) + { err = -1; + } if ( err == 0 ) { - sock = SockCreate(); + sock = ClientSockCreate(); if ( sock < 0 ) + { err = sock; + goto reconnect; + } } if ( err == 0 ) { @@ -191,7 +247,7 @@ int CNameServer::ConnectToNs( bool *retry ) , nodeId.ping ); } err = SockSend( ( char *) &nodeId, sizeof(nodeId) ); - if ( err == 0 ) + if (err == 0) { if ( trace_settings & TRACE_NS ) { @@ -252,7 +308,7 @@ int CNameServer::ConnectToNs( bool *retry ) if ( IsRealCluster ) { CNode *node = Nodes->GetNode( nodeId.nsPNid ); - if ( node ) + if (node && node->GetState() == State_Up) { strcpy( mon2nsHost_, node->GetName() ); GetM2NPort( nodeId.nsPNid ); @@ -273,50 +329,123 @@ int CNameServer::ConnectToNs( bool *retry ) return err; } -void CNameServer::GetM2NPort( int PNid ) +int CNameServer::GetM2NPort( int nsPNid ) { + const char method_name[] = "CNameServer::GetM2NPort"; + TRACE_ENTRY; + + bool done = false; int port; char *p = getenv( "NS_M2N_COMM_PORT" ); if ( p ) + { port = atoi(p); + } else + { port = 0; + } if ( !IsRealCluster ) - port += PNid < 0 ? MyPNID : PNid; + { + // choose initial port + int nsMax = NameServerConfig->GetCount(); + int candidatePNid = nsPNid < 0 ? MyPNID : nsPNid; + int chosenPNid = + candidatePNid < nsMax ? candidatePNid : candidatePNid%nsMax; + int lastChosenPNid = chosenPNid; + while (!done) + { + // check that corresponding node is UP + // node is up, chosen is good to go + // not up, + // round-robin on other name server nodes and chose 1st up node + // no name server nodes available + // log event and down my node (MyPNID) + CNode *node = Nodes->GetNode( chosenPNid ); + if (node && node->GetState() == State_Up) + { + port += chosenPNid; + + if ( trace_settings & TRACE_NS ) + { + trace_printf( "%s@%d - nsMax=%d, nsPNid=%d, MyPNID=%d, " + "candidatePNid=%d, chosenPNid=%d, port=%d\n" + , method_name, __LINE__ + , nsMax + , nsPNid + , MyPNID + , candidatePNid + , chosenPNid + , port ); + } + done = true; + } + else + { + chosenPNid = (chosenPNid+1) < nsMax ? (chosenPNid+1) : 0; + if (chosenPNid == lastChosenPNid) + { + char la_buf[MON_STRING_BUF_SIZE]; + sprintf( la_buf + , "[%s], No Name Server nodes available, " + "chosenPNid=%d, lastChosenPNid=%d.\n" + "Scheduling shutdown (abrupt)!\n" + , method_name + , chosenPNid, lastChosenPNid ); + mon_log_write(NAMESERVER_GETM2NPORT_1, SQ_LOG_CRIT, la_buf ); + ReqQueue.enqueueShutdownReq( ShutdownLevel_Abrupt ); + done = true; + } + port += chosenPNid; + TRACE_EXIT; + return( -2 ); + } + } + } sprintf( mon2nsPort_, "%d", port ); -} -void CNameServer::SetLocalHost( void ) -{ - gethostname( mon2nsHost_, MAX_PROCESSOR_NAME ); + TRACE_EXIT; + return( 0 ); } -void CNameServer::SetShutdown( bool shutdown ) +bool CNameServer::IsNameServerConfigured( int pnid ) { - const char method_name[] = "CNameServer::SetShutdown"; + const char method_name[] = "CNameServer::IsNameServerConfigured"; TRACE_ENTRY; - if ( trace_settings & TRACE_NS ) - trace_printf( "%s@%d - set shutdown_=%d\n" - , method_name, __LINE__, shutdown ); - shutdown_ = shutdown; + bool rs = false; - TRACE_EXIT; -} + if ( IsRealCluster ) + { + CNameServerConfig *config; + CNode *node = Nodes->GetNode( pnid ); + if ( node ) + { + config = NameServerConfig->GetConfig( node->GetName() ); + if ( config ) + { + rs = true; + } + } + } + else + { + rs = pnid < NameServerConfig->GetCount() ? true : false; + } -void CNameServer::SockClose( void ) -{ - const char method_name[] = "CNameServer::SockClose"; - TRACE_ENTRY; + if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) + { + trace_printf( "%s@%d - pnid=%d, configured=%s\n" + , method_name, __LINE__, pnid, rs?"True":"False" ); + } - close( mon2nsSock_ ); - mon2nsSock_ = -1; TRACE_EXIT; + return(rs); } -int CNameServer::SockCreate( void ) +int CNameServer::ClientSockCreate( void ) { - const char method_name[] = "CNameServer::SockCreate"; + const char method_name[] = "CNameServer::ClientSockCreate"; TRACE_ENTRY; int sock; // socket @@ -363,7 +492,8 @@ int CNameServer::SockCreate( void ) snprintf( la_buf, sizeof(la_buf) , "[%s], socket() failed! errno=%d (%s)\n" , method_name, err, strerror(err) ); - mon_log_write( MON_NAMESERVER_MKCLTSOCK_1, SQ_LOG_ERR, la_buf ); + mon_log_write( NAMESERVER_CLIENTSOCKCREATE_1, SQ_LOG_ERR, la_buf ); + TRACE_EXIT; return ( -1 ); } @@ -375,8 +505,9 @@ int CNameServer::SockCreate( void ) snprintf( la_buf, sizeof(la_buf ), "[%s] gethostbyname(%s) failed! errno=%d (%s)\n" , method_name, host, err, strerror(err) ); - mon_log_write(MON_NAMESERVER_MKCLTSOCK_2, SQ_LOG_ERR, la_buf ); + mon_log_write(NAMESERVER_CLIENTSOCKCREATE_2, SQ_LOG_ERR, la_buf ); close( sock ); + TRACE_EXIT; return ( -1 ); } @@ -418,7 +549,7 @@ int CNameServer::SockCreate( void ) int err = errno; sprintf( la_buf, "[%s], connect() failed! errno=%d (%s)\n" , method_name, err, strerror(err) ); - mon_log_write(MON_NAMESERVER_MKCLTSOCK_3, SQ_LOG_ERR, la_buf ); + mon_log_write(NAMESERVER_CLIENTSOCKCREATE_3, SQ_LOG_ERR, la_buf ); struct timespec req, rem; req.tv_sec = 0; req.tv_nsec = 500000000L; // 500,000,000 @@ -439,8 +570,9 @@ int CNameServer::SockCreate( void ) char la_buf[MON_STRING_BUF_SIZE]; sprintf( la_buf, "[%s], connect() exceeded retries! count=%d\n" , method_name, retries ); - mon_log_write(MON_NAMESERVER_MKCLTSOCK_4, SQ_LOG_ERR, la_buf ); + mon_log_write(NAMESERVER_CLIENTSOCKCREATE_4, SQ_LOG_ERR, la_buf ); close( sock ); + TRACE_EXIT; return ( -1 ); } struct timespec req, rem; @@ -449,6 +581,8 @@ int CNameServer::SockCreate( void ) nanosleep( &req, &rem ); } close( sock ); + TRACE_EXIT; + return( -1 ); } if ( trace_settings & TRACE_NS ) @@ -470,8 +604,9 @@ int CNameServer::SockCreate( void ) int err = errno; sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" , method_name, err, strerror(err) ); - mon_log_write(MON_NAMESERVER_MKCLTSOCK_5, SQ_LOG_ERR, la_buf ); + mon_log_write(NAMESERVER_CLIENTSOCKCREATE_5, SQ_LOG_ERR, la_buf ); close( sock ); + TRACE_EXIT; return ( -2 ); } @@ -481,8 +616,9 @@ int CNameServer::SockCreate( void ) int err = errno; sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n" , method_name, err, strerror(err) ); - mon_log_write(MON_NAMESERVER_MKCLTSOCK_6, SQ_LOG_ERR, la_buf ); + mon_log_write(NAMESERVER_CLIENTSOCKCREATE_6, SQ_LOG_ERR, la_buf ); close( sock ); + TRACE_EXIT; return ( -2 ); } @@ -490,6 +626,19 @@ int CNameServer::SockCreate( void ) return ( sock ); } +void CNameServer::NameServerExited( void ) +{ + const char method_name[] = "CNameServer::NameServerExited"; + TRACE_ENTRY; + + mon2nsHost_[0] = '\0'; + mon2nsPort_[0] = '\0'; + nsStartupComplete_ = false; + SockClose(); + + TRACE_EXIT; +} + int CNameServer::NameServerStop( struct message_def* msg ) { const char method_name[] = "CNameServer::NameServerStop"; @@ -599,9 +748,6 @@ int CNameServer::ProcessNew(CProcess* process ) msgnew->unhooked = process->IsUnhooked(); msgnew->event_messages = process->IsEventMessages(); msgnew->system_messages = process->IsSystemMessages(); -// msgnew->pathStrId = process->pathStrId(); -// msgnew->ldpathStrId = process->ldPathStrId(); -// msgnew->programStrId = process->programStrId(); strcpy( msgnew->path, process->path() ); strcpy( msgnew->ldpath, process->ldpath() ); strcpy( msgnew->program, process->program() ); @@ -682,6 +828,45 @@ int CNameServer::ProcessNew(CProcess* process ) return error; } +int CNameServer::ProcessNodeDown( int nid, char *nodeName ) +{ + const char method_name[] = "CNameServer::ProcessNodeDown"; + TRACE_ENTRY; + + int error = 0; + CProcess *process = MyNode->GetProcessByType( ProcessType_NameServer ); + if (process) + { + struct message_def msg; + memset(&msg, 0, sizeof(msg) ); // TODO: remove! + msg.type = MsgType_Service; + msg.noreply = false; + msg.reply_tag = seqNum_++; + msg.u.request.type = ReqType_NodeDown; + struct NodeDown_def *msgdown = &msg.u.request.u.down; + msgdown->nid = nid; + strcpy( msgdown->node_name, nodeName ); + msgdown->takeover = 0; + msgdown->reason[0] = 0; + + if ( trace_settings & TRACE_NS ) + { + trace_printf( "%s@%d - sending node-down request to nameserver=%s:%s\n" + " msg.down.nid=%d\n" + " msg.down.node_name=%s\n" + , method_name, __LINE__ + , mon2nsHost_, mon2nsPort_ + , msgdown->nid + , msgdown->node_name ); + } + + error = SendReceive(&msg ); + } + + TRACE_EXIT; + return error; +} + int CNameServer::ProcessShutdown( void ) { const char method_name[] = "CNameServer::ProcessShutdown"; @@ -696,7 +881,6 @@ int CNameServer::ProcessShutdown( void ) struct ShutdownNs_def *msgshutdown = &msg.u.request.u.shutdown_ns; msgshutdown->nid = -1; msgshutdown->pid = -1; - //msgshutdown->level = msgIn->u.request.u.shutdown.level; msgshutdown->level = ShutdownLevel_Normal; int error = SendReceive(&msg ); @@ -711,16 +895,20 @@ int CNameServer::ProcessShutdown( void ) int CNameServer::SendReceive( struct message_def* msg ) { const char method_name[] = "CNameServer::SendReceive"; + TRACE_ENTRY; + + int retryCount = 0; char desc[256]; char* descp; - struct DelProcessNs_def *msgdel; - struct NewProcessNs_def *msgnew; - struct ShutdownNs_def *msgshutdown; - struct NameServerStart_def *msgstart; - struct NameServerStop_def *msgstop; - struct ProcessInfo_def *msginfo; - - TRACE_ENTRY; + struct DelProcessNs_def* msgdel; + struct NameServerStart_def* msgstart; + struct NameServerStop_def* msgstop; + struct NewProcessNs_def* msgnew; + struct NodeDown_def* msgdown; + struct ProcessInfo_def* msginfo; + struct ShutdownNs_def* msgshutdown; + struct message_def msg_reply; + struct message_def* pmsg_reply = &msg_reply; descp = desc; int size = offsetof(struct message_def, u.request.u); @@ -750,6 +938,13 @@ int CNameServer::SendReceive( struct message_def* msg ) msgnew->nid, msgnew->pid, msgnew->verifier, msgnew->process_name ); size += sizeof(msg->u.request.u.new_process_ns); break; + case ReqType_NodeDown: + msgdown = &msg->u.request.u.down; + sprintf( desc, "node-down (nid=%d, node-name=%s, takeover=%d, reason=%s)", + msgdown->nid, msgdown->node_name, + msgdown->takeover, msgdown->reason ); + size += sizeof(msg->u.request.u.down); + break; case ReqType_ProcessInfo: msginfo = &msg->u.request.u.process_info; sprintf( desc, "process-info (nid=%d, pid=%d, verifier=%d, name=%s)\n" @@ -774,7 +969,7 @@ int CNameServer::SendReceive( struct message_def* msg ) break; case ReqType_ShutdownNs: msgshutdown = &msg->u.request.u.shutdown_ns; - sprintf( desc, "shutdown (nid=%d, pid=%d, level=%d)", + sprintf( desc, "shutdown-ns (nid=%d, pid=%d, level=%d)", msgshutdown->nid, msgshutdown->pid, msgshutdown->level ); size += sizeof(msg->u.request.u.shutdown_ns); break; @@ -783,13 +978,16 @@ int CNameServer::SendReceive( struct message_def* msg ) break; } +retryIO: + int error = SendToNs( descp, msg, size ); if ( error == 0 ) error = SockReceive( (char *) &size, sizeof(size ) ); if ( error == 0 ) - error = SockReceive( (char *) msg, size ); + error = SockReceive( (char *) pmsg_reply, size ); if ( error == 0 ) { + memcpy( msg, pmsg_reply, size ); if ( trace_settings & ( TRACE_NS | TRACE_PROCESS ) ) { char desc[2048]; @@ -827,7 +1025,6 @@ int CNameServer::SendReceive( struct message_def* msg ) msg->u.reply.u.process_info.more_data ); break; case ReplyType_ProcessInfoNs: -// int argvLen = sizeof(msg->u.reply.u.process_info_ns.argv); sprintf( desc, "process-info-ns reply:\n" " process_info_ns.nid=%d\n" @@ -847,18 +1044,14 @@ int CNameServer::SendReceive( struct message_def* msg ) " process_info_ns.path=%s\n" " process_info_ns.ldpath=%s\n" " process_info_ns.program=%s\n" -// " process_info_ns.pathStrId=%d:%d\n" -// " process_info_ns.ldpathStrId=%d:%d\n" -// " process_info_ns.programStrId=%d:%d\n" " process_info_ns.port_name=%s\n" " process_info_ns.argc=%d\n" -// " process_info_ns.argv=[%.*s]\n" " process_info_ns.infile=%s\n" " process_info_ns.outfile=%s\n" -//#if 0 -// " process_info_ns.creation_time=%ld(secs)\n", -// " process_info_ns.creation_time=%ld(secs):%ld(nsecs)\n", -//#endif +#if 0 + " process_info_ns.creation_time=%ld(secs)\n", + " process_info_ns.creation_time=%ld(secs):%ld(nsecs)\n", +#endif " process_info_ns.return_code=%d" , msg->u.reply.u.process_info_ns.nid , msg->u.reply.u.process_info_ns.pid @@ -877,21 +1070,14 @@ int CNameServer::SendReceive( struct message_def* msg ) , msg->u.reply.u.process_info_ns.path , msg->u.reply.u.process_info_ns.ldpath , msg->u.reply.u.process_info_ns.program -// , msg->u.reply.u.process_info_ns.pathStrId.nid -// , msg->u.reply.u.process_info_ns.pathStrId.id -// , msg->u.reply.u.process_info_ns.ldpathStrId.nid -// , msg->u.reply.u.process_info_ns.ldpathStrId.id -// , msg->u.reply.u.process_info_ns.programStrId.nid -// , msg->u.reply.u.process_info_ns.programStrId.id , msg->u.reply.u.process_info_ns.port_name , msg->u.reply.u.process_info_ns.argc -// , &msg->u.reply.u.process_info_ns.argv , msg->u.reply.u.process_info_ns.infile , msg->u.reply.u.process_info_ns.outfile -//#if 0 -// , msg->u.reply.u.process_info_ns.creation_time.tv_sec -// , msg->u.reply.u.process_info_ns.creation_time.tv_nsec -//#endif +#if 0 + , msg->u.reply.u.process_info_ns.creation_time.tv_sec + , msg->u.reply.u.process_info_ns.creation_time.tv_nsec +#endif , msg->u.reply.u.process_info_ns.return_code ); break; default: @@ -905,7 +1091,20 @@ int CNameServer::SendReceive( struct message_def* msg ) ); } } - else + else if ( error != -2 && retryCount < NAMESERVER_IO_RETRIES ) + { + retryCount++; + if ( trace_settings & TRACE_NS ) + { + trace_printf( "%s@%d - retrying IO (%d) to nameserver=%s:%s\n" + , method_name, __LINE__ + , retryCount + , mon2nsHost_, mon2nsPort_ ); + } + goto retryIO; + } + + if ( error ) { // create a synthetic reply msg->u.reply.u.generic.nid = -1; @@ -943,9 +1142,10 @@ int CNameServer::SendToNs( const char *reqType, struct message_def *msg, int siz if ( trace_settings & TRACE_NS ) { - trace_printf( "%s@%d - sending %s REQ to nameserver=%s:%s, sock=%d, shutdown=%d\n" + trace_printf( "%s@%d - sending %s\tREQ (size=%d) to nameserver=%s:%s, sock=%d, shutdown=%d\n" , method_name, __LINE__ , reqType + , size , mon2nsHost_ , mon2nsPort_ , mon2nsSock_ @@ -967,15 +1167,72 @@ int CNameServer::SendToNs( const char *reqType, struct message_def *msg, int siz error = ConnectToNs( &retry ); } } + if ( error == 0 ) + { error = SockSend( (char *) &size, sizeof(size) ); - if ( error == 0 ) - error = SockSend( (char *) msg, size ); + if (error) + { + int err = error; + char buf[MON_STRING_BUF_SIZE]; + snprintf( buf, sizeof(buf) + , "[%s], unable to send %s request size %d to " + "nameserver=%s:%s, error: %d(%s)\n" + , method_name, reqType, size, mon2nsHost_, mon2nsPort_, err, strerror(err) ); + mon_log_write(NAMESERVER_SENDTONS_1, SQ_LOG_ERR, buf); + } + else + { + error = SockSend( (char *) msg, size ); + if (error) + { + int err = error; + char buf[MON_STRING_BUF_SIZE]; + snprintf( buf, sizeof(buf) + , "[%s], unable to send %s request to " + "nameserver=%s:%s, error: %d(%s)\n" + , method_name, reqType, mon2nsHost_, mon2nsPort_, err, strerror(err) ); + mon_log_write(NAMESERVER_SENDTONS_2, SQ_LOG_ERR, buf); + } + } + } TRACE_EXIT; return error; } +void CNameServer::SetLocalHost( void ) +{ + gethostname( mon2nsHost_, MAX_PROCESSOR_NAME ); +} + +void CNameServer::SetShutdown( bool shutdown ) +{ + const char method_name[] = "CNameServer::SetShutdown"; + TRACE_ENTRY; + + if ( trace_settings & TRACE_NS ) + trace_printf( "%s@%d - set shutdown_=%d\n" + , method_name, __LINE__, shutdown ); + shutdown_ = shutdown; + + TRACE_EXIT; +} + +void CNameServer::SockClose( void ) +{ + const char method_name[] = "CNameServer::SockClose"; + TRACE_ENTRY; + + if (mon2nsSock_ != -1) + { + close( mon2nsSock_ ); + mon2nsSock_ = -1; + } + + TRACE_EXIT; +} + int CNameServer::SockReceive( char *buf, int size ) { const char method_name[] = "CNameServer::SockReceive"; @@ -1045,9 +1302,29 @@ int CNameServer::SockReceive( char *buf, int size ) , error, strerror(error) ); } - if ( error ) + if (error) + { SockClose(); + int err = error; + char buf[MON_STRING_BUF_SIZE]; + snprintf( buf, sizeof(buf) + , "[%s], unable to receive request size %d to " + "nameserver=%s:%s, error: %d(%s)\n" + , method_name, size, mon2nsHost_, mon2nsPort_, err, strerror(err) ); + mon_log_write(NAMESERVER_SOCKRECEIVE_1, SQ_LOG_ERR, buf); + + // Choose another name server on IO retry + if (IsRealCluster) + { + mon2nsHost_[0] = 0; + } + else + { + mon2nsPort_[0] = 0; + } + } + TRACE_EXIT; return error; } @@ -1112,9 +1389,28 @@ int CNameServer::SockSend( char *buf, int size ) , error, strerror(error) ); } - if ( error ) + if (error) + { SockClose(); + int err = error; + char buf[MON_STRING_BUF_SIZE]; + snprintf( buf, sizeof(buf) + , "[%s], unable to send request size %d to " + "nameserver=%s:%s, error: %d(%s)\n" + , method_name, size, mon2nsHost_, mon2nsPort_, err, strerror(err) ); + mon_log_write(NAMESERVER_SOCKSEND_1, SQ_LOG_ERR, buf); + // Choose another name server on IO retry + if (IsRealCluster) + { + mon2nsHost_[0] = 0; + } + else + { + mon2nsPort_[0] = 0; + } + } + TRACE_EXIT; return error; } http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nameserver.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nameserver.h b/core/sqf/monitor/linux/nameserver.h index a8ccb4b..009eced 100644 --- a/core/sqf/monitor/linux/nameserver.h +++ b/core/sqf/monitor/linux/nameserver.h @@ -40,12 +40,15 @@ public: CNameServer( void ); virtual ~CNameServer( void ); + bool IsNameServerConfigured( int pnid ); + void NameServerExited( void ); int NameServerStop( struct message_def* msg ); int ProcessDelete(CProcess* process ); int ProcessInfo( struct message_def* msg ); int ProcessInfoCont( struct message_def* msg ); int ProcessInfoNs( struct message_def* msg ); int ProcessNew(CProcess* process ); + int ProcessNodeDown( int nid, char* nodeName ); int ProcessShutdown( void ); void SetLocalHost( void ); @@ -53,21 +56,20 @@ private: char mon2nsHost_[MAX_PROCESSOR_NAME]; char mon2nsPort_[10]; int mon2nsSock_; - int nsConfigInx_; bool nsStartupComplete_; int seqNum_; bool shutdown_; - void ChooseNextNs( void ); - int ConnectToNs( bool *retry ); - void GetM2NPort( int PNid ); + int ChooseNextNs( void ); + int ClientSockCreate(); + int ConnectToNs( bool* retry ); + int GetM2NPort( int PNid ); int SendReceive( struct message_def* msg ); - int SendToNs( const char *reqType, struct message_def *msg, int size ); + int SendToNs( const char* reqType, struct message_def* msg, int size ); void SetShutdown( bool shutdown ); void SockClose( void ); - int SockCreate(); - int SockReceive( char *buf, int size ); - int SockSend( char *buf, int size ); + int SockReceive( char* buf, int size ); + int SockSend( char* buf, int size ); }; #endif http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nscommacceptmon.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nscommacceptmon.cxx b/core/sqf/monitor/linux/nscommacceptmon.cxx index edddca7..0857cc9 100644 --- a/core/sqf/monitor/linux/nscommacceptmon.cxx +++ b/core/sqf/monitor/linux/nscommacceptmon.cxx @@ -155,6 +155,36 @@ void CCommAcceptMon::monReqNameServerStop( struct message_def* msg, int sockFd ) TRACE_EXIT; } +void CCommAcceptMon::monReqNodeDown( struct message_def* msg, int sockFd ) +{ + const char method_name[] = "CCommAcceptMon::monReqNodeDown"; + TRACE_ENTRY; + + if ( trace_settings & ( TRACE_NS | TRACE_REQUEST) ) + { + trace_printf( "%s@%d - Received monitor node-down request.\n" + " msg.down.nid=%d\n" + " msg.down.node_name=%s\n" + " msg.down.takeover=%d\n" + " msg.down.reason=%s\n" + , method_name, __LINE__ + , msg->u.request.u.down.nid + , msg->u.request.u.down.node_name + , msg->u.request.u.down.takeover + , msg->u.request.u.down.reason + ); + } + + CExternalReq::reqQueueMsg_t msgType; + msgType = CExternalReq::NonStartupMsg; + int nid = msg->u.request.u.down.nid; + int pid = -1; + // Place new request on request queue + ReqQueue.enqueueReq(msgType, nid, pid, sockFd, msg); + + TRACE_EXIT; +} + void CCommAcceptMon::monReqProcessInfo( struct message_def* msg, int sockFd ) { const char method_name[] = "CCommAcceptMon::monReqProcessInfo"; @@ -412,12 +442,12 @@ void CCommAcceptMon::monReqUnknown( struct message_def* msg, int sockFd ) void CCommAcceptMon::processMonReqs( int sockFd ) { const char method_name[] = "CCommAcceptMon::processMonReqs"; + TRACE_ENTRY; + int rc; nodeId_t nodeId; struct message_def msg; - TRACE_ENTRY; - if ( trace_settings & ( TRACE_NS ) ) { trace_printf( "%s@%d - Accepted connection sock=%d\n" @@ -435,7 +465,7 @@ void CCommAcceptMon::processMonReqs( int sockFd ) char buf[MON_STRING_BUF_SIZE]; snprintf(buf, sizeof(buf), "[%s], unable to obtain node id from new " "monitor: %s.\n", method_name, ErrorMsg(rc)); - mon_log_write(NS_COMMACCEPT_2, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_1, SQ_LOG_ERR, buf); return; } @@ -462,6 +492,36 @@ void CCommAcceptMon::processMonReqs( int sockFd ) , nodeId.ping ); } + CNode *node; + node = Nodes->GetNode( nodeId.pnid ); + if ( node != NULL ) + { + if ( node->GetState() != State_Up ) + { + if ( trace_settings & ( TRACE_NS ) ) + { + trace_printf( "%s@%d - Bringing node up, node=%s, pnid=%d\n" + , method_name, __LINE__ + , node->GetName(), node->GetPNid() ); + } + rc = Monitor->HardNodeUpNs( node->GetPNid() ); + if ( rc ) + { // Handle error + close( sockFd ); + return; + } + } + } + else + { // Handle error + close( sockFd ); + char buf[MON_STRING_BUF_SIZE]; + snprintf(buf, sizeof(buf), "[%s], invalid physical node id, " + "pnid: %d\n", method_name, nodeId.pnid ); + mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_2, SQ_LOG_ERR, buf); + return; + } + strcpy(nodeId.nodeName, MyNode->GetName()); strcpy(nodeId.commPort, MyNode->GetCommPort()); strcpy(nodeId.syncPort, MyNode->GetSyncPort()); @@ -504,7 +564,7 @@ void CCommAcceptMon::processMonReqs( int sockFd ) char buf[MON_STRING_BUF_SIZE]; snprintf(buf, sizeof(buf), "[%s], unable to send node id from new " "monitor: %s.\n", method_name, ErrorMsg(rc)); - mon_log_write(NS_COMMACCEPT_3, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_3, SQ_LOG_ERR, buf); return; } @@ -517,9 +577,9 @@ void CCommAcceptMon::processMonReqs( int sockFd ) { // Handle error close( sockFd ); char buf[MON_STRING_BUF_SIZE]; - snprintf(buf, sizeof(buf), "[%s], unable to obtain node id from new " + snprintf(buf, sizeof(buf), "[%s], unable to obtain message size from " "monitor: %s.\n", method_name, ErrorMsg(rc)); - mon_log_write(NS_COMMACCEPT_4, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_4, SQ_LOG_ERR, buf); return; } @@ -528,9 +588,9 @@ void CCommAcceptMon::processMonReqs( int sockFd ) { // Handle error close( sockFd ); char buf[MON_STRING_BUF_SIZE]; - snprintf(buf, sizeof(buf), "[%s], unable to obtain node id from new " + snprintf(buf, sizeof(buf), "[%s], unable to obtain message from " "monitor: %s.\n", method_name, ErrorMsg(rc)); - mon_log_write(NS_COMMACCEPT_5, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_5, SQ_LOG_ERR, buf); return; } if ( trace_settings & ( TRACE_NS ) ) @@ -591,6 +651,10 @@ void CCommAcceptMon::processMonReqs( int sockFd ) monReqNameServerStop(&msg, sockFd); break; + case ReqType_NodeDown: + monReqNodeDown(&msg, sockFd); + break; + case ReqType_ProcessInfo: monReqProcessInfo(&msg, sockFd); break; @@ -663,9 +727,9 @@ void CCommAcceptMon::processNewSock( int joinFd ) if (rc != 0) { char buf[MON_STRING_BUF_SIZE]; - snprintf(buf, sizeof(buf), "[%s], thread create error=%d\n", + snprintf(buf, sizeof(buf), "[%s], mon2nsProcess thread create error=%d\n", method_name, rc); - mon_log_write(NS_COMMACCEPT_6, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_PROCESSNEWSOCK_1, SQ_LOG_ERR, buf); } TRACE_EXIT; @@ -743,7 +807,7 @@ void CCommAcceptMon::commAcceptorSock() char buf[MON_STRING_BUF_SIZE]; snprintf(buf, sizeof(buf), "[%s], cannot accept new monitor: %s.\n", method_name, strerror(errno)); - mon_log_write(NS_COMMACCEPT_7, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_COMMACCEPTORSOCK_1, SQ_LOG_ERR, buf); } else @@ -800,7 +864,7 @@ static void *mon2nsAcceptMon(void *arg) char buf[MON_STRING_BUF_SIZE]; snprintf(buf, sizeof(buf), "[%s], pthread_sigmask error=%d\n", method_name, rc); - mon_log_write(NS_COMMACCEPT_8, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_MON2NSACCEPTMON_1, SQ_LOG_ERR, buf); } // Enter thread processing loop @@ -830,7 +894,7 @@ static void *mon2nsProcess(void *arg) char buf[MON_STRING_BUF_SIZE]; snprintf(buf, sizeof(buf), "[%s], pthread_sigmask error=%d\n", method_name, rc); - mon_log_write(NS_COMMACCEPT_9, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_MON2NSPROCESS_1, SQ_LOG_ERR, buf); } MyNode->AddMonConnCount(1); @@ -858,7 +922,7 @@ void CCommAcceptMon::start() char buf[MON_STRING_BUF_SIZE]; snprintf(buf, sizeof(buf), "[%s], thread create error=%d\n", method_name, rc); - mon_log_write(NS_COMMACCEPT_10, SQ_LOG_ERR, buf); + mon_log_write(NS_COMMACCEPT_START_1, SQ_LOG_ERR, buf); } TRACE_EXIT; http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nscommacceptmon.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nscommacceptmon.h b/core/sqf/monitor/linux/nscommacceptmon.h index 41b2c9b..1749b16 100644 --- a/core/sqf/monitor/linux/nscommacceptmon.h +++ b/core/sqf/monitor/linux/nscommacceptmon.h @@ -46,6 +46,7 @@ public: void monReqExec( CExternalReq * request ); void monReqNameServerStop( struct message_def* msg, int sockFd ); void monReqNewProcess( struct message_def* msg, int sockFd ); + void monReqNodeDown( struct message_def* msg, int sockFd ); void monReqProcessInfo( struct message_def* msg, int sockFd ); void monReqProcessInfoCont( struct message_def* msg, int sockFd ); void monReqProcessInfoNs( struct message_def* msg, int sockFd ); @@ -68,9 +69,9 @@ private: bool accepting_; bool shutdown_; - // commAccept thread's id + // mon2nsAcceptMon thread's id pthread_t thread_id_; - // commAccept thread's id + // mon2nsProcess thread's id pthread_t process_thread_id_; enum { HEURISTIC_COUNT = 10 }; http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nsreqprocinfons.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nsreqprocinfons.cxx b/core/sqf/monitor/linux/nsreqprocinfons.cxx index aa53437..37c09f6 100644 --- a/core/sqf/monitor/linux/nsreqprocinfons.cxx +++ b/core/sqf/monitor/linux/nsreqprocinfons.cxx @@ -222,18 +222,21 @@ void CExtProcInfoNsReq::performRequest() if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) { - trace_printf( "%s@%d request #%ld: ProcessInfoNs, for (%d, %d:%d), " + trace_printf( "%s@%d request #%ld: ProcessInfoNs, for %s (%d, %d:%d), " "process type=%s\n" , method_name, __LINE__, id_ - , target_nid, target_pid, target_verifier + , target_process_name.c_str(), target_nid, target_pid, target_verifier , ProcessTypeString(target_type)); } if (target_process_name.size()) { // find by name (don't check node state, don't check process state, not backup) - process = Nodes->GetProcess( target_process_name.c_str() - , target_verifier - , false, false, false ); + if (msg_->u.request.u.process_info.target_process_name[0] == '$' ) + { + process = Nodes->GetProcess( target_process_name.c_str() + , target_verifier + , false, false, false ); + } } else { http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nsreqshutdown.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nsreqshutdown.cxx b/core/sqf/monitor/linux/nsreqshutdown.cxx index e5888d2..63d9400 100644 --- a/core/sqf/monitor/linux/nsreqshutdown.cxx +++ b/core/sqf/monitor/linux/nsreqshutdown.cxx @@ -30,12 +30,10 @@ #include "montrace.h" #include "monsonar.h" #include "monlogging.h" -#include "replicate.h" extern CMonStats *MonStats; extern CNode *MyNode; extern CNodeContainer *Nodes; -extern CReplicate Replicator; CExtShutdownNsReq::CExtShutdownNsReq (reqQueueMsg_t msgType, int nid, int pid, int sockFd, @@ -43,7 +41,7 @@ CExtShutdownNsReq::CExtShutdownNsReq (reqQueueMsg_t msgType, : CExternalReq(msgType, nid, pid, sockFd, msg) { // Add eyecatcher sequence as a debugging aid - memcpy(&eyecatcher_, "RQER", 4); // TODO + memcpy(&eyecatcher_, "RqER", 4); priority_ = High; } @@ -51,7 +49,7 @@ CExtShutdownNsReq::CExtShutdownNsReq (reqQueueMsg_t msgType, CExtShutdownNsReq::~CExtShutdownNsReq() { // Alter eyecatcher sequence as a debugging aid to identify deleted object - memcpy(&eyecatcher_, "rqer", 4); // TODO + memcpy(&eyecatcher_, "rQer", 4); } void CExtShutdownNsReq::populateRequestString( void ) http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nsreqstop.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nsreqstop.cxx b/core/sqf/monitor/linux/nsreqstop.cxx index 4ff46ce..a20e9bd 100644 --- a/core/sqf/monitor/linux/nsreqstop.cxx +++ b/core/sqf/monitor/linux/nsreqstop.cxx @@ -90,7 +90,7 @@ void CExtNameServerStopNsReq::performRequest() int nid = atoi( msg_->u.request.u.nameserver_stop.node_name ); node = Nodes->GetLNode( nid )->GetNode(); } - Monitor->HardNodeDown( node->GetPNid(), true ); + Monitor->HardNodeDownNs( node->GetPNid() ); char la_buf[MON_STRING_BUF_SIZE*2]; snprintf( la_buf, sizeof(la_buf) http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/pnode.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx index 4a4b8c4..364837b 100644 --- a/core/sqf/monitor/linux/pnode.cxx +++ b/core/sqf/monitor/linux/pnode.cxx @@ -1065,11 +1065,11 @@ strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode, bool clone ) !MyNode->IsMyNode(targetLNode->GetNid())) { // Forward the unique string to the target node - int rc = PtpClient->AddUniqStr( id.nid - , id.id - , candidate - , targetLNode->GetNid() - , targetLNode->GetNode()->GetName() ); + int rc = PtpClient->ProcessAddUniqStr( id.nid + , id.id + , candidate + , targetLNode->GetNid() + , targetLNode->GetNode()->GetName() ); if (rc) { char la_buf[MON_STRING_BUF_SIZE]; @@ -1110,11 +1110,11 @@ strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode, bool clone ) !MyNode->IsMyNode(targetLNode->GetNid())) { // Forward the unique string to the target node - int rc = PtpClient->AddUniqStr( id.nid - , id.id - , candidate - , targetLNode->GetNid() - , targetLNode->GetNode()->GetName()); + int rc = PtpClient->ProcessAddUniqStr( id.nid + , id.id + , candidate + , targetLNode->GetNid() + , targetLNode->GetNode()->GetName()); if (rc) { char la_buf[MON_STRING_BUF_SIZE]; @@ -1240,6 +1240,16 @@ void CNode::StartNameServerProcess( void ) const char method_name[] = "CNode::StartNameServerProcess"; TRACE_ENTRY; + if ( !NameServer->IsNameServerConfigured( MyPNID ) ) + { + if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) + { + trace_printf( "%s@%d" " - NameServer is not configured in my node\n" + , method_name, __LINE__); + } + return; + } + char path[MAX_SEARCH_PATH]; char *ldpath = NULL; // = getenv("LD_LIBRARY_PATH"); char filename[MAX_PROCESS_PATH]; @@ -1250,7 +1260,9 @@ void CNode::StartNameServerProcess( void ) snprintf( stdout, sizeof(stdout), "stdout_TNS%d", MyNode->GetZone() ); if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) - trace_printf("%s@%d" " - Creating NameService Process\n", method_name, __LINE__); + { + trace_printf("%s@%d" " - Creating NameServer Process\n", method_name, __LINE__); + } strcpy(path,getenv("PATH")); strcat(path,":"); @@ -1281,12 +1293,14 @@ void CNode::StartNameServerProcess( void ) if ( NameServerProcess ) { if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) - trace_printf("%s@%d" " - NameService Process created\n", method_name, __LINE__); + { + trace_printf("%s@%d" " - NameServer Process created\n", method_name, __LINE__); + } } else { char la_buf[MON_STRING_BUF_SIZE]; - sprintf(la_buf, "[%s], NameService Process creation failed.\n", method_name); + sprintf(la_buf, "[%s], NameServer Process creation failed.\n", method_name); mon_log_write( MON_NODE_STARTNAMESERVER_1, SQ_LOG_ERR, la_buf ); } @@ -2556,11 +2570,27 @@ CProcess *CNodeContainer::CloneProcessNs( int nid } else { - char buf[MON_STRING_BUF_SIZE]; - snprintf( buf, sizeof(buf), - "[%s] ProcessInfo failed, rc=%d\n" - , method_name, msg.u.reply.u.process_info_ns.return_code ); - mon_log_write( MON_NODE_CLONEPROCESSNS_1, SQ_LOG_ERR, buf ); + if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) + { + trace_printf( "%s@%d - ProcessInfoNs(%d, %d:%d) -- can't find target process\n" + , method_name, __LINE__ + , msg.u.reply.u.process_info_ns.nid + , msg.u.reply.u.process_info_ns.pid + , msg.u.reply.u.process_info_ns.verifier); + } + + if ( msg.u.reply.u.process_info_ns.return_code != MPI_ERR_NAME ) + { + char buf[MON_STRING_BUF_SIZE]; + snprintf( buf, sizeof(buf), + "[%s] ProcessInfo(%d, %d:%d) failed, rc=%d\n" + , method_name + , msg.u.reply.u.process_info_ns.nid + , msg.u.reply.u.process_info_ns.pid + , msg.u.reply.u.process_info_ns.verifier + , msg.u.reply.u.process_info_ns.return_code ); + mon_log_write( MON_NODE_CLONEPROCESSNS_1, SQ_LOG_ERR, buf ); + } } } else @@ -2625,11 +2655,25 @@ CProcess *CNodeContainer::CloneProcessNs( const char *name, Verifier_t verifier } else { - char buf[MON_STRING_BUF_SIZE]; - snprintf( buf, sizeof(buf), - "[%s] ProcessInfo failed, rc=%d\n" - , method_name, msg.u.reply.u.process_info_ns.return_code ); - mon_log_write( MON_NODE_CLONEPROCESSNS_4, SQ_LOG_ERR, buf ); + if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) + { + trace_printf( "%s@%d - ProcessInfoNs(%s:%d) -- can't find target process\n" + , method_name, __LINE__ + , msg.u.reply.u.process_info_ns.process_name + , msg.u.reply.u.process_info_ns.verifier); + } + + if ( msg.u.reply.u.process_info_ns.return_code != MPI_ERR_NAME ) + { + char buf[MON_STRING_BUF_SIZE]; + snprintf( buf, sizeof(buf), + "[%s] ProcessInfo(%s:%d) failed, rc=%d\n" + , method_name + , msg.u.reply.u.process_info_ns.process_name + , msg.u.reply.u.process_info_ns.verifier + , msg.u.reply.u.process_info_ns.return_code ); + mon_log_write( MON_NODE_CLONEPROCESSNS_4, SQ_LOG_ERR, buf ); + } } } else
