Fixed parent clone issue and ptpCommAccept thread shutdown logic Added -nid <nid> argument to specify shell attach to a <nid> when in virtual cluster.
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/6dc990fe Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/6dc990fe Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/6dc990fe Branch: refs/heads/master Commit: 6dc990fe25798d46b51fb5c4932d92d58ad6aa0a Parents: 38eb84e Author: Zalo Correa <[email protected]> Authored: Thu Mar 29 17:17:47 2018 -0700 Committer: Zalo Correa <[email protected]> Committed: Thu Mar 29 17:17:47 2018 -0700 ---------------------------------------------------------------------- core/sqf/monitor/linux/cluster.cxx | 61 +++++++---- core/sqf/monitor/linux/cluster.h | 8 +- core/sqf/monitor/linux/monitor.cxx | 2 +- core/sqf/monitor/linux/notice.cxx | 57 ++++++---- core/sqf/monitor/linux/pnode.cxx | 2 + core/sqf/monitor/linux/pnode.h | 9 +- core/sqf/monitor/linux/process.cxx | 112 +++++++++++++------- core/sqf/monitor/linux/process.h | 5 +- core/sqf/monitor/linux/ptpclient.cxx | 82 +++++--------- core/sqf/monitor/linux/ptpclient.h | 6 +- core/sqf/monitor/linux/ptpcommaccept.cxx | 6 +- core/sqf/monitor/linux/shell.cxx | 38 ++++++- core/sqf/monitor/test/monitor.env | 18 ++-- core/sqf/monitor/test/runtest | 46 +++----- core/sqf/monitor/test/sqconfig.monitor.virtual | 5 +- 15 files changed, 270 insertions(+), 187 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/cluster.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx index 101cec2..070230e 100644 --- a/core/sqf/monitor/linux/cluster.cxx +++ b/core/sqf/monitor/linux/cluster.cxx @@ -86,7 +86,7 @@ extern CCommAcceptMon CommAcceptMon; extern char MyMon2NsPort[MPI_MAX_PORT_NAME]; #else extern bool NameServerEnabled; -extern char MyMon2MonPort[MPI_MAX_PORT_NAME]; +extern char MyPtPPort[MPI_MAX_PORT_NAME]; #endif extern bool SMSIntegrating; extern int CreatorShellPid; @@ -8315,7 +8315,7 @@ void CCluster::InitServerSock( void ) #ifdef NAMESERVER_PROCESS int mon2nsPort = 0; #else - int mon2monPort = 0; + int ptpPort = 0; #endif unsigned char addr[4]; @@ -8490,7 +8490,7 @@ void CCluster::InitServerSock( void ) int val; errno = 0; val = strtol(env, NULL, 10); - if ( errno == 0) mon2monPort = val; + if ( errno == 0) ptpPort = val; } else { @@ -8505,38 +8505,39 @@ void CCluster::InitServerSock( void ) // For virtual env, add PNid to the port so we can still test without collisions of port numbers if (!IsRealCluster) { - mon2monPort += MyNode->GetPNid(); + ptpPort += MyNode->GetPNid(); } - mon2monSock_ = MkSrvSock( &mon2monPort ); - if ( mon2monSock_ < 0 ) + ptpSock_ = MkSrvSock( &ptpPort ); + if ( ptpSock_ < 0 ) { char ebuff[MON_STRING_BUF_SIZE]; char buf[MON_STRING_BUF_SIZE]; snprintf( buf, sizeof(buf) , "[%s@%d] MkSrvSock(MON2MON_COMM_PORT=%d) error: %s\n" - , method_name, __LINE__, mon2monPort + , method_name, __LINE__, ptpPort , strerror_r( errno, ebuff, MON_STRING_BUF_SIZE ) ); mon_log_write( MON_CLUSTER_INITSERVERSOCK_6, SQ_LOG_CRIT, buf ); abort(); } else { - snprintf( MyMon2MonPort, sizeof(MyMon2MonPort) + snprintf( MyPtPPort, sizeof(MyPtPPort) , "%d.%d.%d.%d:%d" , (int)((unsigned char *)addr)[0] , (int)((unsigned char *)addr)[1] , (int)((unsigned char *)addr)[2] , (int)((unsigned char *)addr)[3] - , mon2monPort ); - MyNode->SetMon2MonPort( MyMon2MonPort ); + , ptpPort ); + MyNode->SetPtPPort( MyPtPPort ); + MyNode->SetPtPSocketPort( ptpPort ); if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) - trace_printf( "%s@%d Initialized my mon2mon socket port, " - "pnid=%d (%s:%s) (mon2monPort=%s)\n" + trace_printf( "%s@%d Initialized my ptp socket port, " + "pnid=%d (%s:%s) (ptpPort=%s)\n" , method_name, __LINE__ - , MyPNID, MyNode->GetName(), MyMon2MonPort - , MyNode->GetMon2MonPort() ); + , MyPNID, MyNode->GetName(), MyPtPPort + , MyNode->GetPtPPort() ); } } @@ -8579,12 +8580,12 @@ int CCluster::AcceptSyncSock( void ) } #ifndef NAMESERVER_PROCESS -int CCluster::AcceptMon2MonSock( void ) +int CCluster::AcceptPtPSock( void ) { - const char method_name[] = "CCluster::AcceptMon2MonSock"; + const char method_name[] = "CCluster::AcceptPtPSock"; TRACE_ENTRY; - int csock = AcceptSock( mon2monSock_ ); + int csock = AcceptSock( ptpSock_ ); TRACE_EXIT; return( csock ); @@ -8871,11 +8872,33 @@ int CCluster::Connect( const char *portName ) return ( sock ); } +#ifndef NAMESERVER_PROCESS +void CCluster::ConnectToPtPCommSelf( void ) +{ + const char method_name[] = "CCluster::ConnectToPtPCommSelf"; + TRACE_ENTRY; + + Connect( MyNode->GetPtPSocketPort() ); + + TRACE_EXIT; +} +#endif + void CCluster::ConnectToSelf( void ) { const char method_name[] = "CCluster::ConnectToSelf"; TRACE_ENTRY; + Connect( MyNode->GetCommSocketPort() ); + + TRACE_EXIT; +} + +void CCluster::Connect( int socketPort ) +{ + const char method_name[] = "CCluster::Connect"; + TRACE_ENTRY; + int sock; // socket int ret; // returned value #if defined(_XOPEN_SOURCE_EXTENDED) @@ -8928,7 +8951,7 @@ void CCluster::ConnectToSelf( void ) memset( (char *) &sockinfo, 0, size ); memcpy( (char *) &sockinfo.sin_addr, (char *) he->h_addr, 4 ); sockinfo.sin_family = AF_INET; - sockinfo.sin_port = htons( (unsigned short) MyNode->GetCommSocketPort() ); + sockinfo.sin_port = htons( (unsigned short) socketPort ); connect_failures = 0; ret = 1; @@ -8942,7 +8965,7 @@ void CCluster::ConnectToSelf( void ) , (int)((unsigned char *)he->h_addr)[1] , (int)((unsigned char *)he->h_addr)[2] , (int)((unsigned char *)he->h_addr)[3] - , MyNode->GetCommSocketPort() + , socketPort , connect_failures ); } http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/cluster.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/cluster.h b/core/sqf/monitor/linux/cluster.h index 90c2092..f4f9147 100644 --- a/core/sqf/monitor/linux/cluster.h +++ b/core/sqf/monitor/linux/cluster.h @@ -109,9 +109,13 @@ public: #ifdef NAMESERVER_PROCESS int AcceptMon2NsSock( void ); #else - int AcceptMon2MonSock( void ); + int AcceptPtPSock( void ); #endif int Connect( const char *portName ); + void Connect( int socketPort ); +#ifndef NAMESERVER_PROCESS + void ConnectToPtPCommSelf( void ); +#endif void ConnectToSelf( void ); int SetKeepAliveSockOpt( int sock ); int MkCltSock( const char *portName ); @@ -231,7 +235,7 @@ protected: #ifdef NAMESERVER_PROCESS int mon2nsSock_; #else - int mon2monSock_; + int ptpSock_; #endif int epollFD_; int *indexToPnid_; http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/monitor.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/monitor.cxx b/core/sqf/monitor/linux/monitor.cxx index 77d7509..e6aa68b 100755 --- a/core/sqf/monitor/linux/monitor.cxx +++ b/core/sqf/monitor/linux/monitor.cxx @@ -111,7 +111,7 @@ char MySyncPort[MPI_MAX_PORT_NAME] = {'\0'}; #ifdef NAMESERVER_PROCESS char MyMon2NsPort[MPI_MAX_PORT_NAME] = {'\0'}; #else -char MyMon2MonPort[MPI_MAX_PORT_NAME] = {'\0'}; +char MyPtPPort[MPI_MAX_PORT_NAME] = {'\0'}; #endif char Node_name[MPI_MAX_PROCESSOR_NAME] = {'\0'}; sigset_t SigSet; http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/notice.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/notice.cxx b/core/sqf/monitor/linux/notice.cxx index 85a9eab..f478f45 100644 --- a/core/sqf/monitor/linux/notice.cxx +++ b/core/sqf/monitor/linux/notice.cxx @@ -269,8 +269,9 @@ void CNotice::Notify( SQ_LocalIOToClient::bcastPids_t *bcastPids ) if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) - trace_printf( "%s@%d - Sending %s (%d, %d:%d) Death " - "message to %s (%d, %d:%d)\n" + { + trace_printf( "%s@%d - Sending Death message of" + " %s (%d, %d:%d) to %s (%d, %d:%d)\n" , method_name, __LINE__ , Process->GetName() , Process->GetNid() @@ -280,41 +281,55 @@ void CNotice::Notify( SQ_LocalIOToClient::bcastPids_t *bcastPids ) , notify->GetNid() , notify->GetPid() , notify->GetVerifier()); - - + } } else { if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) - trace_printf( "%s@%d - Process %s (%d, %d:%d)" - " doesn't want Death message" "\n" + { + trace_printf( "%s@%d - Death message of %s (%d, %d:%d)" + " not wanted by %s (%d, %d:%d)\n" , method_name, __LINE__ + , Process->GetName() + , Process->GetNid() + , Process->GetPid() + , Process->GetVerifier() , notify->GetName() , notify->GetNid() , notify->GetPid() - , notify->GetVerifier() ); + , notify->GetVerifier()); + } } } else { if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) - trace_printf( "%s@%d - Not processed for clone Process %s (%d, %d:%d)\n" + { + trace_printf( "%s@%d - Death message of %s (%d, %d:%d)" + " not processed for clone %s (%d, %d:%d)\n" , method_name, __LINE__ + , Process->GetName() + , Process->GetNid() + , Process->GetPid() + , Process->GetVerifier() , notify->GetName() , notify->GetNid() , notify->GetPid() - , notify->GetVerifier() ); + , notify->GetVerifier()); + } } } else { if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) - trace_printf( "%s@%d - Can't find process %s (%d, %d:%d)\n" - , method_name, __LINE__ - , name_.c_str() - , Nid - , Pid - , verifier_ ); + { + trace_printf( "%s@%d - Can't find process %s (%d, %d:%d)\n" + , method_name, __LINE__ + , name_.c_str() + , Nid + , Pid + , verifier_ ); + } } } TRACE_EXIT; @@ -456,8 +471,8 @@ void CNotice::NotifyNid( NidQueue_t *nidQueue ) if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) { CLNode *lnode = Nodes->GetLNode( Nid ); - trace_printf( "%s@%d - Sending process %s (%d, %d:%d) " - "exit message to %s (nid=%d)\n" + trace_printf( "%s@%d - Sending exit message of" + " %s (%d, %d:%d) to %s (nid=%d)\n" , method_name, __LINE__ , Process->GetName() , Process->GetNid() @@ -471,9 +486,13 @@ void CNotice::NotifyNid( NidQueue_t *nidQueue ) { if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) { - trace_printf( "%s@%d - Process %s (%d, %d:%d)" - " doesn't want Death message\n" + trace_printf( "%s@%d - Death message of %s (%d, %d:%d)" + " not wanted by %s (%d, %d:%d)\n" , method_name, __LINE__ + , Process->GetName() + , Process->GetNid() + , Process->GetPid() + , Process->GetVerifier() , remoteProcess->GetName() , remoteProcess->GetNid() , remoteProcess->GetPid() http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/pnode.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx index e3923a0..6affd4f 100644 --- a/core/sqf/monitor/linux/pnode.cxx +++ b/core/sqf/monitor/linux/pnode.cxx @@ -169,6 +169,8 @@ CNode::CNode( char *name, int pnid, int rank ) ,zid_(pnid) #ifdef NAMESERVER_PROCESS ,monConnCount_(0) +#else + ,ptpSocketPort_(-1) #endif ,commSocketPort_(-1) ,syncSocketPort_(-1) http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/pnode.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnode.h b/core/sqf/monitor/linux/pnode.h index 48af3bc..f0b89bb 100644 --- a/core/sqf/monitor/linux/pnode.h +++ b/core/sqf/monitor/linux/pnode.h @@ -245,7 +245,8 @@ public: inline const char *GetMon2NsPort( void ) { return mon2NsPort_.c_str(); } inline int GetMonConnCount( void ) { return monConnCount_; } #else - inline const char *GetMon2MonPort( void ) { return mon2MonPort_.c_str(); } + inline const char *GetPtPPort( void ) { return ptpPort_.c_str(); } + inline int GetPtPSocketPort( void ) { return( ptpSocketPort_ ); } #endif inline int GetCommSocketPort( void ) { return( commSocketPort_ ); } inline int GetSyncSocketPort( void ) { return( syncSocketPort_ ); } @@ -315,7 +316,8 @@ public: #ifdef NAMESERVER_PROCESS inline void SetMon2NsPort( char *mon2NsPort) { mon2NsPort_ = mon2NsPort; } #else - inline void SetMon2MonPort( char *mon2MonPort) { mon2MonPort_ = mon2MonPort; } + inline void SetPtPPort( char *ptpPort) { ptpPort_ = ptpPort; } + inline void SetPtPSocketPort( int ptpSocketPort) { ptpSocketPort_ = ptpSocketPort; } #endif //inline void SetSockPort( int sockPort ) { sockPort_ = sockPort; } inline void SetCommSocketPort( int commSocketPort) { commSocketPort_ = commSocketPort; } @@ -414,7 +416,8 @@ private: string mon2NsPort_; // monitor to ns port int monConnCount_; // monitor connections #else - string mon2MonPort_; + string ptpPort_; + int ptpSocketPort_; // point-2-point socket port #endif int commSocketPort_; // re-integration socket port int syncSocketPort_; // algather socket port http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/process.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/process.cxx b/core/sqf/monitor/linux/process.cxx index 02d6276..017ce76 100644 --- a/core/sqf/monitor/linux/process.cxx +++ b/core/sqf/monitor/linux/process.cxx @@ -545,11 +545,14 @@ bool CProcess::procExitReg(CProcess *targetProcess, { // This process is not the parent of the target process (parent // processes automatically get process death notifications.) - // Add entry to list of processes that are being monitored - // by this process. nidPid_t target = { targetProcess->Nid, targetProcess->Pid }; deathInterestLock_.lock(); - deathInterest_.push_back ( target ); + // Add entry to list of processes that are being monitored + // by this process. + deathInterest_.push_back( target ); + // Add entry to set of nids of processes that are being monitored + // by this process. + deathInterestNid_.insert( targetProcess->Nid ); deathInterestLock_.unlock(); // Register interest with the target process @@ -580,12 +583,54 @@ bool CProcess::procExitReg(CProcess *targetProcess, #endif #ifndef NAMESERVER_PROCESS +void CProcess::procExitNotifierNodes( void ) +{ + const char method_name[] = "CProcess::procExitNotifierNodes"; + TRACE_ENTRY; + + CLNode *targetLNode; + CNode *targetNode; + nidSet_t::iterator it; + + // Remove death notice registration for all entries on list + deathInterestLock_.lock(); + for ( it = deathInterestNid_.begin(); it != deathInterestNid_.end(); ++it) + { + targetLNode = Nodes->GetLNode ( *it ); + if (targetLNode) + { + targetNode = targetLNode->GetNode(); + } + + if ( targetNode ) + { + if (NameServerEnabled && targetNode->GetPNid() != MyPNID) + { + int rc = -1; + // Forward the process exit to the target node + rc = PtpClient->ProcessExit( this + , targetLNode->GetNid() + , targetNode->GetName() ); + if (rc) + { + // TODO: Error handling + } + } + } + } + deathInterestNid_.clear(); + deathInterestLock_.unlock(); + + TRACE_EXIT; +} +#endif + +#ifndef NAMESERVER_PROCESS void CProcess::procExitUnregAll ( _TM_Txid_External transId ) { const char method_name[] = "CProcess::procExitUnregAll"; TRACE_ENTRY; - nidPidList_t::iterator iter; CLNode *node; CProcess *targetProcess = NULL; nidPidList_t::iterator it; @@ -3222,48 +3267,39 @@ void CProcess::Exit( CProcess *parent ) #ifndef NAMESERVER_PROCESS if (NameServerEnabled) { - if ( parent && parent->IsClone() && Pid != -1 ) + if ( parent ) { - int targetNid = parent->GetNid(); - CLNode *targetLNode = Nodes->GetLNode( targetNid ); - // Send the process exit to the target node - int rc = PtpClient->ProcessExit( this - , targetNid - , targetLNode->GetNode()->GetName() ); - if (rc) + if ( parent->IsClone() && Pid != -1 ) { - // TODO: Error handling + int targetNid = parent->GetNid(); + CLNode *targetLNode = Nodes->GetLNode( targetNid ); + // Send the process exit to the parent node + int rc = PtpClient->ProcessExit( this + , targetNid + , targetLNode->GetNode()->GetName() ); + if (rc) + { + // TODO: Error handling + } } -#if 0 - // TODO: This is not the correct place. It needs to be found! - // When the parent process is in a remote node and - // the local node contains child processes, - // a clone of the parent is created at child creation time, - // when all child processes are deleted, it leaves the - // parent clone process. Need to determine when all - // child process objects which reference the parent clone - // are deleted so the parent clone object can be deleted. - // The symptom is that shutdown never occurs since there - // are object which have not been deleted and the process - // counts prevent the shutdown from completing. - if (parent->childCount() == 0) + } + else + { + if (GetParentNid() != -1) { - if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) + int targetNid = GetParentNid(); + CLNode *targetLNode = Nodes->GetLNode( targetNid ); + // Send the process exit to the parent node + int rc = PtpClient->ProcessExit( this + , targetNid + , targetLNode->GetNode()->GetName() ); + if (rc) { - trace_printf( "%s@%d" " - Deleting parent %s (%d,%d:%d) of last child %s (%d,%d:%d) \n" - , method_name, __LINE__ - , parent->GetName(), parent->GetNid() - , parent->GetPid(), parent->GetVerifier() - , GetName(), GetNid(), GetPid(), GetVerifier() ); + // TODO: Error handling } - - CNode *parentNode = Nodes->GetLNode(parent->GetNid())->GetNode(); - parentNode->DelFromNameMap( parent ); - parentNode->DelFromPidMap( parent ); - parentNode->DeleteFromList( parent ); } -#endif } + procExitNotifierNodes(); } #endif http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/process.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/process.h b/core/sqf/monitor/linux/process.h index 9445f6e..90954f3 100644 --- a/core/sqf/monitor/linux/process.h +++ b/core/sqf/monitor/linux/process.h @@ -409,6 +409,7 @@ class CProcess bool procExitReg(CProcess *targetProcess, _TM_Txid_External transId); + void procExitNotifierNodes( void ); void procExitUnregAll( _TM_Txid_External transId ); void validateObj( void ); @@ -549,8 +550,10 @@ private: // Container to keep track of the processes for which this process // is interested in process death. deathInterestLock_ is used to // protect both the deathInterest_ and CNotice list. + typedef set<int> nidSet_t; nidPidList_t deathInterest_; - CLock deathInterestLock_; + nidSet_t deathInterestNid_; + CLock deathInterestLock_; CNotice *NoticeHead; // List of processes requesting death notice CNotice *NoticeTail; http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/ptpclient.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/ptpclient.cxx b/core/sqf/monitor/linux/ptpclient.cxx index 5e1380e..4f3f292 100644 --- a/core/sqf/monitor/linux/ptpclient.cxx +++ b/core/sqf/monitor/linux/ptpclient.cxx @@ -57,7 +57,7 @@ extern bool IsRealCluster; extern CMeas Meas; CPtpClient::CPtpClient (void) - : mon2monSock_(0) + : ptpSock_(0) , seqNum_(0) { const char method_name[] = "CPtpClient::CPtpClient"; @@ -89,13 +89,13 @@ CPtpClient::~CPtpClient (void) TRACE_EXIT; } -int CPtpClient::InitializePtpClient( char * mon2monPort ) +int CPtpClient::InitializePtpClient( char * ptpPort ) { const char method_name[] = "CPtpClient::InitializePtpClient"; TRACE_ENTRY; int err = 0; - int sock = Monitor->MkCltSock( mon2monPort ); + int sock = Monitor->MkCltSock( ptpPort ); if (sock < 0) { err = sock; @@ -108,50 +108,16 @@ int CPtpClient::InitializePtpClient( char * mon2monPort ) } else { - mon2monSock_ = sock; + ptpSock_ = sock; if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) { trace_printf( "%s@%d - connected to monitor node=%s, sock=%d\n" , method_name, __LINE__ - , mon2monPort - , mon2monSock_ ); + , ptpPort + , ptpSock_ ); } } -#if 0 - // remove - if (err == 0) - { - nodeId_t msg; - strcpy(msg.nodeName, MyNode->GetName()); - strcpy(msg.commPort, MyNode->GetCommPort()); - strcpy(msg.syncPort, MyNode->GetSyncPort()); - msg.pnid = MyNode->GetPNid(); - msg.creatorPNid = -1; - msg.creatorShellPid = -1; - msg.creatorShellVerifier = -1; - msg.creator = false; - msg.ping = false; - if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) - { - trace_printf( "%s@%d - sending node-info to monitor=%s, sock=%d\n" - , method_name, __LINE__ - , mon2monPort - , mon2monSock_); - } - err = SendSock((char *) &msg, sizeof(msg), mon2monSock_); - if (err) - { - if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) - { - trace_printf( "%s@%d - error sending to monitor=%s, sock=%d, error=%d\n" - , method_name, __LINE__ - , mon2monPort - , mon2monSock_ - , err ); - } - } - } -#endif + TRACE_EXIT; return err; } @@ -807,7 +773,7 @@ int CPtpClient::SendToMon(const char *reqType, internal_msg_def *msg, int size, TRACE_ENTRY; char monPortString[MAX_PROCESSOR_NAME]; - char mon2monPort[MAX_PROCESSOR_NAME]; + char ptpPort[MAX_PROCESSOR_NAME]; int tempPort = basePort_; // For virtual env @@ -828,16 +794,16 @@ int CPtpClient::SendToMon(const char *reqType, internal_msg_def *msg, int size, , basePort_ ); } - memset( &mon2monPort, 0, MAX_PROCESSOR_NAME ); - memset( &mon2monPortBase_, 0, MAX_PROCESSOR_NAME+100 ); + memset( &ptpPort, 0, MAX_PROCESSOR_NAME ); + memset( &ptpPortBase_, 0, MAX_PROCESSOR_NAME+100 ); - strcat( mon2monPortBase_, hostName ); - strcat( mon2monPortBase_, ":" ); + strcat( ptpPortBase_, hostName ); + strcat( ptpPortBase_, ":" ); sprintf( monPortString,"%d", tempPort ); - strcat( mon2monPort, mon2monPortBase_ ); - strcat( mon2monPort, monPortString ); + strcat( ptpPort, ptpPortBase_ ); + strcat( ptpPort, monPortString ); - int error = InitializePtpClient( mon2monPort ); + int error = InitializePtpClient( ptpPort ); if (error < 0) { TRACE_EXIT; @@ -849,37 +815,37 @@ int CPtpClient::SendToMon(const char *reqType, internal_msg_def *msg, int size, trace_printf( "%s@%d - sending %s REQ to Monitor=%s, sock=%d\n" , method_name, __LINE__ , reqType - , mon2monPort - , mon2monSock_); + , ptpPort + , ptpSock_); } - error = SendSock((char *) &size, sizeof(size), mon2monSock_); + error = SendSock((char *) &size, sizeof(size), ptpSock_); if (error) { if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) { trace_printf( "%s@%d - error sending to Monitor=%s, sock=%d, error=%d\n" , method_name, __LINE__ - , mon2monPort - , mon2monSock_ + , ptpPort + , ptpSock_ , error ); } } - error = SendSock((char *) msg, size, mon2monSock_); + error = SendSock((char *) msg, size, ptpSock_); if (error) { if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) { trace_printf( "%s@%d - error sending to nameserver=%s, sock=%d, error=%d\n" , method_name, __LINE__ - , mon2monPort - , mon2monSock_ + , ptpPort + , ptpSock_ , error ); } } - close( mon2monSock_ ); + close( ptpSock_ ); TRACE_EXIT; return error; http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/ptpclient.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/ptpclient.h b/core/sqf/monitor/linux/ptpclient.h index 7dd8b86..554bc71 100644 --- a/core/sqf/monitor/linux/ptpclient.h +++ b/core/sqf/monitor/linux/ptpclient.h @@ -40,7 +40,7 @@ public: CPtpClient( void ); virtual ~CPtpClient( void ); - int InitializePtpClient( char * mon2monPort ); + int InitializePtpClient( char * ptpPort ); int ProcessClone( CProcess *process ); int ProcessExit( CProcess* process , int parentNid @@ -67,8 +67,8 @@ public: private: int basePort_; - char mon2monPortBase_[MAX_PROCESSOR_NAME+100]; - int mon2monSock_; + char ptpPortBase_[MAX_PROCESSOR_NAME+100]; + int ptpSock_; int seqNum_; int ReceiveSock(char *buf, int size, int sockFd); http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/ptpcommaccept.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/ptpcommaccept.cxx b/core/sqf/monitor/linux/ptpcommaccept.cxx index b070508..c6d5145 100644 --- a/core/sqf/monitor/linux/ptpcommaccept.cxx +++ b/core/sqf/monitor/linux/ptpcommaccept.cxx @@ -39,7 +39,7 @@ extern CMonitor *Monitor; extern CNode *MyNode; extern CNodeContainer *Nodes; extern int MyPNID; -extern char MyMon2MonPort[MPI_MAX_PORT_NAME]; +extern char MyPtPPort[MPI_MAX_PORT_NAME]; extern char *ErrorMsg (int error_code); extern const char *StateString( STATE state); extern CommType_t CommType; @@ -235,7 +235,7 @@ void CPtpCommAccept::commAcceptorSock() } mem_log_write(CMonLog::MON_CONNTONEWMON_1); - sockFd = Monitor->AcceptMon2MonSock(); + sockFd = Monitor->AcceptPtPSock(); } else { @@ -287,7 +287,7 @@ void CPtpCommAccept::shutdownWork(void) // Set flag that tells the PtpCommAccept thread to exit shutdown_ = true; - Monitor->ConnectToSelf(); + Monitor->ConnectToPtPCommSelf(); CLock::wakeOne(); if (trace_settings & TRACE_INIT) http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/linux/shell.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/shell.cxx b/core/sqf/monitor/linux/shell.cxx index 52afe06..20b41e6 100644 --- a/core/sqf/monitor/linux/shell.cxx +++ b/core/sqf/monitor/linux/shell.cxx @@ -82,6 +82,7 @@ char Path[MAX_SEARCH_PATH]; char Wdir[MAX_SEARCH_PATH]; char prompt[13]; int VirtualNodes = 0; +int VirtualNid = -1; int NumNodes = 0; int NumLNodes = 0; int CurNodes = 0; @@ -848,7 +849,7 @@ void TraceInit( int & argc, char **&argv ) // line arguments. for (int j=i, k=i+2; k < argc; j++, k++) { - printf ("setting argv[%d] = argv[%d]\n", j, k); + //printf ("setting argv[%d] = argv[%d]\n", j, k); argv[j] = argv[k]; } argc -= 2; @@ -868,6 +869,32 @@ void TraceInit( int & argc, char **&argv ) } } +void VirtualNidInit( int & argc, char **&argv ) +{ + // Check for trace flags specified on the command line. + for (int i = 0; i < argc; i++) + { + if ( strcmp ( argv[i], "-nid" ) == 0 && (i != argc-1) ) + { // <nid> setting specified on command line. + VirtualNid = atoi ( argv[i+1] ); + + // Remove the virtual nid arguments from the list of command + // line arguments. + for (int j=i, k=i+2; k < argc; j++, k++) + { + //printf ("setting argv[%d] = argv[%d]\n", j, k); + argv[j] = argv[k]; + } + argc -= 2; + } + } + + if (VirtualNid != -1) + { + printf( "Using VirtualNid=%d\n", VirtualNid ); + } +} + void RedirectFd(int orig_fd, char *fifo_name) { int rdir_fd; @@ -9302,6 +9329,9 @@ int main (int argc, char *argv[]) // Initialize trace settings TraceInit ( argc, argv ); + // Initialize virtual <nid> from command line args + VirtualNidInit( argc, argv ); + MyName = new char [MAX_PROCESS_PATH]; // setup defaults strcpy (MyName, "SHELL"); @@ -9340,6 +9370,12 @@ int main (int argc, char *argv[]) MyNid = 0; } + if ( VirtualNodes && VirtualNid != -1) + { + // Override NyNid with the command line nid value + MyNid = VirtualNid; + } + msg = new struct message_def; // Load default node information http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/test/monitor.env ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/test/monitor.env b/core/sqf/monitor/test/monitor.env index b8cf913..dd2cbb3 100644 --- a/core/sqf/monitor/test/monitor.env +++ b/core/sqf/monitor/test/monitor.env @@ -40,21 +40,21 @@ MONITOR_COMM_PORT=23330 # Uncomment MON_TRACE_ENABLE and specific tracing level to enable # Trafodion monitor process tracing -#MON_TRACE_ENABLE=1 -#MON_TRACE_EVLOG_MSG=1 -#MON_TRACE_INIT=1 -#MON_TRACE_RECOVERY=1 -#MON_TRACE_REQUEST=1 -#MON_TRACE_PROCESS=1 -#MON_TRACE_NOTICE=1 -#MON_TRACE_NS=1 +MON_TRACE_ENABLE=1 +MON_TRACE_EVLOG_MSG=1 +MON_TRACE_INIT=1 +MON_TRACE_RECOVERY=1 +MON_TRACE_REQUEST=1 +MON_TRACE_PROCESS=1 +MON_TRACE_NOTICE=1 +MON_TRACE_NS=1 #MON_TRACE_SYNC=1 # Enable TC_TRACE_* along with MON_TRACE_TRAFCONFIG for more detail #MON_TRACE_TRAFCONFIG=1 #MON_TRACE_MLIO=1 #MON_TRACE_REQUEST_DETAIL=1 -#MON_TRACE_PROCESS_DETAIL=1 +MON_TRACE_PROCESS_DETAIL=1 #MON_TRACE_NOTICE_DETAIL=1 #MON_TRACE_SYNC_DETAIL=1 #MON_TRACE_MLIO_DETAIL=1 http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/test/runtest ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/test/runtest b/core/sqf/monitor/test/runtest index 20c06a9..5580d0a 100755 --- a/core/sqf/monitor/test/runtest +++ b/core/sqf/monitor/test/runtest @@ -169,13 +169,14 @@ shell <<eof delay 3 exec {name \$CTRLR, nid 0, out $TRAF_HOME/monitor/test/childExit.lst} childExitCtrl $trace delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then -shell -c ps +shell -nid 0 -c ps +#shell -nid 0 -c ps;shell -nid 1 -c ps;shell -nid 2 -c ps;shell -nid 3 -c ps;shell -nid 4 -c ps;shell -nid 5 -c ps +shell -nid 0 -c ps monitor;shell -nid 1 -c ps monitor;shell -nid 2 -c ps monitor;shell -nid 3 -c ps monitor;shell -nid 4 -c ps monitor;shell -nid 5 -c ps monitor shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -211,12 +212,11 @@ shell <<eof exec {pri 10,name \$CLIENT,nid 0, out $TRAF_HOME/monitor/test/multiNode.lst} client $trace delay 3 !shutdown - ps exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -243,13 +243,12 @@ shell <<eof delay 3 exec {name \$CTRLR, nid 0, out $TRAF_HOME/monitor/test/regTest.lst} regTestCtrl $trace delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -287,13 +286,12 @@ shell <<eof ! exec {name \$DEATH, nid 0, out $TRAF_HOME/monitor/test/deathNotice.lst} deathNotice $trace delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -324,13 +322,12 @@ shell <<eof delay 15 exec {name \$PPROC, nid 0, out $TRAF_HOME/monitor/test/persistentProc.lst} persistentProc $trace delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -350,13 +347,12 @@ shell <<eof delay 3 exec {name \$PPROC, nid 0, out $TRAF_HOME/monitor/test/persistentProc.lst} persistentProc $trace delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -382,13 +378,12 @@ shell <<eof delay 3 exec {name \$DTMCTRL, nid 0, out $TRAF_HOME/monitor/test/dtmTest.lst} dtmCtrl $trace delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -420,13 +415,12 @@ shell <<eof delay 3 exec {name \$SPXCTRL, nid 0, out $TRAF_HOME/monitor/test/spxTest.lst} spxCtrl $trace delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -453,13 +447,12 @@ shell <<eof delay 3 exec {name \$PCRE8, nid 0, out $TRAF_HOME/monitor/test/procCreate.lst} procCreate $trace -x delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -488,13 +481,12 @@ shell <<eof delay 3 down 1 ! delay 10 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -525,13 +517,12 @@ shell <<eof exec {nowait, nid 0, name \$CTRLR, out $TRAF_HOME/monitor/test/tmSync.lst} tmSyncCtrl -n 1,3,4,5,6 $trace wait $CTRLR delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -551,13 +542,12 @@ shell <<eof exec {nowait, nid 0, name \$CTRLR, out $TRAF_HOME/monitor/test/tmSync8.lst} tmSyncCtrl -n 8 $trace wait $CTRLR delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -579,13 +569,12 @@ shell <<eof exec {nowait, nid 0, name \$CTRLR, out $TRAF_HOME/monitor/test/tmSync10.lst} tmSyncCtrl -n 10 $trace wait $CTRLR delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit @@ -606,13 +595,12 @@ shell <<eof exec {nowait, nid 0, name \$CTRLR, out $TRAF_HOME/monitor/test/tmSync.lst} tmSyncCtrl -n 3,4,5,6,7 $trace wait $CTRLR delay 3 - ps !shutdown exit eof -if ( [ $test '==' -1 ] ); then shell -c ps shell -c ps monitor +if ( [ $test '==' -1 ] ); then shell -a<<eof shutdown exit http://git-wip-us.apache.org/repos/asf/trafodion/blob/6dc990fe/core/sqf/monitor/test/sqconfig.monitor.virtual ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/test/sqconfig.monitor.virtual b/core/sqf/monitor/test/sqconfig.monitor.virtual index 8f6750c..9ab56e8 100644 --- a/core/sqf/monitor/test/sqconfig.monitor.virtual +++ b/core/sqf/monitor/test/sqconfig.monitor.virtual @@ -24,6 +24,9 @@ _virtualnodes 6 end node begin name-server -nodes=0 +#nodes=0 +nodes=0,1 +#nodes=0,1,2 +#nodes=0,1,2,3 #nodes=0,1,2,3,4,5 end name-server
