Repository: trafodion Updated Branches: refs/heads/master ee4430046 -> fe87aa15e
http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/reqopen.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqopen.cxx b/core/sqf/monitor/linux/reqopen.cxx index f131a08..f44b8d4 100644 --- a/core/sqf/monitor/linux/reqopen.cxx +++ b/core/sqf/monitor/linux/reqopen.cxx @@ -229,8 +229,8 @@ bool CExtOpenReq::prepare() return false; } - CProcess * openerProcess; - CProcess * openedProcess; + CProcess * openerProcess = NULL; + CProcess * openedProcess = NULL; // Get process object for opener process if ( msg_->u.request.u.open.process_name[0] ) @@ -263,9 +263,12 @@ bool CExtOpenReq::prepare() // Get process object for process to open if ( msg_->u.request.u.open.target_process_name[0] ) { // find by name (check node state, don't check process state, backup is NOT Ok) - openedProcess = Nodes->GetProcess( msg_->u.request.u.open.target_process_name - , msg_->u.request.u.open.target_verifier - , true, false, false ); + if (msg_->u.request.u.open.target_process_name[0] == '$' ) + { + openedProcess = Nodes->GetProcess( msg_->u.request.u.open.target_process_name + , msg_->u.request.u.open.target_verifier + , true, false, false ); + } } else { // find by pid (check node state, don't check process state, backup is Ok) @@ -291,8 +294,11 @@ bool CExtOpenReq::prepare() , method_name, __LINE__ , target_process_name.c_str() , target_verifier ); - openedProcess = Nodes->CloneProcessNs( target_process_name.c_str() - , target_verifier ); + if (msg_->u.request.u.open.target_process_name[0] == '$' ) + { + openedProcess = Nodes->CloneProcessNs( target_process_name.c_str() + , target_verifier ); + } } else { // Name Server find by nid,pid:verifier http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/reqprocinfo.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqprocinfo.cxx b/core/sqf/monitor/linux/reqprocinfo.cxx index d3f04e2..c49a877 100644 --- a/core/sqf/monitor/linux/reqprocinfo.cxx +++ b/core/sqf/monitor/linux/reqprocinfo.cxx @@ -417,9 +417,6 @@ void CExtProcInfoReq::performRequest() requester = Nodes->GetProcess( nid_ , pid_ , verifier_ , false, false, true ); -// CLNode *lnode = Nodes->GetLNode( nid_ ); -// CNode *node = lnode->GetNode(); -// requester = node->GetProcess( pid_, verifier_ ); #else requester = MyNode->GetProcess( pid_ , verifier_ ); @@ -483,12 +480,16 @@ void CExtProcInfoReq::performRequest() , false, false , target_verifier == -1 ? false : true ); #else + CProcess *process = NULL; // find by name (check node state, don't check process state, // if verifier is -1, backup is NOT Ok, else is Ok) - CProcess *process = Nodes->GetProcess( target_process_name.c_str() - , target_verifier - , true, false - , target_verifier == -1 ? false : true ); + if (msg_->u.request.u.process_info.target_process_name[0] == '$' ) + { + process = Nodes->GetProcess( target_process_name.c_str() + , target_verifier + , true, false + , target_verifier == -1 ? false : true ); + } #endif if (process) { http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/reqqueue.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqqueue.cxx b/core/sqf/monitor/linux/reqqueue.cxx index 2238d71..3d425f2 100644 --- a/core/sqf/monitor/linux/reqqueue.cxx +++ b/core/sqf/monitor/linux/reqqueue.cxx @@ -64,6 +64,7 @@ extern char *ErrorMsg (int error_code); extern CRedirector Redirector; extern bool NameServerEnabled; extern CPtpClient *PtpClient; +extern CProcess *NameServerProcess; extern CNameServer *NameServer; extern CNameServerConfigContainer *NameServerConfig; #endif @@ -1578,15 +1579,18 @@ void CIntNewProcReq::performRequest() { if (NameServerEnabled) { - if (trace_settings & TRACE_REQUEST) - trace_printf( "%s@%d" " - Getting parent process from Name Server (%d,%d:%d)\n" - , method_name, __LINE__ - , parentNid_ - , parentPid_ - , parentVerifier_ ); - parentProcess = Nodes->CloneProcessNs( parentNid_ - , parentPid_ - , parentVerifier_ ); + if (parentNid_ != -1 && parentPid_ != -1) + { + if (trace_settings & TRACE_REQUEST) + trace_printf( "%s@%d" " - Getting parent process from Name Server (%d,%d:%d)\n" + , method_name, __LINE__ + , parentNid_ + , parentPid_ + , parentVerifier_ ); + parentProcess = Nodes->CloneProcessNs( parentNid_ + , parentPid_ + , parentVerifier_ ); + } } } } @@ -2598,7 +2602,7 @@ void CIntChildDeathReq::performRequest() , process_->GetVerifier() ); } #ifndef NAMESERVER_PROCESS - if ( NameServerEnabled ) + if ( NameServerEnabled && process_ != NameServerProcess) { int rc = NameServer->ProcessDelete(process_); // in reqQueue thread (CIntChildDeathReq) if (rc) @@ -2713,9 +2717,11 @@ void CIntShutdownReq::performRequest() else { // Stop all processes - Monitor->HardNodeDown( MyPNID ); #ifndef NAMESERVER_PROCESS + Monitor->HardNodeDown( MyPNID ); MyNode->EmptyQuiescingPids(); +#else + Monitor->HardNodeDownNs( MyPNID ); #endif // now stop the Watchdog process HealthCheck.setState(MON_NODE_DOWN); @@ -3261,7 +3267,11 @@ void CIntDownReq::performRequest() if (trace_settings & (TRACE_SYNC | TRACE_REQUEST)) trace_printf("%s@%d - Node down request, pnid=%d\n", method_name, __LINE__, pnid_); +#ifndef NAMESERVER_PROCESS Monitor->HardNodeDown( pnid_ ); +#else + Monitor->HardNodeDownNs( pnid_ ); +#endif TRACE_EXIT; } @@ -4063,7 +4073,11 @@ void CPostQuiesceReq::performRequest() else { // Stop all processes +#ifndef NAMESERVER_PROCESS Monitor->HardNodeDown( MyPNID ); +#else + Monitor->HardNodeDownNs( MyPNID ); +#endif #ifndef NAMESERVER_PROCESS MyNode->EmptyQuiescingPids(); #endif @@ -4241,6 +4255,11 @@ CExternalReq *CReqQueue::prepExternalReq(CExternalReq::reqQueueMsg_t msgType, request->setConcurrent(reqConcurrent[msg->u.request.type]); break; + case ReqType_NodeDown: + request = new CExtNodeDownNsReq(msgType, pid, sockFd, msg); + request->setConcurrent(reqConcurrent[msg->u.request.type]); + break; + case ReqType_NewProcessNs: request = new CExtNewProcNsReq(msgType, nid, pid, sockFd, msg); request->setConcurrent(reqConcurrent[msg->u.request.type]); @@ -5376,7 +5395,7 @@ CRequest* CReqQueue::getRequest() } } - if (!request->isShutdown()) + if (request && !request->isShutdown()) { // Take request out of list reqQueue_.erase (it); http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/reqqueue.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqqueue.h b/core/sqf/monitor/linux/reqqueue.h index 2f6f030..b600a0f 100644 --- a/core/sqf/monitor/linux/reqqueue.h +++ b/core/sqf/monitor/linux/reqqueue.h @@ -456,6 +456,23 @@ private: }; #endif +#ifdef NAMESERVER_PROCESS +class CExtNodeDownNsReq: public CExternalReq +{ +public: + CExtNodeDownNsReq( reqQueueMsg_t msgType + , int pid + , int sockFd + , struct message_def *msg ); + virtual ~CExtNodeDownNsReq(); + + void performRequest(); + +private: + void populateRequestString( void ); +}; +#endif + #ifndef NAMESERVER_PROCESS class CExtNameServerAddReq: public CExternalReq { @@ -1801,6 +1818,7 @@ private: RQEI CExtNewProcReq RqEB CExtNewProcessNsReq RQEJ CExtNodeDownReq + RqEJ CExtNodeDownNsReq RQEK CExtNodeInfoReq RQEK CExtPNodeInfoReq RQEL CExtNodeUpReq @@ -1816,6 +1834,7 @@ private: RQEP CExtProcInfoContReq RQEQ CExtSetReq RQER CExtShutdownReq + RqER CExtShutdownNsReq RQES CExtStartupReq RQET CExtTmLeaderReq RQEV CExtTmSyncReq http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/shell.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/shell.cxx b/core/sqf/monitor/linux/shell.cxx index 6c5f14b..0da8c32 100644 --- a/core/sqf/monitor/linux/shell.cxx +++ b/core/sqf/monitor/linux/shell.cxx @@ -80,7 +80,7 @@ char *MyName; char LDpath[MAX_SEARCH_PATH]; char Path[MAX_SEARCH_PATH]; char Wdir[MAX_SEARCH_PATH]; -char prompt[13]; +char prompt[MAX_PROCESS_NAME]; int VirtualNodes = 0; int VirtualNid = -1; int NumNodes = 0; @@ -394,51 +394,41 @@ bool check_environment( void ) { bool rs = true; bool isNameServerEnabled = false; + bool isAgentModeEnabled = false; char* env; char msgString[MAX_BUFFER] = { 0 }; int val = 0; - env = getenv("MONITOR_COMM_PORT"); - if ( env ) + env = getenv("SQ_MON_RUN_MODE"); + if ( env && (strcmp(env, "AGENT") == 0) ) { - val = atoi(env); - if ( val <= 0) + isAgentModeEnabled = true; + } + + if (isAgentModeEnabled) + { + env = getenv("MONITOR_COMM_PORT"); + if ( env ) { - if (VirtualNodes) + val = atoi(env); + if ( val <= 0) { sprintf( msgString, "[%s] Warning: MONITOR_COMM_PORT value is invalid (%s)!", MyName, env ); write_startup_log( msgString ); printf("%s\n", msgString ); } - else - { - sprintf( msgString, "[%s] Error: MONITOR_COMM_PORT value is invalid (%s)! Set MONITOR_COMM_PORT environment variable and try again.", MyName, env ); - write_startup_log( msgString ); - printf("%s\n", msgString ); - rs = false; - } } - } - - env = getenv("MONITOR_SYNC_PORT"); - if ( env ) - { - val = atoi(env); - if ( val <= 0) + + env = getenv("MONITOR_SYNC_PORT"); + if ( env ) { - if (VirtualNodes) + val = atoi(env); + if ( val <= 0) { sprintf( msgString, "[%s] Warning: MONITOR_SYNC_PORT value is invalid (%s)!", MyName, env ); write_startup_log( msgString ); printf("%s\n", msgString ); } - else - { - sprintf( msgString, "[%s] Error: MONITOR_SYNC_PORT value is invalid (%s)! Set MONITOR_COMM_PORT environment variable and try again.", MyName, env ); - write_startup_log( msgString ); - printf("%s\n", msgString ); - rs = false; - } } } @@ -446,10 +436,7 @@ bool check_environment( void ) if ( env ) { val = atoi(env); - if ( val > 0) - { - isNameServerEnabled = (val != 0); - } + isNameServerEnabled = (val != 0) ? true : false; } if (isNameServerEnabled) @@ -2391,8 +2378,8 @@ void get_proc_info( int nid { if (displayHeader) { - printf("[%s] NID,PID(os) PRI TYPE STATES NAME PARENT PROGRAM\n",MyName); - printf("[%s] ------------ --- ---- ------- ----------- ----------- ---------------\n",MyName); + printf("[%s] NID,PID(os) PRI TYPE STATES NAME PARENT PROGRAM\n",MyName); + printf("[%s] ------------ --- ---- ------- ------------ ------------ ---------------\n",MyName); } show_proc_info(); @@ -5257,7 +5244,7 @@ void show_proc_info( void ) msg->u.reply.u.process_info.process[i].type = ProcessType_Undefined; } - printf("%3.3d %-4s %c%c%c%c%c%c%c %-11s %-11s %-15s\n", + printf("%3.3d %-4s %c%c%c%c%c%c%c %-12s %-12s %-15s\n", msg->u.reply.u.process_info.process[i].priority, processTypeStr[msg->u.reply.u.process_info.process[i].type], (msg->u.reply.u.process_info.process[i].event_messages?'E':'-'), @@ -9572,14 +9559,8 @@ int main (int argc, char *argv[]) env = getenv("SQ_NAMESERVER_ENABLED"); if ( env && isdigit(*env) ) { - if ( strcmp(env,"0") == 0 ) - { - NameServerEnabled = false; - } - else - { - NameServerEnabled = true; - } + int val = atoi(env); + NameServerEnabled = (val != 0) ? true : false; } if ( !VirtualNodes ) http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/tmsync.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/tmsync.cxx b/core/sqf/monitor/linux/tmsync.cxx index 548ae81..b56c5f8 100644 --- a/core/sqf/monitor/linux/tmsync.cxx +++ b/core/sqf/monitor/linux/tmsync.cxx @@ -1012,7 +1012,8 @@ void CTmSync_Container::SendUnsolicitedMessages (void) } if (NameServerEnabled) { - if (!MyNode->IsMyNode( tm->GetNid() )) + if (!MyNode->IsMyNode( tm->GetNid() ) + && (req->GetNext() && req->GetNext()->Nid != tm->GetNid() ) ) { if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) { @@ -1024,6 +1025,7 @@ void CTmSync_Container::SendUnsolicitedMessages (void) , tm->GetVerifier() ); } Nodes->DeleteCloneProcess( tm ); + tm = NULL; } } http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/zclient.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/zclient.cxx b/core/sqf/monitor/linux/zclient.cxx index f9bd698..19a7679 100644 --- a/core/sqf/monitor/linux/zclient.cxx +++ b/core/sqf/monitor/linux/zclient.cxx @@ -506,7 +506,7 @@ const char* CZClient::WaitForAndReturnMaster( bool doWait ) string masterMonitor( ss.str( ) ); // wait for 3 minutes for giving up. - while ( (!found) && (retries < 180)) + while ( (GetState() != ZC_SHUTDOWN) && (!found) && (retries < 180)) { if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) { http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/sqenvcom.sh ---------------------------------------------------------------------- diff --git a/core/sqf/sqenvcom.sh b/core/sqf/sqenvcom.sh index cd2ea37..0bac47c 100644 --- a/core/sqf/sqenvcom.sh +++ b/core/sqf/sqenvcom.sh @@ -681,11 +681,14 @@ export SQ_STARTUP=r # (meaning that mpirun is the parent process of the monitor process) # AGENT - monitor process runs in agent mode versus MPI collective # -# Uncomment the next four environment variables -#export SQ_MON_CREATOR=MPIRUN -#export SQ_MON_RUN_MODE=AGENT -#export MONITOR_COMM_PORT=23390 -#export MONITOR_SYNC_PORT=23380 +# Uncomment the next environment variable +export SQ_MON_CREATOR=MPIRUN +if [[ "$SQ_MON_CREATOR" == "MPIRUN" ]]; then + export SQ_MON_RUN_MODE=${SQ_MON_RUN_MODE:-AGENT} + export MONITOR_COMM_PORT=${MONITOR_COMM_PORT:-23390} + export MONITOR_SYNC_PORT=${MONITOR_SYNC_PORT:-23380} + export TRAF_SCALING_FACTOR=${TRAF_SCALING_FACTOR:-0.75} +fi # # NAME-SERVER - to disable process replication and enable the name-server @@ -743,6 +746,11 @@ fi # set to 0 to disable phandle verifier export SQ_PHANDLE_VERIFIER=1 +# set to 0 to disable process name long format in clusters larger that 256 nodes +#export SQ_MON_PROCESS_NAME_FORMAT_LONG=0 +# short format: '$Zxxpppp' xx = nid, pppp = pid +# long format: '$Zxxxxpppppp' xxxx = nid, pppppp = pid (default) + # set to 0 to disable or 1 to enable configuration of DTM as a persistent process # must re-execute 'sqgen' to effect change export SQ_DTM_PERSISTENT_PROCESS=1 http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/sql/scripts/gomon.cold ---------------------------------------------------------------------- diff --git a/core/sqf/sql/scripts/gomon.cold b/core/sqf/sql/scripts/gomon.cold index 6055490..f963e29 100755 --- a/core/sqf/sql/scripts/gomon.cold +++ b/core/sqf/sql/scripts/gomon.cold @@ -90,15 +90,24 @@ eof echo `date`" - Continuing with Startup ..." echo fi +fi + +if ( + [[ $TRAF_AGENT == "CM" ]] || + [[ $SQ_MON_RUN_MODE == "AGENT" ]] + ) +then + export TRAF_SCALING_FACTOR=${TRAF_SCALING_FACTOR:-0.75} # Set the number of nodes configured let node_count=`trafconf -nid-count` + #echo "***" #echo "*** node_count = ${node_count}" - #echo "*** TRAF_SCALING_FACTOR = $TRAF_SCALING_FACTOR" + #echo "*** TRAF_SCALING_FACTOR = ${TRAF_SCALING_FACTOR}" # allow time for other nodes to integrate, scaled to cluster size # scaling factor may be non-integer, so use awk to evaluate - start_delay=$( echo "${node_count} $TRAF_SCALING_FACTOR" | awk '{print $1 * $2}') + start_delay=$( echo "${node_count} ${TRAF_SCALING_FACTOR}" | awk '{print $1 * $2}') echo "***" echo "***" %`date`" - Waiting ${start_delay} seconds for Monitor processes to integrate" echo "***"