More monitor p2p fixes to address Seabed test failures in virtual and real cluster testing.
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/251f3f57 Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/251f3f57 Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/251f3f57 Branch: refs/heads/master Commit: 251f3f57d62915b14ad365af85afe29083940076 Parents: 7d6c0dd Author: Zalo Correa <[email protected]> Authored: Tue Apr 10 17:54:38 2018 -0700 Committer: Zalo Correa <[email protected]> Committed: Tue Apr 10 17:54:38 2018 -0700 ---------------------------------------------------------------------- core/sqf/monitor/linux/cluster.cxx | 10 ++ core/sqf/monitor/linux/internal.h | 14 +++ core/sqf/monitor/linux/monitor.cxx | 13 ++ core/sqf/monitor/linux/msgdef.h | 2 + core/sqf/monitor/linux/nameserver.cxx | 11 +- core/sqf/monitor/linux/nscommacceptmon.cxx | 2 + core/sqf/monitor/linux/nsreqdelproc.cxx | 46 ++++--- core/sqf/monitor/linux/nsreqnewproc.cxx | 2 +- core/sqf/monitor/linux/pnode.h | 3 + core/sqf/monitor/linux/process.cxx | 97 +++++++++++---- core/sqf/monitor/linux/process.h | 2 +- core/sqf/monitor/linux/replicate.cxx | 76 ++++++++++- core/sqf/monitor/linux/replicate.h | 26 ++++ core/sqf/monitor/linux/reqqueue.cxx | 159 ++++++++++++++++++++++-- core/sqf/monitor/linux/reqqueue.h | 30 +++++ core/sqf/monitor/test/monitor.env | 2 +- 16 files changed, 438 insertions(+), 57 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/cluster.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx index aa3062c..514771d 100644 --- a/core/sqf/monitor/linux/cluster.cxx +++ b/core/sqf/monitor/linux/cluster.cxx @@ -1752,6 +1752,7 @@ int CCluster::SoftNodeUpPrepare( int pnid ) { SMSIntegrating = true; #ifndef NAMESERVER_PROCESS + node->SetSoftNodeUp( ); Monitor->StartPrimitiveProcesses(); #endif // Let other monitors know this node is preparing to soft up @@ -2289,7 +2290,11 @@ void CCluster::HandleOtherNodeMsg (struct internal_msg_def *recv_msg, case InternalType_Exit: if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) trace_printf("%s@%d - Internal exit request for %s (%d, %d)\n", method_name, __LINE__, recv_msg->u.exit.name, recv_msg->u.exit.nid, recv_msg->u.exit.pid); +#ifndef NAMESERVER_PROCESS ReqQueue.enqueueExitReq( &recv_msg->u.exit ); +#else + ReqQueue.enqueueExitNsReq( &recv_msg->u.exit_ns ); +#endif break; #ifndef NAMESERVER_PROCESS @@ -2880,6 +2885,11 @@ void CCluster::HandleMyNodeMsg (struct internal_msg_def *recv_msg, case InternalType_Exit: // Final process exit logic is done in Process_Exit, not here // as in the past. + if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) + trace_printf("%s@%d - Internal exit request for %s (%d, %d)\n", method_name, __LINE__, recv_msg->u.exit_ns.name, recv_msg->u.exit_ns.nid, recv_msg->u.exit_ns.pid); +#ifdef NAMESERVER_PROCESS + ReqQueue.enqueueExitNsReq( &recv_msg->u.exit_ns ); +#endif break; #ifndef NAMESERVER_PROCESS http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/internal.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/internal.h b/core/sqf/monitor/linux/internal.h index 505e1c3..6d28b29 100644 --- a/core/sqf/monitor/linux/internal.h +++ b/core/sqf/monitor/linux/internal.h @@ -92,6 +92,7 @@ typedef enum { State_Default=0, State_Quiesce, // node quiesce state while going down State_SoftDown, // node soft down on DTM abort -> restart + State_SoftUp, // node soft up on DTM restart State_Ready_To_Exit } IntNodeState; @@ -176,6 +177,18 @@ struct exit_def bool abended; }; +struct exit_ns_def +{ + int nid; // Node id of process exiting + int pid; // Process id of process exiting + Verifier_t verifier; // Verifier of the process exiting + char name[MAX_PROCESS_NAME]; // Name of process exiting + bool abended; + int sockFd; // monitor socket fd to reply + int origPNid; // pnid of nameserver which processed request + struct message_def *msg; // requester's request buffer to reply +}; + struct event_def { int nid; // Nid id of process to receive event @@ -433,6 +446,7 @@ struct internal_msg_def struct down_def down; struct dump_def dump; struct exit_def exit; + struct exit_ns_def exit_ns; struct event_def event; ioData_t iodata; struct kill_def kill; http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/monitor.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/monitor.cxx b/core/sqf/monitor/linux/monitor.cxx index d97a3f0..578d158 100755 --- a/core/sqf/monitor/linux/monitor.cxx +++ b/core/sqf/monitor/linux/monitor.cxx @@ -1791,7 +1791,20 @@ int main (int argc, char *argv[]) #endif if (monitorPort) { +#ifdef NAMESERVER_PROCESS + if ( IsRealCluster ) + { + strcpy( IntegratingMonitorPort, MasterMonitorName); + } + else + { + char localHost[MAX_PROCESSOR_NAME]; + gethostname( localHost, MAX_PROCESSOR_NAME ); + strcpy( IntegratingMonitorPort, localHost); + } +#else strcpy( IntegratingMonitorPort, MasterMonitorName); +#endif strcat( IntegratingMonitorPort, ":"); strcat( IntegratingMonitorPort, monitorPort); } http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/msgdef.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/msgdef.h b/core/sqf/monitor/linux/msgdef.h index 98f11c8..4b34557 100644 --- a/core/sqf/monitor/linux/msgdef.h +++ b/core/sqf/monitor/linux/msgdef.h @@ -332,6 +332,7 @@ typedef enum { // types, add any new message types // before this one } MSGTYPE; + typedef TcProcessType_t PROCESSTYPE; typedef enum { @@ -369,6 +370,7 @@ struct DelProcessNs_def int target_pid; // Process id of process to delete Verifier_t target_verifier; // Process verifier of processes to delete char target_process_name[MAX_PROCESS_NAME]; // Name of process to delete + bool target_abended; // True if process aborted }; struct DelProcessNs_reply_def http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/nameserver.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nameserver.cxx b/core/sqf/monitor/linux/nameserver.cxx index 1815409..93f99b3 100644 --- a/core/sqf/monitor/linux/nameserver.cxx +++ b/core/sqf/monitor/linux/nameserver.cxx @@ -513,6 +513,7 @@ int CNameServer::ProcessDelete(CProcess* process ) msgdel->target_pid = msgdel->pid; msgdel->target_verifier = msgdel->verifier; strcpy( msgdel->target_process_name, msgdel->process_name ); + msgdel->target_abended = process->IsAbended(); int error = SendReceive(&msg ); @@ -636,13 +637,14 @@ int CNameServer::ProcessShutdown( void ) int CNameServer::SendReceive( struct message_def* msg ) { const char method_name[] = "CNameServer::SendReceive"; - char desc[100]; + char desc[256]; char* descp; struct DelProcessNs_def *msgdel; struct NewProcessNs_def *msgnew; struct ShutdownNs_def *msgshutdown; struct NameServerStart_def *msgstart; struct NameServerStop_def *msgstop; + struct ProcessInfo_def *msginfo; TRACE_ENTRY; @@ -675,7 +677,12 @@ int CNameServer::SendReceive( struct message_def* msg ) size += sizeof(msg->u.request.u.new_process_ns); break; case ReqType_ProcessInfo: - descp = (char *) "process-info"; + msginfo = &msg->u.request.u.process_info; + sprintf( desc, "process-info (nid=%d, pid=%d, verifier=%d, name=%s)\n" + "\ttarget (nid=%d, pid=%d, verifier=%d, name=%s, type=%d)\n", + msginfo->nid, msginfo->pid, msginfo->verifier, msginfo->process_name, + msginfo->target_nid, msginfo->target_pid, msginfo->target_verifier, + msginfo->target_process_name, msginfo->type ); size += sizeof(msg->u.request.u.process_info); break; case ReqType_ProcessInfoCont: http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/nscommacceptmon.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nscommacceptmon.cxx b/core/sqf/monitor/linux/nscommacceptmon.cxx index 763d324..3482608 100644 --- a/core/sqf/monitor/linux/nscommacceptmon.cxx +++ b/core/sqf/monitor/linux/nscommacceptmon.cxx @@ -85,6 +85,7 @@ void CCommAcceptMon::monReqDeleteProcess( struct message_def* msg, int sockFd ) " msg.del_process_ns.target_pid=%d\n" " msg.del_process_ns.target_verifier=%d\n" " msg.del_process_ns.target_process_name=%s\n" + " msg.del_process_ns.target_abended=%d\n" , method_name, __LINE__ , msg->u.request.u.del_process_ns.nid , msg->u.request.u.del_process_ns.pid @@ -94,6 +95,7 @@ void CCommAcceptMon::monReqDeleteProcess( struct message_def* msg, int sockFd ) , msg->u.request.u.del_process_ns.target_pid , msg->u.request.u.del_process_ns.target_verifier , msg->u.request.u.del_process_ns.target_process_name + , msg->u.request.u.del_process_ns.target_abended ); } http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/nsreqdelproc.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nsreqdelproc.cxx b/core/sqf/monitor/linux/nsreqdelproc.cxx index 8de0c6a..05585cf 100644 --- a/core/sqf/monitor/linux/nsreqdelproc.cxx +++ b/core/sqf/monitor/linux/nsreqdelproc.cxx @@ -56,7 +56,7 @@ CExtDelProcessNsReq::~CExtDelProcessNsReq() void CExtDelProcessNsReq::populateRequestString( void ) { - char strBuf[MON_STRING_BUF_SIZE/2] = { 0 }; + char strBuf[MON_STRING_BUF_SIZE] = { 0 }; snprintf( strBuf, sizeof(strBuf), "ExtReq(%s) req #=%ld " @@ -93,7 +93,7 @@ void CExtDelProcessNsReq::performRequest() if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) { trace_printf( "%s@%d request #%ld: Delete, requester %s (%d, %d:%d), " - "target %s (%d, %d:%d)\n", + "target %s (%d, %d:%d), abended=%d\n", method_name, __LINE__, id_, msg_->u.request.u.del_process_ns.process_name, msg_->u.request.u.del_process_ns.nid, @@ -102,13 +102,15 @@ void CExtDelProcessNsReq::performRequest() msg_->u.request.u.del_process_ns.target_process_name, msg_->u.request.u.del_process_ns.target_nid, msg_->u.request.u.del_process_ns.target_pid, - msg_->u.request.u.del_process_ns.target_verifier); + msg_->u.request.u.del_process_ns.target_verifier, + msg_->u.request.u.del_process_ns.target_abended); } nid_ = msg_->u.request.u.del_process_ns.nid; verifier_ = msg_->u.request.u.del_process_ns.verifier; processName_ = msg_->u.request.u.del_process_ns.process_name; + bool target_abended = msg_->u.request.u.del_process_ns.target_abended; int target_nid = -1; int target_pid = -1; string target_process_name; @@ -128,18 +130,28 @@ void CExtDelProcessNsReq::performRequest() if (process) { CNode * node = Nodes->GetLNode (process->GetNid())->GetNode(); - node->DelFromNameMap ( process ); - node->DelFromPidMap ( process ); - process->SetState (State_Stopped); - // Replicate the exit to other nodes - CReplExit *repl = new CReplExit(process->GetNid(), - process->GetPid(), - process->GetVerifier(), - process->GetName(), - process->IsAbended()); - Replicator.addItem(repl); - process->SetDeletePending ( true ); - node->DeleteFromList( process ); + + // Note: process object is deletes by Exit_Process, so use + // target_* values to replicate + node->Exit_Process( process, target_abended, -1 ); + // Replicate the exit to other name servers + CReplExitNs *repl = new CReplExitNs(target_nid, + target_pid, + target_verifier, + target_process_name.c_str(), + target_abended, + msg_, + sockFd_, + MyPNID ); + if (repl) + { + // we will not reply at this time ... but wait for + // exit request to be processed in CIntExitNsReq + + msg_->noreply = true; + + Replicator.addItem(repl); + } msg_->u.reply.type = ReplyType_DelProcessNs; msg_->u.reply.u.del_process_ns.nid = msg_->u.request.u.del_process_ns.target_nid; @@ -172,10 +184,8 @@ void CExtDelProcessNsReq::performRequest() msg_->u.reply.u.del_process_ns.return_code = MPI_ERR_NAME; if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) trace_printf("%s@%d - unsuccessful\n", method_name, __LINE__); + monreply(msg_, sockFd_); } - // Send reply to requester - monreply(msg_, sockFd_); - TRACE_EXIT; } http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/nsreqnewproc.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nsreqnewproc.cxx b/core/sqf/monitor/linux/nsreqnewproc.cxx index 682823b..2231fc0 100644 --- a/core/sqf/monitor/linux/nsreqnewproc.cxx +++ b/core/sqf/monitor/linux/nsreqnewproc.cxx @@ -148,7 +148,7 @@ void CExtNewProcNsReq::performRequest() if (repl) { // we will not reply at this time ... but wait for - // node add to be processed in CIntNodeAddReq + // new process request to be processed in CIntCloneProcNsReq // Retain reference to requester's request buffer so can // send completion message. http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/pnode.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnode.h b/core/sqf/monitor/linux/pnode.h index eb4829e..b13fbef 100644 --- a/core/sqf/monitor/linux/pnode.h +++ b/core/sqf/monitor/linux/pnode.h @@ -270,12 +270,14 @@ public: inline bool IsRankFailure( void ) { return( rankFailure_ ); } inline bool IsSpareNode( void ) { return( spareNode_ ); } inline bool IsSoftNodeDown( void ) { return( internalState_ == State_SoftDown ); } + inline bool IsSoftNodeUp( void ) { return( internalState_ == State_SoftUp ); } CNode *Link( CNode *entry ); void MoveLNodes( CNode *targetNode ); inline void ResetSpareNode( void ) { spareNode_ = false; } void ResetWatchdogTimer( void ); inline void ResetSoftNodeDown( void ) { internalState_ = State_Default; } + inline void ResetSoftNodeUp( void ) { internalState_ = State_Default; } inline void SetActivatingSpare( int activatingSpare ) { activatingSpare_ = activatingSpare; } void SetAffinity( int nid, pid_t pid, PROCESSTYPE type ); void SetAffinity( CProcess *process ); @@ -306,6 +308,7 @@ public: inline void SetNumCores( int numCores ) { numCores_ = numCores; } inline void SetPhase( NodePhase phase ) { phase_ = phase; } inline void SetSoftNodeDown( void ) { internalState_ = State_SoftDown; } + inline void SetSoftNodeUp( void ) { internalState_ = State_SoftUp; } inline void SetSparePNids( PNidVector &sparePNids ) { sparePNids_ = sparePNids; } inline void SetRank( int rank ) { rank_ = rank; } inline void SetRankFailure( bool failed ) { rankFailure_ = failed; http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/process.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/process.cxx b/core/sqf/monitor/linux/process.cxx index 34ec848..fce323c 100644 --- a/core/sqf/monitor/linux/process.cxx +++ b/core/sqf/monitor/linux/process.cxx @@ -2915,6 +2915,7 @@ struct message_def * CProcess::DeathMessage( ) } #endif +#ifndef NAMESERVER_PROCESS void CProcess::Exit( CProcess *parent ) { char la_buf[MON_STRING_BUF_SIZE]; @@ -2922,12 +2923,10 @@ void CProcess::Exit( CProcess *parent ) const char method_name[] = "CProcess::Exit"; TRACE_ENTRY; -#ifndef NAMESERVER_PROCESS if ( DumpState != Dump_Ready ) { DumpEnd( Dump_Failed, (char *)corefile_.c_str() ); } -#endif if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) trace_printf( "%s@%d" " - Process %s (%d,%d:%d) is exiting, parent process %s (%d,%d:%d)\n" @@ -2952,7 +2951,6 @@ void CProcess::Exit( CProcess *parent ) : !node->IsKillingNode(); } -#ifndef NAMESERVER_PROCESS if( NoticeHead && !MyNode->IsKillingNode() && !(Type == ProcessType_DTM && IsAbended()) && @@ -2966,7 +2964,6 @@ void CProcess::Exit( CProcess *parent ) // Notify all local registered processes of this process' death NoticeHead->NotifyAll(); } -#endif if ( !Clone && !Paired ) { @@ -2974,14 +2971,11 @@ void CProcess::Exit( CProcess *parent ) { case ProcessType_TSE: case ProcessType_ASE: -#ifndef NAMESERVER_PROCESS MyNode->delFromQuiesceExitPids( GetPid(), GetVerifier() ); -#endif if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL | TRACE_REQUEST_DETAIL)) trace_printf("%s%d: pid %d deleted from quiesce exit list\n", method_name, __LINE__, GetPid()); -#ifndef NAMESERVER_PROCESS if (MyNode->isInQuiesceState()) { if (MyNode->isQuiesceExitPidsEmpty()) @@ -2989,12 +2983,9 @@ void CProcess::Exit( CProcess *parent ) HealthCheck.setState(MON_SCHED_NODE_DOWN); // schedule a node down req } } -#endif else { // unmount volumes only if node is not quiescing. -#ifndef NAMESERVER_PROCESS Devices->UnMountVolume( Name, Backup ); -#endif } break; case ProcessType_DTM: @@ -3022,7 +3013,6 @@ void CProcess::Exit( CProcess *parent ) } else { -#ifndef NAMESERVER_PROCESS if ( Monitor->GetTmLeader() == MyPNID ) { // set the clean shutdown condition @@ -3032,7 +3022,6 @@ void CProcess::Exit( CProcess *parent ) strcpy(value,"True"); Config->GetClusterGroup()->Set( key, value ); } -#endif } } break; @@ -3100,12 +3089,10 @@ void CProcess::Exit( CProcess *parent ) trace_printf("%s@%d: Queueing death notice for SSMP process for %s (%d, %d:%d)\n", method_name, __LINE__, Name, Nid, Pid, Verifier); -#ifndef NAMESERVER_PROCESS ssmpProcess->ssmpNoticesLock_.lock(); ssmpProcess->ssmpNotices_.push_back( DeathMessage() ); ssmpProcess->ssmpNoticesLock_.unlock(); SQ_theLocalIOToClient->nudgeNotifier (); -#endif } else { @@ -3146,15 +3133,14 @@ void CProcess::Exit( CProcess *parent ) // Check if we need to output a entry into the process id map log file if ( PidMap ) { -#ifndef NAMESERVER_PROCESS Monitor->writeProcessMapEnd( Name, Nid, Pid, Verifier, parent ? parent->GetNid() : -1, parent ? parent->GetPid() : -1, parent ? parent->GetVerifier() : -1, program() ); -#endif } } + if ( Clone && Pid != -1 ) { if ( Type == ProcessType_SPX && @@ -3168,12 +3154,10 @@ void CProcess::Exit( CProcess *parent ) CProcess *spxProcess = lnode->GetProcessLByType( ProcessType_SPX ); if ( spxProcess && MyNode->GetState() == State_Up ) { -#ifndef NAMESERVER_PROCESS SQ_theLocalIOToClient->putOnNoticeQueue( spxProcess->Pid , spxProcess->Verifier , DeathMessage() , NULL); -#endif if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d)\n" @@ -3209,12 +3193,10 @@ void CProcess::Exit( CProcess *parent ) CProcess *tmProcess = lnode->GetProcessLByType( ProcessType_DTM ); if ( tmProcess && MyNode->GetState() == State_Up ) { -#ifndef NAMESERVER_PROCESS SQ_theLocalIOToClient->putOnNoticeQueue( tmProcess->Pid , tmProcess->Verifier , DeathMessage() , NULL); -#endif if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d)\n" @@ -3246,12 +3228,10 @@ void CProcess::Exit( CProcess *parent ) parent->GetType() == ProcessType_DTM) && supplyProcessDeathNotices ) { -#ifndef NAMESERVER_PROCESS SQ_theLocalIOToClient->putOnNoticeQueue( parent->Pid , parent->Verifier , DeathMessage() , NULL); -#endif if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) trace_printf( "%s@%d" " - Sending death message of %s (%d,%d:%d) to %s (%d,%d:%d) \n" @@ -3267,7 +3247,6 @@ void CProcess::Exit( CProcess *parent ) } } -#ifndef NAMESERVER_PROCESS if (NameServerEnabled) { if ( parent ) @@ -3304,10 +3283,10 @@ void CProcess::Exit( CProcess *parent ) } procExitNotifierNodes(); } -#endif TRACE_EXIT; } +#endif #ifndef NAMESERVER_PROCESS void CProcess::GenerateEvent( int event_id, int length, char *data ) @@ -5371,6 +5350,72 @@ void CProcessContainer::Exit_Process (CProcess *process, bool abend, int downNod } #endif +#ifdef NAMESERVER_PROCESS +void CProcessContainer::Exit_Process (CProcess *process, bool abend, int downNode) +{ + const char method_name[] = "CProcessContainer::Exit_Process(process)"; + TRACE_ENTRY; + + char la_buf[MON_STRING_BUF_SIZE]; + CProcess *parent = NULL; + + if (process) + { + if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) + trace_printf( "%s@%d - Process %s (abended=%d) is exiting, abend=%d, downNode=%d\n" + , method_name, __LINE__ + , process->GetName() + , process->IsAbended() + , abend + , downNode ); + + if ( process->GetState() == State_Down && abend && !process->IsAbended() ) + { + process->SetAbended( abend ); + } + if (process->GetNid() == downNode && !process->IsAbended() ) + { + process->SetAbended( abend ); + } + + if ( numProcs_ <= 0 ) + { + snprintf(la_buf, sizeof(la_buf), + "[%s], Node's process count is invalid, aborting\n", + method_name); + mon_log_write(MON_PROCESSCONT_EXITPROCESS_1, SQ_LOG_ERR, la_buf); + abort(); + } + + if ( process->GetState() == State_Stopped ) + { + if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) + trace_printf("%s@%d" " - Process " "%s" " already exited." "\n", method_name, __LINE__, process->GetName()); + return; + } + + if ( parent == NULL) + { + parent = Nodes->GetProcess( process->GetParentNid(), + process->GetParentPid() ); + } + + // Handle the process termination + process->Switch( parent ); // switch process pair roles if needed + process->SetDeletePending ( true ); + + CNode *node; + node = Nodes->GetLNode(process->GetNid())->GetNode(); + node->DelFromNameMap ( process ); + node->DelFromPidMap ( process ); + node->DeleteFromList( process ); + } + TRACE_EXIT; + + return; +} +#endif + CProcess *CProcessContainer::GetProcess (int pid) { const char method_name[] = "CProcessContainer::GetProcess (pid)"; @@ -5822,7 +5867,7 @@ void CProcessContainer::KillAllDownSoft() } // valid for virtual cluster or soft node down only. - if ( type != ProcessType_DTM ) + if ( type != ProcessType_DTM && type != ProcessType_NameServer ) { // Delete pid map entry DelFromPidMap ( process ); @@ -5859,7 +5904,7 @@ void CProcessContainer::KillAllDownSoft() nextProc = process->GetNext(); PROCESSTYPE type = process->GetType(); - if ( type != ProcessType_DTM ) + if ( type != ProcessType_DTM && type != ProcessType_NameServer ) { // Delete pid map entry DelFromPidMap ( process ); http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/process.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/process.h b/core/sqf/monitor/linux/process.h index 90954f3..f6f22cb 100644 --- a/core/sqf/monitor/linux/process.h +++ b/core/sqf/monitor/linux/process.h @@ -127,9 +127,9 @@ class CProcessContainer ); bool Dump_Process( CProcess *dumper, CProcess *process, char *core_path ); void DumpCallback( int nid, pid_t pid, int status ); + void Exit_Process( CProcess *process, bool abend, int downNode ); #ifndef NAMESERVER_PROCESS static CProcess *ParentNewProcReply ( CProcess *process, int result ); - void Exit_Process( CProcess *process, bool abend, int downNode ); #else static CProcess *MonReply ( CProcess *process, int result ); #endif http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/replicate.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/replicate.cxx b/core/sqf/monitor/linux/replicate.cxx index c22c799..0b6fadb 100644 --- a/core/sqf/monitor/linux/replicate.cxx +++ b/core/sqf/monitor/linux/replicate.cxx @@ -91,7 +91,7 @@ int CReplObj::calcAllocSize() sizeof(CReplExit)), sizeof(CReplKill)), #ifdef NAMESERVER_PROCESS - sizeof(dummy_sizeof_def)), + sizeof(CReplExitNs)), #else sizeof(CReplDevice)), #endif @@ -929,6 +929,80 @@ bool CReplExit::replicate(struct internal_msg_def *&msg) return true; } +CReplExitNs::CReplExitNs( int nid + , int pid + , Verifier_t verifier + , const char *name + , bool abended + , struct message_def *msg + , int sockFd + , int origPNid ) + : nid_(nid) + , pid_(pid) + , verifier_(verifier) + , abended_(abended) + , msg_(msg) + , sockFd_(sockFd) + , origPNid_(origPNid) +{ + // Add eyecatcher sequence as a debugging aid + memcpy(&eyecatcher_, "RPLJ", 4); + + strcpy(name_, name); + + // Compute message size (adjust if needed to conform to + // internal_msg_def structure alignment). + replSize_ = (MSG_HDR_SIZE + sizeof ( exit_ns_def ) + msgAlignment_) + & ~msgAlignment_; + + if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL)) + { + const char method_name[] = "CReplExitNs::CReplExitNs"; + trace_printf( "%s@%d - Queuing replicating process exit %s (%d, %d:%d)," + " abended=%d, msg=%p, sockFd=%d, origPNid=%d\n" + , method_name, __LINE__ + , name_, nid_, pid_, verifier_, abended_ + , msg_, sockFd_, origPNid_ ); + } +} + +CReplExitNs::~CReplExitNs() +{ + // Alter eyecatcher sequence as a debugging aid to identify deleted object + memcpy(&eyecatcher_, "rplj", 4); +} + + +bool CReplExitNs::replicate(struct internal_msg_def *&msg) +{ + const char method_name[] = "CReplExitNs::replicate"; + TRACE_ENTRY; + + if (trace_settings & (TRACE_SYNC | TRACE_PROCESS)) + trace_printf("%s@%d" " - Replicating process exit %s (%d, %d:%d)," + " abended=%d\n", method_name, __LINE__, + name_, nid_, pid_, verifier_, abended_); + + // Build message to replicate this process exit to other nodes + msg->type = InternalType_Exit; + msg->u.exit_ns.nid = nid_; + msg->u.exit_ns.pid = pid_; + msg->u.exit_ns.verifier = verifier_; + strcpy(msg->u.exit_ns.name, name_); + msg->u.exit_ns.abended = abended_; + msg->u.exit_ns.msg = msg_; + msg->u.exit_ns.sockFd = sockFd_; + msg->u.exit_ns.origPNid = origPNid_; + + // Advance sync buffer pointer + Nodes->AddMsg( msg, replSize() ); + + TRACE_EXIT; + + return true; +} + + CReplKill::CReplKill( int nid , int pid , Verifier_t verifier http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/replicate.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/replicate.h b/core/sqf/monitor/linux/replicate.h index 966c69e..5d4a7cc 100644 --- a/core/sqf/monitor/linux/replicate.h +++ b/core/sqf/monitor/linux/replicate.h @@ -210,6 +210,32 @@ private: bool abended_; }; +class CReplExitNs: public CReplObj +{ +public: + CReplExitNs( int nid + , int pid + , Verifier_t verifier + , const char *name + , bool abended + , struct message_def *msg + , int sockFd + , int origPNid ); + virtual ~CReplExitNs(); + + bool replicate(struct internal_msg_def *& msg); + +private: + int nid_; + int pid_; + Verifier_t verifier_; + char name_[MAX_PROCESS_NAME]; + bool abended_; + struct message_def *msg_; + int sockFd_; + int origPNid_; +}; + class CReplKill: public CReplObj { public: http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/reqqueue.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqqueue.cxx b/core/sqf/monitor/linux/reqqueue.cxx index 530b096..6825652 100644 --- a/core/sqf/monitor/linux/reqqueue.cxx +++ b/core/sqf/monitor/linux/reqqueue.cxx @@ -1017,6 +1017,7 @@ void CIntDeviceReq::performRequest() } #endif +#ifndef NAMESERVER_PROCESS CIntExitReq::CIntExitReq( ) : CInternalReq() , nid_(0) @@ -1101,12 +1102,136 @@ void CIntExitReq::performRequest() { lnode->GetNode()->DelFromNameMap ( process ); lnode->GetNode()->DelFromPidMap ( process ); + lnode->GetNode()->Exit_Process (process, abended_, -1); + } + } + else + { + char buf[MON_STRING_BUF_SIZE]; + sprintf(buf, "[%s], Can't find process %s (%d, %d) for processing " + "exit.\n", method_name, name_, nid_, pid_); + mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_5, SQ_LOG_ERR, buf); + } -#ifdef NAMESERVER_PROCESS - lnode->GetNode()->DeleteFromList( process ); + TRACE_EXIT; +} #else - lnode->GetNode()->Exit_Process (process, abended_, -1); -#endif +CIntExitNsReq::CIntExitNsReq( ) + : CInternalReq() + , nid_(0) + , pid_(0) + , verifier_(-1) +{ + // Add eyecatcher sequence as a debugging aid + memcpy(&eyecatcher_, "RQIE", 4); + + name_[0] = '\0'; +} + +CIntExitNsReq::~CIntExitNsReq( ) +{ + // Alter eyecatcher sequence as a debugging aid to identify deleted object + memcpy(&eyecatcher_, "rqie", 4); +} + +void CIntExitNsReq::populateRequestString( void ) +{ + char strBuf[MON_STRING_BUF_SIZE/2]; + sprintf( strBuf, "IntReq(%s) req #=%ld (name=%s/nid=%d/pid=%d/verifier=%d)" + "(msg=%p, sockFd=%d, origPNid=%d)" + , CReqQueue::intReqType[InternalType_Exit] + , getId(), name_, nid_, pid_, verifier_ + , msg_, sockFd_, origPNid_ ); + requestString_.assign( strBuf ); +} + +void CIntExitNsReq::prepRequest( struct exit_ns_def *exitDef ) +{ + const char method_name[] = "CIntExitNsReq::prepRequest"; + TRACE_ENTRY; + + nid_ = exitDef->nid; + pid_ = exitDef->pid; + verifier_ = exitDef->verifier; + strcpy( name_, exitDef->name ); + abended_ = exitDef->abended; + msg_ = exitDef->msg; + sockFd_ = exitDef->sockFd; + origPNid_ = exitDef->origPNid; + + TRACE_EXIT; +} + +void CIntExitNsReq::performRequest() +{ + const char method_name[] = "CIntExitReq::performRequest"; + TRACE_ENTRY; + + CProcess *process = NULL; + CLNode *lnode; + + // Check if this name server is handling monitor request + // from CExtDelProcessNsReq::performRequest() + if (origPNid_ == MyPNID) + { + msg_->noreply = false; + msg_->u.reply.type = ReplyType_DelProcessNs; + msg_->u.reply.u.del_process_ns.nid = nid_; + msg_->u.reply.u.del_process_ns.pid = pid_; + msg_->u.reply.u.del_process_ns.verifier = verifier_; + strncpy(msg_->u.reply.u.del_process_ns.process_name, name_, MAX_PROCESS_NAME); + msg_->u.reply.u.del_process_ns.return_code = MPI_SUCCESS; + + if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) + { + trace_printf("%s@%d - Replying to monitor sockFd_=%d, msg_=%p, deleted %s (%d, %d:%d)\n", + method_name, __LINE__, + sockFd_, + msg_, + name_, + nid_, + pid_, + verifier_ ); + } + + // Send reply to requesting monitor + monreply( msg_, sockFd_ ); + return; + } + + lnode = Nodes->GetLNode( nid_ ); + if ( lnode ) + { + process = lnode->GetNode()->GetProcess( pid_ ); + + if ( ! process ) + { + // Could not locate process by process id. If the exit + // occurred due to an early process termination on another + // node we won't have the process id. Try the look up by + // name instead. + process = lnode->GetNode()->GetProcess( name_, false ); + } + } + + if ( process ) + { + if ( (verifier_ != -1) && (verifier_ != process->GetVerifier()) ) + { + if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) + { + trace_printf("%s@%d - Exit %s (%d, %d:%d) failed -- verifier mismatch (%d)\n", + method_name, __LINE__, + name_, + nid_, + pid_, + verifier_, + process->GetVerifier()); + } + } + else + { + lnode->GetNode()->Exit_Process( process, abended_, -1 ); } } else @@ -1119,6 +1244,7 @@ void CIntExitReq::performRequest() TRACE_EXIT; } +#endif #ifndef NAMESERVER_PROCESS CIntKillReq::CIntKillReq( struct kill_def *killDef ) @@ -3583,10 +3709,17 @@ void CIntCreatePrimitiveReq::performRequest() { startNs = true; } - if ( startNs ) + if ( !MyNode->IsSoftNodeUp() ) + { // Don't restart the name server on a soft node up + if ( startNs ) + { + NameServer->SetLocalHost(); + MyNode->StartNameServerProcess(); + } + } + else { - NameServer->SetLocalHost(); - MyNode->StartNameServerProcess(); + MyNode->ResetSoftNodeUp(); } } MyNode->StartWatchdogProcess(); @@ -4246,6 +4379,7 @@ void CReqQueue::enqueueUpReq( int pnid, char *node_name, int merge_lead ) enqueueReq ( request ); } +#ifndef NAMESERVER_PROCESS void CReqQueue::enqueueExitReq( struct exit_def *exitDef ) { CIntExitReq * request; @@ -4255,6 +4389,17 @@ void CReqQueue::enqueueExitReq( struct exit_def *exitDef ) enqueueReq ( request ); } +#else +void CReqQueue::enqueueExitNsReq( struct exit_ns_def *exitDef ) +{ + CIntExitNsReq * request; + + request = new CIntExitNsReq ( ); + request->prepRequest( exitDef ); + + enqueueReq ( request ); +} +#endif #ifndef NAMESERVER_PROCESS //void CReqQueue::enqueueKillReq( int nid, int pid, bool abort ) http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/reqqueue.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqqueue.h b/core/sqf/monitor/linux/reqqueue.h index 7fa69fd..995912d 100644 --- a/core/sqf/monitor/linux/reqqueue.h +++ b/core/sqf/monitor/linux/reqqueue.h @@ -926,6 +926,7 @@ private: }; #endif +#ifndef NAMESERVER_PROCESS class CIntExitReq: public CInternalReq { public: @@ -944,6 +945,31 @@ private: bool abended_; char name_[MAX_PROCESS_NAME]; }; +#endif + +#ifdef NAMESERVER_PROCESS +class CIntExitNsReq: public CInternalReq +{ +public: + CIntExitNsReq(); + virtual ~CIntExitNsReq(); + + void prepRequest( struct exit_ns_def *exitDef ); + void performRequest(); + +private: + void populateRequestString( void ); + + int nid_; + int pid_; + Verifier_t verifier_; + bool abended_; + char name_[MAX_PROCESS_NAME]; + struct message_def *msg_; + int sockFd_; + int origPNid_; +}; +#endif #ifdef NAMESERVER_PROCESS class CExtDelProcessNsReq: public CExternalReq @@ -1471,7 +1497,11 @@ class CReqQueue #ifndef NAMESERVER_PROCESS void enqueueDeviceReq( char *ldevName ); #endif +#ifndef NAMESERVER_PROCESS void enqueueExitReq( struct exit_def *exitDef ); +#else + void enqueueExitNsReq( struct exit_ns_def *exitDef ); +#endif #ifdef NAMESERVER_PROCESS void enqueueDeleteReq( struct delete_def *deleteDef ); #endif http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/test/monitor.env ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/test/monitor.env b/core/sqf/monitor/test/monitor.env index 303dbf3..52e2341 100644 --- a/core/sqf/monitor/test/monitor.env +++ b/core/sqf/monitor/test/monitor.env @@ -54,7 +54,7 @@ MON_TRACE_NS=1 #MON_TRACE_MLIO=1 #MON_TRACE_REQUEST_DETAIL=1 -#MON_TRACE_PROCESS_DETAIL=1 +MON_TRACE_PROCESS_DETAIL=1 #MON_TRACE_NOTICE_DETAIL=1 #MON_TRACE_SYNC_DETAIL=1 #MON_TRACE_MLIO_DETAIL=1
