More Name Server enabled process management fixes.
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/65ac5563 Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/65ac5563 Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/65ac5563 Branch: refs/heads/master Commit: 65ac55633c34dc8ee12de5da4c6fbc91cd6b9093 Parents: 8e4f2c7 Author: Zalo Correa <[email protected]> Authored: Fri May 25 14:44:54 2018 -0700 Committer: Zalo Correa <[email protected]> Committed: Fri May 25 14:44:54 2018 -0700 ---------------------------------------------------------------------- .../export/include/common/evl_sqlog_eventnum.h | 34 ++ core/sqf/monitor/linux/cluster.cxx | 11 +- core/sqf/monitor/linux/notice.cxx | 13 +- core/sqf/monitor/linux/nsreqprocinfons.cxx | 6 +- core/sqf/monitor/linux/pnode.cxx | 105 +++-- core/sqf/monitor/linux/pnode.h | 2 +- core/sqf/monitor/linux/process.cxx | 406 +++++++++++++++++-- core/sqf/monitor/linux/process.h | 11 + core/sqf/monitor/linux/reqkill.cxx | 23 +- core/sqf/monitor/linux/reqnewproc.cxx | 21 +- core/sqf/monitor/linux/reqnotify.cxx | 23 +- core/sqf/monitor/linux/reqprocinfo.cxx | 26 +- core/sqf/monitor/linux/reqqueue.cxx | 117 +++--- core/sqf/monitor/linux/tmsync.cxx | 17 + 14 files changed, 679 insertions(+), 136 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/export/include/common/evl_sqlog_eventnum.h ---------------------------------------------------------------------- diff --git a/core/sqf/export/include/common/evl_sqlog_eventnum.h b/core/sqf/export/include/common/evl_sqlog_eventnum.h index 1f4d166..de3e60e 100644 --- a/core/sqf/export/include/common/evl_sqlog_eventnum.h +++ b/core/sqf/export/include/common/evl_sqlog_eventnum.h @@ -254,6 +254,8 @@ #define MON_CLUSTER_ASSIGNMONITORLEADER_3 101015303 #define MON_CLUSTER_ASSIGNMONITORLEADER_4 101015304 +#define MON_CLUSTER_CHECKIFDONE_1 101015401 + /* Module: monitor.cxx = 02 */ #define MON_MONITOR_MAIN_1 101020101 @@ -298,6 +300,7 @@ #define MON_MONITOR_STARTPROCESS_12 101020512 #define MON_MONITOR_STARTPROCESS_13 101020513 #define MON_MONITOR_STARTPROCESS_14 101020514 +#define MON_MONITOR_STARTPROCESS_15 101020515 #define MON_MONITOR_PROCESSINFO 101020601 #define MON_MONITOR_PROCESSREQUEST_1 101020701 #define MON_MONITOR_PROCESSREQUEST_2 101020702 @@ -382,6 +385,8 @@ #define MON_PROCESS_CREATE_9 101031409 #define MON_PROCESS_CREATE_10 101031410 #define MON_PROCESS_CREATE_11 101031411 +#define MON_PROCESS_CREATE_12 101031412 +#define MON_PROCESS_CREATE_13 101031413 #define MON_PROCESS_SETPROCESSSTATE_1 101031501 #define MON_PROCESS_PIDHANGUPCHECK_1 101031601 #define MON_PROCESS_PIDHANGUPCHECK_2 101031602 @@ -416,6 +421,14 @@ #define MON_PROCESSCONT_KILLALLDOWN_1 101032301 #define MON_PROCESS_SETSTATE_1 101032401 #define MON_PROCESS_SETSTATE_2 101032402 +#define MON_PROCESS_COMPLETESTARTUP_1 101032501 +#define MON_PROCESS_COMPLETESTARTUP_2 101032502 +#define MON_PROCESS_COMPLETESTARTUP_3 101032503 +#define MON_PROCESSCONT_CHILDEXIT_1 101032601 +#define MON_PROCESS_PROCEXITNOTIFIERNODES_1 101032701 +#define MON_PROCESS_PROCEXIT_1 101032801 +#define MON_PROCESS_PROCEXIT_2 101032802 +#define MON_PROCESS_PROCEXITUNREGALL_1 101032901 /* Module: pnode.cxx = 04 */ @@ -450,6 +463,19 @@ #define MON_NODE_GETPROCESSNS_2 101041502 #define MON_NODE_GETPROCESSNS_3 101041503 #define MON_NODE_GETPROCESSNS_4 101041504 +#define MON_NODE_GETPROCESSNS_5 101041505 +#define MON_NODE_GETPROCESSNS_6 101041506 +#define MON_NODE_GETSTRINGID_1 101041601 +#define MON_NODE_GETSTRINGID_2 101041602 +#define MON_NODE_CLONEPROCESSNS_1 101041701 +#define MON_NODE_CLONEPROCESSNS_2 101041702 +#define MON_NODE_CLONEPROCESSNS_3 101041703 +#define MON_NODE_CLONEPROCESSNS_4 101041704 +#define MON_NODE_CLONEPROCESSNS_5 101041705 +#define MON_NODE_CLONEPROCESSNS_6 101041706 +#define MON_NODE_GETPROCESSLBYTYPENS_1 101041801 +#define MON_NODE_GETPROCESSLBYTYPENS_2 101041802 +#define MON_NODE_GETPROCESSLBYTYPENS_3 101041803 /* Module: config.cxx = 05 */ @@ -747,6 +773,11 @@ #define MON_REQ_IODATA_1 101182201 #define MON_REQ_STDIN_1 101182301 #define MON_REQ_STDIN_2 101182302 +#define MON_REQ_KILL_1 101182401 +#define MON_REQ_NOTIFY_1 101182501 +#define MON_REQ_PROCINFO_1 101182601 +#define MON_REQ_PROCINFOCONT_1 101182701 +#define MON_INTREQ_CHILDDEATH_1 101182801 /* Module: clio.cxx = 19 */ #define MON_CLIO_ACQUIRE_MSG_1 101190101 @@ -1048,6 +1079,9 @@ #define PTP_COMMACCEPT_7 101940107 #define PTP_COMMACCEPT_8 101940108 +/* Module notice.cxx = 95 */ +#define NOTICE_NOTIFYREMOTE_1 101950101 + /**********************************************/ /*********** Seabed ***********/ http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/cluster.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx index 8768d2e..69647ec 100644 --- a/core/sqf/monitor/linux/cluster.cxx +++ b/core/sqf/monitor/linux/cluster.cxx @@ -7786,7 +7786,16 @@ bool CCluster::checkIfDone ( ) Nodes->ProcessCount(), MyNode->ProcessCount()); waitForNameServerExit_ = true; - NameServer->ProcessShutdown(); + int rc = NameServer->ProcessShutdown(); + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Shutdown request to Name Server failed, node going down\n" + , method_name ); + mon_log_write( MON_CLUSTER_CHECKIFDONE_1, SQ_LOG_ERR, la_buf ); + ReqQueue.enqueueDownReq( MyPNID ); + } } } else http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/notice.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/notice.cxx b/core/sqf/monitor/linux/notice.cxx index f478f45..64778d7 100644 --- a/core/sqf/monitor/linux/notice.cxx +++ b/core/sqf/monitor/linux/notice.cxx @@ -421,7 +421,18 @@ void CNotice::NotifyRemote( void ) , targetLNode->GetNode()->GetName() ); if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process exit " + "for process %s (%d, %d) " + "to target node %s, nid=%d\n" + , method_name + , Process->GetName() + , Process->GetNid() + , Process->GetPid() + , targetLNode->GetNode()->GetName() + , targetLNode->GetNid() ); + mon_log_write(NOTICE_NOTIFYREMOTE_1, SQ_LOG_ERR, la_buf); } nidQueue->pop(); } http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/nsreqprocinfons.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/nsreqprocinfons.cxx b/core/sqf/monitor/linux/nsreqprocinfons.cxx index 562fdb6..aa53437 100644 --- a/core/sqf/monitor/linux/nsreqprocinfons.cxx +++ b/core/sqf/monitor/linux/nsreqprocinfons.cxx @@ -59,7 +59,7 @@ CExtProcInfoNsReq::~CExtProcInfoNsReq() // Copy information for a specific process into the reply message buffer. void CExtProcInfoNsReq::copyInfo(CProcess *process, ProcessInfoNs_reply_def &process_info_ns) { - const char method_name[] = "CNameServer::SendReceive"; + const char method_name[] = "CExtProcInfoNsReq::copyInfo"; TRACE_ENTRY; CProcess *parent; @@ -92,8 +92,8 @@ void CExtProcInfoNsReq::copyInfo(CProcess *process, ProcessInfoNs_reply_def &pro process_info_ns.unhooked = process->IsUnhooked(); process_info_ns.event_messages = process->IsEventMessages(); process_info_ns.system_messages = process->IsSystemMessages(); - strncpy( process_info_ns.path, process->path(), MAX_PROCESS_PATH ); - strncpy( process_info_ns.ldpath, process->ldpath(), MAX_PROCESS_PATH ); + strncpy( process_info_ns.path, process->path(), MAX_SEARCH_PATH ); + strncpy( process_info_ns.ldpath, process->ldpath(), MAX_SEARCH_PATH ); strncpy( process_info_ns.program, process->program(), MAX_PROCESS_PATH ); // process_info_ns.pathStrId = process->pathStrId(); // process_info_ns.ldpathStrId = process->ldPathStrId(); http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/pnode.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx index c044100..73b0246 100644 --- a/core/sqf/monitor/linux/pnode.cxx +++ b/core/sqf/monitor/linux/pnode.cxx @@ -1035,7 +1035,7 @@ bool CNode::GetSchedulingData( void ) } -strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode ) +strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode, bool clone ) { const char method_name[] = "CNode::GetStringId"; strId_t id; @@ -1059,21 +1059,33 @@ strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode ) #ifndef NAMESERVER_PROCESS if (NameServerEnabled) { - if (targetLNode != NULL && + if (targetLNode != NULL && !clone && !MyNode->IsMyNode(targetLNode->GetNid())) { // Forward the unique string to the target node - PtpClient->AddUniqStr( id.nid - , id.id - , candidate - , targetLNode->GetNid() - , targetLNode->GetNode()->GetName()); + int rc = PtpClient->AddUniqStr( id.nid + , id.id + , candidate + , targetLNode->GetNid() + , targetLNode->GetNode()->GetName() ); + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send unique string " + "to target node %s, nid=%d\n" + , method_name + , targetLNode->GetNode()->GetName() + , targetLNode->GetNid() ); + mon_log_write(MON_NODE_GETSTRINGID_1, SQ_LOG_ERR, la_buf); + } } } else #endif { #ifdef NAMESERVER_PROCESS + clone = clone; // Make compiler happy! targetLNode = targetLNode; // Make compiler happy! #endif CReplUniqStr *repl = new CReplUniqStr ( id.nid, id.id, candidate ); @@ -1092,15 +1104,26 @@ strId_t CNode::GetStringId( char *candidate, CLNode *targetLNode ) #ifndef NAMESERVER_PROCESS if (NameServerEnabled) { - if (targetLNode != NULL && + if (targetLNode != NULL && !clone && !MyNode->IsMyNode(targetLNode->GetNid())) { // Forward the unique string to the target node - PtpClient->AddUniqStr( id.nid - , id.id - , candidate - , targetLNode->GetNid() - , targetLNode->GetNode()->GetName()); + int rc = PtpClient->AddUniqStr( id.nid + , id.id + , candidate + , targetLNode->GetNid() + , targetLNode->GetNode()->GetName()); + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send unique string " + "to target node %s, nid=%d\n" + , method_name + , targetLNode->GetNode()->GetName() + , targetLNode->GetNid() ); + mon_log_write(MON_NODE_GETSTRINGID_2, SQ_LOG_ERR, la_buf); + } } } #endif @@ -1721,9 +1744,9 @@ CProcess *CNodeContainer::AddCloneProcess( ProcessInfoNs_reply_def *processInfo CLNode *lnode = Nodes->GetLNode(processInfo->nid); CNode *node = lnode->GetNode(); - strId_t pathStrId = MyNode->GetStringId ( processInfo->path, lnode ); - strId_t ldpathStrId = MyNode->GetStringId (processInfo->ldpath, lnode ); - strId_t programStrId = MyNode->GetStringId ( processInfo->program, lnode ); + strId_t pathStrId = MyNode->GetStringId ( processInfo->path, lnode, true ); + strId_t ldpathStrId = MyNode->GetStringId (processInfo->ldpath, lnode, true ); + strId_t programStrId = MyNode->GetStringId ( processInfo->program, lnode, true ); CProcess *process = node->CloneProcess( processInfo->nid , processInfo->type @@ -2535,7 +2558,7 @@ CProcess *CNodeContainer::CloneProcessNs( int nid snprintf( buf, sizeof(buf), "[%s] ProcessInfo failed, rc=%d\n" , method_name, msg.u.reply.u.process_info_ns.return_code ); - mon_log_write( MON_NODE_GETPROCESSNS_1, SQ_LOG_ERR, buf ); + mon_log_write( MON_NODE_CLONEPROCESSNS_1, SQ_LOG_ERR, buf ); } } else @@ -2545,9 +2568,17 @@ CProcess *CNodeContainer::CloneProcessNs( int nid "[%s], Invalid MsgType(%d)/ReplyType(%d) for " "ProcessInfoNs\n" , method_name, msg.type, msg.u.reply.type ); - mon_log_write( MON_NODE_GETPROCESSNS_2, SQ_LOG_ERR, buf ); + mon_log_write( MON_NODE_CLONEPROCESSNS_2, SQ_LOG_ERR, buf ); } } + else + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Process info request to Name Server failed\n" + , method_name ); + mon_log_write( MON_NODE_CLONEPROCESSNS_3, SQ_LOG_ERR, la_buf ); + } TRACE_EXIT; return( process ); @@ -2596,7 +2627,7 @@ CProcess *CNodeContainer::CloneProcessNs( const char *name, Verifier_t verifier snprintf( buf, sizeof(buf), "[%s] ProcessInfo failed, rc=%d\n" , method_name, msg.u.reply.u.process_info_ns.return_code ); - mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, buf ); + mon_log_write( MON_NODE_CLONEPROCESSNS_4, SQ_LOG_ERR, buf ); } } else @@ -2606,9 +2637,17 @@ CProcess *CNodeContainer::CloneProcessNs( const char *name, Verifier_t verifier "[%s], Invalid MsgType(%d)/ReplyType(%d) for " "ProcessInfo\n" , method_name, msg.type, msg.u.reply.type ); - mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf ); + mon_log_write( MON_NODE_CLONEPROCESSNS_5, SQ_LOG_ERR, buf ); } } + else + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Process info request to Name Server failed\n" + , method_name ); + mon_log_write( MON_NODE_CLONEPROCESSNS_6, SQ_LOG_ERR, la_buf ); + } TRACE_EXIT; return( process ); @@ -3243,6 +3282,11 @@ int CNodeContainer::GetProcessInfoNs( int nid } else { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Process info request to Name Server failed\n" + , method_name ); + mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, la_buf ); rc = MPI_ERR_OP; } @@ -3293,7 +3337,7 @@ int CNodeContainer::GetProcessInfoNs( const char *name snprintf( buf, sizeof(buf), "[%s] ProcessInfo failed, rc=%d\n" , method_name, msg.u.reply.u.process_info_ns.return_code ); - mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, buf ); + mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf ); } rc = msg.u.reply.u.process_info_ns.return_code; } @@ -3304,12 +3348,17 @@ int CNodeContainer::GetProcessInfoNs( const char *name "[%s], Invalid MsgType(%d)/ReplyType(%d) for " "ProcessInfo\n" , method_name, msg.type, msg.u.reply.type ); - mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf ); + mon_log_write( MON_NODE_GETPROCESSNS_5, SQ_LOG_ERR, buf ); rc = MPI_ERR_OP; } } else { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Process info request to Name Server failed\n" + , method_name ); + mon_log_write( MON_NODE_GETPROCESSNS_6, SQ_LOG_ERR, la_buf ); rc = MPI_ERR_OP; } @@ -3383,7 +3432,7 @@ CProcess *CNodeContainer::GetProcessLByTypeNs( int nid, PROCESSTYPE type ) snprintf( buf, sizeof(buf), "[%s] ProcessInfo failed, rc=%d\n" , method_name, msg.u.reply.u.process_info_ns.return_code ); - mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, buf ); + mon_log_write( MON_NODE_GETPROCESSLBYTYPENS_1, SQ_LOG_ERR, buf ); } } else @@ -3393,9 +3442,17 @@ CProcess *CNodeContainer::GetProcessLByTypeNs( int nid, PROCESSTYPE type ) "[%s], Invalid MsgType(%d)/ReplyType(%d) for " "ProcessInfo\n" , method_name, msg.type, msg.u.reply.type ); - mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf ); + mon_log_write( MON_NODE_GETPROCESSLBYTYPENS_2, SQ_LOG_ERR, buf ); } } + else + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Process info request to Name Server failed\n" + , method_name ); + mon_log_write( MON_NODE_GETPROCESSLBYTYPENS_3, SQ_LOG_ERR, la_buf ); + } TRACE_EXIT; return( process ); http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/pnode.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnode.h b/core/sqf/monitor/linux/pnode.h index fbfddf4..b04e387 100644 --- a/core/sqf/monitor/linux/pnode.h +++ b/core/sqf/monitor/linux/pnode.h @@ -269,7 +269,7 @@ public: // If candidate string has not been seen before assign a unique // id and store it in the config database. In either case return // the unique id as the value of the method. - strId_t GetStringId( char *candidate, CLNode *targetLNode = NULL ); + strId_t GetStringId( char *candidate, CLNode *targetLNode = NULL, bool clone = false ); inline int GetTmSyncNid( void ) { return( tmSyncNid_ ); } inline SyncState GetTmSyncState( void ) { return( tmSyncState_ ); } http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/process.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/process.cxx b/core/sqf/monitor/linux/process.cxx index 5e7b792..3a3e699 100644 --- a/core/sqf/monitor/linux/process.cxx +++ b/core/sqf/monitor/linux/process.cxx @@ -651,14 +651,24 @@ void CProcess::procExitNotifierNodes( void ) { if (NameServerEnabled && targetNode->GetPNid() != MyPNID) { - int rc = -1; // Forward the process exit to the target node - rc = PtpClient->ProcessExit( this - , targetLNode->GetNid() - , targetNode->GetName() ); + int rc = PtpClient->ProcessExit( this + , targetLNode->GetNid() + , targetNode->GetName() ); if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process exit " + "for process %s (%d, %d) " + "to target node %s, nid=%d\n" + , method_name + , GetName() + , GetNid() + , GetPid() + , targetLNode->GetNode()->GetName() + , targetLNode->GetNid() ); + mon_log_write(MON_PROCESS_PROCEXITNOTIFIERNODES_1, SQ_LOG_ERR, la_buf); } } } @@ -709,7 +719,18 @@ void CProcess::procExitUnregAll ( _TM_Txid_External transId ) , targetLNode->GetNode()->GetName() ); if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process notify request " + "for process %s (%d, %d) " + "to target node %s, nid=%d\n" + , method_name + , targetProcess->GetName() + , targetProcess->GetNid() + , targetProcess->GetPid() + , targetLNode->GetNode()->GetName() + , targetLNode->GetNid() ); + mon_log_write(MON_PROCESS_PROCEXITUNREGALL_1, SQ_LOG_ERR, la_buf); } } @@ -726,6 +747,7 @@ void CProcess::procExitUnregAll ( _TM_Txid_External transId ) } #endif +#ifndef NAMESERVER_PROCESS void CProcess::childAdd ( int nid, int pid ) { const char method_name[] = "CProcess::childAdd"; @@ -795,6 +817,81 @@ bool CProcess::childRemoveFirst ( nidPid_t & child) return result; } +void CProcess::childUnHookedAdd( int nid, int pid ) +{ + const char method_name[] = "CProcess::childUnHookedAdd"; + TRACE_ENTRY; + + if (trace_settings & (TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL)) + trace_printf( "%s@%d adding unhooked child (%d:%d)\n" + , method_name, __LINE__ + , nid, pid ); + + nidPid_t child = { nid, pid }; + childrenListLock_.lock(); + childrenUnHooked_.push_back ( child ); + childrenListLock_.unlock(); + + TRACE_EXIT; +} + +int CProcess::childUnHookedCount( void ) +{ + const char method_name[] = "CProcess::childUnHookedCount"; + TRACE_ENTRY; + + childrenListLock_.lock(); + int count = childrenUnHooked_.size(); + childrenListLock_.unlock(); + + TRACE_EXIT; + return(count); +} + +void CProcess::childUnHookedRemove( int nid, int pid ) +{ + const char method_name[] = "CProcess::childUnHookedRemove"; + TRACE_ENTRY; + + nidPidList_t::iterator it; + + childrenListLock_.lock(); + for ( it = childrenUnHooked_.begin(); it != childrenUnHooked_.end(); ++it) + { + if (it->nid == nid && it->pid == pid ) + { + childrenUnHooked_.erase ( it ); + break; + } + } + childrenListLock_.unlock(); + + TRACE_EXIT; +} + +bool CProcess::childUnHookedRemoveFirst( nidPid_t & child) +{ + const char method_name[] = "CProcess::childUnHookedRemoveFirst"; + TRACE_ENTRY; + + bool result = false; + + childrenListLock_.lock(); + if ( !childrenUnHooked_.empty() ) + { + child = childrenUnHooked_.front (); + childrenUnHooked_.pop_front (); + result = true; + + } + childrenListLock_.unlock(); + + TRACE_EXIT; + + return result; +} +#endif + #ifndef NAMESERVER_PROCESS void CProcess::CompleteDump(DUMPSTATUS status, char *core_file) { @@ -882,7 +979,16 @@ void CProcess::CompleteProcessStartup (char *port, int os_pid, bool event_messag rc = NameServer->ProcessNew(this); // in reqQueue thread (CExtStartupReq) if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't register new process " + "%s (%d, %d) " + "to Name Server process\n" + , method_name + , GetName() + , GetNid() + , GetPid() ); + mon_log_write(MON_PROCESS_COMPLETESTARTUP_1, SQ_LOG_ERR, la_buf); } if (Parent_Nid != -1) @@ -893,14 +999,22 @@ void CProcess::CompleteProcessStartup (char *port, int os_pid, bool event_messag rc = PtpClient->ProcessClone(this); if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + CLNode *parentLNode = NULL; + parentLNode = Nodes->GetLNode( GetParentNid() ); + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process clone request" + "for process %s (%d, %d) " + "to parent node %s, nid=%d\n" + , method_name + , GetName() + , GetNid() + , GetPid() + , parentLNode->GetNode()->GetName() + , parentLNode->GetNid() ); + mon_log_write(MON_PROCESS_COMPLETESTARTUP_2, SQ_LOG_ERR, la_buf); } } - else - { - // TODO: Generate internal clone request? - // to update local parent? - } } } else @@ -929,14 +1043,22 @@ void CProcess::CompleteProcessStartup (char *port, int os_pid, bool event_messag rc = PtpClient->ProcessClone(this); if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + CLNode *parentLNode = NULL; + parentLNode = Nodes->GetLNode( GetParentNid() ); + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process clone request" + "for process %s (%d, %d) " + "to parent node %s, nid=%d\n" + , method_name + , GetName() + , GetNid() + , GetPid() + , parentLNode->GetNode()->GetName() + , parentLNode->GetNid() ); + mon_log_write(MON_PROCESS_COMPLETESTARTUP_3, SQ_LOG_ERR, la_buf); } } - else - { - // TODO: Generate internal clone request? - // to update local parent? - } } } else @@ -1616,6 +1738,7 @@ bool CProcess::Create (CProcess *parent, void* tag, int & result) int i; int j; int rc = -1; + int rc2 = -1; char *env; char **argv; char *childEnv[MAX_CHILD_ENV_VARS + 1]; @@ -2339,10 +2462,27 @@ bool CProcess::Create (CProcess *parent, void* tag, int & result) // Send actual pid and process name back to parent // STDIO Redirection requires that clone process in parent node // have the actual pid - PtpClient->ProcessInit( this - , tag - , 0 - , parent->Nid ); + rc2 = PtpClient->ProcessInit( this + , tag + , 0 + , parent->Nid ); + if (rc2) + { + char la_buf[MON_STRING_BUF_SIZE]; + CLNode *parentLNode = NULL; + parentLNode = Nodes->GetLNode( parent->Nid ); + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process create success " + "for process %s (%d, %d) " + "to parent node %s, nid=%d\n" + , method_name + , GetName() + , GetNid() + , GetPid() + , parentLNode->GetNode()->GetName() + , parentLNode->GetNid() ); + mon_log_write(MON_PROCESS_CREATE_12, SQ_LOG_ERR, la_buf); + } } if (trace_settings & (TRACE_PROCESS | TRACE_REDIRECTION)) @@ -2708,6 +2848,31 @@ bool CProcess::Create (CProcess *parent, void* tag, int & result) successful = false; result = MPI_ERR_SPAWN; + if (NameServerEnabled) + { + rc2 = PtpClient->ProcessInit( this + , tag + , result + , parent->Nid ); + if (rc2) + { + char la_buf[MON_STRING_BUF_SIZE]; + CLNode *parentLNode = NULL; + parentLNode = Nodes->GetLNode( parent->Nid ); + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process create failure " + "for process %s (%d, %d) " + "result to parent node %s, nid=%d, result=%d\n" + , method_name + , GetName() + , GetNid() + , GetPid() + , parentLNode->GetNode()->GetName() + , parentLNode->GetNid(), result ); + mon_log_write(MON_PROCESS_CREATE_13, SQ_LOG_ERR, la_buf); + } + } + char buf[MON_STRING_BUF_SIZE]; snprintf(buf, sizeof(buf), "[CProcess::Create], Failed to start process %s path= %s.\n", Name, path.c_str()); mon_log_write(MON_PROCESS_CREATE_11, SQ_LOG_ERR, buf); @@ -3231,6 +3396,10 @@ void CProcess::Exit( CProcess *parent ) if ( (parent != NULL) && (parent->GetState() == State_Up) ) { parent->childRemove( Nid, Pid); + if (NameServerEnabled) + { + parent->childUnHookedRemove( Nid, Pid); + } } // Check if we need to output a entry into the process id map log file @@ -3364,7 +3533,18 @@ void CProcess::Exit( CProcess *parent ) , targetLNode->GetNode()->GetName() ); if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process exit " + "for process %s (%d, %d) " + "to parent node %s, nid=%d\n" + , method_name + , GetName() + , GetNid() + , GetPid() + , targetLNode->GetNode()->GetName() + , targetLNode->GetNid() ); + mon_log_write(MON_PROCESS_PROCEXIT_1, SQ_LOG_ERR, la_buf); } } } @@ -3380,7 +3560,18 @@ void CProcess::Exit( CProcess *parent ) , targetLNode->GetNode()->GetName() ); if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process exit " + "for process %s (%d, %d) " + "to parent node %s, nid=%d\n" + , method_name + , GetName() + , GetNid() + , GetPid() + , targetLNode->GetNode()->GetName() + , targetLNode->GetNid() ); + mon_log_write(MON_PROCESS_PROCEXIT_2, SQ_LOG_ERR, la_buf); } } } @@ -4703,18 +4894,36 @@ void CProcessContainer::Child_Exit ( CProcess * parent ) { if (NameServerEnabled) { - CNode *childNode = NULL; - childNode = childNode->GetNode(); - // Forward the process create to the target node - PtpClient->ProcessKill( process - , process->GetAbort() - , childLNode->GetNid() - , childNode->GetName()); + CNode* childNode = childLNode->GetNode(); + // Forward the process kill to the target node + int rc = PtpClient->ProcessKill( process + , process->GetAbort() + , childLNode->GetNid() + , childNode->GetName() ); + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process kill " + "request for child process %s (%d, %d) " + "to child node %s, nid=%d\n" + , method_name + , process->GetName() + , process->GetNid() + , process->GetPid() + , childNode->GetName() + , childLNode->GetNid() ); + mon_log_write(MON_PROCESSCONT_CHILDEXIT_1, SQ_LOG_ERR, la_buf); + } } } if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) - trace_printf("%s@%d - Completed kill for child process %s (%d, %d)\n", method_name, __LINE__, process->GetName(), process->GetNid(), process->GetPid()); + trace_printf( "%s@%d - Completed kill for child process %s (%d, %d)\n" + , method_name, __LINE__ + , process->GetName() + , process->GetNid() + , process->GetPid()); } else { @@ -4736,6 +4945,90 @@ void CProcessContainer::Child_Exit ( CProcess * parent ) } TRACE_EXIT; } + +void CProcessContainer::ChildUnHooked_Exit( CProcess* parent ) +{ + const char method_name[] = "CProcessContainer::ChildUnHooked_Exit"; + TRACE_ENTRY; + + CProcess *process; + + if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS)) + trace_printf( "%s@%d with parent %s (%d,%d:%d)\n" + , method_name, __LINE__ + , parent->GetName() + , parent->GetNid() + , parent->GetPid() + , parent->GetVerifier() ); + + if (NameServerEnabled) + { + if ( parent && !parent->IsClone() + && ((MyNode->GetState() != State_Shutdown + && MyNode->GetShutdownLevel() == ShutdownLevel_Undefined)) ) + { + CProcess::nidPid_t child; + CLNode* childLNode; + + while ( parent->childUnHookedRemoveFirst( child )) + { + childLNode = Nodes->GetLNode( child.nid ); + process = (childLNode != NULL ) + ? childLNode->GetNode()->GetProcess( child.pid ) : NULL; + if (process) + { + if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) + { + trace_printf( "%s@%d - Telling unhooked child process %s (%d,%d:%d) " + "of parent death %s (%d,%d:%d)\n" + , method_name, __LINE__ + , process->GetName() + , process->GetNid() + , process->GetPid() + , process->GetVerifier() + , parent->GetName() + , parent->GetNid() + , parent->GetPid() + , parent->GetVerifier() ); + } + + CNode* childNode = childLNode->GetNode(); + // Forward the parent's process exit to the child's node + int rc = PtpClient->ProcessExit( parent + , childLNode->GetNid() + , childNode->GetName() ); + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process exit " + "request for parent process %s (%d,%d:%d) " + "to child's node %s, nid=%d\n" + , method_name + , parent->GetName() + , parent->GetNid() + , parent->GetPid() + , parent->GetVerifier() + , childNode->GetName() + , childLNode->GetNid() ); + mon_log_write(MON_PROCESSCONT_CHILDEXIT_1, SQ_LOG_ERR, la_buf); + } + else + { + if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS)) + trace_printf( "%s@%d - Completed kill for parent process %s (%d,%d:%d)\n" + , method_name, __LINE__ + , parent->GetName() + , parent->GetNid() + , parent->GetPid() + , parent->GetVerifier() ); + } + } + } + } + } + TRACE_EXIT; +} #endif void CProcessContainer::CleanUpProcesses( void ) @@ -4922,7 +5215,6 @@ CProcess *CProcessContainer::CompleteProcessStartup (char *process_name, // exits abnormally. int parentNid; int parentPid; - CProcess * parent; if ( ! process->IsBackup() ) { parentNid = process->GetParentNid(); @@ -4934,8 +5226,10 @@ CProcess *CProcessContainer::CompleteProcessStartup (char *process_name, parentPid = process->GetPairParentPid(); } +#ifndef NAMESERVER_PROCESS if ( parentNid != -1 && parentPid != -1 ) { + CProcess* parent; parent = Nodes->GetLNode ( parentNid ) ->GetProcessL( parentPid ); if ( parent && !process->IsBackup() ) @@ -4945,7 +5239,43 @@ CProcess *CProcessContainer::CompleteProcessStartup (char *process_name, parent->childAdd ( process->GetNid(), os_pid ); } } +#endif + } +#ifndef NAMESERVER_PROCESS + if (NameServerEnabled) + { + if (process->IsUnhooked()) + { // Parent process object keeps track of child processes + // created. Needed when parent process exits to clean up + // parent clone process object in remote nodes. + int parentNid; + int parentPid; + CProcess* parent; + if ( !process->IsBackup() ) + { + parentNid = process->GetParentNid(); + parentPid = process->GetParentPid(); + } + else + { + parentNid = process->GetPairParentNid(); + parentPid = process->GetPairParentPid(); + } + + if ( parentNid != -1 && parentPid != -1 ) + { + parent = Nodes->GetLNode(parentNid)->GetProcessL(parentPid); + if ( parent && !parent->IsClone() && !process->IsBackup() ) + { + parent->childUnHookedRemove( process->GetNid() + , process->GetPid() ); + parent->childUnHookedAdd( process->GetNid() + , os_pid ); + } + } + } } +#endif // Process id changed from when we started the process. So // remap using the new pid. [This could happen if, for example, // a shell script was the originally started process and it @@ -5366,6 +5696,14 @@ void CProcessContainer::Exit_Process (CProcess *process, bool abend, int downNod Child_Exit(process); } + if (!process->IsClone() && NameServerEnabled) + { + if (process->childUnHookedCount() > 0) + { + ChildUnHooked_Exit(process); + } + } + if ( parent == NULL) { parent = Nodes->GetProcess( process->GetParentNid(), http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/process.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/process.h b/core/sqf/monitor/linux/process.h index 3c813bb..3cde3e5 100644 --- a/core/sqf/monitor/linux/process.h +++ b/core/sqf/monitor/linux/process.h @@ -73,6 +73,7 @@ class CProcessContainer , _TM_Txid_External trans_id ); #ifndef NAMESERVER_PROCESS void Child_Exit ( CProcess * parent ); + void ChildUnHooked_Exit ( CProcess * parent ); #endif void CleanUpProcesses( void ); CProcess *CloneProcess( int nid, @@ -428,11 +429,18 @@ class CProcess void SetHangupTime () { clock_gettime(CLOCK_REALTIME, &hangupTime_); } time_t GetHangupTime () { return hangupTime_.tv_sec; } +#ifndef NAMESERVER_PROCESS void childAdd ( int nid, int pid ); int childCount ( void ); void childRemove ( int nid, int pid ); bool childRemoveFirst ( nidPid_t & child ); + void childUnHookedAdd( int nid, int pid ); + int childUnHookedCount( void ); + void childUnHookedRemove( int nid, int pid ); + bool childUnHookedRemoveFirst( nidPid_t & child ); +#endif + struct message_def * GetDeathNotice ( void ); void PutDeathNotice( struct message_def * ); @@ -558,17 +566,20 @@ private: enum { MAX_CHILD_ENV_VARS = 300 }; +#ifndef NAMESERVER_PROCESS // Container to keep track of this process' children created on // the local node. Needed because if this process abornmally terminates // the children will be terminated too. typedef list<nidPid_t> nidPidList_t; nidPidList_t children_; + nidPidList_t childrenUnHooked_; // only used with Name Server enabled // Lock for children_ list. Temporarily using a lock but should // be able to eliminate for better performance. Once lioCleanupThread // and syncThread uniformly queue requests to be processed by worker // thread this lock should not be necessary. CLock childrenListLock_; +#endif // Container to hold dead process info to be sent as death notices // to an ssmp process. This is a NULL list except when the CProcess http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqkill.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqkill.cxx b/core/sqf/monitor/linux/reqkill.cxx index e8cad71..b59cae2 100644 --- a/core/sqf/monitor/linux/reqkill.cxx +++ b/core/sqf/monitor/linux/reqkill.cxx @@ -94,10 +94,25 @@ void CExtKillReq::Kill( CProcess *process ) if (NameServerEnabled) { // Forward the process create to the target node - PtpClient->ProcessKill( process - , process->GetAbort() - , lnode->GetNid() - , node->GetName()); + int rc = PtpClient->ProcessKill( process + , process->GetAbort() + , lnode->GetNid() + , node->GetName()); + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process kill " + "request for child process %s (%d, %d) " + "to child node %s, nid=%d\n" + , method_name + , process->GetName() + , process->GetNid() + , process->GetPid() + , node->GetName() + , lnode->GetNid() ); + mon_log_write(MON_REQ_KILL_1, SQ_LOG_ERR, la_buf); + } } else { http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqnewproc.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqnewproc.cxx b/core/sqf/monitor/linux/reqnewproc.cxx index a6da01e..972d785 100644 --- a/core/sqf/monitor/linux/reqnewproc.cxx +++ b/core/sqf/monitor/linux/reqnewproc.cxx @@ -532,9 +532,24 @@ void CExtNewProcReq::performRequest() if (NameServerEnabled) { // Forward the process create to the target node - PtpClient->ProcessNew( process - , lnode->GetNid() - , lnode->GetNode()->GetName()); + int rc = PtpClient->ProcessNew( process + , lnode->GetNid() + , lnode->GetNode()->GetName()); + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process create " + "request for process %s (%d, %d) " + "to target node %s, nid=%d\n" + , method_name + , process->GetName() + , process->GetNid() + , process->GetPid() + , lnode->GetNode()->GetName() + , lnode->GetNid() ); + mon_log_write(MON_MONITOR_STARTPROCESS_15, SQ_LOG_ERR, la_buf); + } } else #endif http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqnotify.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqnotify.cxx b/core/sqf/monitor/linux/reqnotify.cxx index 5900f01..4d278ce 100644 --- a/core/sqf/monitor/linux/reqnotify.cxx +++ b/core/sqf/monitor/linux/reqnotify.cxx @@ -274,16 +274,6 @@ void CExtNotifyReq::performRequest() } else { -#if 0 - if ( msg_->u.request.u.notify.cancel ) - { // Unregister interest in death of target process - status = targetProcess->CancelDeathNotification( nid_ - , pid - , verifier_ - , msg_->u.request.u.notify.trans_id); - } - else if (sourceProcess) -#endif if (sourceProcess) { // Register interest in death of target process if (NameServerEnabled && targetProcess->IsClone()) @@ -304,7 +294,18 @@ void CExtNotifyReq::performRequest() , targetLNode->GetNode()->GetName() ); if (rc) { - // TODO: Error handling + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Can't send process notify request " + "for process %s (%d, %d) " + "to target node %s, nid=%d\n" + , method_name + , sourceProcess->GetName() + , sourceProcess->GetNid() + , sourceProcess->GetPid() + , targetLNode->GetNode()->GetName() + , targetLNode->GetNid() ); + mon_log_write(MON_REQ_NOTIFY_1, SQ_LOG_ERR, la_buf); } } http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqprocinfo.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqprocinfo.cxx b/core/sqf/monitor/linux/reqprocinfo.cxx index 84dc3a7..d3f04e2 100644 --- a/core/sqf/monitor/linux/reqprocinfo.cxx +++ b/core/sqf/monitor/linux/reqprocinfo.cxx @@ -44,7 +44,7 @@ extern CNameServer *NameServer; // Copy information for a specific process into the reply message buffer. void CExtProcInfoBase::ProcessInfo_CopyData(CProcess *process, ProcessInfoState &procState) { - const char method_name[] = "CNameServer::SendReceive"; + const char method_name[] = "CExtProcInfoBase::ProcessInfo_CopyData"; CProcess *parent; TRACE_ENTRY; @@ -356,7 +356,17 @@ void CExtProcInfoReq::performRequest() } if ( NameServerEnabled && !getMonitorInfo ) - NameServer->ProcessInfo(msg_); // in reqQueue thread (CExternalReq) + { + int rc = NameServer->ProcessInfo(msg_); // in reqQueue thread (CExternalReq) + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Process info request to Name Server failed\n" + , method_name ); + mon_log_write(MON_REQ_PROCINFO_1, SQ_LOG_ERR, la_buf); + } + } #endif #ifndef NAMESERVER_PROCESS @@ -642,7 +652,17 @@ void CExtProcInfoContReq::performRequest() } if ( NameServerEnabled && !getMonitorInfo ) - NameServer->ProcessInfoCont(msg_); // in reqQueue thread (CExternalReq) + { + int rc = NameServer->ProcessInfoCont(msg_); // in reqQueue thread (CExternalReq) + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Process info continue request to Name Server failed\n" + , method_name ); + mon_log_write(MON_REQ_PROCINFOCONT_1, SQ_LOG_ERR, la_buf); + } + } #endif #ifndef NAMESERVER_PROCESS http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/reqqueue.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqqueue.cxx b/core/sqf/monitor/linux/reqqueue.cxx index 508a325..c7af4ca 100644 --- a/core/sqf/monitor/linux/reqqueue.cxx +++ b/core/sqf/monitor/linux/reqqueue.cxx @@ -1577,7 +1577,7 @@ void CIntNewProcReq::performRequest() else { if (NameServerEnabled) - { // Name Server find by nid,pid:verifier + { if (trace_settings & TRACE_REQUEST) trace_printf( "%s@%d" " - Getting parent process from Name Server (%d,%d:%d)\n" , method_name, __LINE__ @@ -1656,14 +1656,7 @@ void CIntNewProcReq::performRequest() { // Process creation failure, relay error code to node // that requested process creation. - if (NameServerEnabled) - { - PtpClient->ProcessInit( newProcess - , reqTag_ - , result - , parentNid_ ); - } - else + if (!NameServerEnabled) { CReplProcInit *repl = new CReplProcInit(newProcess, reqTag_, result, parentNid_); @@ -2285,9 +2278,10 @@ void CIntProcInitReq::performRequest() Nodes->GetLNode( process_->GetNid() )->GetNode()->AddToPidMap(process_->GetPid(), process_); Nodes->GetLNode( process_->GetNid() )->GetNode()->AddToNameMap(process_); + CProcess* parent; + if (process_->IsBackup()) { - CProcess * parent; parent = Nodes->GetProcess(process_->GetParentNid(), process_->GetParentPid(), false); if (parent) @@ -2295,53 +2289,59 @@ void CIntProcInitReq::performRequest() // this backup process object. if (trace_settings & (TRACE_SYNC | TRACE_PROCESS)) { - trace_printf("%s@%d - For backup process (%d, %d)" - ", for parent (%d, %d) setting " - "parent's Parent_Nid/Parent_Pid=" - "(%d, %d).\n", - method_name, __LINE__, process_->GetNid(), - process_->GetPid(), parent->GetNid(), - parent->GetPid(), - process_->GetNid(), process_->GetPid()); + trace_printf( "%s@%d - For backup process %s (%d,%d:%d)" + ", for parent %s (%d,%d:%d) setting " + "parent's Parent_Nid/Parent_Pid=" + "(%d,%d).\n" + , method_name, __LINE__ + , process_->GetName() + , process_->GetNid() + , process_->GetPid() + , process_->GetVerifier() + , parent->GetName() + , parent->GetNid() + , parent->GetPid() + , parent->GetVerifier() + , process_->GetNid() + , process_->GetPid()); } parent->SetParentNid ( process_->GetNid() ); parent->SetParentPid ( process_->GetPid() ); } } - - -#ifdef QUICK_WAITED_NEWPROCESS_REPLY -// Following allows reply to a "waited" new process request before we -// get the "startup" message from the process. This make the process -// creation appear to complete more quickly. However there are potential -// problems if the requester immediately tries to open the new process -// because it is not ready yet. So need to handle quick "open" of this -// type before re-enabling this code section. - if (!process->IsNowait()) - { // new process request was a "waited" request - if (process->GetParentNid() == -1) - { - parent = NULL; - } - else - { - parent = - LNode[process->GetParentNid()]-> - GetProcessL(process->GetParentPid()); - } - - if (parent) - { - reply_msg = process->parentContext(); - if ( reply_msg ) - { - // the parent gets a new_process reply - parent->ReplyNewProcess ( reply_msg, process ); - - process->parentContext (NULL); - } +#ifndef NAMESERVER_PROCESS + if (NameServerEnabled) + { + if (process_->IsUnhooked()) + { + if ( process_->GetParentNid() != -1 && process_->GetParentPid() != -1 ) + { + parent = Nodes->GetProcess(process_->GetParentNid(), + process_->GetParentPid(), false); + if (parent && !parent->IsClone()) + { // Parent process object keeps track of child processes + // created. Needed when parent process exits to clean up + // parent clone process object in remote nodes. + if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL + | TRACE_PROCESS_DETAIL)) + trace_printf( "%s@%d - Adding unhooked child process %s (%d,%d:%d) to " + "parent %s (%d,%d:%d)\n" + , method_name, __LINE__ + , process_->GetName() + , process_->GetNid() + , process_->GetPid() + , process_->GetVerifier() + , parent->GetName() + , parent->GetNid() + , parent->GetPid() + , parent->GetVerifier() ); + + parent->childUnHookedAdd( process_->GetNid() + , process_->GetPid() ); } } + } + } #endif } @@ -2599,7 +2599,22 @@ void CIntChildDeathReq::performRequest() } #ifndef NAMESERVER_PROCESS if ( NameServerEnabled ) - NameServer->ProcessDelete(process_); // in reqQueue thread (CIntChildDeathReq) + { + int rc = NameServer->ProcessDelete(process_); // in reqQueue thread (CIntChildDeathReq) + if (rc) + { + char la_buf[MON_STRING_BUF_SIZE]; + snprintf( la_buf, sizeof(la_buf) + , "[%s] - Process delete request to Name Server failed" + "for child process %s (%d, %d:%d)\n" + , method_name + , process_->GetName() + , process_->GetNid() + , process_->GetPid() + , process_->GetVerifier() ); + mon_log_write(MON_INTREQ_CHILDDEATH_1, SQ_LOG_ERR, la_buf); + } + } #endif MyNode->DelFromNameMap ( process_ ); MyNode->DelFromPidMap ( process_ ); http://git-wip-us.apache.org/repos/asf/trafodion/blob/65ac5563/core/sqf/monitor/linux/tmsync.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/tmsync.cxx b/core/sqf/monitor/linux/tmsync.cxx index e6e3a76..548ae81 100644 --- a/core/sqf/monitor/linux/tmsync.cxx +++ b/core/sqf/monitor/linux/tmsync.cxx @@ -1010,6 +1010,23 @@ void CTmSync_Container::SendUnsolicitedMessages (void) delete msg; msg = NULL; } + if (NameServerEnabled) + { + if (!MyNode->IsMyNode( tm->GetNid() )) + { + if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC)) + { + trace_printf( "%s@%d - Deleting clone process %s, (%d,%d:%d)\n" + , method_name, __LINE__ + , tm->GetName() + , tm->GetNid() + , tm->GetPid() + , tm->GetVerifier() ); + } + Nodes->DeleteCloneProcess( tm ); + } + + } } else {
