Repository: trafodion Updated Branches: refs/heads/master 8b6a6bbeb -> 0c049d784
http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/montest_run.virtual ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/montest_run.virtual b/core/sqf/monitor/linux/montest_run.virtual index 424d38d..e6a7463 100755 --- a/core/sqf/monitor/linux/montest_run.virtual +++ b/core/sqf/monitor/linux/montest_run.virtual @@ -23,24 +23,26 @@ #!/bin/sh +ARCH=`arch` # Cleanup -cd $TRAF_HOME/monitor/linux/Linux-x86_64/dbg +cd ${TRAF_HOME}/monitor/linux/Linux-${ARCH}/dbg echo $PWD rm -f core* *.log *.lst test*sub* rm -f $MPI_TMPDIR/monitor.port.* +exit 0 # Setup monitor test files cd $TRAF_HOME/monitor/linux echo $PWD echo Copying monitor test files to execution directory -echo cp -p ./test*sub* ./Linux-x86_64/dbg -cp -p ./test*sub* ./Linux-x86_64/dbg +echo cp -p ./test*sub* ./Linux-${ARCH}/dbg +cp -p ./test*sub* ./Linux-${ARCH}/dbg # Establish SQ virtual cluster parameters export SQ_VIRTUAL_NODES=6 export SQ_VIRTUAL_NID=0 -cd $TRAF_HOME/monitor/linux/Linux-x86_64/dbg +cd ${TRAF_HOME}/monitor/linux/Linux-${ARCH}/dbg echo $PWD shell <<eof http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/pnode.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx index 57ce0a6..00f1271 100644 --- a/core/sqf/monitor/linux/pnode.cxx +++ b/core/sqf/monitor/linux/pnode.cxx @@ -1133,10 +1133,10 @@ void CNode::StartWatchdogProcess( void ) } //Displays the startup and keep alive timer values in use for a given run. - if (trace_settings & TRACE_INIT) + if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) trace_printf("%s@%d" " - KeepAlive Timer in seconds =%d\n", method_name, __LINE__, (wdtKeepAliveTimerValue_)); - if (trace_settings & TRACE_INIT) + if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) trace_printf("%s@%d" " - Creating Watchdog Process\n", method_name, __LINE__); strcpy(path,getenv("PATH")); @@ -1959,7 +1959,7 @@ int CNodeContainer::PackNodeMappings( intBuffPtr_t &buffer ) ++count; - if (trace_settings & ( TRACE_INIT || TRACE_RECOVERY || TRACE_REQUEST_DETAIL) ) + if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) trace_printf("%s@%d - Packing node mapping, pnidConfig=%d, pnid=%d \n", method_name, __LINE__, pnidConfig, pnid); } @@ -1982,7 +1982,7 @@ void CNodeContainer::UnpackNodeMappings( intBuffPtr_t &buffer, int nodeMapCount pnidConfig = *buffer++; pnid = *buffer++; - if (trace_settings & ( TRACE_INIT || TRACE_RECOVERY || TRACE_REQUEST_DETAIL) ) + if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) trace_printf("%s@%d - Unpacking node mapping, pnidConfig=%d, pnid=%d \n", method_name, __LINE__, pnidConfig, pnid); http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/pnodeconfig.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnodeconfig.cxx b/core/sqf/monitor/linux/pnodeconfig.cxx index 4d5ee99..03f52c9 100644 --- a/core/sqf/monitor/linux/pnodeconfig.cxx +++ b/core/sqf/monitor/linux/pnodeconfig.cxx @@ -569,9 +569,10 @@ int CPNodeConfigContainer::hostnamecmp( const char *p_str1, const char *p_str2 ) if ( !p_str1 ) return 1; if ( !p_str2 ) return 1; + // Compare the string passed in int lv_ret = strcmp( p_str1, p_str2 ); if ( lv_ret == 0 ) - { + { // Got a match! return lv_ret; } if ( sb_strict_hostname_check ) @@ -586,23 +587,45 @@ int CPNodeConfigContainer::hostnamecmp( const char *p_str1, const char *p_str2 ) char *lp_str1_dot = strchr( (char *) p_str1, '.' ); if ( lp_str1_dot ) - { + { // Found '.', copy up to one char before '.' memcpy( lv_str1_to_cmp, p_str1, lp_str1_dot - p_str1 ); } else - { + { // Copy entire string strcpy( lv_str1_to_cmp, p_str1 ); } char *lp_str2_dot = strchr( (char *) p_str2, '.' ); if ( lp_str2_dot ) - { + { // Found '.', copy up to one char before '.' memcpy( lv_str2_to_cmp, p_str2, lp_str2_dot - p_str2 ); } else - { + { // Copy entire string strcpy( lv_str2_to_cmp, p_str2 ); } + // Ignore case + NormalizeCase( lv_str1_to_cmp ); + NormalizeCase( lv_str2_to_cmp ); return strcmp( lv_str1_to_cmp, lv_str2_to_cmp ); } + +char *CPNodeConfigContainer::NormalizeCase( char *token ) +{ + char *ptr = token; + + const char method_name[] = "CPNodeConfigContainer::NormalizeCase"; + TRACE_ENTRY; + + while ( *ptr ) + { + *ptr = tolower( *ptr ); + if ( *ptr == '\n' ) *ptr = '\0'; + ptr++; + } + + TRACE_EXIT; + return token; +} + http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/pnodeconfig.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/pnodeconfig.h b/core/sqf/monitor/linux/pnodeconfig.h index 1916797..5deccc8 100644 --- a/core/sqf/monitor/linux/pnodeconfig.h +++ b/core/sqf/monitor/linux/pnodeconfig.h @@ -74,6 +74,8 @@ protected: int nextPNid_; // next physical node id available private: + static char *NormalizeCase( char *token ); + int pnodesConfigMax_; // maximum number of physical nodes PNodesConfigList_t spareNodesConfigList_; // configured spare nodes list CPNodeConfig *head_; // head of physical nodes linked list http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/process.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/process.cxx b/core/sqf/monitor/linux/process.cxx index 8a35c4d..6a8e08b 100644 --- a/core/sqf/monitor/linux/process.cxx +++ b/core/sqf/monitor/linux/process.cxx @@ -327,13 +327,19 @@ CProcess::~CProcess (void) delete [] userArgv_; if (fd_stdin_ != -1 && !Clone) - Redirector.tryShutdownPipeFd(Pid, fd_stdin_); + { + Redirector.tryShutdownPipeFd(Pid, fd_stdin_, false); + } if (fd_stdout_ != -1) - Redirector.tryShutdownPipeFd(Pid, fd_stdout_); + { + Redirector.tryShutdownPipeFd(Pid, fd_stdout_, true); + } if (fd_stderr_ != -1) - Redirector.tryShutdownPipeFd(Pid, fd_stderr_); + { + Redirector.tryShutdownPipeFd(Pid, fd_stderr_, false); + } // Remove the fifos associated with this process (if any) if (fifo_stdin_.size() != 0) @@ -911,9 +917,10 @@ bool CProcess::PickStdfile(PickStdFile_t whichStdfile, { ancestor = node->GetProcessL(nextPid); if ( ancestor && - (ancestor->CreationTime.tv_sec < earlyCreationTime.tv_sec || - (ancestor->CreationTime.tv_sec == earlyCreationTime.tv_sec && - ancestor->CreationTime.tv_nsec < earlyCreationTime.tv_nsec)) ) + (( ! MyNode->IsMyNode(ancestor->GetNid())) || + (ancestor->CreationTime.tv_sec < earlyCreationTime.tv_sec || + (ancestor->CreationTime.tv_sec == earlyCreationTime.tv_sec && + ancestor->CreationTime.tv_nsec < earlyCreationTime.tv_nsec))) ) { earlyCreationTime.tv_sec = ancestor->CreationTime.tv_sec; earlyCreationTime.tv_nsec = ancestor->CreationTime.tv_nsec; @@ -3853,71 +3860,91 @@ void CProcessContainer::AttachProcessCheck ( struct message_def *msg ) if ( ! MyNode->IsSpareNode() ) { int nid = MyNode->AssignNid(); - strId_t progStrId = MyNode->GetStringId( msg->u.request.u.startup.program ); - strId_t nullStrId = { -1, -1 }; - process = - new CProcess (NULL, nid, msg->u.request.u.startup.os_pid, ProcessType_Generic, 0, 0, false, true, (char *) "", - nullStrId, nullStrId, progStrId, (char *) "", (char *) ""); - if (process == NULL) + if ( (nid == -1) && (MyNode->GetState() != State_Up) ) { - //TODO: Log event - abort(); - } - if ( process ) - { - char user_argv[MAX_ARGS][MAX_ARG_SIZE]; - process->userArgs ( 0, user_argv ); - } - if ( msg->u.request.u.startup.process_name[0] == '\0') - { // Create a name for the process and place it in the - // Name member of the process object); - char pname[MAX_KEY_NAME]; - MyNode->BuildOurName(nid, process->GetPid(), pname ); - process->SetName( pname ); + snprintf( la_buf, sizeof(la_buf), + "[%s], Can't attach the pid %d (program: %s) - the monitor is not up yet (curr state: %d).\n", + method_name, + msg->u.request.u.startup.os_pid, + msg->u.request.u.startup.program, + MyNode->GetState() ); + mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_4, SQ_LOG_ERR, la_buf ); + + msg->u.reply.type = ReplyType_Generic; + msg->u.reply.u.generic.nid = -1; + msg->u.reply.u.generic.pid = -1; + msg->u.reply.u.generic.verifier = -1; + msg->u.reply.u.generic.process_name[0] = '\0'; + msg->u.reply.u.generic.return_code = MPI_ERR_NAME; } else { - process->SetName ( - MyNode->NormalizeName(msg->u.request.u.startup.process_name) ); + strId_t progStrId = MyNode->GetStringId( msg->u.request.u.startup.program ); + strId_t nullStrId = { -1, -1 }; + process = + new CProcess( NULL, nid, msg->u.request.u.startup.os_pid, ProcessType_Generic, 0, 0, false, true, (char *) "", + nullStrId, nullStrId, progStrId, (char *) "", (char *) "" ); + if ( process == NULL ) + { + //TODO: Log event + abort(); + } + if ( process ) + { + char user_argv[MAX_ARGS][MAX_ARG_SIZE]; + process->userArgs( 0, user_argv ); + } + if ( msg->u.request.u.startup.process_name[0] == '\0' ) + { // Create a name for the process and place it in the + // Name member of the process object); + char pname[MAX_KEY_NAME]; + MyNode->BuildOurName( nid, process->GetPid( ), pname ); + process->SetName( pname ); + } + else + { + process->SetName( + MyNode->NormalizeName( msg->u.request.u.startup.process_name ) ); + } + process->SetAttached( true ); + process->SetupFifo( process->GetNid( ), msg->u.request.u.startup.os_pid ); + process->SetCreationTime( msg->u.request.u.startup.os_pid ); + process->SetVerifier( ); + AddToList( process ); + process->CompleteProcessStartup( msg->u.request.u.startup.port_name, + msg->u.request.u.startup.os_pid, + msg->u.request.u.startup.event_messages, + msg->u.request.u.startup.system_messages, + false, + NULL ); + + msg->u.reply.type = ReplyType_Startup; + msg->u.reply.u.startup_info.nid = process->GetNid( ); + msg->u.reply.u.startup_info.pid = process->GetPid( ); + msg->u.reply.u.startup_info.verifier = process->GetVerifier( ); + strcpy( msg->u.reply.u.startup_info.process_name, process->GetName( ) ); + msg->u.reply.u.startup_info.return_code = MPI_SUCCESS; + STRCPY( msg->u.reply.u.startup_info.fifo_stdin, + process->fifo_stdin() ); + STRCPY( msg->u.reply.u.startup_info.fifo_stdout, + process->fifo_stdout() ); + STRCPY( msg->u.reply.u.startup_info.fifo_stderr, + process->fifo_stderr() ); + + Monitor->writeProcessMapBegin( process->GetName( ) + , process->GetNid( ) + , process->GetPid( ) + , process->GetVerifier( ) + , -1, -1, -1 + , msg->u.request.u.startup.program ); } - process->SetAttached ( true ); - process->SetupFifo(process->GetNid(), msg->u.request.u.startup.os_pid); - process->SetCreationTime(msg->u.request.u.startup.os_pid); - process->SetVerifier(); - AddToList( process ); - process->CompleteProcessStartup ( msg->u.request.u.startup.port_name, - msg->u.request.u.startup.os_pid, - msg->u.request.u.startup.event_messages, - msg->u.request.u.startup.system_messages, - false, - NULL ); - - msg->u.reply.type = ReplyType_Startup; - msg->u.reply.u.startup_info.nid = process->GetNid(); - msg->u.reply.u.startup_info.pid = process->GetPid(); - msg->u.reply.u.startup_info.verifier = process->GetVerifier(); - strcpy (msg->u.reply.u.startup_info.process_name, process->GetName()); - msg->u.reply.u.startup_info.return_code = MPI_SUCCESS; - STRCPY(msg->u.reply.u.startup_info.fifo_stdin, - process->fifo_stdin()); - STRCPY(msg->u.reply.u.startup_info.fifo_stdout, - process->fifo_stdout()); - STRCPY(msg->u.reply.u.startup_info.fifo_stderr, - process->fifo_stderr()); - - Monitor->writeProcessMapBegin( process->GetName() - , process->GetNid() - , process->GetPid() - , process->GetVerifier() - , -1, -1, -1 - , msg->u.request.u.startup.program ); } else { - snprintf(la_buf, sizeof(la_buf), - "[%s], Can't attach, node is a spare node!\n", - method_name); - mon_log_write(MON_PROCESSCONT_ATTACHPCHECK_3, SQ_LOG_ERR, la_buf); + snprintf( la_buf, sizeof(la_buf), + "[%s], Can't attach, node is a spare node!\n", + method_name ); + mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_3, SQ_LOG_ERR, la_buf ); msg->u.reply.type = ReplyType_Startup; msg->u.reply.u.startup_info.nid = -1; @@ -3930,10 +3957,10 @@ void CProcessContainer::AttachProcessCheck ( struct message_def *msg ) else { // Find the duplicate process - snprintf(la_buf, sizeof(la_buf), + snprintf( la_buf, sizeof(la_buf), "[%s], Can't attach duplicate process %s!\n", - method_name, msg->u.request.u.startup.process_name); - mon_log_write(MON_PROCESSCONT_ATTACHPCHECK_4, SQ_LOG_ERR, la_buf); + method_name, msg->u.request.u.startup.process_name ); + mon_log_write( MON_PROCESSCONT_ATTACHPCHECK_4, SQ_LOG_ERR, la_buf ); msg->u.reply.type = ReplyType_Generic; msg->u.reply.u.generic.nid = -1; @@ -3941,7 +3968,7 @@ void CProcessContainer::AttachProcessCheck ( struct message_def *msg ) msg->u.reply.u.generic.verifier = -1; msg->u.reply.u.generic.process_name[0] = '\0'; msg->u.reply.u.generic.return_code = MPI_ERR_NAME; - } + } } // complete a monitor child process startup else http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/redirector.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/redirector.cxx b/core/sqf/monitor/linux/redirector.cxx index b3780cc..de27211 100644 --- a/core/sqf/monitor/linux/redirector.cxx +++ b/core/sqf/monitor/linux/redirector.cxx @@ -1288,7 +1288,7 @@ void CRedirectStderr::handleOutput(ssize_t count, char *buffer) buf[size-1] = '\n'; } } - mon_log_write(MON_REDIR_STDERR, SQ_LOG_INFO, buf); + mon_log_write(MON_REDIR_STDERR, SQ_LOG_DEBUG, buf); delete [] buf; } @@ -1764,7 +1764,7 @@ void CRedirector::stdinOn(int fd) TRACE_EXIT; } -void CRedirector::tryShutdownPipeFd(int pid, int fd) +void CRedirector::tryShutdownPipeFd(int pid, int fd, bool pv_delete_redirect) { const char method_name[] = "CRedirector::tryShutdownPipeFd"; TRACE_ENTRY; @@ -1784,9 +1784,12 @@ void CRedirector::tryShutdownPipeFd(int pid, int fd) redirect = iter->second; // bugcatcher, temp call - redirect->validateObj(); + if (redirect->pid() != 0) + redirect->validateObj(); - if (!redirect->active() && (pid == redirect->pid())) + if (((pv_delete_redirect) || + (!redirect->active())) && + (pid == redirect->pid())) { if (trace_settings & TRACE_REDIRECTION) trace_printf("%s@%d invoking shutdownPipeFd for fd=%d\n", http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/redirector.h ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/redirector.h b/core/sqf/monitor/linux/redirector.h index 1babca6..2bea30a 100644 --- a/core/sqf/monitor/linux/redirector.h +++ b/core/sqf/monitor/linux/redirector.h @@ -254,7 +254,7 @@ public: void stdinOff(int fd); void stdinOn(int fd); - void tryShutdownPipeFd(int pid, int fd); + void tryShutdownPipeFd(int pid, int fd, bool pv_delete_redirect); void disposeIoData(int fd, int count, char *buffer); http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/reqexit.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqexit.cxx b/core/sqf/monitor/linux/reqexit.cxx index 4cfdec5..8a487e8 100644 --- a/core/sqf/monitor/linux/reqexit.cxx +++ b/core/sqf/monitor/linux/reqexit.cxx @@ -96,6 +96,8 @@ void CExtExitReq::populateRequestString( void ) void CExtExitReq::performRequest() { bool status = FAILURE; + int target_nid = -1; + CLNode *target_lnode = NULL; const char method_name[] = "CExtExitReq::performRequest"; TRACE_ENTRY; @@ -115,8 +117,9 @@ void CExtExitReq::performRequest() , msg_->u.request.u.exit.verifier ); } - if ((msg_->u.request.u.exit.nid < 0) || - (msg_->u.request.u.exit.nid >= Nodes->GetLNodesConfigMax())) + target_nid = msg_->u.request.u.exit.nid; + target_lnode = Nodes->GetLNode( target_nid ); + if ( target_lnode == NULL ) { char buf[MON_STRING_BUF_SIZE]; sprintf(buf, "[CMonitor::ExitProcess], Invalid Node ID!\n"); http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/reqnewproc.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqnewproc.cxx b/core/sqf/monitor/linux/reqnewproc.cxx index 7cd35ea..afe1f38 100644 --- a/core/sqf/monitor/linux/reqnewproc.cxx +++ b/core/sqf/monitor/linux/reqnewproc.cxx @@ -86,6 +86,7 @@ void CExtNewProcReq::performRequest() CProcess *process = NULL; CNode *node = NULL; CLNode *lnode = NULL; + CLNode *target_lnode = NULL; CLNode *zone_lnode = NULL; char la_buf[MON_STRING_BUF_SIZE]; int result; @@ -116,10 +117,10 @@ void CExtNewProcReq::performRequest() if ( requester ) { target_nid = msg_->u.request.u.new_process.nid; + target_lnode = Nodes->GetLNode( target_nid ); if ( msg_->u.request.u.new_process.type == ProcessType_SSMP ) { - if (( msg_->u.request.u.new_process.nid < 0 || - msg_->u.request.u.new_process.nid >= Nodes->GetLNodesConfigMax() ) ) + if ( target_lnode == NULL ) { // Nid must be specified msg_->u.reply.type = ReplyType_NewProcess; @@ -150,8 +151,7 @@ void CExtNewProcReq::performRequest() } if ( msg_->u.request.u.new_process.type == ProcessType_DTM ) { - if (( msg_->u.request.u.new_process.nid < 0 || - msg_->u.request.u.new_process.nid >= Nodes->GetLNodesConfigMax() ) ) + if ( target_lnode == NULL ) { // Nid must be specified msg_->u.reply.type = ReplyType_NewProcess; @@ -189,8 +189,7 @@ void CExtNewProcReq::performRequest() } if ( msg_->u.request.u.new_process.type == ProcessType_SPX ) { - if (( msg_->u.request.u.new_process.nid < 0 || - msg_->u.request.u.new_process.nid >= Nodes->GetLNodesConfigMax() ) ) + if ( target_lnode == NULL ) { // Nid must be specified msg_->u.reply.type = ReplyType_NewProcess; @@ -350,9 +349,7 @@ void CExtNewProcReq::performRequest() } } } - else if (( msg_->u.request.u.new_process.type == ProcessType_DTM ) && - (( msg_->u.request.u.new_process.nid < 0 ) || - ( msg_->u.request.u.new_process.nid >= Nodes->GetLNodesConfigMax() ) ) ) + else if ( target_lnode == NULL ) { msg_->u.reply.type = ReplyType_NewProcess; msg_->u.reply.u.new_process.return_code = MPI_ERR_SPAWN; @@ -365,21 +362,6 @@ void CExtNewProcReq::performRequest() return; } - else if (( msg_->u.request.u.new_process.type != ProcessType_DTM ) && - (( msg_->u.request.u.new_process.nid < 0 ) || - ( msg_->u.request.u.new_process.nid >= Nodes->GetLNodesConfigMax() ) ) ) - { - msg_->u.reply.type = ReplyType_NewProcess; - msg_->u.reply.u.new_process.return_code = MPI_ERR_SPAWN; - // Send reply to requester - lioreply(msg_, pid_); - - sprintf(la_buf, "[%s], Invalid Node ID (%d).\n", method_name, - target_nid); - mon_log_write(MON_MONITOR_STARTPROCESS_7, SQ_LOG_ERR, la_buf); - - return; - } else { if( msg_->u.request.u.new_process.backup ) http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/reqopen.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqopen.cxx b/core/sqf/monitor/linux/reqopen.cxx index 56b13b2..494a0e3 100644 --- a/core/sqf/monitor/linux/reqopen.cxx +++ b/core/sqf/monitor/linux/reqopen.cxx @@ -189,13 +189,17 @@ bool CExtOpenReq::prepare() const char method_name[] = "CExtOpenReq::prepare"; TRACE_ENTRY; + int target_nid = -1; + CLNode *target_lnode = NULL; + if ( prepared_ == true ) { // Already did the prepare work earlier. return true; } - if ((msg_->u.request.u.open.nid < 0) || - (msg_->u.request.u.open.nid >= Nodes->GetLNodesConfigMax())) + target_nid = msg_->u.request.u.open.nid; + target_lnode = Nodes->GetLNode( target_nid ); + if ( target_lnode == NULL ) { char buf[MON_STRING_BUF_SIZE]; sprintf(buf, "%s, Invalid Node ID (%d)\n", method_name, http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/reqqueue.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/reqqueue.cxx b/core/sqf/monitor/linux/reqqueue.cxx index 764966f..becb0cd 100644 --- a/core/sqf/monitor/linux/reqqueue.cxx +++ b/core/sqf/monitor/linux/reqqueue.cxx @@ -1971,6 +1971,11 @@ CIntDownReq::CIntDownReq( int pnid ) { // Add eyecatcher sequence as a debugging aid memcpy(&eyecatcher_, "RQIP", 4); + + if ( pnid == MyPNID ) + { + SetReviveFlag(1); // allow this request to be processed during revive + } } CIntDownReq::~CIntDownReq() http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/shell.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/shell.cxx b/core/sqf/monitor/linux/shell.cxx index 7bbc6e3..5037dd3 100644 --- a/core/sqf/monitor/linux/shell.cxx +++ b/core/sqf/monitor/linux/shell.cxx @@ -44,6 +44,7 @@ using namespace std; #include <sys/stat.h> #include <sys/types.h> #include <sys/wait.h> +#include <string> #include "msgdef.h" #include "props.h" @@ -541,6 +542,102 @@ bool update_cluster_state( bool displayState, bool checkSpareColdStandby = true return( true ); } +bool update_node_state( char *nodeName, bool checkSpareColdStandby = true ) +{ + if ( strlen(nodeName) == 0 ) + { + return( false ); + } + + int rc, rc2; + char pnodename[MPI_MAX_PROCESSOR_NAME]; + CPhysicalNode *physicalNode; + PhysicalNodeNameMap_t::iterator it; + CCmsh cmshcmd( "sqnodestatus" ); + + strncpy(pnodename, nodeName, MPI_MAX_PROCESSOR_NAME); + pnodename[MPI_MAX_PROCESSOR_NAME-1] = '\0'; + + // Look up name + it = PhysicalNodeMap.find( pnodename ); + + if (it != PhysicalNodeMap.end()) + { + physicalNode = it->second; + } + else + { + printf( "[%s] Error: Internal error while looking up physical node map, node name does not exist, node name=%s\n", MyName, pnodename ); + return( false ); + } + + // save, close and restore stdin when executing ssh command + // because ssh, by design, would consume contents of stdin. + int savedStdIn = dup(STDIN_FILENO); + if ( savedStdIn == -1 ) + { + fprintf(stderr, "[%s] Error: dup() failed for STDIN_FILENO: %s (%d)\n", MyName, strerror(errno), errno ); + exit(1); + } + close(STDIN_FILENO); + + rc = cmshcmd.GetNodeState( nodeName, physicalNode ); + rc2 = dup2(savedStdIn, STDIN_FILENO); + if ( rc2 == -1 ) + { + fprintf(stderr, "[%s] Error: dup2() failed for STDIN_FILENO: %s (%d)\n", MyName, strerror(errno), errno ); + exit(1); + } + close(savedStdIn); + + if ( rc == -1 ) + { + return( false ); + } + + NodeState_t nodeState; + CPNodeConfig *pnodeConfig = ClusterConfig.GetPNodeConfig( nodeName ); + if ( pnodeConfig ) + { + if ( get_pnode_state( PNode[pnodeConfig->GetPNid()], nodeState ) ) + { + if ( nodeState == StateUp ) + { + if ( checkSpareColdStandby && SpareNodeColdStandby ) + { + if ( pnodeConfig && pnodeConfig->IsSpareNode() ) + { + ++NumDown; + NodeState[pnodeConfig->GetPNid()] = false; + nodeState = StateDown; + set_pnode_state( PNode[pnodeConfig->GetPNid()], nodeState ); + } + else + { + NodeState[pnodeConfig->GetPNid()] = true; + } + } + else + { + NodeState[pnodeConfig->GetPNid()] = true; + } + } + else + { + NodeState[pnodeConfig->GetPNid()] = false; + ++NumDown; + } + } + } + else + { + printf( "[%s] Physical node configuration does not exist, node name=%s\n", MyName, nodeName ); + return( false ); + } + + return( true ); +} + int mon_log_write(int pv_event_type, posix_sqlog_severity_t pv_severity, char *pp_string) { pv_event_type = pv_event_type; @@ -3770,8 +3867,8 @@ int node_up( int nid, char *node_name, bool nowait ) // If this is a real cluster if ( nid == -1 ) { - // Get current physical state of all nodes - if ( !update_cluster_state( true, false ) ) + // Get current physical state of target nodes + if ( !update_node_state( node_name, false ) ) { return( rc ) ; } http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/tcdbsqlite.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/tcdbsqlite.cxx b/core/sqf/monitor/linux/tcdbsqlite.cxx index d53f602..ff18cdd 100644 --- a/core/sqf/monitor/linux/tcdbsqlite.cxx +++ b/core/sqf/monitor/linux/tcdbsqlite.cxx @@ -2507,10 +2507,6 @@ int CTcdbSqlite::GetUniqueString( int nid, int id, const char *uniqStr ) } else { - if ( prepStmt != NULL ) - { - sqlite3_finalize( prepStmt ); - } char buf[TC_LOG_BUF_SIZE]; snprintf( buf, sizeof(buf) , "[%s] (%s) failed, nid=%d, id=%d, error: %s\n" @@ -2631,10 +2627,6 @@ int CTcdbSqlite::GetUniqueStringId( int nid } else { - if ( prepStmt != NULL ) - { - sqlite3_finalize( prepStmt ); - } char buf[TC_LOG_BUF_SIZE]; snprintf( buf, sizeof(buf) , "[%s] (%s) failed, nid=%d, id=%d, error: %s\n" http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/tmsync.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/tmsync.cxx b/core/sqf/monitor/linux/tmsync.cxx index 3e72241..60d9f40 100644 --- a/core/sqf/monitor/linux/tmsync.cxx +++ b/core/sqf/monitor/linux/tmsync.cxx @@ -661,31 +661,41 @@ void CTmSync_Container::ProcessTmSyncReply( struct message_def * msg ) if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC)) trace_printf("%s@%d - Unsolicited TmSync reply, handle=%d\n", method_name, __LINE__, tmsync_req->Handle); - tmsync_req->Completed = true; - UnsolicitedComplete( msg ); - if ( msg->u.reply.u.unsolicited_tm_sync.return_code != MPI_SUCCESS ) + if (msg->u.reply.u.unsolicited_tm_sync.return_code == MPI_SUCCESS) { TmSyncReplyCode |= msg->u.reply.u.unsolicited_tm_sync.return_code; - } - if ( TmSyncPNid == MyPNID ) - { - if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC)) - trace_printf("%s@%d - Local Unsolicited TmSync reply, handle=" - "%d\n", method_name, __LINE__, - tmsync_req->Handle); - if ( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() ) + tmsync_req->Completed = true; + UnsolicitedComplete( msg ); + if ( TmSyncPNid == MyPNID ) { - UpdateTmSyncState( TmSyncReplyCode ); - UnsolicitedCompleteDone(); + if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC)) + trace_printf("%s@%d - Local Unsolicited TmSync reply, handle=" + "%d\n", method_name, __LINE__, + tmsync_req->Handle); + if ( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() ) + { + UpdateTmSyncState( TmSyncReplyCode ); + UnsolicitedCompleteDone(); + } } - } - else - { - if ( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() ) + else { - CommitTmDataBlock(TmSyncReplyCode); + if ( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() ) + { + CommitTmDataBlock(TmSyncReplyCode); + } } } + else + { // The Seabed callback has not been registered, try again + if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC)) + trace_printf("%s@%d - Retrying Local Unsolicited TmSync, handle=" + "%d\n", method_name, __LINE__, + tmsync_req->Handle); + PendingSlaveTmSyncCount--; + tmsync_req->Completed = false; + SendUnsolicitedMessages(); + } } else { http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/monitor/linux/zclient.cxx ---------------------------------------------------------------------- diff --git a/core/sqf/monitor/linux/zclient.cxx b/core/sqf/monitor/linux/zclient.cxx index 23dca8a..36a0600 100644 --- a/core/sqf/monitor/linux/zclient.cxx +++ b/core/sqf/monitor/linux/zclient.cxx @@ -650,7 +650,7 @@ void CZClient::HandleExpiredZNode( void ) monZnode.assign( znodeQueue_.front() ); - if (trace_settings) + if (trace_settings & (TRACE_INIT | TRACE_RECOVERY)) { trace_printf("%s@%d" " - znodePath=%s, znodeQueue_.size=%ld\n" , method_name, __LINE__ @@ -659,10 +659,6 @@ void CZClient::HandleExpiredZNode( void ) znodeQueue_.pop_front(); - trace_printf( "%s@%d" " - Checking znode=%s\n" - , method_name, __LINE__ - , monZnode.c_str() ); - strcpy( pathStr, monZnode.c_str() ); tknStart++; // skip the first '/' http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/sql/scripts/sqnodestatus ---------------------------------------------------------------------- diff --git a/core/sqf/sql/scripts/sqnodestatus b/core/sqf/sql/scripts/sqnodestatus index 44dc93f..56511fc 100755 --- a/core/sqf/sql/scripts/sqnodestatus +++ b/core/sqf/sql/scripts/sqnodestatus @@ -34,25 +34,34 @@ my $node_context=readpipe("trafconf -name"); my %node_hash=(); my $sq_mon_ssh_options=readpipe("echo -n \$SQ_MON_SSH_OPTIONS"); my $json=$ARGV[0]; +my $node_name=$ARGV[1]; &main(); sub main() { - #$node_context=~s/-w//ig; - #print "node_context=${node_context}"; - chomp($node_context); - my @nodes=split(' ',$node_context); - foreach my $node(@nodes) + #print "json=${json}\n"; + #print "node_name=${node_name}\n"; + if ($ARGV[0] ne '-n') { - $check_flag=check_node_status($node); + #print "node_context=${node_context}"; + chomp($node_context); + my @nodes=split(' ',$node_context); + foreach my $node(@nodes) + { + $check_flag=check_node_status($node); + } + } + else + { + $check_flag=check_node_status($node_name); } print_node_status(); } sub print_node_status() { - if ($json) + if ($json eq '-json') { $comma=""; print "["; http://git-wip-us.apache.org/repos/asf/trafodion/blob/e832d827/core/sqf/src/seabed/src/msmon.cpp ---------------------------------------------------------------------- diff --git a/core/sqf/src/seabed/src/msmon.cpp b/core/sqf/src/seabed/src/msmon.cpp index 311e099..cad7146 100644 --- a/core/sqf/src/seabed/src/msmon.cpp +++ b/core/sqf/src/seabed/src/msmon.cpp @@ -6609,7 +6609,7 @@ void msg_mon_recv_unsol_msg_loc_cbt(Mon_Msg_Type *pp_msg, int) { } else { if (gv_ms_trace_mon) trace_where_printf(WHERE, "no tmsync callback, replying with error\n"); - lv_handle = -1; + lv_handle = pp_msg->u.request.u.unsolicited_tm_sync.handle; lv_cbret = 1; // set error } lv_err = gp_local_mon_io->acquire_msg(&lp_msg);
