More monitor p2p fixes to address Seabed test failures in virtual and real 
cluster testing.


Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/251f3f57
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/251f3f57
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/251f3f57

Branch: refs/heads/master
Commit: 251f3f57d62915b14ad365af85afe29083940076
Parents: 7d6c0dd
Author: Zalo Correa <[email protected]>
Authored: Tue Apr 10 17:54:38 2018 -0700
Committer: Zalo Correa <[email protected]>
Committed: Tue Apr 10 17:54:38 2018 -0700

----------------------------------------------------------------------
 core/sqf/monitor/linux/cluster.cxx         |  10 ++
 core/sqf/monitor/linux/internal.h          |  14 +++
 core/sqf/monitor/linux/monitor.cxx         |  13 ++
 core/sqf/monitor/linux/msgdef.h            |   2 +
 core/sqf/monitor/linux/nameserver.cxx      |  11 +-
 core/sqf/monitor/linux/nscommacceptmon.cxx |   2 +
 core/sqf/monitor/linux/nsreqdelproc.cxx    |  46 ++++---
 core/sqf/monitor/linux/nsreqnewproc.cxx    |   2 +-
 core/sqf/monitor/linux/pnode.h             |   3 +
 core/sqf/monitor/linux/process.cxx         |  97 +++++++++++----
 core/sqf/monitor/linux/process.h           |   2 +-
 core/sqf/monitor/linux/replicate.cxx       |  76 ++++++++++-
 core/sqf/monitor/linux/replicate.h         |  26 ++++
 core/sqf/monitor/linux/reqqueue.cxx        | 159 ++++++++++++++++++++++--
 core/sqf/monitor/linux/reqqueue.h          |  30 +++++
 core/sqf/monitor/test/monitor.env          |   2 +-
 16 files changed, 438 insertions(+), 57 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/cluster.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.cxx 
b/core/sqf/monitor/linux/cluster.cxx
index aa3062c..514771d 100644
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@ -1752,6 +1752,7 @@ int CCluster::SoftNodeUpPrepare( int pnid )
     {
         SMSIntegrating = true;
 #ifndef NAMESERVER_PROCESS
+        node->SetSoftNodeUp( );
         Monitor->StartPrimitiveProcesses();
 #endif
         // Let other monitors know this node is preparing to soft up
@@ -2289,7 +2290,11 @@ void CCluster::HandleOtherNodeMsg (struct 
internal_msg_def *recv_msg,
     case InternalType_Exit:
         if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
             trace_printf("%s@%d - Internal exit request for %s (%d, %d)\n", 
method_name, __LINE__, recv_msg->u.exit.name, recv_msg->u.exit.nid, 
recv_msg->u.exit.pid);
+#ifndef NAMESERVER_PROCESS
         ReqQueue.enqueueExitReq( &recv_msg->u.exit );
+#else
+        ReqQueue.enqueueExitNsReq( &recv_msg->u.exit_ns );
+#endif
         break;
 
 #ifndef NAMESERVER_PROCESS
@@ -2880,6 +2885,11 @@ void CCluster::HandleMyNodeMsg (struct internal_msg_def 
*recv_msg,
     case InternalType_Exit:
         // Final process exit logic is done in Process_Exit, not here
         // as in the past.
+        if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
+            trace_printf("%s@%d - Internal exit request for %s (%d, %d)\n", 
method_name, __LINE__, recv_msg->u.exit_ns.name, recv_msg->u.exit_ns.nid, 
recv_msg->u.exit_ns.pid);
+#ifdef NAMESERVER_PROCESS
+        ReqQueue.enqueueExitNsReq( &recv_msg->u.exit_ns );
+#endif
         break;
 
 #ifndef NAMESERVER_PROCESS

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/internal.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/internal.h 
b/core/sqf/monitor/linux/internal.h
index 505e1c3..6d28b29 100644
--- a/core/sqf/monitor/linux/internal.h
+++ b/core/sqf/monitor/linux/internal.h
@@ -92,6 +92,7 @@ typedef enum {
     State_Default=0,
     State_Quiesce,                  // node quiesce state while going down
     State_SoftDown,                 // node soft down on DTM abort -> restart
+    State_SoftUp,                   // node soft up on DTM restart
     State_Ready_To_Exit
 } IntNodeState; 
 
@@ -176,6 +177,18 @@ struct exit_def
     bool abended;
 };
 
+struct exit_ns_def
+{
+    int nid;                        // Node id of process exiting
+    int pid;                        // Process id of process exiting
+    Verifier_t verifier;            // Verifier of the process exiting
+    char name[MAX_PROCESS_NAME];    // Name of process exiting
+    bool abended;
+    int sockFd;                     // monitor socket fd to reply
+    int origPNid;                   // pnid of nameserver which processed 
request
+    struct message_def *msg;        // requester's request buffer to reply
+};
+
 struct event_def
 {
     int  nid;                        // Nid id of process to receive event
@@ -433,6 +446,7 @@ struct internal_msg_def
         struct down_def    down;
         struct dump_def    dump;
         struct exit_def    exit;
+        struct exit_ns_def exit_ns;
         struct event_def   event;
         ioData_t           iodata;
         struct kill_def    kill;

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/monitor.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/monitor.cxx 
b/core/sqf/monitor/linux/monitor.cxx
index d97a3f0..578d158 100755
--- a/core/sqf/monitor/linux/monitor.cxx
+++ b/core/sqf/monitor/linux/monitor.cxx
@@ -1791,7 +1791,20 @@ int main (int argc, char *argv[])
 #endif
                 if (monitorPort)
                 {
+#ifdef NAMESERVER_PROCESS
+                    if ( IsRealCluster )
+                    {
+                        strcpy( IntegratingMonitorPort, MasterMonitorName);
+                    }
+                    else
+                    {
+                        char localHost[MAX_PROCESSOR_NAME];
+                        gethostname( localHost, MAX_PROCESSOR_NAME );
+                        strcpy( IntegratingMonitorPort, localHost);
+                    }
+#else
                     strcpy( IntegratingMonitorPort, MasterMonitorName);
+#endif
                     strcat( IntegratingMonitorPort, ":");
                     strcat( IntegratingMonitorPort, monitorPort);
                 }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/msgdef.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/msgdef.h b/core/sqf/monitor/linux/msgdef.h
index 98f11c8..4b34557 100644
--- a/core/sqf/monitor/linux/msgdef.h
+++ b/core/sqf/monitor/linux/msgdef.h
@@ -332,6 +332,7 @@ typedef enum {
                                             // types, add any new message 
types 
                                             // before this one
 } MSGTYPE;
+
 typedef TcProcessType_t PROCESSTYPE;
 
 typedef enum {
@@ -369,6 +370,7 @@ struct DelProcessNs_def
     int  target_pid;                        // Process id of process to delete
     Verifier_t target_verifier;             // Process verifier of processes 
to delete
     char target_process_name[MAX_PROCESS_NAME];    // Name of process to delete
+    bool target_abended;                    // True if process aborted
 };
 
 struct DelProcessNs_reply_def

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/nameserver.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nameserver.cxx 
b/core/sqf/monitor/linux/nameserver.cxx
index 1815409..93f99b3 100644
--- a/core/sqf/monitor/linux/nameserver.cxx
+++ b/core/sqf/monitor/linux/nameserver.cxx
@@ -513,6 +513,7 @@ int CNameServer::ProcessDelete(CProcess* process )
     msgdel->target_pid = msgdel->pid;
     msgdel->target_verifier = msgdel->verifier;
     strcpy( msgdel->target_process_name, msgdel->process_name );
+    msgdel->target_abended = process->IsAbended();
 
     int error = SendReceive(&msg );
 
@@ -636,13 +637,14 @@ int CNameServer::ProcessShutdown( void )
 int CNameServer::SendReceive( struct message_def* msg )
 {
     const char method_name[] = "CNameServer::SendReceive";
-    char desc[100];
+    char desc[256];
     char* descp;
     struct DelProcessNs_def *msgdel;
     struct NewProcessNs_def *msgnew;
     struct ShutdownNs_def *msgshutdown;
     struct NameServerStart_def *msgstart;
     struct NameServerStop_def *msgstop;
+    struct ProcessInfo_def *msginfo;
 
     TRACE_ENTRY;
 
@@ -675,7 +677,12 @@ int CNameServer::SendReceive( struct message_def* msg )
         size += sizeof(msg->u.request.u.new_process_ns);
         break;
     case ReqType_ProcessInfo:
-        descp = (char *) "process-info";
+        msginfo = &msg->u.request.u.process_info;
+        sprintf( desc, "process-info (nid=%d, pid=%d, verifier=%d, name=%s)\n"
+                       "\ttarget (nid=%d, pid=%d, verifier=%d, name=%s, 
type=%d)\n",
+                msginfo->nid, msginfo->pid, msginfo->verifier, 
msginfo->process_name,
+                msginfo->target_nid, msginfo->target_pid, 
msginfo->target_verifier, 
+                msginfo->target_process_name, msginfo->type );
         size += sizeof(msg->u.request.u.process_info);
         break;
     case ReqType_ProcessInfoCont:

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/nscommacceptmon.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nscommacceptmon.cxx 
b/core/sqf/monitor/linux/nscommacceptmon.cxx
index 763d324..3482608 100644
--- a/core/sqf/monitor/linux/nscommacceptmon.cxx
+++ b/core/sqf/monitor/linux/nscommacceptmon.cxx
@@ -85,6 +85,7 @@ void CCommAcceptMon::monReqDeleteProcess( struct message_def* 
msg, int sockFd )
                       "        msg.del_process_ns.target_pid=%d\n"
                       "        msg.del_process_ns.target_verifier=%d\n"
                       "        msg.del_process_ns.target_process_name=%s\n"
+                      "        msg.del_process_ns.target_abended=%d\n"
                     , method_name, __LINE__
                     , msg->u.request.u.del_process_ns.nid
                     , msg->u.request.u.del_process_ns.pid
@@ -94,6 +95,7 @@ void CCommAcceptMon::monReqDeleteProcess( struct message_def* 
msg, int sockFd )
                     , msg->u.request.u.del_process_ns.target_pid
                     , msg->u.request.u.del_process_ns.target_verifier
                     , msg->u.request.u.del_process_ns.target_process_name
+                    , msg->u.request.u.del_process_ns.target_abended
                     );
     }
 

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/nsreqdelproc.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nsreqdelproc.cxx 
b/core/sqf/monitor/linux/nsreqdelproc.cxx
index 8de0c6a..05585cf 100644
--- a/core/sqf/monitor/linux/nsreqdelproc.cxx
+++ b/core/sqf/monitor/linux/nsreqdelproc.cxx
@@ -56,7 +56,7 @@ CExtDelProcessNsReq::~CExtDelProcessNsReq()
 
 void CExtDelProcessNsReq::populateRequestString( void )
 {
-    char strBuf[MON_STRING_BUF_SIZE/2] = { 0 };
+    char strBuf[MON_STRING_BUF_SIZE] = { 0 };
 
     snprintf( strBuf, sizeof(strBuf), 
               "ExtReq(%s) req #=%ld "
@@ -93,7 +93,7 @@ void CExtDelProcessNsReq::performRequest()
     if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
     {
         trace_printf( "%s@%d request #%ld: Delete, requester %s (%d, %d:%d), "
-                      "target %s (%d, %d:%d)\n",
+                      "target %s (%d, %d:%d), abended=%d\n",
                       method_name, __LINE__, id_,
                       msg_->u.request.u.del_process_ns.process_name,
                       msg_->u.request.u.del_process_ns.nid,
@@ -102,13 +102,15 @@ void CExtDelProcessNsReq::performRequest()
                       msg_->u.request.u.del_process_ns.target_process_name,
                       msg_->u.request.u.del_process_ns.target_nid,
                       msg_->u.request.u.del_process_ns.target_pid,
-                      msg_->u.request.u.del_process_ns.target_verifier);
+                      msg_->u.request.u.del_process_ns.target_verifier,
+                      msg_->u.request.u.del_process_ns.target_abended);
     }
 
     nid_ = msg_->u.request.u.del_process_ns.nid;
     verifier_ = msg_->u.request.u.del_process_ns.verifier;
     processName_ = msg_->u.request.u.del_process_ns.process_name;
 
+    bool      target_abended = msg_->u.request.u.del_process_ns.target_abended;
     int       target_nid = -1;
     int       target_pid = -1;
     string    target_process_name;
@@ -128,18 +130,28 @@ void CExtDelProcessNsReq::performRequest()
     if (process)
     {
         CNode * node = Nodes->GetLNode (process->GetNid())->GetNode();
-        node->DelFromNameMap ( process );
-        node->DelFromPidMap ( process );
-        process->SetState (State_Stopped);
-        // Replicate the exit to other nodes
-        CReplExit *repl = new CReplExit(process->GetNid(),
-                                        process->GetPid(),
-                                        process->GetVerifier(),
-                                        process->GetName(),
-                                        process->IsAbended());
-        Replicator.addItem(repl);
-        process->SetDeletePending ( true );
-        node->DeleteFromList( process );
+
+        // Note: process object is deletes by Exit_Process, so use 
+        //       target_* values to replicate
+        node->Exit_Process( process, target_abended, -1 );
+        // Replicate the exit to other name servers 
+        CReplExitNs *repl = new CReplExitNs(target_nid,
+                                            target_pid,
+                                            target_verifier,
+                                            target_process_name.c_str(),
+                                            target_abended,
+                                            msg_,
+                                            sockFd_,
+                                            MyPNID );
+        if (repl)
+        {
+            // we will not reply at this time ... but wait for 
+            // exit request to be processed in CIntExitNsReq
+
+            msg_->noreply = true;
+
+            Replicator.addItem(repl);
+        }
 
         msg_->u.reply.type = ReplyType_DelProcessNs;
         msg_->u.reply.u.del_process_ns.nid = 
msg_->u.request.u.del_process_ns.target_nid;
@@ -172,10 +184,8 @@ void CExtDelProcessNsReq::performRequest()
         msg_->u.reply.u.del_process_ns.return_code = MPI_ERR_NAME;
         if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
            trace_printf("%s@%d - unsuccessful\n", method_name, __LINE__);
+        monreply(msg_, sockFd_);
     }
 
-    // Send reply to requester
-    monreply(msg_, sockFd_);
-
     TRACE_EXIT;
 }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/nsreqnewproc.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nsreqnewproc.cxx 
b/core/sqf/monitor/linux/nsreqnewproc.cxx
index 682823b..2231fc0 100644
--- a/core/sqf/monitor/linux/nsreqnewproc.cxx
+++ b/core/sqf/monitor/linux/nsreqnewproc.cxx
@@ -148,7 +148,7 @@ void CExtNewProcNsReq::performRequest()
             if (repl)
             {
                 // we will not reply at this time ... but wait for 
-                // node add to be processed in CIntNodeAddReq
+                // new process request to be processed in CIntCloneProcNsReq
     
                 // Retain reference to requester's request buffer so can
                 // send completion message.

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/pnode.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.h b/core/sqf/monitor/linux/pnode.h
index eb4829e..b13fbef 100644
--- a/core/sqf/monitor/linux/pnode.h
+++ b/core/sqf/monitor/linux/pnode.h
@@ -270,12 +270,14 @@ public:
     inline bool  IsRankFailure( void ) { return( rankFailure_ ); }
     inline bool  IsSpareNode( void ) { return( spareNode_ ); }
     inline bool  IsSoftNodeDown( void ) { return( internalState_ == 
State_SoftDown ); }
+    inline bool  IsSoftNodeUp( void ) { return( internalState_ == State_SoftUp 
); }
 
     CNode  *Link( CNode *entry );
     void    MoveLNodes( CNode *targetNode );
     inline void ResetSpareNode( void ) { spareNode_ = false; }
     void    ResetWatchdogTimer( void );
     inline void ResetSoftNodeDown( void ) { internalState_ = State_Default; }
+    inline void ResetSoftNodeUp( void ) { internalState_ = State_Default; }
     inline void SetActivatingSpare( int activatingSpare ) { activatingSpare_ = 
activatingSpare; }
     void    SetAffinity( int nid, pid_t pid, PROCESSTYPE type );
     void    SetAffinity( CProcess *process );
@@ -306,6 +308,7 @@ public:
     inline void SetNumCores( int numCores ) { numCores_ = numCores; }
     inline void SetPhase( NodePhase phase ) { phase_ = phase; }
     inline void SetSoftNodeDown( void ) { internalState_ = State_SoftDown; }
+    inline void SetSoftNodeUp( void ) { internalState_ = State_SoftUp; }
     inline void SetSparePNids( PNidVector &sparePNids ) { sparePNids_ = 
sparePNids; }
     inline void SetRank( int rank ) { rank_ = rank; }
     inline void SetRankFailure( bool failed ) { rankFailure_ = failed; 

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/process.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/process.cxx 
b/core/sqf/monitor/linux/process.cxx
index 34ec848..fce323c 100644
--- a/core/sqf/monitor/linux/process.cxx
+++ b/core/sqf/monitor/linux/process.cxx
@@ -2915,6 +2915,7 @@ struct message_def * CProcess::DeathMessage( )
 }
 #endif
 
+#ifndef NAMESERVER_PROCESS
 void CProcess::Exit( CProcess *parent )
 {
     char la_buf[MON_STRING_BUF_SIZE];
@@ -2922,12 +2923,10 @@ void CProcess::Exit( CProcess *parent )
     const char method_name[] = "CProcess::Exit";
     TRACE_ENTRY;
 
-#ifndef NAMESERVER_PROCESS
     if ( DumpState != Dump_Ready )
     {
         DumpEnd( Dump_Failed, (char *)corefile_.c_str() );
     }
-#endif
 
     if (trace_settings & (TRACE_SYNC | TRACE_REQUEST | TRACE_PROCESS))
        trace_printf( "%s@%d" " - Process %s (%d,%d:%d) is exiting, parent 
process %s (%d,%d:%d)\n"
@@ -2952,7 +2951,6 @@ void CProcess::Exit( CProcess *parent )
                                         : !node->IsKillingNode();
     }
 
-#ifndef NAMESERVER_PROCESS
     if(  NoticeHead &&
         !MyNode->IsKillingNode() &&
         !(Type == ProcessType_DTM && IsAbended()) &&
@@ -2966,7 +2964,6 @@ void CProcess::Exit( CProcess *parent )
         // Notify all local registered processes of this process' death
         NoticeHead->NotifyAll();
     }
-#endif
 
     if ( !Clone && !Paired )
     {
@@ -2974,14 +2971,11 @@ void CProcess::Exit( CProcess *parent )
         {
             case ProcessType_TSE:
             case ProcessType_ASE:
-#ifndef NAMESERVER_PROCESS
                 MyNode->delFromQuiesceExitPids( GetPid(), GetVerifier() );
-#endif
 
                 if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL 
| TRACE_REQUEST_DETAIL))
                     trace_printf("%s%d: pid %d deleted from quiesce exit 
list\n", method_name, __LINE__, GetPid());
 
-#ifndef NAMESERVER_PROCESS
                 if (MyNode->isInQuiesceState())
                 {
                     if (MyNode->isQuiesceExitPidsEmpty())
@@ -2989,12 +2983,9 @@ void CProcess::Exit( CProcess *parent )
                         HealthCheck.setState(MON_SCHED_NODE_DOWN);  // 
schedule a node down req
                     }
                 }
-#endif
                 else
                 {   // unmount volumes only if node is not quiescing.
-#ifndef NAMESERVER_PROCESS
                     Devices->UnMountVolume( Name, Backup );
-#endif
                 }
                 break;
             case ProcessType_DTM:
@@ -3022,7 +3013,6 @@ void CProcess::Exit( CProcess *parent )
                     }
                     else
                     {
-#ifndef NAMESERVER_PROCESS
                         if ( Monitor->GetTmLeader() == MyPNID )
                         {
                             // set the clean shutdown condition
@@ -3032,7 +3022,6 @@ void CProcess::Exit( CProcess *parent )
                             strcpy(value,"True");
                             Config->GetClusterGroup()->Set( key, value );
                         }
-#endif
                     }
                 }
                 break;
@@ -3100,12 +3089,10 @@ void CProcess::Exit( CProcess *parent )
                                 trace_printf("%s@%d: Queueing death notice for 
SSMP process for %s (%d, %d:%d)\n",
                                              method_name, __LINE__, Name, Nid, 
Pid, Verifier);
 
-#ifndef NAMESERVER_PROCESS
                             ssmpProcess->ssmpNoticesLock_.lock();
                             ssmpProcess->ssmpNotices_.push_back( 
DeathMessage() );
                             ssmpProcess->ssmpNoticesLock_.unlock();
                             SQ_theLocalIOToClient->nudgeNotifier ();
-#endif
                         }
                         else
                         {
@@ -3146,15 +3133,14 @@ void CProcess::Exit( CProcess *parent )
         // Check if we need to output a entry into the process id map log file
         if ( PidMap )
         {
-#ifndef NAMESERVER_PROCESS
             Monitor->writeProcessMapEnd( Name, Nid, Pid, Verifier,
                                          parent ? parent->GetNid() : -1,
                                          parent ? parent->GetPid() : -1,
                                          parent ? parent->GetVerifier() : -1,
                                          program() );
-#endif
         }
     }
+
     if ( Clone && Pid != -1 )
     {
         if ( Type == ProcessType_SPX &&
@@ -3168,12 +3154,10 @@ void CProcess::Exit( CProcess *parent )
                 CProcess *spxProcess = lnode->GetProcessLByType( 
ProcessType_SPX );
                 if ( spxProcess && MyNode->GetState() == State_Up )
                 {
-#ifndef NAMESERVER_PROCESS
                     SQ_theLocalIOToClient->putOnNoticeQueue( spxProcess->Pid
                                                            , 
spxProcess->Verifier
                                                            , DeathMessage()
                                                            , NULL);
-#endif
 
                     if (trace_settings & (TRACE_SYNC_DETAIL | 
TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                        trace_printf( "%s@%d" " - Sending death message of %s 
(%d,%d:%d) to %s (%d,%d:%d)\n"
@@ -3209,12 +3193,10 @@ void CProcess::Exit( CProcess *parent )
                 CProcess *tmProcess = lnode->GetProcessLByType( 
ProcessType_DTM );
                 if ( tmProcess && MyNode->GetState() == State_Up )
                 {
-#ifndef NAMESERVER_PROCESS
                     SQ_theLocalIOToClient->putOnNoticeQueue( tmProcess->Pid
                                                            , 
tmProcess->Verifier
                                                            , DeathMessage()
                                                            , NULL);
-#endif
 
                     if (trace_settings & (TRACE_SYNC_DETAIL | 
TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
                        trace_printf( "%s@%d" " - Sending death message of %s 
(%d,%d:%d) to %s (%d,%d:%d)\n"
@@ -3246,12 +3228,10 @@ void CProcess::Exit( CProcess *parent )
                parent->GetType()  == ProcessType_DTM) &&
              supplyProcessDeathNotices )
         {
-#ifndef NAMESERVER_PROCESS
             SQ_theLocalIOToClient->putOnNoticeQueue( parent->Pid
                                                    , parent->Verifier
                                                    , DeathMessage()
                                                    , NULL);
-#endif
 
             if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | 
TRACE_PROCESS_DETAIL))
                trace_printf( "%s@%d" " - Sending death message of %s 
(%d,%d:%d) to %s (%d,%d:%d) \n"
@@ -3267,7 +3247,6 @@ void CProcess::Exit( CProcess *parent )
         }
     }
 
-#ifndef NAMESERVER_PROCESS
     if (NameServerEnabled)
     {
         if ( parent )
@@ -3304,10 +3283,10 @@ void CProcess::Exit( CProcess *parent )
         }
         procExitNotifierNodes();
     }
-#endif
 
     TRACE_EXIT;
 }
+#endif
 
 #ifndef NAMESERVER_PROCESS
 void CProcess::GenerateEvent( int event_id, int length, char *data )
@@ -5371,6 +5350,72 @@ void CProcessContainer::Exit_Process (CProcess *process, 
bool abend, int downNod
 }
 #endif
 
+#ifdef NAMESERVER_PROCESS
+void CProcessContainer::Exit_Process (CProcess *process, bool abend, int 
downNode)
+{
+    const char method_name[] = "CProcessContainer::Exit_Process(process)";
+    TRACE_ENTRY;
+
+    char la_buf[MON_STRING_BUF_SIZE];
+    CProcess *parent = NULL;
+
+    if (process)
+    {
+        if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | 
TRACE_PROCESS_DETAIL))
+            trace_printf( "%s@%d - Process %s (abended=%d) is exiting, 
abend=%d, downNode=%d\n"
+                        , method_name, __LINE__
+                        , process->GetName()
+                        , process->IsAbended()
+                        , abend
+                        , downNode );
+
+        if ( process->GetState() == State_Down && abend && 
!process->IsAbended() )
+        {
+            process->SetAbended( abend );
+        }
+        if (process->GetNid() == downNode && !process->IsAbended() )
+        {
+            process->SetAbended( abend );
+        }
+
+        if ( numProcs_ <= 0 )
+        {
+            snprintf(la_buf, sizeof(la_buf),
+                     "[%s], Node's process count is invalid, aborting\n",
+                     method_name);
+            mon_log_write(MON_PROCESSCONT_EXITPROCESS_1, SQ_LOG_ERR, la_buf);
+            abort();
+        }
+
+        if ( process->GetState() == State_Stopped )
+        {
+            if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | 
TRACE_PROCESS_DETAIL))
+                trace_printf("%s@%d" " - Process " "%s" " already exited." 
"\n", method_name, __LINE__, process->GetName());
+            return;
+        }
+
+        if ( parent == NULL)
+        {
+            parent = Nodes->GetProcess( process->GetParentNid(),
+                                        process->GetParentPid() );
+        }
+
+        // Handle the process termination
+        process->Switch( parent ); // switch process pair roles if needed
+        process->SetDeletePending ( true );
+
+        CNode *node;
+        node = Nodes->GetLNode(process->GetNid())->GetNode();
+        node->DelFromNameMap ( process );
+        node->DelFromPidMap ( process );
+        node->DeleteFromList( process );
+    }
+    TRACE_EXIT;
+
+    return;
+}
+#endif
+
 CProcess *CProcessContainer::GetProcess (int pid)
 {
     const char method_name[] = "CProcessContainer::GetProcess (pid)";
@@ -5822,7 +5867,7 @@ void CProcessContainer::KillAllDownSoft()
         }
 
         // valid for virtual cluster or soft node down only.
-        if ( type != ProcessType_DTM )
+        if ( type != ProcessType_DTM && type != ProcessType_NameServer )
         {
             // Delete pid map entry
             DelFromPidMap ( process );
@@ -5859,7 +5904,7 @@ void CProcessContainer::KillAllDownSoft()
         nextProc = process->GetNext();
 
         PROCESSTYPE type = process->GetType();
-        if ( type != ProcessType_DTM )
+        if ( type != ProcessType_DTM && type != ProcessType_NameServer )
         {
             // Delete pid map entry
             DelFromPidMap ( process );

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/process.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/process.h b/core/sqf/monitor/linux/process.h
index 90954f3..f6f22cb 100644
--- a/core/sqf/monitor/linux/process.h
+++ b/core/sqf/monitor/linux/process.h
@@ -127,9 +127,9 @@ class CProcessContainer
                              );
     bool Dump_Process( CProcess *dumper, CProcess *process, char *core_path );
     void DumpCallback( int nid, pid_t pid, int status );
+    void Exit_Process( CProcess *process, bool abend, int downNode );
 #ifndef NAMESERVER_PROCESS
     static CProcess *ParentNewProcReply ( CProcess *process, int result );
-    void Exit_Process( CProcess *process, bool abend, int downNode );
 #else
     static CProcess *MonReply ( CProcess *process, int result );
 #endif

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/replicate.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/replicate.cxx 
b/core/sqf/monitor/linux/replicate.cxx
index c22c799..0b6fadb 100644
--- a/core/sqf/monitor/linux/replicate.cxx
+++ b/core/sqf/monitor/linux/replicate.cxx
@@ -91,7 +91,7 @@ int CReplObj::calcAllocSize()
                                                 sizeof(CReplExit)),
                                             sizeof(CReplKill)),
 #ifdef NAMESERVER_PROCESS
-                                        sizeof(dummy_sizeof_def)),
+                                        sizeof(CReplExitNs)),
 #else
                                         sizeof(CReplDevice)),
 #endif
@@ -929,6 +929,80 @@ bool CReplExit::replicate(struct internal_msg_def *&msg)
     return true;
 }
 
+CReplExitNs::CReplExitNs( int nid
+                        , int pid
+                        , Verifier_t verifier
+                        , const char *name
+                        , bool abended
+                        , struct message_def *msg
+                        , int  sockFd
+                        , int  origPNid )
+            : nid_(nid)
+            , pid_(pid)
+            , verifier_(verifier)
+            , abended_(abended)
+            , msg_(msg)
+            , sockFd_(sockFd)
+            , origPNid_(origPNid)
+{
+    // Add eyecatcher sequence as a debugging aid
+    memcpy(&eyecatcher_, "RPLJ", 4);
+
+    strcpy(name_, name);
+
+    // Compute message size (adjust if needed to conform to
+    // internal_msg_def structure alignment).
+    replSize_ = (MSG_HDR_SIZE + sizeof ( exit_ns_def ) + msgAlignment_)
+                & ~msgAlignment_;
+
+    if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_PROCESS_DETAIL))
+    {
+        const char method_name[] = "CReplExitNs::CReplExitNs";
+        trace_printf( "%s@%d  - Queuing replicating process exit %s (%d, 
%d:%d),"
+                      " abended=%d, msg=%p, sockFd=%d, origPNid=%d\n"
+                    , method_name, __LINE__
+                    , name_, nid_, pid_, verifier_, abended_
+                    , msg_, sockFd_, origPNid_ );
+    }
+}
+
+CReplExitNs::~CReplExitNs()
+{
+    // Alter eyecatcher sequence as a debugging aid to identify deleted object
+    memcpy(&eyecatcher_, "rplj", 4);
+}
+
+
+bool CReplExitNs::replicate(struct internal_msg_def *&msg)
+{
+    const char method_name[] = "CReplExitNs::replicate";
+    TRACE_ENTRY;
+
+    if (trace_settings & (TRACE_SYNC | TRACE_PROCESS))
+        trace_printf("%s@%d" " - Replicating process exit %s (%d, %d:%d),"
+                     " abended=%d\n", method_name, __LINE__,
+                     name_, nid_, pid_, verifier_, abended_);
+
+    // Build message to replicate this process exit to other nodes
+    msg->type = InternalType_Exit;
+    msg->u.exit_ns.nid = nid_;
+    msg->u.exit_ns.pid = pid_;
+    msg->u.exit_ns.verifier = verifier_;
+    strcpy(msg->u.exit_ns.name, name_);
+    msg->u.exit_ns.abended = abended_;
+    msg->u.exit_ns.msg = msg_;
+    msg->u.exit_ns.sockFd = sockFd_;
+    msg->u.exit_ns.origPNid = origPNid_;
+
+    // Advance sync buffer pointer
+    Nodes->AddMsg( msg, replSize() );
+
+    TRACE_EXIT;
+
+    return true;
+}
+
+
 CReplKill::CReplKill( int nid
                     , int pid
                     , Verifier_t verifier

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/replicate.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/replicate.h 
b/core/sqf/monitor/linux/replicate.h
index 966c69e..5d4a7cc 100644
--- a/core/sqf/monitor/linux/replicate.h
+++ b/core/sqf/monitor/linux/replicate.h
@@ -210,6 +210,32 @@ private:
     bool abended_;
 };
 
+class CReplExitNs: public CReplObj
+{
+public:
+    CReplExitNs( int nid
+               , int pid
+               , Verifier_t verifier
+               , const char *name
+               , bool abended
+               , struct message_def *msg
+               , int  sockFd
+               , int  origPNid );
+    virtual ~CReplExitNs();
+
+    bool replicate(struct internal_msg_def *& msg);
+
+private:
+    int nid_;
+    int pid_;
+    Verifier_t verifier_;
+    char name_[MAX_PROCESS_NAME];
+    bool abended_;
+    struct message_def *msg_;
+    int  sockFd_;
+    int  origPNid_;
+};
+
 class CReplKill: public CReplObj
 {
 public:

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/reqqueue.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqqueue.cxx 
b/core/sqf/monitor/linux/reqqueue.cxx
index 530b096..6825652 100644
--- a/core/sqf/monitor/linux/reqqueue.cxx
+++ b/core/sqf/monitor/linux/reqqueue.cxx
@@ -1017,6 +1017,7 @@ void CIntDeviceReq::performRequest()
 }
 #endif
 
+#ifndef NAMESERVER_PROCESS
 CIntExitReq::CIntExitReq( )
             : CInternalReq()
             , nid_(0)
@@ -1101,12 +1102,136 @@ void CIntExitReq::performRequest()
         {
             lnode->GetNode()->DelFromNameMap ( process );
             lnode->GetNode()->DelFromPidMap ( process );
+            lnode->GetNode()->Exit_Process (process, abended_, -1);
+        }
+    }
+    else
+    {
+        char buf[MON_STRING_BUF_SIZE];
+        sprintf(buf, "[%s], Can't find process %s (%d, %d) for processing "
+                "exit.\n", method_name, name_, nid_, pid_);
+        mon_log_write(MON_CLUSTER_HANDLEOTHERNODE_5, SQ_LOG_ERR, buf);
+    }
 
-#ifdef NAMESERVER_PROCESS
-            lnode->GetNode()->DeleteFromList( process );
+    TRACE_EXIT;
+}
 #else
-            lnode->GetNode()->Exit_Process (process, abended_, -1);
-#endif
+CIntExitNsReq::CIntExitNsReq( )
+              : CInternalReq()
+              , nid_(0)
+              , pid_(0)
+              , verifier_(-1)
+{
+    // Add eyecatcher sequence as a debugging aid
+    memcpy(&eyecatcher_, "RQIE", 4);
+
+    name_[0] = '\0';
+}
+
+CIntExitNsReq::~CIntExitNsReq( )
+{
+    // Alter eyecatcher sequence as a debugging aid to identify deleted object
+    memcpy(&eyecatcher_, "rqie", 4);
+}
+
+void CIntExitNsReq::populateRequestString( void )
+{
+    char strBuf[MON_STRING_BUF_SIZE/2];
+    sprintf( strBuf, "IntReq(%s) req #=%ld (name=%s/nid=%d/pid=%d/verifier=%d)"
+                     "(msg=%p, sockFd=%d, origPNid=%d)"
+                   , CReqQueue::intReqType[InternalType_Exit]
+                   , getId(), name_, nid_, pid_, verifier_
+                   , msg_, sockFd_, origPNid_ );
+    requestString_.assign( strBuf );
+}
+
+void CIntExitNsReq::prepRequest( struct exit_ns_def *exitDef )
+{
+    const char method_name[] = "CIntExitNsReq::prepRequest";
+    TRACE_ENTRY;
+
+    nid_ = exitDef->nid;
+    pid_ = exitDef->pid;
+    verifier_ = exitDef->verifier;
+    strcpy( name_, exitDef->name );
+    abended_ = exitDef->abended;
+    msg_ = exitDef->msg;
+    sockFd_ = exitDef->sockFd;
+    origPNid_ = exitDef->origPNid;
+
+    TRACE_EXIT;
+}
+
+void CIntExitNsReq::performRequest()
+{
+    const char method_name[] = "CIntExitReq::performRequest";
+    TRACE_ENTRY;
+
+    CProcess *process = NULL;
+    CLNode  *lnode;
+
+    // Check if this name server is handling monitor request
+    // from CExtDelProcessNsReq::performRequest()
+    if (origPNid_ == MyPNID)
+    {
+        msg_->noreply = false;
+        msg_->u.reply.type = ReplyType_DelProcessNs;
+        msg_->u.reply.u.del_process_ns.nid = nid_;
+        msg_->u.reply.u.del_process_ns.pid = pid_;
+        msg_->u.reply.u.del_process_ns.verifier = verifier_;
+        strncpy(msg_->u.reply.u.del_process_ns.process_name, name_, 
MAX_PROCESS_NAME);
+        msg_->u.reply.u.del_process_ns.return_code = MPI_SUCCESS;
+
+        if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+        {
+           trace_printf("%s@%d - Replying to monitor sockFd_=%d, msg_=%p, 
deleted %s (%d, %d:%d)\n",
+                        method_name, __LINE__,
+                        sockFd_,
+                        msg_,
+                        name_,
+                        nid_,
+                        pid_,
+                        verifier_ );
+        }
+
+        // Send reply to requesting monitor
+        monreply( msg_, sockFd_ );
+        return;
+    }
+
+    lnode = Nodes->GetLNode( nid_ );
+    if ( lnode )
+    {
+        process = lnode->GetNode()->GetProcess( pid_ );
+
+        if ( ! process )
+        {
+            // Could not locate process by process id.  If the exit
+            // occurred due to an early process termination on another
+            // node we won't have the process id.  Try the look up by
+            // name instead.
+            process = lnode->GetNode()->GetProcess( name_, false );
+        }
+    }
+
+    if ( process )
+    {
+        if ( (verifier_ != -1) && (verifier_ != process->GetVerifier()) )
+        {
+            if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+            {
+               trace_printf("%s@%d - Exit %s (%d, %d:%d) failed -- verifier 
mismatch (%d)\n",
+                            method_name, __LINE__,
+                            name_,
+                            nid_,
+                            pid_,
+                            verifier_,
+                            process->GetVerifier());
+            }
+        }
+        else
+        {
+            lnode->GetNode()->Exit_Process( process, abended_, -1 );
         }
     }
     else
@@ -1119,6 +1244,7 @@ void CIntExitReq::performRequest()
 
     TRACE_EXIT;
 }
+#endif
 
 #ifndef NAMESERVER_PROCESS
 CIntKillReq::CIntKillReq( struct kill_def *killDef )
@@ -3583,10 +3709,17 @@ void CIntCreatePrimitiveReq::performRequest()
             {
                 startNs = true;
             }
-            if ( startNs )
+            if ( !MyNode->IsSoftNodeUp() )
+            {  // Don't restart the name server on a soft node up
+                if ( startNs )
+                {
+                    NameServer->SetLocalHost();
+                    MyNode->StartNameServerProcess();
+                }
+            }
+            else
             {
-                NameServer->SetLocalHost();
-                MyNode->StartNameServerProcess();
+                MyNode->ResetSoftNodeUp();
             }
         }
         MyNode->StartWatchdogProcess();
@@ -4246,6 +4379,7 @@ void CReqQueue::enqueueUpReq( int pnid, char *node_name, 
int merge_lead )
     enqueueReq ( request );
 }
 
+#ifndef NAMESERVER_PROCESS
 void CReqQueue::enqueueExitReq( struct exit_def *exitDef )
 {
     CIntExitReq * request;
@@ -4255,6 +4389,17 @@ void CReqQueue::enqueueExitReq( struct exit_def *exitDef 
)
 
     enqueueReq ( request );
 }
+#else
+void CReqQueue::enqueueExitNsReq( struct exit_ns_def *exitDef )
+{
+    CIntExitNsReq * request;
+
+    request = new CIntExitNsReq ( );
+    request->prepRequest( exitDef );
+
+    enqueueReq ( request );
+}
+#endif
 
 #ifndef NAMESERVER_PROCESS
 //void CReqQueue::enqueueKillReq( int nid, int pid, bool abort )

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/linux/reqqueue.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqqueue.h 
b/core/sqf/monitor/linux/reqqueue.h
index 7fa69fd..995912d 100644
--- a/core/sqf/monitor/linux/reqqueue.h
+++ b/core/sqf/monitor/linux/reqqueue.h
@@ -926,6 +926,7 @@ private:
 };
 #endif
 
+#ifndef NAMESERVER_PROCESS
 class CIntExitReq: public CInternalReq
 {
 public:
@@ -944,6 +945,31 @@ private:
     bool abended_;
     char name_[MAX_PROCESS_NAME];
 };
+#endif
+
+#ifdef NAMESERVER_PROCESS
+class CIntExitNsReq: public CInternalReq
+{
+public:
+    CIntExitNsReq();
+    virtual ~CIntExitNsReq();
+
+    void prepRequest( struct exit_ns_def *exitDef );
+    void performRequest();
+
+private:
+    void populateRequestString( void );
+
+    int nid_;
+    int pid_;
+    Verifier_t verifier_;
+    bool abended_;
+    char name_[MAX_PROCESS_NAME];
+    struct message_def *msg_;
+    int  sockFd_;
+    int  origPNid_;
+};
+#endif
 
 #ifdef NAMESERVER_PROCESS
 class CExtDelProcessNsReq: public CExternalReq
@@ -1471,7 +1497,11 @@ class CReqQueue
 #ifndef NAMESERVER_PROCESS
     void enqueueDeviceReq( char *ldevName );
 #endif
+#ifndef NAMESERVER_PROCESS
     void enqueueExitReq( struct exit_def *exitDef );
+#else
+    void enqueueExitNsReq( struct exit_ns_def *exitDef );
+#endif
 #ifdef NAMESERVER_PROCESS
     void enqueueDeleteReq( struct delete_def *deleteDef );
 #endif

http://git-wip-us.apache.org/repos/asf/trafodion/blob/251f3f57/core/sqf/monitor/test/monitor.env
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/test/monitor.env 
b/core/sqf/monitor/test/monitor.env
index 303dbf3..52e2341 100644
--- a/core/sqf/monitor/test/monitor.env
+++ b/core/sqf/monitor/test/monitor.env
@@ -54,7 +54,7 @@ MON_TRACE_NS=1
 #MON_TRACE_MLIO=1
 
 #MON_TRACE_REQUEST_DETAIL=1
-#MON_TRACE_PROCESS_DETAIL=1
+MON_TRACE_PROCESS_DETAIL=1
 #MON_TRACE_NOTICE_DETAIL=1
 #MON_TRACE_SYNC_DETAIL=1
 #MON_TRACE_MLIO_DETAIL=1

Reply via email to