Name Server enabled fixes of problems during shutdown.

Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/db656603
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/db656603
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/db656603

Branch: refs/heads/master
Commit: db656603237af4f376a2cca27fe0500b7461380a
Parents: a528474
Author: Zalo Correa <[email protected]>
Authored: Wed Apr 18 11:41:21 2018 -0700
Committer: Zalo Correa <[email protected]>
Committed: Wed Apr 18 11:41:21 2018 -0700

----------------------------------------------------------------------
 core/sqf/monitor/linux/cluster.cxx         | 214 ++++++++++++++++++---
 core/sqf/monitor/linux/cluster.h           |   8 +-
 core/sqf/monitor/linux/nameserver.cxx      |   2 +-
 core/sqf/monitor/linux/nscommacceptmon.cxx |   5 +-
 core/sqf/monitor/linux/nscommacceptmon.h   |   2 +
 core/sqf/monitor/linux/nsreqprocinfons.cxx | 160 +++++++++++-----
 core/sqf/monitor/linux/nsreqshutdown.cxx   |  18 +-
 core/sqf/monitor/linux/pnode.cxx           | 239 +++++++++++++++++++++++-
 core/sqf/monitor/linux/pnode.h             |  16 ++
 core/sqf/monitor/linux/process.cxx         |  24 +++
 core/sqf/monitor/linux/reqtmleader.cxx     |  36 ++++
 11 files changed, 635 insertions(+), 89 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/cluster.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.cxx 
b/core/sqf/monitor/linux/cluster.cxx
index e2f7dbf..4646433 100644
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@ -66,6 +66,8 @@ using namespace std;
 #include "meas.h"
 #ifdef NAMESERVER_PROCESS
 #include "nscommacceptmon.h"
+#else
+#include "nameserver.h"
 #endif
 
 extern bool IAmIntegrating;
@@ -85,6 +87,7 @@ extern char MySyncPort[MPI_MAX_PORT_NAME];
 extern CCommAcceptMon CommAcceptMon;
 extern char MyMon2NsPort[MPI_MAX_PORT_NAME];
 #else
+extern CNameServer *NameServer;
 extern bool NameServerEnabled;
 extern char MyPtPPort[MPI_MAX_PORT_NAME];
 #endif
@@ -120,6 +123,8 @@ extern CReplicate Replicator;
 
 extern char *ErrorMsg (int error_code);
 
+extern const char *ProcessTypeString( PROCESSTYPE type );
+
 const char *JoiningPhaseString( JOINING_PHASE phase);
 const char *StateString( STATE state);
 #ifndef NAMESERVER_PROCESS
@@ -538,6 +543,13 @@ void CCluster::AssignTmLeader( int pnid, bool checkProcess 
)
 
     int TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid();
 
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | 
TRACE_SYNC | TRACE_TMSYNC))
+    {
+        trace_printf( "%s@%d - pnid=%d, checkProcess=%d, tmLeaderNid_=%d, 
TmLeaderPNid=%d\n"
+                    , method_name, __LINE__
+                    , pnid, checkProcess, tmLeaderNid_, TmLeaderPNid );
+    }
+
     if (TmLeaderPNid != pnid)
     {
         node = LNode[tmLeaderNid_]->GetNode();
@@ -561,6 +573,36 @@ void CCluster::AssignTmLeader( int pnid, bool checkProcess 
)
                 }
                 return;
             }
+            else
+            {
+                if (NameServerEnabled)
+                {
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | 
TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+                    {
+                        trace_printf( "%s@%d - Getting process from Name 
Server, nid=%d, type%s\n"
+                                    , method_name, __LINE__
+                                    , tmLeaderNid_, 
ProcessTypeString(ProcessType_DTM) );
+                    }
+                
+                    process = Nodes->GetProcessLByTypeNs( tmLeaderNid_, 
ProcessType_DTM );
+                    if (process)
+                    {
+                        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | 
TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+                        {
+                            if (node)
+                                trace_printf( "%s@%d - Node pnid=%d (%s), 
phase=%s, "
+                                              "isSoftNodeDown=%d, 
checkProcess=%d\n"
+                                            , method_name, __LINE__
+                                            , node->GetPNid()
+                                            , node->GetName()
+                                            , NodePhaseString(node->GetPhase())
+                                            , node->IsSoftNodeDown()
+                                            , checkProcess );
+                        }
+                        return;
+                    }
+                }
+            }
         }
         else
         {
@@ -698,6 +740,7 @@ CCluster::CCluster (void)
       reconnectSeqNum_(0),
       seqNum_(1),
       waitForWatchdogExit_(false)
+      ,waitForNameServerExit_(false)
       ,checkSeqNum_(false)
       ,validateNodeDown_(false)
       ,enqueuedDown_(false)
@@ -1455,8 +1498,8 @@ int CCluster::HardNodeUp( int pnid, char *node_name )
     TRACE_ENTRY;
 
     if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
-       trace_printf( "%s@%d - pnid=%d, name=%s (MyPNID = %d)\n"
-                   , method_name, __LINE__, pnid, node_name, MyPNID );
+       trace_printf( "%s@%d - pnid=%d, name=%s (MyPNID = %d), 
currentNodes_=%d\n"
+                   , method_name, __LINE__, pnid, node_name, MyPNID, 
currentNodes_ );
 
     if ( pnid == -1 )
     {
@@ -3170,13 +3213,19 @@ void CCluster::InitializeConfigCluster( void )
         char *nodes = getenv( "SQ_VIRTUAL_NODES" );
         worldSize = atoi(nodes);
         if ( worldSize <= 0 )
+        {
             worldSize = 1;
+        }
     }
 #endif
     int rankToPnid[worldSize];
     CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
 
+#ifdef NAMESERVER_PROCESS
+    currentNodes_ = 1;  // non-master Name Servers join set through master 
Name Server
+#else
     currentNodes_ = worldSize;
+#endif
 
     if ( IsRealCluster )
     {
@@ -7361,7 +7410,38 @@ bool CCluster::checkIfDone (  )
     const char method_name[] = "CCluster::checkIfDone";
     TRACE_ENTRY;
 
-    if (trace_settings & TRACE_SYNC_DETAIL)
+#ifdef NAMESERVER_PROCESS
+    int nameServerCount = 0;
+    CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
+    CNameServerConfigContainer *nameServerConfig = NULL;
+
+    if (clusterConfig)
+    {
+        nameServerConfig = Nodes->GetNameServerConfig();
+        if (nameServerConfig)
+        {
+            nameServerCount = nameServerConfig->GetCount();
+        }
+    }
+
+    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
+        trace_printf("%s@%d - Node %d shutdown level=%d, state=%s.  Process "
+                     "count=%d, internal state=%d, currentNodes_=%d, "
+                     "local process count=%d, shutdownNameServer=%d, "
+                     "nameServerCount=%d\n",
+                     method_name, __LINE__, 
+                     MyNode->GetPNid(),
+                     MyNode->GetShutdownLevel(),
+                     StateString(MyNode->GetState()),
+                     Nodes->ProcessCount(),
+                     MyNode->getInternalState(),
+                     currentNodes_, 
+                     MyNode->GetNumProcs(),
+                     MyNode->IsShutdownNameServer(),
+                     nameServerCount );
+
+#else
+    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | TRACE_SYNC))
         trace_printf("%s@%d - Node %d shutdown level=%d, state=%s.  Process "
                      "count=%d, internal state=%d, currentNodes_=%d, "
                      "local process count=%d\n",
@@ -7372,45 +7452,117 @@ bool CCluster::checkIfDone (  )
                      MyNode->getInternalState(),
                      currentNodes_, MyNode->GetNumProcs());
 
+#endif            
     // Check if we are also done
     if (( MyNode->GetState() != State_Down    ) &&
         ( MyNode->GetState() != State_Stopped )   )
     {
         if ( MyNode->GetShutdownLevel() != ShutdownLevel_Undefined )
         {
-            if ( Nodes->ProcessCount() == 0 )  // all WDTs exited
-            {
-                if (trace_settings & TRACE_SYNC)
-                   trace_printf("%s@%d - Monitor signaled to exit.\n", 
method_name, __LINE__);
+#ifdef NAMESERVER_PROCESS
+            if ( (Nodes->ProcessCount() <= nameServerCount )   // only Name 
Servers alive
+                 && (MyNode->GetNumProcs() <= MAX_PRIMITIVES ) // only My Name 
Server alive
+                 && MyNode->IsShutdownNameServer()   // monitor shutdown Name 
Server received
+                 && !MyNode->isInQuiesceState() )    // post-quiescing will
+                                                     // expire WDG (cluster)
+            {
+                if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | 
TRACE_SYNC))
+                   trace_printf("%s@%d - Name Server signaled to exit.\n", 
method_name, __LINE__);
                 MyNode->SetState( State_Stopped );
                 MyNode->SetInternalState(State_Ready_To_Exit);
 
                 // we need to sync one more time so other nodes see our state
                 return false;
             }
-            else if ( (Nodes->ProcessCount() <=
-                      (currentNodes_*MAX_PRIMITIVES))        // only WDGs alive
-                      && !MyNode->isInQuiesceState()    // post-quiescing will
-                                                        // expire WDG (cluster)
-                      && !waitForWatchdogExit_ )        // WDG not yet exiting
+#else
+            if ( NameServerEnabled )
             {
-                if (trace_settings & TRACE_SYNC)
-                   trace_printf("%s@%d - Stopping watchdog process.\n",
-                                method_name, __LINE__);
-
-                waitForWatchdogExit_ = true;
-                // stop the watchdog timer first
-                HealthCheck.setState(MON_STOP_WATCHDOG);
-                // let the watchdog process exit
-                HealthCheck.setState(MON_EXIT_PRIMITIVES);
+                
+                if ( Nodes->ProcessCount() == 0 )  // all Name Servers exited
+                {
+                    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL 
| TRACE_SYNC))
+                       trace_printf("%s@%d - Monitor signaled to exit.\n", 
method_name, __LINE__);
+                    MyNode->SetState( State_Stopped );
+                    MyNode->SetInternalState(State_Ready_To_Exit);
+    
+                    // we need to sync one more time so other nodes see our 
state
+                    return false;
+                }
+                else if ( (Nodes->ProcessCount() <= 
+                            (currentNodes_ * (MAX_PRIMITIVES+1)) ) // only 
WDGs and Name Servers alive
+                          && (MyNode->GetNumProcs() <=
+                            (MAX_PRIMITIVES+1) )                   // only 
WDGs and Name Servers alive
+                          && !MyNode->isInQuiesceState()    // post-quiescing 
will
+                                                            // expire WDG 
(cluster)
+                          && !waitForWatchdogExit_ )        // WDG not yet 
exiting
+                {
+                    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL 
| TRACE_SYNC))
+                       trace_printf("%s@%d - Stopping watchdog process. "
+                                    "(process count: cluster=%d, MyNode=%d)\n",
+                                    method_name, __LINE__,
+                                    Nodes->ProcessCount(), 
MyNode->ProcessCount());
+    
+                    waitForWatchdogExit_ = true;
+                    // stop the watchdog timer first
+                    HealthCheck.setState(MON_STOP_WATCHDOG);
+                    // let the watchdog process exit
+                    HealthCheck.setState(MON_EXIT_PRIMITIVES);
+                }
+                else if ( (Nodes->ProcessCount() <= 
+                            (currentNodes_ * (MAX_PRIMITIVES)) ) // only Name 
Servers alive
+                          && (MyNode->GetNumProcs() <=
+                            (MAX_PRIMITIVES) )                   // only Name 
Servers alive
+                          && !MyNode->isInQuiesceState()    // post-quiescing 
will
+                                                            // expire WDG 
(cluster)
+                          && !waitForNameServerExit_ )      // Name Server not 
yet exiting
+                {
+                    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL 
| TRACE_SYNC))
+                       trace_printf("%s@%d - Stopping Name Server process. "
+                                    "(process count: cluster=%d, MyNode=%d)\n",
+                                    method_name, __LINE__,
+                                    Nodes->ProcessCount(), 
MyNode->ProcessCount());
+    
+                    waitForNameServerExit_ = true;
+                    NameServer->ProcessShutdown();
+                }
+            }
+            else
+            {
+                if ( Nodes->ProcessCount() == 0 )  // all WDTs exited
+                {
+                    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL 
| TRACE_SYNC))
+                       trace_printf("%s@%d - Monitor signaled to exit.\n", 
method_name, __LINE__);
+                    MyNode->SetState( State_Stopped );
+                    MyNode->SetInternalState(State_Ready_To_Exit);
+    
+                    // we need to sync one more time so other nodes see our 
state
+                    return false;
+                }
+                else if ( (Nodes->ProcessCount() <=
+                          (currentNodes_*MAX_PRIMITIVES))        // only WDGs 
alive
+                          && !MyNode->isInQuiesceState()    // post-quiescing 
will
+                                                            // expire WDG 
(cluster)
+                          && !waitForWatchdogExit_ )        // WDG not yet 
exiting
+                {
+                    if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL 
| TRACE_SYNC))
+                       trace_printf("%s@%d - Stopping watchdog process.\n",
+                                    method_name, __LINE__);
+    
+                    waitForWatchdogExit_ = true;
+                    // stop the watchdog timer first
+                    HealthCheck.setState(MON_STOP_WATCHDOG);
+                    // let the watchdog process exit
+                    HealthCheck.setState(MON_EXIT_PRIMITIVES);
+                }
             }
+#endif
         }
     }
     else if ( MyNode->GetShutdownLevel() != ShutdownLevel_Undefined
               && MyNode->GetState() == State_Down
               && MyNode->GetNumProcs() == 0)
     {
-        if (trace_settings & TRACE_SYNC)
+        if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL | 
TRACE_SYNC))
             trace_printf("%s@%d - No processes remaining, monitor exiting.\n",
                          method_name, __LINE__);
 
@@ -8483,13 +8635,15 @@ void CCluster::InitServerSock( void )
                 , (int)((unsigned char *)addr)[3]
                 , mon2nsPort );
         MyNode->SetMon2NsPort( MyMon2NsPort );
+        MyNode->SetMon2NsSocketPort( mon2nsPort );
 
         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
             trace_printf( "%s@%d Initialized my mon2ns comm socket port, "
-                          "pnid=%d (%s:%s) (Mon2NsCommPort=%s)\n"
+                          "pnid=%d (%s:%s) (Mon2NsPort=%s, 
Mon2NsSocketPort=%d)\n"
                         , method_name, __LINE__
                         , MyPNID, MyNode->GetName(), MyMon2NsPort
-                        , MyNode->GetMon2NsPort() );
+                        , MyNode->GetMon2NsPort()
+                        , MyNode->GetMon2NsSocketPort() );
 
     }
 #else
@@ -8883,7 +9037,17 @@ int CCluster::Connect( const char *portName )
     return ( sock );
 }
 
-#ifndef NAMESERVER_PROCESS
+#ifdef NAMESERVER_PROCESS
+void CCluster::ConnectToMon2NsCommSelf( void )
+{
+    const char method_name[] = "CCluster::ConnectToMon2NsCommSelf";
+    TRACE_ENTRY;
+
+    Connect( MyNode->GetMon2NsSocketPort() );
+
+    TRACE_EXIT;
+}
+#else
 void CCluster::ConnectToPtPCommSelf( void )
 {
     const char method_name[] = "CCluster::ConnectToPtPCommSelf";

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/cluster.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.h b/core/sqf/monitor/linux/cluster.h
index f4f9147..49939ac 100644
--- a/core/sqf/monitor/linux/cluster.h
+++ b/core/sqf/monitor/linux/cluster.h
@@ -113,9 +113,14 @@ public:
 #endif
     int  Connect( const char *portName );
     void Connect( int socketPort );
-#ifndef NAMESERVER_PROCESS
+#ifdef NAMESERVER_PROCESS
+    void ConnectToMon2NsCommSelf( void );
+#else
     void ConnectToPtPCommSelf( void );
 #endif
+#ifdef NAMESERVER_PROCESS
+    void ConnectToMonCommSelf( void );
+#endif
     void ConnectToSelf( void );
     int  SetKeepAliveSockOpt( int sock );
     int  MkCltSock( const char *portName );
@@ -318,6 +323,7 @@ private:
     int cumulativeDelaySec_;
 
     bool waitForWatchdogExit_;    // set when watchdog exit has already been 
issued
+    bool waitForNameServerExit_;  // set when Name Server exit has already 
been issued
 
     typedef struct state_def
     {

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/nameserver.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nameserver.cxx 
b/core/sqf/monitor/linux/nameserver.cxx
index ef67562..d367085 100644
--- a/core/sqf/monitor/linux/nameserver.cxx
+++ b/core/sqf/monitor/linux/nameserver.cxx
@@ -629,7 +629,7 @@ int CNameServer::ProcessShutdown( void )
     msgshutdown->nid = -1;
     msgshutdown->pid = -1;
     //msgshutdown->level = msgIn->u.request.u.shutdown.level;
-    msgshutdown->level = ShutdownLevel_Abrupt;
+    msgshutdown->level = ShutdownLevel_Normal;
 
     int error = SendReceive(&msg );
 

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/nscommacceptmon.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nscommacceptmon.cxx 
b/core/sqf/monitor/linux/nscommacceptmon.cxx
index cc91eba..6282b0a 100644
--- a/core/sqf/monitor/linux/nscommacceptmon.cxx
+++ b/core/sqf/monitor/linux/nscommacceptmon.cxx
@@ -53,6 +53,7 @@ CCommAcceptMon::CCommAcceptMon()
            : accepting_(false)
            , shutdown_(false)
            , thread_id_(0)
+           , process_thread_id_(0)
 {
     const char method_name[] = "CCommAcceptMon::CCommAcceptMon";
     TRACE_ENTRY;
@@ -668,7 +669,7 @@ void CCommAcceptMon::processNewSock( int joinFd )
     Context *ctx = new Context();
     ctx->this_ = this;
     ctx->pendingFd_ = joinFd;
-    rc = pthread_create(&thread_id_, NULL, mon2nsProcess, ctx);
+    rc = pthread_create(&process_thread_id_, NULL, mon2nsProcess, ctx);
     if (rc != 0)
     {
         char buf[MON_STRING_BUF_SIZE];
@@ -777,7 +778,7 @@ void CCommAcceptMon::shutdownWork(void)
 
     // Set flag that tells the commAcceptor thread to exit
     shutdown_ = true;
-    Monitor->ConnectToSelf();
+    Monitor->ConnectToMon2NsCommSelf();
     CLock::wakeOne();
 
     if ( trace_settings & ( TRACE_NS ) )

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/nscommacceptmon.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nscommacceptmon.h 
b/core/sqf/monitor/linux/nscommacceptmon.h
index 30daa92..41b2c9b 100644
--- a/core/sqf/monitor/linux/nscommacceptmon.h
+++ b/core/sqf/monitor/linux/nscommacceptmon.h
@@ -70,6 +70,8 @@ private:
 
     // commAccept thread's id
     pthread_t                      thread_id_;
+    // commAccept thread's id
+    pthread_t                      process_thread_id_;
 
     enum { HEURISTIC_COUNT = 10 };
 };

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/nsreqprocinfons.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nsreqprocinfons.cxx 
b/core/sqf/monitor/linux/nsreqprocinfons.cxx
index 3be8ddb..dbbf80c 100644
--- a/core/sqf/monitor/linux/nsreqprocinfons.cxx
+++ b/core/sqf/monitor/linux/nsreqprocinfons.cxx
@@ -39,6 +39,8 @@ extern CNodeContainer *Nodes;
 extern CReplicate Replicator;
 extern int MyPNID;
 
+extern const char *ProcessTypeString( PROCESSTYPE type );
+
 CExtProcInfoNsReq::CExtProcInfoNsReq( reqQueueMsg_t msgType,
                                       int nid, int pid, int sockFd,
                                       struct message_def *msg )
@@ -55,48 +57,110 @@ CExtProcInfoNsReq::~CExtProcInfoNsReq()
 }
 
 // Copy information for a specific process into the reply message buffer.
-void CExtProcInfoNsReq::copyInfo(CProcess *process, ProcessInfoNs_reply_def 
&processInfo)
+void CExtProcInfoNsReq::copyInfo(CProcess *process, ProcessInfoNs_reply_def 
&process_info_ns)
 {
-    CProcess *parent;
+    const char method_name[] = "CNameServer::SendReceive";
+    TRACE_ENTRY;
 
-    processInfo.nid = process->GetNid();
-    processInfo.pid = process->GetPid();
-    processInfo.verifier = process->GetVerifier();
-    strncpy( processInfo.process_name, process->GetName(), MAX_PROCESS_NAME );
-    processInfo.type = process->GetType();
+    CProcess *parent;
 
-    parent = process->GetParent();
+    process_info_ns.nid = process->GetNid();
+    process_info_ns.pid = process->GetPid();
+    process_info_ns.verifier = process->GetVerifier();
+    strncpy( process_info_ns.process_name, process->GetName(), 
MAX_PROCESS_NAME );
+    process_info_ns.type = process->GetType();
+    parent = (process->GetParentNid() == -1 ? 
+              NULL : 
+              Nodes->GetLNode(process->GetParentNid())
+                 ->GetProcessL(process->GetParentPid()));
     if (parent)
     {
-        processInfo.parent_nid = parent->GetNid();
-        processInfo.parent_pid = parent->GetPid();
-        processInfo.parent_verifier = parent->GetVerifier();
-//        strncpy(processInfo.parent_name, parent->GetName(), MAX_PROCESS_NAME 
);
+        process_info_ns.parent_nid = parent->GetNid();
+        process_info_ns.parent_pid = parent->GetPid();
+        process_info_ns.parent_verifier = parent->GetVerifier();
     }
     else
     {
-        processInfo.parent_nid = -1;
-        processInfo.parent_pid = -1;
-        processInfo.parent_verifier = -1;
-//        processInfo.parent_name[0] = '\0';
+        process_info_ns.parent_nid = -1;
+        process_info_ns.parent_pid = -1;
+        process_info_ns.parent_verifier = -1;
     }
 
-    processInfo.priority = process->GetPriority();
-    processInfo.backup = process->IsBackup();
-    processInfo.state = process->GetState();
-    processInfo.unhooked = process->IsUnhooked();
-    processInfo.event_messages = process->IsEventMessages();
-    processInfo.system_messages = process->IsSystemMessages();
-    strncpy( processInfo.program, process->program(), MAX_PROCESS_PATH );
-    processInfo.pathStrId = process->pathStrId();
-    processInfo.ldpathStrId = process->ldPathStrId();
-    processInfo.programStrId = process->programStrId();
-    strncpy( processInfo.port_name, process->GetPort(), MPI_MAX_PORT_NAME );
-    processInfo.argc = process->argc();
-    memcpy( processInfo.argv, process->userArgv(), process->userArgvLen() );
-    strncpy( processInfo.infile, process->infile(), MAX_PROCESS_PATH );
-    strncpy( processInfo.outfile, process->outfile(), MAX_PROCESS_PATH );
-    processInfo.creation_time = process->GetCreationTime();
+    process_info_ns.priority = process->GetPriority();
+    process_info_ns.backup = process->IsBackup();
+    process_info_ns.state = process->GetState();
+    process_info_ns.unhooked = process->IsUnhooked();
+    process_info_ns.event_messages = process->IsEventMessages();
+    process_info_ns.system_messages = process->IsSystemMessages();
+    strncpy( process_info_ns.program, process->program(), MAX_PROCESS_PATH );
+    process_info_ns.pathStrId = process->pathStrId();
+    process_info_ns.ldpathStrId = process->ldPathStrId();
+    process_info_ns.programStrId = process->programStrId();
+    strncpy( process_info_ns.port_name, process->GetPort(), MPI_MAX_PORT_NAME 
);
+    process_info_ns.argc = process->argc();
+    memcpy( process_info_ns.argv, process->userArgv(), process->userArgvLen() 
);
+    strncpy( process_info_ns.infile, process->infile(), MAX_PROCESS_PATH );
+    strncpy( process_info_ns.outfile, process->outfile(), MAX_PROCESS_PATH );
+    process_info_ns.creation_time = process->GetCreationTime();
+    if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+    {
+        char desc[2048];
+        char* descp = desc;
+        sprintf( desc, 
+                 "process-info-ns reply:\n"
+                 "        process_info_ns.nid=%d\n"
+                 "        process_info_ns.pid=%d\n"
+                 "        process_info_ns.verifier=%d\n"
+                 "        process_info_ns.process_name=%s\n"
+                 "        process_info_ns.type=%d\n"
+                 "        process_info_ns.parent_nid=%d\n"
+                 "        process_info_ns.parent_pid=%d\n"
+                 "        process_info_ns.parent_verifier=%d\n"
+                 "        process_info_ns.priority=%d\n"
+                 "        process_info_ns.backup=%d\n"
+                 "        process_info_ns.state=%d\n"
+                 "        process_info_ns.unhooked=%d\n"
+                 "        process_info_ns.event_messages=%d\n"
+                 "        process_info_ns.system_messages=%d\n"
+                 "        process_info_ns.program=%s\n"
+                 "        process_info_ns.pathStrId=%d:%d\n"
+                 "        process_info_ns.ldpathStrId=%d:%d\n"
+                 "        process_info_ns.programStrId=%d:%d\n"
+                 "        process_info_ns.port_name=%s\n"
+                 "        process_info_ns.argc=%d\n"
+                 "        process_info_ns.infile=%s\n"
+                 "        process_info_ns.outfile=%s\n"
+                 "        process_info_ns.return_code=%d"
+                 , process_info_ns.nid
+                 , process_info_ns.pid
+                 , process_info_ns.verifier
+                 , process_info_ns.process_name
+                 , process_info_ns.type
+                 , process_info_ns.parent_nid
+                 , process_info_ns.parent_pid
+                 , process_info_ns.parent_verifier
+                 , process_info_ns.priority
+                 , process_info_ns.backup
+                 , process_info_ns.state
+                 , process_info_ns.unhooked
+                 , process_info_ns.event_messages
+                 , process_info_ns.system_messages
+                 , process_info_ns.program
+                 , process_info_ns.pathStrId.nid
+                 , process_info_ns.pathStrId.id
+                 , process_info_ns.ldpathStrId.nid
+                 , process_info_ns.ldpathStrId.id
+                 , process_info_ns.programStrId.nid
+                 , process_info_ns.programStrId.id
+                 , process_info_ns.port_name
+                 , process_info_ns.argc
+                 , process_info_ns.infile
+                 , process_info_ns.outfile
+                 , process_info_ns.return_code );
+        trace_printf( "%s@%d - %s\n"
+                    , method_name, __LINE__, descp );
+    }
+    TRACE_EXIT;
 }
 
 void CExtProcInfoNsReq::populateRequestString( void )
@@ -148,30 +212,42 @@ void CExtProcInfoNsReq::performRequest()
     target_process_name = (const char *) 
msg_->u.request.u.process_info.target_process_name;
     target_verifier  = msg_->u.request.u.process_info.target_verifier;
 
+    PROCESSTYPE target_type  = msg_->u.request.u.process_info.type;
+
     if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
     {
         trace_printf( "%s@%d request #%ld: ProcessInfoNs, for (%d, %d:%d), "
-                      "process type=%d\n"
+                      "process type=%s\n"
                     , method_name, __LINE__, id_
                     , target_nid, target_pid, target_verifier
-                    , msg_->u.request.u.process_info.type);
+                    , ProcessTypeString(target_type));
     }
 
-    if ( target_process_name.size() )
+    if (target_process_name.size())
     { // find by name (don't check node state, don't check process state, not 
backup)
         process = Nodes->GetProcess( target_process_name.c_str()
                                    , target_verifier
                                    , false, false, false );
     }
     else
-    { // find by nid (don't check node state, don't check process state, 
backup is Ok)
-        process = Nodes->GetProcess( target_nid
-                                   , target_pid
-                                   , target_verifier
-                                   , false, false, true );
+    {
+        if (target_pid != -1)
+        { // find by nid,pid (don't check node state, don't check process 
state, backup is Ok)
+            process = Nodes->GetProcess( target_nid
+                                       , target_pid
+                                       , target_verifier
+                                       , false, false, true );
+        }
+        else
+        {
+            CLNode *lnode = Nodes->GetLNode( target_nid );
+            if (lnode)
+            {
+                process = lnode->GetProcessLByType( target_type );
+            }
+        }
     }
 
-
     if (process)
     {
         msg_->u.reply.type = ReplyType_ProcessInfoNs;
@@ -182,7 +258,7 @@ void CExtProcInfoNsReq::performRequest()
     {
         if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
         {
-           trace_printf( "%s@%d - Kill %s (%d, %d:%d) -- can't find target 
process\n"
+           trace_printf( "%s@%d - ProcessInfoNs %s (%d, %d:%d) -- can't find 
target process\n"
                        , method_name, __LINE__
                        , msg_->u.request.u.process_info.target_process_name
                        , msg_->u.request.u.process_info.target_nid

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/nsreqshutdown.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nsreqshutdown.cxx 
b/core/sqf/monitor/linux/nsreqshutdown.cxx
index 0fd6896..e5888d2 100644
--- a/core/sqf/monitor/linux/nsreqshutdown.cxx
+++ b/core/sqf/monitor/linux/nsreqshutdown.cxx
@@ -84,22 +84,12 @@ void CExtShutdownNsReq::performRequest()
                      msg_->u.request.u.shutdown.level);
     }
 
-    if ( msg_->u.request.u.shutdown.level == ShutdownLevel_Abrupt )
+    if (( MyNode->GetState() != State_Down    ) &&
+        ( MyNode->GetState() != State_Stopped )   )
     {
-        // Replicate a shutdown request so that all nodes begin to shutdown 
locally.
-        CReplShutdown *repl = new 
CReplShutdown(msg_->u.request.u.shutdown.level);
-        Replicator.addItem(repl);
-    }
-    else
-    {
-        // normal shutdown
-        // propagate the shutdown level before killing any processes.
+        MyNode->SetShutdownNameServer( true );
         MyNode->SetShutdownLevel( msg_->u.request.u.shutdown.level );
-
-        if (MyNode->GetState() == State_Up)
-        {
-            MyNode->SetState( State_Shutdown );
-        }
+        MyNode->SetState( State_Shutdown );
     }
 
     msg_->u.reply.u.generic.return_code = MPI_SUCCESS;

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/pnode.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx
index b5fa4ed..033a19a 100644
--- a/core/sqf/monitor/linux/pnode.cxx
+++ b/core/sqf/monitor/linux/pnode.cxx
@@ -169,15 +169,22 @@ CNode::CNode( char *name, int pnid, int rank )
       ,tmSyncState_(SyncState_Null)
 #endif
       ,shutdownLevel_(ShutdownLevel_Undefined)
+      ,shutdownNameServer_(false)
       ,wdtKeepAliveTimerValue_(WDT_KeepAliveTimerDefault)
       ,zid_(pnid)
+      ,commPort_("")
+      ,syncPort_("")
 #ifdef NAMESERVER_PROCESS
+      ,mon2NsPort_("")
+      ,mon2NsSocketPort_(-1)
       ,monConnCount_(0)
 #else
+      ,ptpPort_("")
       ,ptpSocketPort_(-1)
 #endif
       ,commSocketPort_(-1)
       ,syncSocketPort_(-1)
+      ,uniqStrId_(-1)
       ,procStatFile_(NULL)
       ,procMeminfoFile_(-1)
 {
@@ -302,8 +309,22 @@ CNode::CNode( char *name
       ,tmSyncNid_(-1)
       ,tmSyncState_(SyncState_Suspended)
       ,shutdownLevel_(ShutdownLevel_Undefined)
+      ,shutdownNameServer_(false)
       ,wdtKeepAliveTimerValue_(WDT_KeepAliveTimerDefault)
       ,zid_(-1)
+      ,commPort_("")
+      ,syncPort_("")
+#ifdef NAMESERVER_PROCESS
+      ,mon2NsPort_("")
+      ,mon2NsSocketPort_(-1)
+      ,monConnCount_(-1)
+#else
+      ,ptpPort_("")
+      ,ptpSocketPort_(-1)
+#endif
+      ,commSocketPort_(-1)
+      ,syncSocketPort_(-1)
+      ,uniqStrId_(-1)
       ,procStatFile_(NULL)
       ,procMeminfoFile_(-1)
 {
@@ -560,8 +581,6 @@ void CNode::CheckShutdownProcessing( void )
         sprintf(buf, "Broadcasting shutdown notice, level = %d\n", 
shutdownLevel_);
         mon_log_write(MON_NODE_SHUTDOWN_1, SQ_LOG_WARNING, buf);
         Bcast (msg);
-        if ( NameServerEnabled )
-            NameServer->ProcessShutdown();
         delete msg;
     }
 #endif
@@ -2458,6 +2477,20 @@ void CNodeContainer::CancelDeathNotification( int nid
 }
 #endif
    
+void CNodeContainer::DeleteCloneProcess( CProcess *process )
+{
+    const char method_name[] = "CNodeContainer::DeleteCloneProcess";
+    TRACE_ENTRY;
+
+    CNode *node;
+    node = Nodes->GetLNode(process->GetNid())->GetNode();
+    node->DelFromNameMap ( process );
+    node->DelFromPidMap ( process );
+    node->DeleteFromList( process );
+
+    TRACE_EXIT;
+}
+
 void CNodeContainer::DeletedNode( CNode *node )
 {
     const char method_name[] = "CNodeContainer::DeletedNode";
@@ -3009,6 +3042,200 @@ CProcess *CNodeContainer::GetProcessByName( const char 
*name, bool checkstate )
 }
 
 #ifndef NAMESERVER_PROCESS
+int CNodeContainer::GetProcessInfoNs( int nid
+                                    , int pid
+                                    , Verifier_t verifier
+                                    , ProcessInfoNs_reply_def *processInfo )
+{
+    const char method_name[] = "CNodeContainer::GetProcessInfoNs";
+    TRACE_ENTRY;
+
+    int rc = MPI_SUCCESS;
+
+    struct message_def msg;
+    msg.type = MsgType_Service;
+    msg.noreply = false;
+    msg.reply_tag = REPLY_TAG;
+    msg.u.request.type = ReqType_ProcessInfoNs;
+
+    struct ProcessInfo_def *process_info = &msg.u.request.u.process_info;
+    process_info->nid = -1;
+    process_info->pid = -1;
+    process_info->verifier = -1;
+    process_info->process_name[0] = 0;
+    process_info->target_nid = nid;
+    process_info->target_pid = pid;
+    process_info->target_verifier = verifier;
+    process_info->target_process_name[0] = 0;
+    process_info->target_process_pattern[0] = 0;
+    process_info->type = ProcessType_Undefined;
+    
+    int error = NameServer->ProcessInfoNs(&msg); // in reqQueue thread 
(CExternalReq)
+    if (error == 0)
+    {
+        if ( (msg.type == MsgType_Service) &&
+             (msg.u.reply.type == ReplyType_ProcessInfoNs) )
+        {
+            if ( msg.u.reply.u.process_info_ns.return_code == MPI_SUCCESS )
+            {
+                *processInfo = msg.u.reply.u.process_info_ns;
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf),
+                          "[%s] ProcessInfo failed, rc=%d\n"
+                        , method_name, 
msg.u.reply.u.process_info_ns.return_code );
+                mon_log_write( MON_NODE_GETPROCESSNS_1, SQ_LOG_ERR, buf );
+            }
+            rc = msg.u.reply.u.process_info_ns.return_code;
+        }
+        else
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf),
+                      "[%s], Invalid MsgType(%d)/ReplyType(%d) for "
+                      "ProcessInfoNs\n"
+                    , method_name, msg.type, msg.u.reply.type );
+            mon_log_write( MON_NODE_GETPROCESSNS_2, SQ_LOG_ERR, buf );
+            rc = MPI_ERR_OP;
+        }
+    }
+    else
+    {
+        rc = MPI_ERR_OP;
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+int CNodeContainer::GetProcessInfoNs( const char *name
+                                    , Verifier_t verifier
+                                    , ProcessInfoNs_reply_def *processInfo )
+{
+    const char method_name[] = "CNodeContainer::GetProcessInfoNs";
+    TRACE_ENTRY;
+
+    int rc = MPI_SUCCESS;
+
+    struct message_def msg;
+    msg.type = MsgType_Service;
+    msg.noreply = false;
+    msg.reply_tag = REPLY_TAG;
+    msg.u.request.type = ReqType_ProcessInfoNs;
+
+    struct ProcessInfo_def *process_info = &msg.u.request.u.process_info;
+    process_info->nid = -1;
+    process_info->pid = -1;
+    process_info->verifier = -1;
+    process_info->process_name[0] = 0;
+    process_info->target_nid = -1;
+    process_info->target_pid = -1;
+    process_info->target_verifier = verifier;
+    STRCPY( process_info->target_process_name, name);
+    process_info->target_process_pattern[0] = 0;
+    process_info->type = ProcessType_Undefined;
+
+    int error = NameServer->ProcessInfoNs(&msg); // in reqQueue thread 
(CExternalReq)
+    if (error == 0)
+    {
+        if ( (msg.type == MsgType_Service) &&
+             (msg.u.reply.type == ReplyType_ProcessInfoNs) )
+        {
+            if ( msg.u.reply.u.process_info_ns.return_code == MPI_SUCCESS )
+            {
+                *processInfo = msg.u.reply.u.process_info_ns;
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf),
+                          "[%s] ProcessInfo failed, rc=%d\n"
+                        , method_name, 
msg.u.reply.u.process_info_ns.return_code );
+                mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, buf );
+            }
+            rc = msg.u.reply.u.process_info_ns.return_code;
+        }
+        else
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf),
+                      "[%s], Invalid MsgType(%d)/ReplyType(%d) for "
+                      "ProcessInfo\n"
+                    , method_name, msg.type, msg.u.reply.type );
+            mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf );
+            rc = MPI_ERR_OP;
+        }
+    }
+    else
+    {
+        rc = MPI_ERR_OP;
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+
+CProcess *CNodeContainer::GetProcessLByTypeNs( int nid, PROCESSTYPE type )
+{
+    const char method_name[] = "CNodeContainer::GetProcessLByTypeNs";
+    TRACE_ENTRY;
+
+    CProcess *process = NULL;
+
+    struct message_def msg;
+    msg.type = MsgType_Service;
+    msg.noreply = false;
+    msg.reply_tag = REPLY_TAG;
+    msg.u.request.type = ReqType_ProcessInfoNs;
+
+    struct ProcessInfo_def *processInfo = &msg.u.request.u.process_info;
+    processInfo->nid = -1;
+    processInfo->pid = -1;
+    processInfo->verifier = -1;
+    processInfo->process_name[0] = 0;
+    processInfo->target_nid = nid;
+    processInfo->target_pid = -1;
+    processInfo->target_verifier = -1;
+    processInfo->target_process_name[0] = 0;
+    processInfo->target_process_pattern[0] = 0;
+    processInfo->type = type;
+
+    int error = NameServer->ProcessInfoNs(&msg); // in reqQueue thread 
(CExternalReq)
+    if (error == 0)
+    {
+        if ( (msg.type == MsgType_Service) &&
+             (msg.u.reply.type == ReplyType_ProcessInfoNs) )
+        {
+            if ( msg.u.reply.u.process_info_ns.return_code == MPI_SUCCESS )
+            {
+                process = AddCloneProcess( &msg.u.reply.u.process_info_ns );
+            }
+            else
+            {
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf),
+                          "[%s] ProcessInfo failed, rc=%d\n"
+                        , method_name, 
msg.u.reply.u.process_info_ns.return_code );
+                mon_log_write( MON_NODE_GETPROCESSNS_3, SQ_LOG_ERR, buf );
+            }
+        }
+        else
+        {
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf),
+                      "[%s], Invalid MsgType(%d)/ReplyType(%d) for "
+                      "ProcessInfo\n"
+                    , method_name, msg.type, msg.u.reply.type );
+            mon_log_write( MON_NODE_GETPROCESSNS_4, SQ_LOG_ERR, buf );
+        }
+    }
+
+    TRACE_EXIT;
+    return( process );
+}
+
 CProcess *CNodeContainer::GetProcessNs( int nid
                                       , int pid
                                       , Verifier_t verifier )
@@ -3019,11 +3246,11 @@ CProcess *CNodeContainer::GetProcessNs( int nid
     CProcess *process = NULL;
 
     struct message_def msg;
-    memset(&msg, 0, sizeof(msg) ); // TODO: remove!
     msg.type = MsgType_Service;
     msg.noreply = false;
     msg.reply_tag = REPLY_TAG;
     msg.u.request.type = ReqType_ProcessInfoNs;
+
     struct ProcessInfo_def *processInfo = &msg.u.request.u.process_info;
     processInfo->nid = -1;
     processInfo->pid = -1;
@@ -3033,6 +3260,8 @@ CProcess *CNodeContainer::GetProcessNs( int nid
     processInfo->target_pid = pid;
     processInfo->target_verifier = verifier;
     processInfo->target_process_name[0] = 0;
+    processInfo->target_process_pattern[0] = 0;
+    processInfo->type = ProcessType_Undefined;
     
     int error = NameServer->ProcessInfoNs(&msg); // in reqQueue thread 
(CExternalReq)
     if (error == 0)
@@ -3076,11 +3305,11 @@ CProcess *CNodeContainer::GetProcessNs( const char 
*name, Verifier_t verifier )
     CProcess *process = NULL;
 
     struct message_def msg;
-    memset(&msg, 0, sizeof(msg) ); // TODO: remove!
     msg.type = MsgType_Service;
     msg.noreply = false;
     msg.reply_tag = REPLY_TAG;
     msg.u.request.type = ReqType_ProcessInfoNs;
+
     struct ProcessInfo_def *processInfo = &msg.u.request.u.process_info;
     processInfo->nid = -1;
     processInfo->pid = -1;
@@ -3090,6 +3319,8 @@ CProcess *CNodeContainer::GetProcessNs( const char *name, 
Verifier_t verifier )
     processInfo->target_pid = -1;
     processInfo->target_verifier = verifier;
     STRCPY( processInfo->target_process_name, name);
+    processInfo->target_process_pattern[0] = 0;
+    processInfo->type = ProcessType_Undefined;
 
     int error = NameServer->ProcessInfoNs(&msg); // in reqQueue thread 
(CExternalReq)
     if (error == 0)

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/pnode.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.h b/core/sqf/monitor/linux/pnode.h
index b13fbef..44008ea 100644
--- a/core/sqf/monitor/linux/pnode.h
+++ b/core/sqf/monitor/linux/pnode.h
@@ -76,6 +76,7 @@ public:
                                    , int verifier
                                    , _TM_Txid_External trans_id );
     void    ChangedNode( CNode *node );
+    void    DeleteCloneProcess( CProcess *process );
     void    DeletedNode( CNode *node );
     bool    DeleteNode( int pnid );
     void    DeleteNode( CNode *node );
@@ -112,6 +113,15 @@ public:
                         , bool checkstate=true
                         , bool backupOk=false );
     CProcess *GetProcessByName( const char *name, bool checkstate=true );
+    int     GetProcessInfoNs( int nid
+                            , int pid
+                            , Verifier_t verifier
+                            , ProcessInfoNs_reply_def *processInfo );
+    int     GetProcessInfoNs( const char *name
+                            , Verifier_t verifier
+                            , ProcessInfoNs_reply_def *processInfo );
+    CProcess *GetProcessLByTypeNs( int nid
+                                 , PROCESSTYPE type );
     CProcess *GetProcessNs( int nid
                           , int pid
                           , Verifier_t verifier );
@@ -243,6 +253,7 @@ public:
     inline const char *GetSyncPort( void ) { return syncPort_.c_str(); }
 #ifdef NAMESERVER_PROCESS
     inline const char *GetMon2NsPort( void ) { return mon2NsPort_.c_str(); }
+    inline int   GetMon2NsSocketPort( void ) { return( mon2NsSocketPort_ ); }
     inline int GetMonConnCount( void ) { return monConnCount_; }
 #else
     inline const char *GetPtPPort( void ) { return ptpPort_.c_str(); }
@@ -271,6 +282,7 @@ public:
     inline bool  IsSpareNode( void ) { return( spareNode_ ); }
     inline bool  IsSoftNodeDown( void ) { return( internalState_ == 
State_SoftDown ); }
     inline bool  IsSoftNodeUp( void ) { return( internalState_ == State_SoftUp 
); }
+    inline bool  IsShutdownNameServer( void ) { return( shutdownNameServer_ ); 
}
 
     CNode  *Link( CNode *entry );
     void    MoveLNodes( CNode *targetNode );
@@ -318,6 +330,7 @@ public:
     inline void SetSyncPort( char *syncPort) { syncPort_ = syncPort; }
 #ifdef NAMESERVER_PROCESS
     inline void SetMon2NsPort( char *mon2NsPort) { mon2NsPort_ = mon2NsPort; }
+    inline void SetMon2NsSocketPort( int mon2NsSocketPort) { mon2NsSocketPort_ 
= mon2NsSocketPort; }
 #else
     inline void SetPtPPort( char *ptpPort) { ptpPort_ = ptpPort; }  
     inline void SetPtPSocketPort( int ptpSocketPort) { ptpSocketPort_ = 
ptpSocketPort; }
@@ -326,6 +339,7 @@ public:
     inline void SetCommSocketPort( int commSocketPort) { commSocketPort_ = 
commSocketPort; }
     inline void SetSyncSocketPort( int syncSocketPort) { syncSocketPort_ = 
syncSocketPort; }
     inline void SetSpareNode( void ) { spareNode_ = true; }
+    inline void SetShutdownNameServer( bool shutdown ) { shutdownNameServer_ = 
shutdown; }
     inline void SetShutdownLevel( ShutdownLevel level ) { shutdownLevel_ = 
level; }
     void SetState( STATE state );
     inline void SetTmSyncNid( int nid ) { tmSyncNid_ = nid; }
@@ -403,6 +417,7 @@ private:
     int           tmSyncNid_;    // Logical Node of TM that initiated sync
     SyncState     tmSyncState_;  // Sync operation state with TMs
     ShutdownLevel shutdownLevel_;
+    bool          shutdownNameServer_; // true when monitor shutdown Name 
Server request is received
     int           wdtKeepAliveTimerValue_; // expiration time
     struct timeval todStart_;    // time of last watchdog reset
 
@@ -417,6 +432,7 @@ private:
     string        syncPort_;          // monitor socket allgather port
 #ifdef NAMESERVER_PROCESS
     string        mon2NsPort_;        // monitor to ns port
+    int           mon2NsSocketPort_;  // monitor to ns socket port
     int           monConnCount_;      // monitor connections
 #else
     string        ptpPort_;

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/process.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/process.cxx 
b/core/sqf/monitor/linux/process.cxx
index fce323c..5ecaf50 100644
--- a/core/sqf/monitor/linux/process.cxx
+++ b/core/sqf/monitor/linux/process.cxx
@@ -2939,6 +2939,30 @@ void CProcess::Exit( CProcess *parent )
 
     SetState(State_Stopped);
 
+    if (parent && NameServerEnabled)
+    {
+        ProcessInfoNs_reply_def processInfo;
+        int rc = Nodes->GetProcessInfoNs( parent->GetNid()
+                                        , parent->GetPid()
+                                        , parent->GetVerifier()
+                                        , &processInfo);
+        if (rc == MPI_ERR_NAME)
+        {
+            if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST 
| TRACE_SYNC | TRACE_TMSYNC))
+            {
+                trace_printf( "%s@%d - Deleting clone process %s, (%d,%d:%d)\n"
+                            , method_name, __LINE__
+                            , parent->GetName()
+                            , parent->GetNid()
+                            , parent->GetPid()
+                            , parent->GetVerifier() );
+            }
+            Nodes->DeleteCloneProcess( parent );
+            parent = NULL;
+        }
+    
+    }
+
     // if the env is set to not deliver death messages upon node down,
     // check the state of the process' node.
     bool supplyProcessDeathNotices = true;

http://git-wip-us.apache.org/repos/asf/trafodion/blob/db656603/core/sqf/monitor/linux/reqtmleader.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqtmleader.cxx 
b/core/sqf/monitor/linux/reqtmleader.cxx
index 9cfe6ea..9d4fb96 100644
--- a/core/sqf/monitor/linux/reqtmleader.cxx
+++ b/core/sqf/monitor/linux/reqtmleader.cxx
@@ -28,11 +28,13 @@
 #include "montrace.h"
 #include "monsonar.h"
 #include "monlogging.h"
+#include "nameserver.h"
 
 extern CMonStats *MonStats;
 extern CNode *MyNode;
 extern CNodeContainer *Nodes;
 extern CMonitor *Monitor;
+extern bool NameServerEnabled;
 
 CExtTmLeaderReq::CExtTmLeaderReq (reqQueueMsg_t msgType, int pid,
                                   struct message_def *msg )
@@ -117,11 +119,27 @@ void CExtTmLeaderReq::performRequest()
             Monitor->ExitSyncCycle();
         }
 
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | 
TRACE_SYNC | TRACE_TMSYNC))
+        {
+            trace_printf( "%s@%d - tmLeaderNid=%d\n"
+                        , method_name, __LINE__, tmLeaderNid );
+        }
+
         if ( MyNode->GetShutdownLevel() == ShutdownLevel_Undefined )
         {
             CProcess *process;
 
             process = Nodes->GetLNode(tmLeaderNid)->GetProcessLByType( 
ProcessType_DTM );
+            if (!process && NameServerEnabled)
+            {
+                if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | 
TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+                {
+                    trace_printf( "%s@%d - Getting process from Name Server, 
nid=%d, type=ProcessType_DTM\n"
+                                , method_name, __LINE__, tmLeaderNid );
+                }
+            
+                process = Nodes->GetProcessLByTypeNs( tmLeaderNid, 
ProcessType_DTM );
+            }
 
             if (!process)
             {
@@ -150,6 +168,24 @@ void CExtTmLeaderReq::performRequest()
             msg_->u.reply.u.generic.pid = process->GetPid();
             msg_->u.reply.u.generic.verifier = process->GetVerifier();
             strcpy (msg_->u.reply.u.generic.process_name, process->GetName());
+
+            if (process && NameServerEnabled)
+            {
+                if (!MyNode->IsMyNode( process->GetNid() ))
+                {
+                    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | 
TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+                    {
+                        trace_printf( "%s@%d - Deleting clone process %s, 
(%d,%d:%d)\n"
+                                    , method_name, __LINE__
+                                    , process->GetName()
+                                    , process->GetNid()
+                                    , process->GetPid()
+                                    , process->GetVerifier() );
+                    }
+                    Nodes->DeleteCloneProcess( process );
+                }
+            
+            }
         }
         else
         {

Reply via email to