Multiple fixes with Name Server enabled logic.
- More than one Name Server is now supported
- Node failures handled with with/without Name Server in node
- Long process names are now the default and support clusters larger that 1000 
nodes


Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/32fe8565
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/32fe8565
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/32fe8565

Branch: refs/heads/master
Commit: 32fe856510ed9c741de13bdd2a3b80561c6b0e2f
Parents: 7184f8f
Author: Zalo Correa <[email protected]>
Authored: Wed Jul 25 16:24:18 2018 -0700
Committer: Zalo Correa <[email protected]>
Committed: Wed Jul 25 16:24:18 2018 -0700

----------------------------------------------------------------------
 .../export/include/common/evl_sqlog_eventnum.h  |  49 +-
 core/sqf/monitor/linux/cluster.cxx              | 221 ++++-
 core/sqf/monitor/linux/cluster.h                |   7 +
 core/sqf/monitor/linux/commaccept.cxx           |   8 +-
 core/sqf/monitor/linux/config.cxx               |   8 +-
 core/sqf/monitor/linux/healthcheck.cxx          |   3 +-
 core/sqf/monitor/linux/internal.h               |   5 +
 core/sqf/monitor/linux/lnode.cxx                |  12 +-
 core/sqf/monitor/linux/makefile                 |   1 +
 core/sqf/monitor/linux/monitor.cxx              |  18 +-
 core/sqf/monitor/linux/msgdef.h                 |   2 +-
 core/sqf/monitor/linux/nameserver.cxx           | 470 ++++++++--
 core/sqf/monitor/linux/nameserver.h             |  18 +-
 core/sqf/monitor/linux/nscommacceptmon.cxx      |  92 +-
 core/sqf/monitor/linux/nscommacceptmon.h        |   5 +-
 core/sqf/monitor/linux/nsreqprocinfons.cxx      |  13 +-
 core/sqf/monitor/linux/nsreqshutdown.cxx        |   6 +-
 core/sqf/monitor/linux/nsreqstop.cxx            |   2 +-
 core/sqf/monitor/linux/pnode.cxx                |  90 +-
 core/sqf/monitor/linux/process.cxx              | 217 ++++-
 core/sqf/monitor/linux/process.h                |   9 +-
 core/sqf/monitor/linux/ptpclient.cxx            | 934 +++++++++++--------
 core/sqf/monitor/linux/ptpclient.h              |  66 +-
 core/sqf/monitor/linux/ptpcommaccept.cxx        | 328 ++++---
 core/sqf/monitor/linux/ptpcommaccept.h          |  12 +-
 core/sqf/monitor/linux/redirector.cxx           |  53 +-
 core/sqf/monitor/linux/reqdump.cxx              |  16 +-
 core/sqf/monitor/linux/reqevent.cxx             |  16 +-
 core/sqf/monitor/linux/reqkill.cxx              |  26 +-
 core/sqf/monitor/linux/reqnotify.cxx            |  18 +-
 core/sqf/monitor/linux/reqopen.cxx              |  20 +-
 core/sqf/monitor/linux/reqprocinfo.cxx          |  15 +-
 core/sqf/monitor/linux/reqqueue.cxx             |  43 +-
 core/sqf/monitor/linux/reqqueue.h               |  19 +
 core/sqf/monitor/linux/shell.cxx                |  67 +-
 core/sqf/monitor/linux/tmsync.cxx               |   4 +-
 core/sqf/monitor/linux/zclient.cxx              |   2 +-
 core/sqf/sqenvcom.sh                            |  18 +-
 core/sqf/sql/scripts/gomon.cold                 |  13 +-
 39 files changed, 2026 insertions(+), 900 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/export/include/common/evl_sqlog_eventnum.h
----------------------------------------------------------------------
diff --git a/core/sqf/export/include/common/evl_sqlog_eventnum.h 
b/core/sqf/export/include/common/evl_sqlog_eventnum.h
index 8930ccc..10268d8 100644
--- a/core/sqf/export/include/common/evl_sqlog_eventnum.h
+++ b/core/sqf/export/include/common/evl_sqlog_eventnum.h
@@ -84,6 +84,7 @@
 #define MON_CLUSTER_MARKDOWN_1              101011101
 #define MON_CLUSTER_MARKDOWN_2              101011102
 #define MON_CLUSTER_MARKDOWN_3              101011103
+#define MON_CLUSTER_MARKDOWN_4              101011104
 //#define MON_CLUSTER_MARKUP                  101011201
 #define MON_CLUSTER_NODE_TM_READY_1         101011301
 #define MON_CLUSTER_NODE_TM_READY_2         101011302
@@ -253,8 +254,16 @@
 #define MON_CLUSTER_ASSIGNMONITORLEADER_2   101015302
 #define MON_CLUSTER_ASSIGNMONITORLEADER_3   101015303
 #define MON_CLUSTER_ASSIGNMONITORLEADER_4   101015304
+
 #define MON_CLUSTER_CHECKIFDONE_1           101015401
 
+#define MON_CLUSTER_HARDNODEUPNS_1          101015501
+#define MON_CLUSTER_HARDNODEUPNS_2          101015502
+
+#define MON_CLUSTER_RECEIVESOCK_1           101015601
+
+#define MON_CLUSTER_SENDSOCK_1              101015701
+
 /* Module: monitor.cxx = 02 */
 
 #define MON_MONITOR_MAIN_1                  101020101
@@ -1016,24 +1025,30 @@
 #define ZCONFIG_DELETECONFIGZNODE_3         101381003
 
 /* Module nameserver.cxx = 39 */
-#define MON_NAMESERVER_MKCLTSOCK_1          101390101
-#define MON_NAMESERVER_MKCLTSOCK_2          101390102
-#define MON_NAMESERVER_MKCLTSOCK_3          101390103
-#define MON_NAMESERVER_MKCLTSOCK_4          101390104
-#define MON_NAMESERVER_MKCLTSOCK_5          101390105
-#define MON_NAMESERVER_MKCLTSOCK_6          101390106
+#define NAMESERVER_CLIENTSOCKCREATE_1       101390101
+#define NAMESERVER_CLIENTSOCKCREATE_2       101390102
+#define NAMESERVER_CLIENTSOCKCREATE_3       101390103
+#define NAMESERVER_CLIENTSOCKCREATE_4       101390104
+#define NAMESERVER_CLIENTSOCKCREATE_5       101390105
+#define NAMESERVER_CLIENTSOCKCREATE_6       101390106
+#define NAMESERVER_SENDTONS_1               101390201
+#define NAMESERVER_SENDTONS_2               101390202
+#define NAMESERVER_SOCKRECEIVE_1            101390301
+#define NAMESERVER_SOCKSEND_1               101390401
+#define NAMESERVER_GETM2NPORT_1             101390501
+#define NAMESERVER_CHOOSENEXTNS_1           101390601
 
 /* Module nscommaccept.cxx = 40 */
-#define NS_COMMACCEPT_1                     101400101
-#define NS_COMMACCEPT_3                     101400102
-#define NS_COMMACCEPT_2                     101400103
-#define NS_COMMACCEPT_4                     101400104
-#define NS_COMMACCEPT_5                     101400105
-#define NS_COMMACCEPT_6                     101400106
-#define NS_COMMACCEPT_7                     101400107
-#define NS_COMMACCEPT_8                     101400108
-#define NS_COMMACCEPT_9                     101400109
-#define NS_COMMACCEPT_10                    101400110
+#define NS_COMMACCEPT_PROCESSMONREQS_1      101400101
+#define NS_COMMACCEPT_PROCESSMONREQS_2      101400102
+#define NS_COMMACCEPT_PROCESSMONREQS_3      101400103
+#define NS_COMMACCEPT_PROCESSMONREQS_4      101400104
+#define NS_COMMACCEPT_PROCESSMONREQS_5      101400105
+#define NS_COMMACCEPT_PROCESSNEWSOCK_1      101400201
+#define NS_COMMACCEPT_COMMACCEPTORSOCK_1    101400301
+#define NS_COMMACCEPT_MON2NSACCEPTMON_1     101400401
+#define NS_COMMACCEPT_MON2NSPROCESS_1       101400501
+#define NS_COMMACCEPT_START_1               101400601
 
 /* Module: reqnodedown.cxx = 41 */
 #define MON_EXT_NAMESERVERDOWN_REQ          101410101
@@ -1067,6 +1082,8 @@
 #define PTPCLIENT_STDINREQ_2                101930202
 #define PTPCLIENT_STDIODATA_1               101930301
 #define PTPCLIENT_STDIODATA_2               101930302
+#define PTPCLIENT_SENDTOMON_1               101930401
+#define PTPCLIENT_SENDTOMON_2               101930402
 
 /* Module ptpcommaccept.cxx = 94 */
 #define PTP_COMMACCEPT_1                    101940101

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/cluster.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.cxx 
b/core/sqf/monitor/linux/cluster.cxx
index 4d0d3c2..762c1a0 100644
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@ -69,6 +69,7 @@ using namespace std;
 #include "nscommacceptmon.h"
 #else
 #include "nameserver.h"
+#include "ptpclient.h"
 #endif
 
 extern bool IAmIntegrating;
@@ -88,7 +89,9 @@ extern char MySyncPort[MPI_MAX_PORT_NAME];
 extern CCommAcceptMon CommAcceptMon;
 extern char MyMon2NsPort[MPI_MAX_PORT_NAME];
 #else
+extern CProcess *NameServerProcess;
 extern CNameServer *NameServer;
+extern CPtpClient *PtpClient;
 extern bool NameServerEnabled;
 extern char MyPtPPort[MPI_MAX_PORT_NAME];
 #endif
@@ -1069,12 +1072,11 @@ unsigned long long 
CCluster::EnsureAndGetSeqNum(cluster_state_def_t nodestate[])
 }
 
 
+#ifndef NAMESERVER_PROCESS
 void CCluster::HardNodeDown (int pnid, bool communicate_state)
 {
-#ifndef NAMESERVER_PROCESS
     char port_fname[MAX_PROCESS_PATH];
     char temp_fname[MAX_PROCESS_PATH];
-#endif
     CNode  *node;
     CLNode *lnode;
     char    buf[MON_STRING_BUF_SIZE];
@@ -1130,7 +1132,6 @@ void CCluster::HardNodeDown (int pnid, bool 
communicate_state)
         return;
     }
 
-#ifndef NAMESERVER_PROCESS
     if ( !Emulate_Down )
     {
         if( !IsRealCluster )
@@ -1161,7 +1162,6 @@ void CCluster::HardNodeDown (int pnid, bool 
communicate_state)
         remove(temp_fname);
         rename(port_fname, temp_fname);
     }
-#endif
 
     if (node->GetState() != State_Down || !node->isInQuiesceState())
     {
@@ -1194,9 +1194,7 @@ void CCluster::HardNodeDown (int pnid, bool 
communicate_state)
                 if ( ! Emulate_Down )
                 {
                     // make sure no processes are alive if in the middle of 
re-integration
-#ifndef NAMESERVER_PROCESS
                     node->KillAllDown();
-#endif
                     snprintf(buf, sizeof(buf),
                              "[CCluster::HardNodeDown], Node %s (%d)is 
down.\n",
                              node->GetName(), node->GetPNid());
@@ -1212,29 +1210,29 @@ void CCluster::HardNodeDown (int pnid, bool 
communicate_state)
         }
         else
         {
-            if ( node->GetPNid() == integratingPNid_ )
-            {
-                ResetIntegratingPNid();
-            }
-#ifndef NAMESERVER_PROCESS
-            node->KillAllDown();
-#endif
-            node->SetState( State_Down );
-            // Send node down message to local node's processes
-            lnode = node->GetFirstLNode();
-            for ( ; lnode; lnode = lnode->GetNextP() )
+            if (node->GetState() != State_Down)
             {
-                lnode->Down();
-            }
-            if ( ZClientEnabled )
-            {
-                ZClient->WatchNodeDelete( node->GetName() );
-                ZClient->WatchNodeMasterDelete( node->GetName() );
+                if ( node->GetPNid() == integratingPNid_ )
+                {
+                    ResetIntegratingPNid();
+                }
+                node->KillAllDown();
+                node->SetState( State_Down );
+                // Send node down message to local node's processes
+                lnode = node->GetFirstLNode();
+                for ( ; lnode; lnode = lnode->GetNextP() )
+                {
+                    lnode->Down();
+                }
+                if ( ZClientEnabled )
+                {
+                    ZClient->WatchNodeDelete( node->GetName() );
+                    ZClient->WatchNodeMasterDelete( node->GetName() );
+                }
             }
         }
     }
 
-#ifndef NAMESERVER_PROCESS
     // we need to abort any active TmSync
     if (( MyNode->GetTmSyncState() == SyncState_Start    ) ||
         ( MyNode->GetTmSyncState() == SyncState_Continue ) ||
@@ -1245,21 +1243,79 @@ void CCluster::HardNodeDown (int pnid, bool 
communicate_state)
         if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | 
TRACE_TMSYNC))
            trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated 
(%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, 
MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ));
     }
-#endif
 
-#ifndef NAMESERVER_PROCESS
     if ( Emulate_Down )
     {
         AssignTmLeader(pnid, false);
     }
     else
-#endif
     {
         AssignLeaders(pnid, node->GetName(), false);
     }
 
     TRACE_EXIT;
 }
+#endif
+
+#ifdef NAMESERVER_PROCESS
+void CCluster::HardNodeDownNs( int pnid )
+{
+    CNode  *node;
+    char    buf[MON_STRING_BUF_SIZE];
+
+    const char method_name[] = "CCluster::HardNodeDownNs";
+    TRACE_ENTRY;
+
+    node = Nodes->GetNode(pnid);
+
+    if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
+       trace_printf( "%s@%d - pnid=%d, state=%s, isInQuiesceState=%d,"
+                     " (local pnid=%d, state=%s, isInQuiesceState=%d, "
+                     "shutdown level=%d)\n", method_name, __LINE__,
+                     pnid, StateString(node->GetState()),
+                     node->isInQuiesceState(),
+                     MyPNID, StateString(MyNode->GetState()),
+                     MyNode->isInQuiesceState(), MyNode->GetShutdownLevel() );
+
+    if (( MyPNID == pnid              ) &&
+        ( MyNode->GetState() == State_Down ||
+          MyNode->IsKillingNode() ) )
+    {
+        // we are coming down ... don't process it
+        if ( !IsRealCluster && MyNode->isInQuiesceState())
+        {
+          // in virtual env, this would be called after node quiescing,
+          // so continue with mark down processing.
+        }
+        else
+        {
+          return;
+        }
+    }
+
+    if (node->GetState() != State_Down)
+    {
+        snprintf( buf, sizeof(buf)
+                , "[%s], Node %s (%d) is going down.\n"
+                 , method_name, node->GetName(), node->GetPNid());
+        mon_log_write(MON_CLUSTER_MARKDOWN_4, SQ_LOG_INFO, buf);
+
+        node->SetKillingNode( true );
+        node->DeleteAllDown();
+        node->SetState( State_Down );
+
+        if ( ZClientEnabled )
+        {
+            //ZClient->WatchNodeDelete( node->GetName() );
+            ZClient->WatchNodeMasterDelete( node->GetName() );
+        }
+    }
+
+    AssignLeaders(pnid, node->GetName(), false);
+
+    TRACE_EXIT;
+}
+#endif
 
 void CCluster::SoftNodeDown( int pnid )
 {
@@ -1651,8 +1707,10 @@ int CCluster::HardNodeUp( int pnid, char *node_name )
         if ( nodeState == State_Down )
         {
             node->SetKillingNode( false );
+#ifndef NAMESERVER_PROCESS
             if ( Emulate_Down )
             {
+#endif
                 // Any DTMs running?
                 for ( int i=0; !tmCount && i < Nodes->GetPNodesCount(); i++ )
                 {
@@ -1706,6 +1764,7 @@ int CCluster::HardNodeUp( int pnid, char *node_name )
                         }
                     }
                 }
+#ifndef NAMESERVER_PROCESS
             }
             else
             {
@@ -1714,6 +1773,7 @@ int CCluster::HardNodeUp( int pnid, char *node_name )
                                   method_name, __LINE__ );
 
             }
+#endif
         }
         else if ( nodeState == State_Merged )
         {
@@ -1866,6 +1926,74 @@ int CCluster::HardNodeUp( int pnid, char *node_name )
     return( rc );
 }
 
+#ifdef NAMESERVER_PROCESS
+int CCluster::HardNodeUpNs( int pnid )
+{
+    int     rc = 0;
+    CNode  *node;
+    STATE   nodeState;
+
+    const char method_name[] = "CCluster::HardNodeUpNs";
+    TRACE_ENTRY;
+
+    if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
+       trace_printf( "%s@%d - pnid=%d, MyPNID = %d, currentNodes_=%d\n"
+                   , method_name, __LINE__, pnid, MyPNID, currentNodes_ );
+
+    node = Nodes->GetNode( pnid );
+    if ( node == NULL )
+    {
+        if ( rc )
+        {   // Handle error
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], Invalid node, pnid=%d\n"
+                    , method_name, pnid );
+            mon_log_write(MON_CLUSTER_HARDNODEUPNS_1, SQ_LOG_ERR, buf);
+            return( -1 );
+        }
+    }
+
+    nodeState = node->GetState();
+
+    if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
+       trace_printf( "%s@%d" " - Node state=%s" "\n"
+                   , method_name, __LINE__, StateString( nodeState ) );
+
+    if ( nodeState != State_Up )
+    {
+        if ( nodeState == State_Down )
+        {
+            node->SetKillingNode( false );
+            // We need to remove any old process objects before we restart the 
node.
+            node->CleanUpProcesses();
+            node->SetState( State_Up );
+            if ( MyPNID != pnid )
+            {
+                // Let other monitors know this node is up
+                CReplNodeUp *repl = new CReplNodeUp(pnid);
+                Replicator.addItem(repl);
+            }
+        }
+    }
+    else
+    {   // Handle error
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], Invalid node state, node %s, pnid=%d, state=%s\n"
+                , method_name
+                , node->GetName()
+                , node->GetPNid()
+                , StateString( nodeState ) );
+        mon_log_write(MON_CLUSTER_HARDNODEUPNS_2, SQ_LOG_ERR, buf);
+        return( -1 );
+    }
+
+    TRACE_EXIT;
+    return( rc );
+}
+#endif
+
 int CCluster::SoftNodeUpPrepare( int pnid )
 {
     char    buf[MON_STRING_BUF_SIZE];
@@ -7456,7 +7584,10 @@ void CCluster::UpdateClusterState( bool &doShutdown,
         case State_Unknown:
            break;
         case State_Down:
-            doShutdown = true;
+            if (IsRealCluster)
+            {
+                doShutdown = true;
+            }
             break;
         case State_Stopped:
         case State_Shutdown:
@@ -7780,19 +7911,23 @@ bool CCluster::checkIfDone (  )
                     // let the watchdog process exit
                     HealthCheck.setState(MON_EXIT_PRIMITIVES);
                 }
-                else if ( (MyNode->GetNumProcs() <=         // only My Name 
Server alive
-                            myNameServerCount )
+                else if ( NameServerProcess != NULL
+                          && myNameServerCount > 0
+                          && (MyNode->GetNumProcs() <= myNameServerCount ) // 
only My Name Server alive
                           && !MyNode->isInQuiesceState()    // post-quiescing 
will
                                                             // expire WDG 
(cluster)
                           && !waitForNameServerExit_ )      // Name Server not 
yet exiting
                 {
                     if (trace_settings & (TRACE_PROCESS | TRACE_PROCESS_DETAIL 
| TRACE_SYNC))
-                       trace_printf("%s@%d - Stopping Name Server process. "
-                                    "(process count: cluster=%d, MyNode=%d)\n",
-                                    method_name, __LINE__,
-                                    Nodes->ProcessCount(), 
MyNode->ProcessCount());
-    
+                    {
+                        trace_printf("%s@%d - Stopping Name Server process. "
+                                     "(process count: cluster=%d, 
MyNode=%d)\n",
+                                     method_name, __LINE__,
+                                     Nodes->ProcessCount(), 
MyNode->ProcessCount());
+                    }
+
                     waitForNameServerExit_ = true;
+                    MyNode->SetProcessState( NameServerProcess, State_Down, 
false );
                     int rc = NameServer->ProcessShutdown();
                     if (rc)
                     {
@@ -10196,6 +10331,14 @@ int CCluster::ReceiveSock(char *buf, int size, int 
sockFd, const char *desc)
             if ( errno != EINTR)
             {
                 error = errno;
+                char la_buf[MON_STRING_BUF_SIZE];
+                sprintf( la_buf, "[%s], recv(), received=%d, sock=%d, 
error=%d(%s), desc=%s\n"
+                       , method_name
+                       , received
+                       , sockFd
+                       , error, strerror(error)
+                       , desc );
+                mon_log_write(MON_CLUSTER_RECEIVESOCK_1, SQ_LOG_ERR, la_buf);
                 readAgain = false;
             }
             else
@@ -10264,6 +10407,14 @@ int CCluster::SendSock(char *buf, int size, int 
sockFd, const char *desc)
             if ( errno != EINTR)
             {
                 error = errno;
+                char la_buf[MON_STRING_BUF_SIZE];
+                sprintf( la_buf, "[%s], send(), sent=%d, sock=%d, 
error=%d(%s), desc=%s\n"
+                       , method_name
+                       , sent
+                       , sockFd
+                       , error, strerror(error)
+                       , desc );
+                mon_log_write(MON_CLUSTER_SENDSOCK_1, SQ_LOG_ERR, la_buf);
                 sendAgain = false;
             }
             else

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/cluster.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.h b/core/sqf/monitor/linux/cluster.h
index 970cf4c..2dfbfb8 100644
--- a/core/sqf/monitor/linux/cluster.h
+++ b/core/sqf/monitor/linux/cluster.h
@@ -163,7 +163,11 @@ public:
     int GetConfigPNodesMax() { return configPNodesMax_; }
     bool ImAlive( bool needed=false, struct sync_def *sync = NULL );
     int  MapRank( int current_rank );
+#ifndef NAMESERVER_PROCESS
     void HardNodeDown( int nid, bool communicate_state=false );
+#else
+    void HardNodeDownNs( int nid );
+#endif
     void SoftNodeDown( int pnid );
     int  SoftNodeUpPrepare( int pnid );
     bool CheckSpareSet( int pnid );
@@ -174,6 +178,9 @@ public:
     void ResetIntegratingPNid( void );
     void SetIntegratingPNid( int pnid );
     int HardNodeUp( int pnid, char *node_name );
+#ifdef NAMESERVER_PROCESS
+    int HardNodeUpNs( int pnid );
+#endif
     inline CNode *GetIntegratingNode() { return Node[integratingPNid_]; }
     inline CNode *GetNode( int pnid ) { return Node[pnid]; }
     static char *Timestamp( void );

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/commaccept.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/commaccept.cxx 
b/core/sqf/monitor/linux/commaccept.cxx
index 5c4e3a5..13c2ebd 100644
--- a/core/sqf/monitor/linux/commaccept.cxx
+++ b/core/sqf/monitor/linux/commaccept.cxx
@@ -220,12 +220,18 @@ bool CCommAccept::sendNodeInfoSock( int sockFd )
                             , i, node->GetPNid(), node->GetName());
             }
 
-            nodeInfo[i].pnid = -1;
             nodeInfo[i].nodeName[0] = '\0';
             nodeInfo[i].commPort[0] = '\0';
             nodeInfo[i].syncPort[0] = '\0';
+            nodeInfo[i].pnid = -1;
             nodeInfo[i].creatorPNid = -1;
         }
+        nodeInfo[i].creatorShellPid = -1;
+        nodeInfo[i].creatorShellVerifier = -1;
+        nodeInfo[i].creator = false;
+        nodeInfo[i].ping = false;
+        nodeInfo[i].nsPid = -1;
+        nodeInfo[i].nsPNid = -1;
     }
 
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/config.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/config.cxx 
b/core/sqf/monitor/linux/config.cxx
index 4ef9b72..9794199 100644
--- a/core/sqf/monitor/linux/config.cxx
+++ b/core/sqf/monitor/linux/config.cxx
@@ -1193,7 +1193,7 @@ int CConfigContainer::PackRegistry( char *&buffer, 
ConfigType type )
         regClusterEntry->valueLength = strlen (regClusterConfig[i].value);
         if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
         {
-            trace_printf ("%s%d pack type %d, scope %s (%d), key %s (%d), 
value %s(%d)\n",method_name, __LINE__,
+            trace_printf ("%s@%d pack type %d, scope %s (%d), key %s (%d), 
value %s(%d)\n",method_name, __LINE__,
                            regClusterEntry->type, regClusterConfig[i].scope, 
                            
regClusterEntry->scopeLength,regClusterConfig[i].key,regClusterEntry->keyLength,
  
                            regClusterConfig[i].value, 
regClusterEntry->valueLength);
@@ -1226,7 +1226,7 @@ int CConfigContainer::PackRegistry( char *&buffer, 
ConfigType type )
     
     if (regClusterConfig)
     {
-         delete regClusterConfig; 
+         delete [] regClusterConfig; 
     }
     return numberOfEntries;
     
@@ -1258,7 +1258,7 @@ void CConfigContainer::UnpackRegistry( char *&buffer, int 
count )
 
         if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
         {
-            trace_printf ("%s%d scope length %d, key length %d, value length 
%d\n", method_name, __LINE__,
+            trace_printf ("%s@%d scope length %d, key length %d, value length 
%d\n", method_name, __LINE__,
                           clusterObj2->scopeLength, 
                           clusterObj2->keyLength, clusterObj2->valueLength);
         }
@@ -1317,7 +1317,7 @@ int CConfigContainer::PackUniqueStrings( char *&buffer )
                  stringObj->stringLength = strlen(unique_string);
                  if (trace_settings & (TRACE_INIT | TRACE_REQUEST))
                  {
-                      trace_printf ("%s%d  packing nid %d, unique id %d, 
stringt %s (length %d)\n", method_name, __LINE__,
+                      trace_printf ("%s@%d  packing nid %d, unique id %d, 
stringt %s (length %d)\n", method_name, __LINE__,
                                      pnid, maxId, 
unique_string,stringObj->stringLength );
                  } 
                  stringObj->unique_id = maxId;

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/healthcheck.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/healthcheck.cxx 
b/core/sqf/monitor/linux/healthcheck.cxx
index 1f344fc..203465c 100644
--- a/core/sqf/monitor/linux/healthcheck.cxx
+++ b/core/sqf/monitor/linux/healthcheck.cxx
@@ -54,7 +54,6 @@ using namespace std;
 #include "redirector.h"
 #include "replicate.h"
 
-
 extern CReqQueue ReqQueue;
 extern CMonitor *Monitor;
 extern CNode *MyNode;
@@ -64,6 +63,7 @@ extern CRedirector Redirector;
 extern CHealthCheck HealthCheck;
 extern CReplicate Replicator;
 extern int MyPNID;
+extern bool IsRealCluster;
 
 // constructor
 CHealthCheck::CHealthCheck()
@@ -229,7 +229,6 @@ void CHealthCheck::healthCheckThread()
     TRACE_ENTRY;
 
     HealthCheckStates state;
-
     struct timespec ts;
 
     if (trace_settings & TRACE_HEALTH)

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/internal.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/internal.h 
b/core/sqf/monitor/linux/internal.h
index 35fa32a..5dc6456 100644
--- a/core/sqf/monitor/linux/internal.h
+++ b/core/sqf/monitor/linux/internal.h
@@ -537,5 +537,10 @@ struct sync_buffer_def
     char msg[MAX_SYNC_SIZE];
 };
 
+typedef struct ptpMsgInfo
+{
+    int pnid;                           // Current offset into the msg buffer
+    int size;                           // Number if messages to replicate
+} ptpMsgInfo_t;
 
 #endif

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/lnode.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/lnode.cxx b/core/sqf/monitor/linux/lnode.cxx
index 69a186d..bbe5ac4 100644
--- a/core/sqf/monitor/linux/lnode.cxx
+++ b/core/sqf/monitor/linux/lnode.cxx
@@ -41,6 +41,7 @@ using namespace std;
 #include "lnode.h"
 #include "pnode.h"
 #include "mlio.h"
+#include "nameserver.h"
 
 extern bool IsRealCluster;
 extern CommType_t CommType;
@@ -50,6 +51,10 @@ extern CMonitor *Monitor;
 extern CMonStats *MonStats;
 extern bool usingCpuAffinity;
 extern bool usingTseCpuAffinity;
+#ifndef NAMESERVER_PROCESS
+extern CNameServer *NameServer;
+extern bool NameServerEnabled;
+#endif
 
 void CoreMaskString( char *str, cpu_set_t coreMask, int totalCores )
 {
@@ -396,7 +401,12 @@ void CLNode::Down( void )
                         , method_name, __LINE__, GetNid()
                         , GetNode()->GetName(), msg->u.request.u.down.takeover 
);
         }
-        
+#ifndef NAMESERVER_PROCESS
+        if ( NameServerEnabled )
+        {
+            NameServer->ProcessNodeDown( Nid, msg->u.request.u.down.node_name 
);
+        }
+#endif
         MyNode->Bcast( msg );
         delete msg;
     }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/makefile
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/makefile b/core/sqf/monitor/linux/makefile
index 73127e4..3d16bab 100644
--- a/core/sqf/monitor/linux/makefile
+++ b/core/sqf/monitor/linux/makefile
@@ -265,6 +265,7 @@ NSOBJS += $(OUTDIR)/nsreqqueue.o
 NSOBJS += $(OUTDIR)/nsreqdelproc.o
 NSOBJS += $(OUTDIR)/nsreqstop.o
 NSOBJS += $(OUTDIR)/nsreqnewproc.o
+NSOBJS += $(OUTDIR)/nsreqnodedown.o
 NSOBJS += $(OUTDIR)/nsreqprocinfo.o
 NSOBJS += $(OUTDIR)/nsreqprocinfons.o
 NSOBJS += $(OUTDIR)/nsreqstart.o

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/monitor.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/monitor.cxx 
b/core/sqf/monitor/linux/monitor.cxx
index d588945..2ad0528 100755
--- a/core/sqf/monitor/linux/monitor.cxx
+++ b/core/sqf/monitor/linux/monitor.cxx
@@ -1367,7 +1367,8 @@ int main (int argc, char *argv[])
     env = getenv("SQ_NAMESERVER_ENABLED");
     if ( env && isdigit(*env) )
     {
-        NameServerEnabled = atoi(env);
+        int val = atoi(env);
+        NameServerEnabled = (val != 0) ? true : false;
     }
 #endif
 
@@ -1605,6 +1606,7 @@ int main (int argc, char *argv[])
     }
     setlinebuf(stdout);
 
+#ifndef NAMESERVER_PROCESS
     // Send stderr output to same file as stdout.  (Note: the monitor does
     // not write to stderr but perhaps there could be components included in
     // the monitor build that do write to stderr.)
@@ -1612,6 +1614,10 @@ int main (int argc, char *argv[])
     {
         printf ( "dup2 failed for stderr: %s (%d)\n", strerror(errno), errno);
     }
+#else
+    // Name Server is a child process of the monitor, the process create logic
+    // will establish IO redirection between the monitor process and the child.
+#endif
 
     switch( CommType )
     {
@@ -2052,13 +2058,15 @@ int main (int argc, char *argv[])
 #ifdef NAMESERVER_PROCESS
         Monitor = new CMonitor ();
 #else
-        Monitor = new CMonitor (procTermSig);
-#endif
-#ifndef NAMESERVER_PROCESS
         if (NameServerEnabled)
         {
+            PtpClient  = new CPtpClient ();
+            Monitor    = new CMonitor (procTermSig);
             NameServer = new CNameServer ();
-            PtpClient = new CPtpClient ();
+        }
+        else
+        {
+            Monitor = new CMonitor (procTermSig);
         }
 #endif
 

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/msgdef.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/msgdef.h b/core/sqf/monitor/linux/msgdef.h
index 639c15c..8218a8e 100644
--- a/core/sqf/monitor/linux/msgdef.h
+++ b/core/sqf/monitor/linux/msgdef.h
@@ -89,7 +89,7 @@
 #define MAX_PROCINFO_LIST 64
 #define MAX_PROC_CONTEXT 5
 #define MAX_PROCESS_NAME MAX_KEY_NAME
-#define MAX_PROCESS_NAME_STR 12
+#define MAX_PROCESS_NAME_STR 13
 #define MAX_PROCESS_PATH 256
 #define MAX_PROCESSOR_NAME 128
 #define MAX_RECONN_PING_WAIT_TIMEOUT 5

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nameserver.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nameserver.cxx 
b/core/sqf/monitor/linux/nameserver.cxx
index e9f1900..ad024f6 100644
--- a/core/sqf/monitor/linux/nameserver.cxx
+++ b/core/sqf/monitor/linux/nameserver.cxx
@@ -44,6 +44,7 @@ using namespace std;
 #include <limits.h>
 #include <unistd.h>
 
+#include "trafconf/trafconfig.h"
 #include "lnode.h"
 #include "pnode.h"
 #include "nameserver.h"
@@ -51,21 +52,24 @@ using namespace std;
 #include "montrace.h"
 #include "nameserverconfig.h"
 #include "meas.h"
+#include "reqqueue.h"
 
 extern CNode *MyNode;
 extern CProcess *NameServerProcess;
 extern CNodeContainer *Nodes;
+extern CReqQueue ReqQueue;
 extern bool IsRealCluster;
 extern int MyPNID;
 extern CNameServerConfigContainer *NameServerConfig;
 extern CMeas Meas;
 
+#define NAMESERVER_IO_RETRIES 3
+
 CNameServer::CNameServer( void )
-: mon2nsSock_(-1)
-, nsConfigInx_(-1)
-, nsStartupComplete_(false)
-, seqNum_(0)
-, shutdown_(false)
+           : mon2nsSock_(-1)
+           , nsStartupComplete_(false)
+           , seqNum_(0)
+           , shutdown_(false)
 {
     const char method_name[] = "CNameServer::CNameServer";
     TRACE_ENTRY;
@@ -84,7 +88,7 @@ CNameServer::~CNameServer( void )
     TRACE_EXIT;
 }
 
-void CNameServer::ChooseNextNs( void )
+int CNameServer::ChooseNextNs( void )
 {
     const char method_name[] = "CNameServer::ChooseNextNs";
     TRACE_ENTRY;
@@ -103,17 +107,58 @@ void CNameServer::ChooseNextNs( void )
     {
         config = config->GetNext();
     }
-    strcpy( mon2nsHost_, config->GetName() );
-    if ( trace_settings & TRACE_NS )
+    CNode *node = Nodes->GetNode( (char*) config->GetName() );
+    if (node && node->GetState() == State_Up)
     {
-        trace_printf( "%s@%d - nameserver=%s, rnd=%d, cnt=%d\n"
-                    , method_name, __LINE__
-                    , mon2nsHost_
-                    , rnd
-                    , cnt );
+        strcpy( mon2nsHost_, config->GetName() );
+        if ( trace_settings & TRACE_NS )
+        {
+            trace_printf( "%s@%d - nameserver=%s, rnd=%d, cnt=%d\n"
+                        , method_name, __LINE__
+                        , mon2nsHost_
+                        , rnd
+                        , cnt );
+        }
+    }
+    else
+    {
+        config = 
config->GetNext()?config->GetNext():NameServerConfig->GetFirstConfig();
+        while (config)
+        {
+            node = Nodes->GetNode( (char*) config->GetName() );
+            if (node && node->GetState() != State_Up)
+            {
+                config = config->GetNext();
+                continue;
+            }
+            
+            strcpy( mon2nsHost_, config->GetName() );
+            if ( trace_settings & TRACE_NS )
+            {
+                trace_printf( "%s@%d - selected alternate nameserver=%s\n"
+                            , method_name, __LINE__
+                            , mon2nsHost_ );
+            }
+            break;
+        }
+    }
+
+    if (strlen(mon2nsHost_) == 0)
+    {
+        char la_buf[MON_STRING_BUF_SIZE];
+        sprintf( la_buf
+               , "[%s], No Name Server nodes available.\n"
+                 "Scheduling shutdown (abrupt)!\n"
+               , method_name );
+        mon_log_write(NAMESERVER_CHOOSENEXTNS_1, SQ_LOG_CRIT, la_buf ); 
+        ReqQueue.enqueueShutdownReq( ShutdownLevel_Abrupt );
+
+        TRACE_EXIT;
+        return( -2 );
     }
 
     TRACE_EXIT;
+    return( 0 );
 }
 
 int CNameServer::ConnectToNs( bool *retry )
@@ -123,21 +168,32 @@ int CNameServer::ConnectToNs( bool *retry )
 
     int err = 0;
 
+reconnect:
+
     if ( !mon2nsPort_[0] )
-        CNameServer::GetM2NPort( -1 );
-    if ( !mon2nsHost_[0] )
-        ChooseNextNs();
+    {
+        err = GetM2NPort( -1 );
+    }
+    if ( err == 0 && !mon2nsHost_[0] )
+    {
+        err = ChooseNextNs();
+    }
 
     int sock = 0;
 
     if ( shutdown_ )
+    {
         err = -1;
+    }
 
     if ( err == 0 )
     {
-        sock = SockCreate();
+        sock = ClientSockCreate();
         if ( sock < 0 )
+        {
             err = sock;
+            goto reconnect;
+        }
     }
     if ( err == 0 )
     {
@@ -191,7 +247,7 @@ int CNameServer::ConnectToNs( bool *retry )
                         , nodeId.ping );
         }
         err = SockSend( ( char *) &nodeId, sizeof(nodeId) );
-        if ( err == 0 )
+        if (err == 0)
         {
             if ( trace_settings & TRACE_NS )
             {
@@ -252,7 +308,7 @@ int CNameServer::ConnectToNs( bool *retry )
                     if ( IsRealCluster )
                     {
                         CNode *node = Nodes->GetNode( nodeId.nsPNid );
-                        if ( node )
+                        if (node && node->GetState() == State_Up)
                         {
                             strcpy( mon2nsHost_, node->GetName() );
                             GetM2NPort( nodeId.nsPNid );
@@ -273,50 +329,123 @@ int CNameServer::ConnectToNs( bool *retry )
     return err;
 }
 
-void CNameServer::GetM2NPort( int PNid )
+int CNameServer::GetM2NPort( int nsPNid )
 {
+    const char method_name[] = "CNameServer::GetM2NPort";
+    TRACE_ENTRY;
+
+    bool done = false;
     int port;
     char *p = getenv( "NS_M2N_COMM_PORT" );
     if ( p )
+    {
         port = atoi(p);
+    }
     else
+    {
         port = 0;
+    }
     if ( !IsRealCluster )
-        port += PNid < 0 ? MyPNID : PNid;
+    {
+        // choose initial port
+        int nsMax = NameServerConfig->GetCount();
+        int candidatePNid = nsPNid < 0 ? MyPNID : nsPNid;
+        int chosenPNid = 
+                candidatePNid < nsMax ? candidatePNid : candidatePNid%nsMax;
+        int lastChosenPNid = chosenPNid;
+        while (!done)
+        {
+            // check that corresponding node is UP
+            // node is up, chosen is good to go
+            // not up,
+            //   round-robin on other name server nodes and chose 1st up node
+            //   no name server nodes available
+            //      log event and down my node (MyPNID)
+            CNode *node = Nodes->GetNode( chosenPNid );
+            if (node && node->GetState() == State_Up)
+            {
+                port += chosenPNid;
+        
+                if ( trace_settings & TRACE_NS )
+                {
+                    trace_printf( "%s@%d - nsMax=%d, nsPNid=%d, MyPNID=%d, "
+                                  "candidatePNid=%d, chosenPNid=%d, port=%d\n"
+                                , method_name, __LINE__
+                                , nsMax
+                                , nsPNid
+                                , MyPNID
+                                , candidatePNid
+                                , chosenPNid
+                                , port );
+                }
+                done = true;
+            }
+            else
+            {
+                chosenPNid = (chosenPNid+1) < nsMax ? (chosenPNid+1) : 0;
+                if (chosenPNid == lastChosenPNid)
+                {
+                    char la_buf[MON_STRING_BUF_SIZE];
+                    sprintf( la_buf
+                           , "[%s], No Name Server nodes available, "
+                             "chosenPNid=%d, lastChosenPNid=%d.\n"
+                             "Scheduling shutdown (abrupt)!\n"
+                           , method_name
+                           , chosenPNid, lastChosenPNid );
+                    mon_log_write(NAMESERVER_GETM2NPORT_1, SQ_LOG_CRIT, la_buf 
); 
+                    ReqQueue.enqueueShutdownReq( ShutdownLevel_Abrupt );
+                    done = true;
+                }
+                port += chosenPNid;
+                TRACE_EXIT;
+                return( -2 );
+            }
+        }
+    }
     sprintf( mon2nsPort_, "%d", port );
-}
 
-void CNameServer::SetLocalHost( void )
-{
-    gethostname( mon2nsHost_, MAX_PROCESSOR_NAME );
+    TRACE_EXIT;
+    return( 0 );
 }
 
-void CNameServer::SetShutdown( bool shutdown )
+bool CNameServer::IsNameServerConfigured( int pnid )
 {
-    const char method_name[] = "CNameServer::SetShutdown";
+    const char method_name[] = "CNameServer::IsNameServerConfigured";
     TRACE_ENTRY;
 
-    if ( trace_settings & TRACE_NS )
-        trace_printf( "%s@%d - set shutdown_=%d\n"
-                    , method_name, __LINE__, shutdown );
-    shutdown_ = shutdown;
+    bool rs = false;    
 
-    TRACE_EXIT;
-}
+    if ( IsRealCluster )
+    {
+        CNameServerConfig *config;
+        CNode *node = Nodes->GetNode( pnid );
+        if ( node )
+        {
+            config = NameServerConfig->GetConfig( node->GetName() );
+            if ( config )
+            {
+                rs = true;
+            }
+        }
+    }
+    else
+    {
+        rs = pnid < NameServerConfig->GetCount() ? true : false;
+    }
 
-void CNameServer::SockClose( void )
-{
-    const char method_name[] = "CNameServer::SockClose";
-    TRACE_ENTRY;
+    if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+    {
+        trace_printf( "%s@%d - pnid=%d, configured=%s\n"
+                    , method_name, __LINE__, pnid, rs?"True":"False" );
+    }
 
-    close( mon2nsSock_ );
-    mon2nsSock_ = -1;
     TRACE_EXIT;
+    return(rs);
 }
 
-int CNameServer::SockCreate( void )
+int CNameServer::ClientSockCreate( void )
 {
-    const char method_name[] = "CNameServer::SockCreate";
+    const char method_name[] = "CNameServer::ClientSockCreate";
     TRACE_ENTRY;
 
     int    sock;        // socket
@@ -363,7 +492,8 @@ int CNameServer::SockCreate( void )
             snprintf( la_buf, sizeof(la_buf) 
                     , "[%s], socket() failed! errno=%d (%s)\n"
                     , method_name, err, strerror(err) );
-            mon_log_write( MON_NAMESERVER_MKCLTSOCK_1, SQ_LOG_ERR, la_buf ); 
+            mon_log_write( NAMESERVER_CLIENTSOCKCREATE_1, SQ_LOG_ERR, la_buf 
); 
+            TRACE_EXIT;
             return ( -1 );
         }
 
@@ -375,8 +505,9 @@ int CNameServer::SockCreate( void )
             snprintf( la_buf, sizeof(la_buf ), 
                       "[%s] gethostbyname(%s) failed! errno=%d (%s)\n"
                     , method_name, host, err, strerror(err) );
-            mon_log_write(MON_NAMESERVER_MKCLTSOCK_2, SQ_LOG_ERR, la_buf ); 
+            mon_log_write(NAMESERVER_CLIENTSOCKCREATE_2, SQ_LOG_ERR, la_buf ); 
             close( sock );
+            TRACE_EXIT;
             return ( -1 );
         }
 
@@ -418,7 +549,7 @@ int CNameServer::SockCreate( void )
                 int err = errno;
                 sprintf( la_buf, "[%s], connect() failed! errno=%d (%s)\n"
                        , method_name, err, strerror(err) );
-                mon_log_write(MON_NAMESERVER_MKCLTSOCK_3, SQ_LOG_ERR, la_buf 
); 
+                mon_log_write(NAMESERVER_CLIENTSOCKCREATE_3, SQ_LOG_ERR, 
la_buf ); 
                 struct timespec req, rem;
                 req.tv_sec = 0;
                 req.tv_nsec = 500000000L; // 500,000,000
@@ -439,8 +570,9 @@ int CNameServer::SockCreate( void )
                 char la_buf[MON_STRING_BUF_SIZE];
                 sprintf( la_buf, "[%s], connect() exceeded retries! count=%d\n"
                        , method_name, retries );
-                mon_log_write(MON_NAMESERVER_MKCLTSOCK_4, SQ_LOG_ERR, la_buf 
); 
+                mon_log_write(NAMESERVER_CLIENTSOCKCREATE_4, SQ_LOG_ERR, 
la_buf ); 
                 close( sock );
+                TRACE_EXIT;
                 return ( -1 );
             }
             struct timespec req, rem;
@@ -449,6 +581,8 @@ int CNameServer::SockCreate( void )
             nanosleep( &req, &rem );
         }
         close( sock );
+        TRACE_EXIT;
+        return( -1 );
     }
 
     if ( trace_settings & TRACE_NS )
@@ -470,8 +604,9 @@ int CNameServer::SockCreate( void )
         int err = errno;
         sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
                , method_name, err, strerror(err) );
-        mon_log_write(MON_NAMESERVER_MKCLTSOCK_5, SQ_LOG_ERR, la_buf );
+        mon_log_write(NAMESERVER_CLIENTSOCKCREATE_5, SQ_LOG_ERR, la_buf );
         close( sock );
+        TRACE_EXIT;
         return ( -2 );
     }
 
@@ -481,8 +616,9 @@ int CNameServer::SockCreate( void )
         int err = errno;
         sprintf( la_buf, "[%s], setsockopt() failed! errno=%d (%s)\n"
                , method_name, err, strerror(err) );
-        mon_log_write(MON_NAMESERVER_MKCLTSOCK_6, SQ_LOG_ERR, la_buf ); 
+        mon_log_write(NAMESERVER_CLIENTSOCKCREATE_6, SQ_LOG_ERR, la_buf ); 
         close( sock );
+        TRACE_EXIT;
         return ( -2 );
     }
 
@@ -490,6 +626,19 @@ int CNameServer::SockCreate( void )
     return ( sock );
 }
 
+void CNameServer::NameServerExited( void )
+{
+    const char method_name[] = "CNameServer::NameServerExited";
+    TRACE_ENTRY;
+
+    mon2nsHost_[0] = '\0';
+    mon2nsPort_[0] = '\0';
+    nsStartupComplete_ = false;
+    SockClose();
+
+    TRACE_EXIT;
+}
+
 int CNameServer::NameServerStop( struct message_def* msg )
 {
     const char method_name[] = "CNameServer::NameServerStop";
@@ -599,9 +748,6 @@ int CNameServer::ProcessNew(CProcess* process )
     msgnew->unhooked = process->IsUnhooked();
     msgnew->event_messages = process->IsEventMessages();
     msgnew->system_messages = process->IsSystemMessages();
-//    msgnew->pathStrId = process->pathStrId();
-//    msgnew->ldpathStrId = process->ldPathStrId();
-//    msgnew->programStrId = process->programStrId();
     strcpy( msgnew->path, process->path() );
     strcpy( msgnew->ldpath, process->ldpath() );
     strcpy( msgnew->program, process->program() );
@@ -682,6 +828,45 @@ int CNameServer::ProcessNew(CProcess* process )
     return error;
 }
 
+int CNameServer::ProcessNodeDown( int nid, char *nodeName )
+{
+    const char method_name[] = "CNameServer::ProcessNodeDown";
+    TRACE_ENTRY;
+
+    int error = 0;
+    CProcess *process = MyNode->GetProcessByType( ProcessType_NameServer );
+    if (process)
+    {
+        struct message_def msg;
+        memset(&msg, 0, sizeof(msg) ); // TODO: remove!
+        msg.type = MsgType_Service;
+        msg.noreply = false;
+        msg.reply_tag = seqNum_++;
+        msg.u.request.type = ReqType_NodeDown;
+        struct NodeDown_def *msgdown = &msg.u.request.u.down;
+        msgdown->nid = nid;
+        strcpy( msgdown->node_name, nodeName );
+        msgdown->takeover = 0;
+        msgdown->reason[0] = 0;
+    
+        if ( trace_settings & TRACE_NS )
+        {
+            trace_printf( "%s@%d - sending node-down request to 
nameserver=%s:%s\n"
+                          "        msg.down.nid=%d\n"
+                          "        msg.down.node_name=%s\n"
+                        , method_name, __LINE__
+                        , mon2nsHost_, mon2nsPort_ 
+                        , msgdown->nid
+                        , msgdown->node_name );
+        }
+
+        error = SendReceive(&msg );
+    }
+
+    TRACE_EXIT;
+    return error;
+}
+
 int CNameServer::ProcessShutdown( void )
 {
     const char method_name[] = "CNameServer::ProcessShutdown";
@@ -696,7 +881,6 @@ int CNameServer::ProcessShutdown( void )
     struct ShutdownNs_def *msgshutdown = &msg.u.request.u.shutdown_ns;
     msgshutdown->nid = -1;
     msgshutdown->pid = -1;
-    //msgshutdown->level = msgIn->u.request.u.shutdown.level;
     msgshutdown->level = ShutdownLevel_Normal;
 
     int error = SendReceive(&msg );
@@ -711,16 +895,20 @@ int CNameServer::ProcessShutdown( void )
 int CNameServer::SendReceive( struct message_def* msg )
 {
     const char method_name[] = "CNameServer::SendReceive";
+    TRACE_ENTRY;
+
+    int retryCount = 0;
     char desc[256];
     char* descp;
-    struct DelProcessNs_def *msgdel;
-    struct NewProcessNs_def *msgnew;
-    struct ShutdownNs_def *msgshutdown;
-    struct NameServerStart_def *msgstart;
-    struct NameServerStop_def *msgstop;
-    struct ProcessInfo_def *msginfo;
-
-    TRACE_ENTRY;
+    struct DelProcessNs_def* msgdel;
+    struct NameServerStart_def* msgstart;
+    struct NameServerStop_def* msgstop;
+    struct NewProcessNs_def* msgnew;
+    struct NodeDown_def* msgdown;
+    struct ProcessInfo_def* msginfo;
+    struct ShutdownNs_def* msgshutdown;
+    struct message_def msg_reply;
+    struct message_def* pmsg_reply = &msg_reply;
 
     descp = desc;
     int size = offsetof(struct message_def, u.request.u);
@@ -750,6 +938,13 @@ int CNameServer::SendReceive( struct message_def* msg )
                 msgnew->nid, msgnew->pid, msgnew->verifier, 
msgnew->process_name );
         size += sizeof(msg->u.request.u.new_process_ns);
         break;
+    case ReqType_NodeDown:
+        msgdown = &msg->u.request.u.down;
+        sprintf( desc, "node-down (nid=%d, node-name=%s, takeover=%d, 
reason=%s)",
+                msgdown->nid, msgdown->node_name,
+                msgdown->takeover, msgdown->reason );
+        size += sizeof(msg->u.request.u.down);
+        break;
     case ReqType_ProcessInfo:
         msginfo = &msg->u.request.u.process_info;
         sprintf( desc, "process-info (nid=%d, pid=%d, verifier=%d, name=%s)\n"
@@ -774,7 +969,7 @@ int CNameServer::SendReceive( struct message_def* msg )
         break;
     case ReqType_ShutdownNs:
         msgshutdown = &msg->u.request.u.shutdown_ns;
-        sprintf( desc, "shutdown (nid=%d, pid=%d, level=%d)",
+        sprintf( desc, "shutdown-ns (nid=%d, pid=%d, level=%d)",
                 msgshutdown->nid, msgshutdown->pid, msgshutdown->level );
         size += sizeof(msg->u.request.u.shutdown_ns);
         break;
@@ -783,13 +978,16 @@ int CNameServer::SendReceive( struct message_def* msg )
         break;
     }
 
+retryIO:
+
     int error = SendToNs( descp, msg, size );
     if ( error == 0 )
         error = SockReceive( (char *) &size, sizeof(size ) );
     if ( error == 0 )
-        error = SockReceive( (char *) msg, size );
+        error = SockReceive( (char *) pmsg_reply, size );
     if ( error == 0 )
     {
+        memcpy( msg, pmsg_reply, size );
         if ( trace_settings & ( TRACE_NS | TRACE_PROCESS ) )
         {
             char desc[2048];
@@ -827,7 +1025,6 @@ int CNameServer::SendReceive( struct message_def* msg )
                          msg->u.reply.u.process_info.more_data );
                 break;
             case ReplyType_ProcessInfoNs:
-//                int argvLen = sizeof(msg->u.reply.u.process_info_ns.argv);
                 sprintf( desc, 
                          "process-info-ns reply:\n"
                          "        process_info_ns.nid=%d\n"
@@ -847,18 +1044,14 @@ int CNameServer::SendReceive( struct message_def* msg )
                          "        process_info_ns.path=%s\n"
                          "        process_info_ns.ldpath=%s\n"
                          "        process_info_ns.program=%s\n"
-//                         "        process_info_ns.pathStrId=%d:%d\n"
-//                         "        process_info_ns.ldpathStrId=%d:%d\n"
-//                         "        process_info_ns.programStrId=%d:%d\n"
                          "        process_info_ns.port_name=%s\n"
                          "        process_info_ns.argc=%d\n"
-//                         "        process_info_ns.argv=[%.*s]\n"
                          "        process_info_ns.infile=%s\n"
                          "        process_info_ns.outfile=%s\n"
-//#if 0
-//                         "        process_info_ns.creation_time=%ld(secs)\n",
-//                         "        
process_info_ns.creation_time=%ld(secs):%ld(nsecs)\n",
-//#endif
+#if 0
+                         "        process_info_ns.creation_time=%ld(secs)\n",
+                         "        
process_info_ns.creation_time=%ld(secs):%ld(nsecs)\n",
+#endif
                          "        process_info_ns.return_code=%d"
                          , msg->u.reply.u.process_info_ns.nid
                          , msg->u.reply.u.process_info_ns.pid
@@ -877,21 +1070,14 @@ int CNameServer::SendReceive( struct message_def* msg )
                          , msg->u.reply.u.process_info_ns.path
                          , msg->u.reply.u.process_info_ns.ldpath
                          , msg->u.reply.u.process_info_ns.program
-//                         , msg->u.reply.u.process_info_ns.pathStrId.nid
-//                         , msg->u.reply.u.process_info_ns.pathStrId.id
-//                         , msg->u.reply.u.process_info_ns.ldpathStrId.nid
-//                         , msg->u.reply.u.process_info_ns.ldpathStrId.id
-//                         , msg->u.reply.u.process_info_ns.programStrId.nid
-//                         , msg->u.reply.u.process_info_ns.programStrId.id
                          , msg->u.reply.u.process_info_ns.port_name
                          , msg->u.reply.u.process_info_ns.argc
-//                         , &msg->u.reply.u.process_info_ns.argv
                          , msg->u.reply.u.process_info_ns.infile
                          , msg->u.reply.u.process_info_ns.outfile
-//#if 0
-//                         , 
msg->u.reply.u.process_info_ns.creation_time.tv_sec
-//                         , 
msg->u.reply.u.process_info_ns.creation_time.tv_nsec
-//#endif
+#if 0
+                         , msg->u.reply.u.process_info_ns.creation_time.tv_sec
+                         , msg->u.reply.u.process_info_ns.creation_time.tv_nsec
+#endif
                          , msg->u.reply.u.process_info_ns.return_code );
                 break;
             default:
@@ -905,7 +1091,20 @@ int CNameServer::SendReceive( struct message_def* msg )
                         );
         }
     }
-    else
+    else if ( error != -2 && retryCount < NAMESERVER_IO_RETRIES )
+    {
+        retryCount++;
+        if ( trace_settings & TRACE_NS )
+        {
+            trace_printf( "%s@%d - retrying IO (%d) to nameserver=%s:%s\n"
+                        , method_name, __LINE__
+                        , retryCount
+                        , mon2nsHost_, mon2nsPort_ );
+        }
+        goto retryIO;
+    }
+
+    if ( error )
     {
         // create a synthetic reply
         msg->u.reply.u.generic.nid = -1;
@@ -943,9 +1142,10 @@ int CNameServer::SendToNs( const char *reqType, struct 
message_def *msg, int siz
 
     if ( trace_settings & TRACE_NS )
     {
-        trace_printf( "%s@%d - sending %s REQ to nameserver=%s:%s, sock=%d, 
shutdown=%d\n"
+        trace_printf( "%s@%d - sending %s\tREQ (size=%d) to nameserver=%s:%s, 
sock=%d, shutdown=%d\n"
                     , method_name, __LINE__
                     , reqType
+                    , size
                     , mon2nsHost_
                     , mon2nsPort_
                     , mon2nsSock_ 
@@ -967,15 +1167,72 @@ int CNameServer::SendToNs( const char *reqType, struct 
message_def *msg, int siz
             error = ConnectToNs( &retry );
         }
     }
+
     if ( error == 0 )
+    {
         error = SockSend( (char *) &size, sizeof(size) );
-    if ( error == 0 )
-        error = SockSend( (char *) msg, size );
+        if (error)
+        {
+            int err = error;
+            char buf[MON_STRING_BUF_SIZE];
+            snprintf( buf, sizeof(buf)
+                    , "[%s], unable to send %s request size %d to "
+                      "nameserver=%s:%s, error: %d(%s)\n"
+                    , method_name, reqType, size, mon2nsHost_, mon2nsPort_, 
err, strerror(err) );
+            mon_log_write(NAMESERVER_SENDTONS_1, SQ_LOG_ERR, buf);    
+        }
+        else
+        {
+            error = SockSend( (char *) msg, size );
+            if (error)
+            {
+                int err = error;
+                char buf[MON_STRING_BUF_SIZE];
+                snprintf( buf, sizeof(buf)
+                        , "[%s], unable to send %s request to "
+                          "nameserver=%s:%s, error: %d(%s)\n"
+                        , method_name, reqType,  mon2nsHost_, mon2nsPort_, 
err, strerror(err) );
+                mon_log_write(NAMESERVER_SENDTONS_2, SQ_LOG_ERR, buf);    
+            }
+        }
+    }
 
     TRACE_EXIT;
     return error;
 }
 
+void CNameServer::SetLocalHost( void )
+{
+    gethostname( mon2nsHost_, MAX_PROCESSOR_NAME );
+}
+
+void CNameServer::SetShutdown( bool shutdown )
+{
+    const char method_name[] = "CNameServer::SetShutdown";
+    TRACE_ENTRY;
+
+    if ( trace_settings & TRACE_NS )
+        trace_printf( "%s@%d - set shutdown_=%d\n"
+                    , method_name, __LINE__, shutdown );
+    shutdown_ = shutdown;
+
+    TRACE_EXIT;
+}
+
+void CNameServer::SockClose( void )
+{
+    const char method_name[] = "CNameServer::SockClose";
+    TRACE_ENTRY;
+
+    if (mon2nsSock_ != -1)
+    {
+        close( mon2nsSock_ );
+        mon2nsSock_ = -1;
+    }
+
+    TRACE_EXIT;
+}
+
 int CNameServer::SockReceive( char *buf, int size )
 {
     const char method_name[] = "CNameServer::SockReceive";
@@ -1045,9 +1302,29 @@ int CNameServer::SockReceive( char *buf, int size )
                     , error, strerror(error) );
     }
 
-    if ( error )
+    if (error)
+    {
         SockClose();
 
+        int err = error;
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], unable to receive request size %d to "
+                  "nameserver=%s:%s, error: %d(%s)\n"
+                , method_name, size, mon2nsHost_, mon2nsPort_, err, 
strerror(err) );
+        mon_log_write(NAMESERVER_SOCKRECEIVE_1, SQ_LOG_ERR, buf);    
+
+        // Choose another name server on IO retry
+        if (IsRealCluster)
+        {
+            mon2nsHost_[0] = 0;
+        }
+        else
+        {
+            mon2nsPort_[0] = 0;
+        }
+    }
+
     TRACE_EXIT;
     return error;
 }
@@ -1112,9 +1389,28 @@ int CNameServer::SockSend( char *buf, int size )
                     , error, strerror(error) );
     }
 
-    if ( error )
+    if (error)
+    {
         SockClose();
 
+        int err = error;
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf( buf, sizeof(buf)
+                , "[%s], unable to send request size %d to "
+                  "nameserver=%s:%s, error: %d(%s)\n"
+                , method_name, size, mon2nsHost_, mon2nsPort_, err, 
strerror(err) );
+        mon_log_write(NAMESERVER_SOCKSEND_1, SQ_LOG_ERR, buf);    
+        // Choose another name server on IO retry
+        if (IsRealCluster)
+        {
+            mon2nsHost_[0] = 0;
+        }
+        else
+        {
+            mon2nsPort_[0] = 0;
+        }
+    }
+
     TRACE_EXIT;
     return error;
 }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nameserver.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nameserver.h 
b/core/sqf/monitor/linux/nameserver.h
index a8ccb4b..009eced 100644
--- a/core/sqf/monitor/linux/nameserver.h
+++ b/core/sqf/monitor/linux/nameserver.h
@@ -40,12 +40,15 @@ public:
     CNameServer( void );
     virtual ~CNameServer( void );
 
+    bool IsNameServerConfigured( int pnid );
+    void NameServerExited( void );
     int  NameServerStop( struct message_def* msg );
     int  ProcessDelete(CProcess* process );
     int  ProcessInfo( struct message_def* msg );
     int  ProcessInfoCont( struct message_def* msg );
     int  ProcessInfoNs( struct message_def* msg );
     int  ProcessNew(CProcess* process );
+    int  ProcessNodeDown( int nid, char* nodeName );
     int  ProcessShutdown( void );
     void SetLocalHost( void );
 
@@ -53,21 +56,20 @@ private:
     char mon2nsHost_[MAX_PROCESSOR_NAME];
     char mon2nsPort_[10];
     int  mon2nsSock_;
-    int  nsConfigInx_;
     bool nsStartupComplete_;
     int  seqNum_;
     bool shutdown_;
 
-    void ChooseNextNs( void );
-    int  ConnectToNs( bool *retry );
-    void GetM2NPort( int PNid );
+    int  ChooseNextNs( void );
+    int  ClientSockCreate();
+    int  ConnectToNs( bool* retry );
+    int  GetM2NPort( int PNid );
     int  SendReceive( struct message_def* msg );
-    int  SendToNs( const char *reqType, struct message_def *msg, int size );
+    int  SendToNs( const char* reqType, struct message_def* msg, int size );
     void SetShutdown( bool shutdown );
     void SockClose( void );
-    int  SockCreate();
-    int  SockReceive( char *buf, int size );
-    int  SockSend( char *buf, int size );
+    int  SockReceive( char* buf, int size );
+    int  SockSend( char* buf, int size );
 };
 
 #endif

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nscommacceptmon.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nscommacceptmon.cxx 
b/core/sqf/monitor/linux/nscommacceptmon.cxx
index edddca7..0857cc9 100644
--- a/core/sqf/monitor/linux/nscommacceptmon.cxx
+++ b/core/sqf/monitor/linux/nscommacceptmon.cxx
@@ -155,6 +155,36 @@ void CCommAcceptMon::monReqNameServerStop( struct 
message_def* msg, int sockFd )
     TRACE_EXIT;
 }
 
+void CCommAcceptMon::monReqNodeDown( struct message_def* msg, int sockFd )
+{
+    const char method_name[] = "CCommAcceptMon::monReqNodeDown";
+    TRACE_ENTRY;
+
+    if ( trace_settings & ( TRACE_NS | TRACE_REQUEST) )
+    {
+        trace_printf( "%s@%d - Received monitor node-down request.\n"
+                      "        msg.down.nid=%d\n"
+                      "        msg.down.node_name=%s\n"
+                      "        msg.down.takeover=%d\n"
+                      "        msg.down.reason=%s\n"
+                    , method_name, __LINE__
+                    , msg->u.request.u.down.nid
+                    , msg->u.request.u.down.node_name
+                    , msg->u.request.u.down.takeover
+                    , msg->u.request.u.down.reason
+                    );
+    }
+
+    CExternalReq::reqQueueMsg_t msgType;
+    msgType = CExternalReq::NonStartupMsg;
+    int nid = msg->u.request.u.down.nid;
+    int pid = -1;
+    // Place new request on request queue
+    ReqQueue.enqueueReq(msgType, nid, pid, sockFd, msg);
+
+    TRACE_EXIT;
+}
+
 void CCommAcceptMon::monReqProcessInfo( struct message_def* msg, int sockFd )
 {
     const char method_name[] = "CCommAcceptMon::monReqProcessInfo";
@@ -412,12 +442,12 @@ void CCommAcceptMon::monReqUnknown( struct message_def* 
msg, int sockFd )
 void CCommAcceptMon::processMonReqs( int sockFd )
 {
     const char method_name[] = "CCommAcceptMon::processMonReqs";
+    TRACE_ENTRY;
+
     int rc;
     nodeId_t nodeId;
     struct message_def msg;
 
-    TRACE_ENTRY;
-
     if ( trace_settings & ( TRACE_NS ) )
     {
         trace_printf( "%s@%d - Accepted connection sock=%d\n"
@@ -435,7 +465,7 @@ void CCommAcceptMon::processMonReqs( int sockFd )
         char buf[MON_STRING_BUF_SIZE];
         snprintf(buf, sizeof(buf), "[%s], unable to obtain node id from new "
                  "monitor: %s.\n", method_name, ErrorMsg(rc));
-        mon_log_write(NS_COMMACCEPT_2, SQ_LOG_ERR, buf);
+        mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_1, SQ_LOG_ERR, buf);
         return;
     }
 
@@ -462,6 +492,36 @@ void CCommAcceptMon::processMonReqs( int sockFd )
                     , nodeId.ping );
     }
 
+    CNode  *node;
+    node = Nodes->GetNode( nodeId.pnid );
+    if ( node != NULL )
+    {
+        if ( node->GetState() != State_Up )
+        {
+            if ( trace_settings & ( TRACE_NS ) )
+            {
+                trace_printf( "%s@%d - Bringing node up, node=%s, pnid=%d\n"
+                            , method_name, __LINE__
+                            , node->GetName(), node->GetPNid() );
+            }
+            rc = Monitor->HardNodeUpNs( node->GetPNid() );
+            if ( rc )
+            {   // Handle error
+                close( sockFd );
+                return;
+            }
+        }
+    }
+    else
+    {   // Handle error
+        close( sockFd );
+        char buf[MON_STRING_BUF_SIZE];
+        snprintf(buf, sizeof(buf), "[%s], invalid physical node id, "
+                 "pnid: %d\n", method_name, nodeId.pnid );
+        mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_2, SQ_LOG_ERR, buf);
+        return;
+    }
+
     strcpy(nodeId.nodeName, MyNode->GetName());
     strcpy(nodeId.commPort, MyNode->GetCommPort());
     strcpy(nodeId.syncPort, MyNode->GetSyncPort());
@@ -504,7 +564,7 @@ void CCommAcceptMon::processMonReqs( int sockFd )
         char buf[MON_STRING_BUF_SIZE];
         snprintf(buf, sizeof(buf), "[%s], unable to send node id from new "
                  "monitor: %s.\n", method_name, ErrorMsg(rc));
-        mon_log_write(NS_COMMACCEPT_3, SQ_LOG_ERR, buf);
+        mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_3, SQ_LOG_ERR, buf);
         return;
     }
 
@@ -517,9 +577,9 @@ void CCommAcceptMon::processMonReqs( int sockFd )
         {   // Handle error
             close( sockFd );
             char buf[MON_STRING_BUF_SIZE];
-            snprintf(buf, sizeof(buf), "[%s], unable to obtain node id from 
new "
+            snprintf(buf, sizeof(buf), "[%s], unable to obtain message size 
from "
                      "monitor: %s.\n", method_name, ErrorMsg(rc));
-            mon_log_write(NS_COMMACCEPT_4, SQ_LOG_ERR, buf);
+            mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_4, SQ_LOG_ERR, buf);
             return;
         }
 
@@ -528,9 +588,9 @@ void CCommAcceptMon::processMonReqs( int sockFd )
         {   // Handle error
             close( sockFd );
             char buf[MON_STRING_BUF_SIZE];
-            snprintf(buf, sizeof(buf), "[%s], unable to obtain node id from 
new "
+            snprintf(buf, sizeof(buf), "[%s], unable to obtain message from "
                      "monitor: %s.\n", method_name, ErrorMsg(rc));
-            mon_log_write(NS_COMMACCEPT_5, SQ_LOG_ERR, buf);
+            mon_log_write(NS_COMMACCEPT_PROCESSMONREQS_5, SQ_LOG_ERR, buf);
             return;
         }
         if ( trace_settings & ( TRACE_NS ) )
@@ -591,6 +651,10 @@ void CCommAcceptMon::processMonReqs( int sockFd )
             monReqNameServerStop(&msg, sockFd);
             break;
 
+        case ReqType_NodeDown:
+            monReqNodeDown(&msg, sockFd);
+            break;
+
         case ReqType_ProcessInfo:
             monReqProcessInfo(&msg, sockFd);
             break;
@@ -663,9 +727,9 @@ void CCommAcceptMon::processNewSock( int joinFd )
     if (rc != 0)
     {
         char buf[MON_STRING_BUF_SIZE];
-        snprintf(buf, sizeof(buf), "[%s], thread create error=%d\n",
+        snprintf(buf, sizeof(buf), "[%s], mon2nsProcess thread create 
error=%d\n",
                  method_name, rc);
-        mon_log_write(NS_COMMACCEPT_6, SQ_LOG_ERR, buf);
+        mon_log_write(NS_COMMACCEPT_PROCESSNEWSOCK_1, SQ_LOG_ERR, buf);
     }
 
     TRACE_EXIT;
@@ -743,7 +807,7 @@ void CCommAcceptMon::commAcceptorSock()
             char buf[MON_STRING_BUF_SIZE];
             snprintf(buf, sizeof(buf), "[%s], cannot accept new monitor: 
%s.\n",
                      method_name, strerror(errno));
-            mon_log_write(NS_COMMACCEPT_7, SQ_LOG_ERR, buf);
+            mon_log_write(NS_COMMACCEPT_COMMACCEPTORSOCK_1, SQ_LOG_ERR, buf);
 
         }
         else
@@ -800,7 +864,7 @@ static void *mon2nsAcceptMon(void *arg)
         char buf[MON_STRING_BUF_SIZE];
         snprintf(buf, sizeof(buf), "[%s], pthread_sigmask error=%d\n",
                  method_name, rc);
-        mon_log_write(NS_COMMACCEPT_8, SQ_LOG_ERR, buf);
+        mon_log_write(NS_COMMACCEPT_MON2NSACCEPTMON_1, SQ_LOG_ERR, buf);
     }
 
     // Enter thread processing loop
@@ -830,7 +894,7 @@ static void *mon2nsProcess(void *arg)
         char buf[MON_STRING_BUF_SIZE];
         snprintf(buf, sizeof(buf), "[%s], pthread_sigmask error=%d\n",
                  method_name, rc);
-        mon_log_write(NS_COMMACCEPT_9, SQ_LOG_ERR, buf);
+        mon_log_write(NS_COMMACCEPT_MON2NSPROCESS_1, SQ_LOG_ERR, buf);
     }
 
     MyNode->AddMonConnCount(1);
@@ -858,7 +922,7 @@ void CCommAcceptMon::start()
         char buf[MON_STRING_BUF_SIZE];
         snprintf(buf, sizeof(buf), "[%s], thread create error=%d\n",
                  method_name, rc);
-        mon_log_write(NS_COMMACCEPT_10, SQ_LOG_ERR, buf);
+        mon_log_write(NS_COMMACCEPT_START_1, SQ_LOG_ERR, buf);
     }
 
     TRACE_EXIT;

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nscommacceptmon.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nscommacceptmon.h 
b/core/sqf/monitor/linux/nscommacceptmon.h
index 41b2c9b..1749b16 100644
--- a/core/sqf/monitor/linux/nscommacceptmon.h
+++ b/core/sqf/monitor/linux/nscommacceptmon.h
@@ -46,6 +46,7 @@ public:
     void monReqExec( CExternalReq * request );
     void monReqNameServerStop( struct message_def* msg, int sockFd );
     void monReqNewProcess( struct message_def* msg, int sockFd );
+    void monReqNodeDown( struct message_def* msg, int sockFd );
     void monReqProcessInfo( struct message_def* msg, int sockFd );
     void monReqProcessInfoCont( struct message_def* msg, int sockFd );
     void monReqProcessInfoNs( struct message_def* msg, int sockFd );
@@ -68,9 +69,9 @@ private:
     bool accepting_;
     bool shutdown_;
 
-    // commAccept thread's id
+    // mon2nsAcceptMon thread's id
     pthread_t                      thread_id_;
-    // commAccept thread's id
+    // mon2nsProcess thread's id
     pthread_t                      process_thread_id_;
 
     enum { HEURISTIC_COUNT = 10 };

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nsreqprocinfons.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nsreqprocinfons.cxx 
b/core/sqf/monitor/linux/nsreqprocinfons.cxx
index aa53437..37c09f6 100644
--- a/core/sqf/monitor/linux/nsreqprocinfons.cxx
+++ b/core/sqf/monitor/linux/nsreqprocinfons.cxx
@@ -222,18 +222,21 @@ void CExtProcInfoNsReq::performRequest()
 
     if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
     {
-        trace_printf( "%s@%d request #%ld: ProcessInfoNs, for (%d, %d:%d), "
+        trace_printf( "%s@%d request #%ld: ProcessInfoNs, for %s (%d, %d:%d), "
                       "process type=%s\n"
                     , method_name, __LINE__, id_
-                    , target_nid, target_pid, target_verifier
+                    , target_process_name.c_str(), target_nid, target_pid, 
target_verifier
                     , ProcessTypeString(target_type));
     }
 
     if (target_process_name.size())
     { // find by name (don't check node state, don't check process state, not 
backup)
-        process = Nodes->GetProcess( target_process_name.c_str()
-                                   , target_verifier
-                                   , false, false, false );
+        if (msg_->u.request.u.process_info.target_process_name[0] == '$' )
+        {
+            process = Nodes->GetProcess( target_process_name.c_str()
+                                       , target_verifier
+                                       , false, false, false );
+        }
     }
     else
     {

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nsreqshutdown.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nsreqshutdown.cxx 
b/core/sqf/monitor/linux/nsreqshutdown.cxx
index e5888d2..63d9400 100644
--- a/core/sqf/monitor/linux/nsreqshutdown.cxx
+++ b/core/sqf/monitor/linux/nsreqshutdown.cxx
@@ -30,12 +30,10 @@
 #include "montrace.h"
 #include "monsonar.h"
 #include "monlogging.h"
-#include "replicate.h"
 
 extern CMonStats *MonStats;
 extern CNode *MyNode;
 extern CNodeContainer *Nodes;
-extern CReplicate Replicator;
 
 CExtShutdownNsReq::CExtShutdownNsReq (reqQueueMsg_t msgType,
                                       int nid, int pid, int sockFd,
@@ -43,7 +41,7 @@ CExtShutdownNsReq::CExtShutdownNsReq (reqQueueMsg_t msgType,
     : CExternalReq(msgType, nid, pid, sockFd, msg)
 {
     // Add eyecatcher sequence as a debugging aid
-    memcpy(&eyecatcher_, "RQER", 4); // TODO
+    memcpy(&eyecatcher_, "RqER", 4);
 
     priority_    = High;
 }
@@ -51,7 +49,7 @@ CExtShutdownNsReq::CExtShutdownNsReq (reqQueueMsg_t msgType,
 CExtShutdownNsReq::~CExtShutdownNsReq()
 {
     // Alter eyecatcher sequence as a debugging aid to identify deleted object
-    memcpy(&eyecatcher_, "rqer", 4); // TODO
+    memcpy(&eyecatcher_, "rQer", 4);
 }
 
 void CExtShutdownNsReq::populateRequestString( void )

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/nsreqstop.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/nsreqstop.cxx 
b/core/sqf/monitor/linux/nsreqstop.cxx
index 4ff46ce..a20e9bd 100644
--- a/core/sqf/monitor/linux/nsreqstop.cxx
+++ b/core/sqf/monitor/linux/nsreqstop.cxx
@@ -90,7 +90,7 @@ void CExtNameServerStopNsReq::performRequest()
         int nid = atoi( msg_->u.request.u.nameserver_stop.node_name );
         node = Nodes->GetLNode( nid )->GetNode();
     }
-    Monitor->HardNodeDown( node->GetPNid(), true );
+    Monitor->HardNodeDownNs( node->GetPNid() );
 
     char la_buf[MON_STRING_BUF_SIZE*2];
     snprintf( la_buf, sizeof(la_buf)

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/pnode.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx
index 4a4b8c4..364837b 100644
--- a/core/sqf/monitor/linux/pnode.cxx
+++ b/core/sqf/monitor/linux/pnode.cxx
@@ -1065,11 +1065,11 @@ strId_t CNode::GetStringId( char *candidate, CLNode 
*targetLNode, bool clone )
                 !MyNode->IsMyNode(targetLNode->GetNid()))
             {
                 // Forward the unique string to the target node
-                int rc = PtpClient->AddUniqStr( id.nid
-                                              , id.id
-                                              , candidate
-                                              , targetLNode->GetNid()
-                                              , 
targetLNode->GetNode()->GetName() );
+                int rc = PtpClient->ProcessAddUniqStr( id.nid
+                                                     , id.id
+                                                     , candidate
+                                                     , targetLNode->GetNid()
+                                                     , 
targetLNode->GetNode()->GetName() );
                 if (rc)
                 {
                     char la_buf[MON_STRING_BUF_SIZE];
@@ -1110,11 +1110,11 @@ strId_t CNode::GetStringId( char *candidate, CLNode 
*targetLNode, bool clone )
                 !MyNode->IsMyNode(targetLNode->GetNid()))
             {
                 // Forward the unique string to the target node
-                int rc = PtpClient->AddUniqStr( id.nid
-                                              , id.id
-                                              , candidate
-                                              , targetLNode->GetNid()
-                                              , 
targetLNode->GetNode()->GetName());
+                int rc = PtpClient->ProcessAddUniqStr( id.nid
+                                                     , id.id
+                                                     , candidate
+                                                     , targetLNode->GetNid()
+                                                     , 
targetLNode->GetNode()->GetName());
                 if (rc)
                 {
                     char la_buf[MON_STRING_BUF_SIZE];
@@ -1240,6 +1240,16 @@ void CNode::StartNameServerProcess( void )
     const char method_name[] = "CNode::StartNameServerProcess";
     TRACE_ENTRY;
 
+    if ( !NameServer->IsNameServerConfigured( MyPNID ) )
+    {
+        if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+        {
+            trace_printf( "%s@%d" " - NameServer is not configured in my 
node\n"
+                        , method_name, __LINE__);
+        }
+        return;
+    }
+
     char path[MAX_SEARCH_PATH];
     char *ldpath = NULL; // = getenv("LD_LIBRARY_PATH");
     char filename[MAX_PROCESS_PATH];
@@ -1250,7 +1260,9 @@ void CNode::StartNameServerProcess( void )
     snprintf( stdout, sizeof(stdout), "stdout_TNS%d", MyNode->GetZone() );
 
     if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-       trace_printf("%s@%d" " - Creating NameService Process\n", method_name, 
__LINE__);
+    {
+        trace_printf("%s@%d" " - Creating NameServer Process\n", method_name, 
__LINE__);
+    }
 
     strcpy(path,getenv("PATH"));
     strcat(path,":");
@@ -1281,12 +1293,14 @@ void CNode::StartNameServerProcess( void )
     if ( NameServerProcess )
     {
         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
-           trace_printf("%s@%d" " - NameService Process created\n", 
method_name, __LINE__);
+        {
+            trace_printf("%s@%d" " - NameServer Process created\n", 
method_name, __LINE__);
+        }
     }
     else
     {
         char la_buf[MON_STRING_BUF_SIZE];
-        sprintf(la_buf, "[%s], NameService Process creation failed.\n", 
method_name);
+        sprintf(la_buf, "[%s], NameServer Process creation failed.\n", 
method_name);
         mon_log_write( MON_NODE_STARTNAMESERVER_1, SQ_LOG_ERR, la_buf );
     }
 
@@ -2556,11 +2570,27 @@ CProcess *CNodeContainer::CloneProcessNs( int nid
             }
             else
             {
-                char buf[MON_STRING_BUF_SIZE];
-                snprintf( buf, sizeof(buf),
-                          "[%s] ProcessInfo failed, rc=%d\n"
-                        , method_name, 
msg.u.reply.u.process_info_ns.return_code );
-                mon_log_write( MON_NODE_CLONEPROCESSNS_1, SQ_LOG_ERR, buf );
+                if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+                {
+                   trace_printf( "%s@%d - ProcessInfoNs(%d, %d:%d) -- can't 
find target process\n"
+                               , method_name, __LINE__
+                               , msg.u.reply.u.process_info_ns.nid
+                               , msg.u.reply.u.process_info_ns.pid
+                               , msg.u.reply.u.process_info_ns.verifier);
+                }
+
+                if ( msg.u.reply.u.process_info_ns.return_code != MPI_ERR_NAME 
)
+                {
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf),
+                              "[%s] ProcessInfo(%d, %d:%d) failed, rc=%d\n"
+                            , method_name
+                            , msg.u.reply.u.process_info_ns.nid
+                            , msg.u.reply.u.process_info_ns.pid
+                            , msg.u.reply.u.process_info_ns.verifier
+                            , msg.u.reply.u.process_info_ns.return_code );
+                    mon_log_write( MON_NODE_CLONEPROCESSNS_1, SQ_LOG_ERR, buf 
);
+                }
             }
         }
         else
@@ -2625,11 +2655,25 @@ CProcess *CNodeContainer::CloneProcessNs( const char 
*name, Verifier_t verifier
             }
             else
             {
-                char buf[MON_STRING_BUF_SIZE];
-                snprintf( buf, sizeof(buf),
-                          "[%s] ProcessInfo failed, rc=%d\n"
-                        , method_name, 
msg.u.reply.u.process_info_ns.return_code );
-                mon_log_write( MON_NODE_CLONEPROCESSNS_4, SQ_LOG_ERR, buf );
+                if (trace_settings & (TRACE_REQUEST | TRACE_PROCESS))
+                {
+                   trace_printf( "%s@%d - ProcessInfoNs(%s:%d) -- can't find 
target process\n"
+                               , method_name, __LINE__
+                               , msg.u.reply.u.process_info_ns.process_name
+                               , msg.u.reply.u.process_info_ns.verifier);
+                }
+
+                if ( msg.u.reply.u.process_info_ns.return_code != MPI_ERR_NAME 
)
+                {
+                    char buf[MON_STRING_BUF_SIZE];
+                    snprintf( buf, sizeof(buf),
+                              "[%s] ProcessInfo(%s:%d) failed, rc=%d\n"
+                            , method_name
+                            , msg.u.reply.u.process_info_ns.process_name
+                            , msg.u.reply.u.process_info_ns.verifier
+                            , msg.u.reply.u.process_info_ns.return_code );
+                    mon_log_write( MON_NODE_CLONEPROCESSNS_4, SQ_LOG_ERR, buf 
);
+                }
             }
         }
         else

Reply via email to