Repository: trafodion
Updated Branches:
  refs/heads/master ee4430046 -> fe87aa15e


http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/reqopen.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqopen.cxx 
b/core/sqf/monitor/linux/reqopen.cxx
index f131a08..f44b8d4 100644
--- a/core/sqf/monitor/linux/reqopen.cxx
+++ b/core/sqf/monitor/linux/reqopen.cxx
@@ -229,8 +229,8 @@ bool CExtOpenReq::prepare()
         return false;
     }
 
-    CProcess * openerProcess;
-    CProcess * openedProcess;
+    CProcess * openerProcess = NULL;
+    CProcess * openedProcess = NULL;
 
     // Get process object for opener process
     if ( msg_->u.request.u.open.process_name[0] )
@@ -263,9 +263,12 @@ bool CExtOpenReq::prepare()
     // Get process object for process to open
     if ( msg_->u.request.u.open.target_process_name[0] ) 
     { // find by name (check node state, don't check process state, backup is 
NOT Ok)
-        openedProcess = Nodes->GetProcess( 
msg_->u.request.u.open.target_process_name
-                                         , 
msg_->u.request.u.open.target_verifier
-                                         , true, false, false );
+        if (msg_->u.request.u.open.target_process_name[0] == '$' )
+        {
+            openedProcess = Nodes->GetProcess( 
msg_->u.request.u.open.target_process_name
+                                             , 
msg_->u.request.u.open.target_verifier
+                                             , true, false, false );
+        }
     }
     else
     { // find by pid (check node state, don't check process state, backup is 
Ok)
@@ -291,8 +294,11 @@ bool CExtOpenReq::prepare()
                                 , method_name, __LINE__
                                 , target_process_name.c_str()
                                 , target_verifier );
-                openedProcess = Nodes->CloneProcessNs( 
target_process_name.c_str()
-                                                     , target_verifier );
+                if (msg_->u.request.u.open.target_process_name[0] == '$' )
+                {
+                    openedProcess = Nodes->CloneProcessNs( 
target_process_name.c_str()
+                                                         , target_verifier );
+                }
             }     
             else
             { // Name Server find by nid,pid:verifier

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/reqprocinfo.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqprocinfo.cxx 
b/core/sqf/monitor/linux/reqprocinfo.cxx
index d3f04e2..c49a877 100644
--- a/core/sqf/monitor/linux/reqprocinfo.cxx
+++ b/core/sqf/monitor/linux/reqprocinfo.cxx
@@ -417,9 +417,6 @@ void CExtProcInfoReq::performRequest()
             requester =
                Nodes->GetProcess( nid_ , pid_ , verifier_
                                 , false, false, true );
-//            CLNode *lnode = Nodes->GetLNode( nid_ );
-//            CNode *node = lnode->GetNode();
-//            requester = node->GetProcess( pid_, verifier_ );
 #else
             requester = MyNode->GetProcess( pid_
                                           , verifier_ );
@@ -483,12 +480,16 @@ void CExtProcInfoReq::performRequest()
                                                          , false, false
                                                          , target_verifier == 
-1 ? false : true );
 #else
+                    CProcess *process = NULL;
                     // find by name (check node state, don't check process 
state, 
                     //               if verifier is -1, backup is NOT Ok, else 
is Ok)
-                    CProcess *process = Nodes->GetProcess( 
target_process_name.c_str()
-                                                         , target_verifier
-                                                         , true, false
-                                                         , target_verifier == 
-1 ? false : true );
+                    if (msg_->u.request.u.process_info.target_process_name[0] 
== '$' )
+                    {
+                        process = Nodes->GetProcess( 
target_process_name.c_str()
+                                                   , target_verifier
+                                                   , true, false
+                                                   , target_verifier == -1 ? 
false : true );
+                    }
 #endif
                     if (process)
                     {

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/reqqueue.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqqueue.cxx 
b/core/sqf/monitor/linux/reqqueue.cxx
index 2238d71..3d425f2 100644
--- a/core/sqf/monitor/linux/reqqueue.cxx
+++ b/core/sqf/monitor/linux/reqqueue.cxx
@@ -64,6 +64,7 @@ extern char *ErrorMsg (int error_code);
 extern CRedirector Redirector;
 extern bool NameServerEnabled;
 extern CPtpClient *PtpClient;
+extern CProcess *NameServerProcess;
 extern CNameServer *NameServer;
 extern CNameServerConfigContainer *NameServerConfig;
 #endif
@@ -1578,15 +1579,18 @@ void CIntNewProcReq::performRequest()
         {
             if (NameServerEnabled)
             {
-                if (trace_settings & TRACE_REQUEST)
-                    trace_printf( "%s@%d" " - Getting parent process from Name 
Server (%d,%d:%d)\n"
-                                , method_name, __LINE__
-                                , parentNid_
-                                , parentPid_
-                                , parentVerifier_ );
-                parentProcess = Nodes->CloneProcessNs( parentNid_
-                                                     , parentPid_
-                                                     , parentVerifier_ );
+                if (parentNid_ != -1 && parentPid_ != -1)
+                {
+                    if (trace_settings & TRACE_REQUEST)
+                        trace_printf( "%s@%d" " - Getting parent process from 
Name Server (%d,%d:%d)\n"
+                                    , method_name, __LINE__
+                                    , parentNid_
+                                    , parentPid_
+                                    , parentVerifier_ );
+                    parentProcess = Nodes->CloneProcessNs( parentNid_
+                                                         , parentPid_
+                                                         , parentVerifier_ );
+                }
             }
         }
     }
@@ -2598,7 +2602,7 @@ void CIntChildDeathReq::performRequest()
                          , process_->GetVerifier() );
         }
 #ifndef NAMESERVER_PROCESS
-        if ( NameServerEnabled )
+        if ( NameServerEnabled && process_ != NameServerProcess)
         {
             int rc = NameServer->ProcessDelete(process_); // in reqQueue 
thread (CIntChildDeathReq)
             if (rc)
@@ -2713,9 +2717,11 @@ void CIntShutdownReq::performRequest()
     else
     {
         // Stop all processes
-        Monitor->HardNodeDown( MyPNID );
 #ifndef NAMESERVER_PROCESS
+        Monitor->HardNodeDown( MyPNID );
         MyNode->EmptyQuiescingPids();
+#else
+        Monitor->HardNodeDownNs( MyPNID );
 #endif
         // now stop the Watchdog process
         HealthCheck.setState(MON_NODE_DOWN);
@@ -3261,7 +3267,11 @@ void CIntDownReq::performRequest()
     if (trace_settings & (TRACE_SYNC | TRACE_REQUEST))
         trace_printf("%s@%d - Node down request, pnid=%d\n",
                      method_name, __LINE__, pnid_);
+#ifndef NAMESERVER_PROCESS
     Monitor->HardNodeDown( pnid_ );
+#else
+    Monitor->HardNodeDownNs( pnid_ );
+#endif
 
     TRACE_EXIT;
 }
@@ -4063,7 +4073,11 @@ void CPostQuiesceReq::performRequest()
     else
     {
         // Stop all processes
+#ifndef NAMESERVER_PROCESS
         Monitor->HardNodeDown( MyPNID );
+#else
+        Monitor->HardNodeDownNs( MyPNID );
+#endif
 #ifndef NAMESERVER_PROCESS
         MyNode->EmptyQuiescingPids();
 #endif
@@ -4241,6 +4255,11 @@ CExternalReq 
*CReqQueue::prepExternalReq(CExternalReq::reqQueueMsg_t msgType,
             request->setConcurrent(reqConcurrent[msg->u.request.type]);
             break;
 
+        case ReqType_NodeDown:
+            request = new CExtNodeDownNsReq(msgType, pid, sockFd, msg);
+            request->setConcurrent(reqConcurrent[msg->u.request.type]);
+            break;
+
         case ReqType_NewProcessNs:
             request = new CExtNewProcNsReq(msgType, nid, pid, sockFd, msg);
             request->setConcurrent(reqConcurrent[msg->u.request.type]);
@@ -5376,7 +5395,7 @@ CRequest* CReqQueue::getRequest()
         }
     }
 
-    if (!request->isShutdown())
+    if (request && !request->isShutdown())
     {
         // Take request out of list
         reqQueue_.erase (it);

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/reqqueue.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqqueue.h 
b/core/sqf/monitor/linux/reqqueue.h
index 2f6f030..b600a0f 100644
--- a/core/sqf/monitor/linux/reqqueue.h
+++ b/core/sqf/monitor/linux/reqqueue.h
@@ -456,6 +456,23 @@ private:
 };
 #endif
 
+#ifdef NAMESERVER_PROCESS
+class CExtNodeDownNsReq: public CExternalReq
+{
+public:
+    CExtNodeDownNsReq( reqQueueMsg_t msgType
+                     , int pid
+                     , int sockFd
+                     , struct message_def *msg );
+    virtual ~CExtNodeDownNsReq();
+
+    void performRequest();
+
+private:
+    void populateRequestString( void );
+};
+#endif
+
 #ifndef NAMESERVER_PROCESS
 class CExtNameServerAddReq: public CExternalReq
 {
@@ -1801,6 +1818,7 @@ private:
       RQEI   CExtNewProcReq
       RqEB   CExtNewProcessNsReq
       RQEJ   CExtNodeDownReq
+      RqEJ   CExtNodeDownNsReq
       RQEK   CExtNodeInfoReq
       RQEK   CExtPNodeInfoReq
       RQEL   CExtNodeUpReq
@@ -1816,6 +1834,7 @@ private:
       RQEP   CExtProcInfoContReq
       RQEQ   CExtSetReq
       RQER   CExtShutdownReq
+      RqER   CExtShutdownNsReq
       RQES   CExtStartupReq
       RQET   CExtTmLeaderReq
       RQEV   CExtTmSyncReq

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/shell.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/shell.cxx b/core/sqf/monitor/linux/shell.cxx
index 6c5f14b..0da8c32 100644
--- a/core/sqf/monitor/linux/shell.cxx
+++ b/core/sqf/monitor/linux/shell.cxx
@@ -80,7 +80,7 @@ char *MyName;
 char LDpath[MAX_SEARCH_PATH];
 char Path[MAX_SEARCH_PATH];
 char Wdir[MAX_SEARCH_PATH];
-char prompt[13];
+char prompt[MAX_PROCESS_NAME];
 int VirtualNodes = 0;
 int VirtualNid = -1;
 int NumNodes = 0;
@@ -394,51 +394,41 @@ bool check_environment( void )
 {
     bool  rs = true;
     bool  isNameServerEnabled = false;
+    bool  isAgentModeEnabled = false;
     char* env;
     char  msgString[MAX_BUFFER] = { 0 };
     int   val = 0;
 
-    env = getenv("MONITOR_COMM_PORT");
-    if ( env )
+    env = getenv("SQ_MON_RUN_MODE");
+    if ( env && (strcmp(env, "AGENT") == 0) )
     {
-        val = atoi(env);
-        if ( val <= 0)
+        isAgentModeEnabled = true;
+    }
+
+    if (isAgentModeEnabled)
+    {
+        env = getenv("MONITOR_COMM_PORT");
+        if ( env )
         {
-            if (VirtualNodes)
+            val = atoi(env);
+            if ( val <= 0)
             {
                 sprintf( msgString, "[%s] Warning: MONITOR_COMM_PORT value is 
invalid (%s)!", MyName, env );
                 write_startup_log( msgString );
                 printf("%s\n", msgString );
             }
-            else
-            {
-                sprintf( msgString, "[%s] Error: MONITOR_COMM_PORT value is 
invalid (%s)! Set MONITOR_COMM_PORT environment variable and try again.", 
MyName, env );
-                write_startup_log( msgString );
-                printf("%s\n", msgString );
-                rs = false;
-            }
         }
-    }
-
-    env = getenv("MONITOR_SYNC_PORT");
-    if ( env )
-    {
-        val = atoi(env);
-        if ( val <= 0)
+    
+        env = getenv("MONITOR_SYNC_PORT");
+        if ( env )
         {
-            if (VirtualNodes)
+            val = atoi(env);
+            if ( val <= 0)
             {
                 sprintf( msgString, "[%s] Warning: MONITOR_SYNC_PORT value is 
invalid (%s)!", MyName, env );
                 write_startup_log( msgString );
                 printf("%s\n", msgString );
             }
-            else
-            {
-                sprintf( msgString, "[%s] Error: MONITOR_SYNC_PORT value is 
invalid (%s)! Set MONITOR_COMM_PORT environment variable and try again.", 
MyName, env );
-                write_startup_log( msgString );
-                printf("%s\n", msgString );
-                rs = false;
-            }
         }
     }
 
@@ -446,10 +436,7 @@ bool check_environment( void )
     if ( env )
     {
         val = atoi(env);
-        if ( val > 0)
-        {
-            isNameServerEnabled = (val != 0);
-        }
+        isNameServerEnabled = (val != 0) ? true : false;
     }
     
     if (isNameServerEnabled)
@@ -2391,8 +2378,8 @@ void get_proc_info( int nid
             {
                 if (displayHeader)
                 {
-                    printf("[%s] NID,PID(os)  PRI TYPE STATES  NAME        
PARENT      PROGRAM\n",MyName);
-                    printf("[%s] ------------ --- ---- ------- ----------- 
----------- ---------------\n",MyName);
+                    printf("[%s] NID,PID(os)  PRI TYPE STATES  NAME         
PARENT       PROGRAM\n",MyName);
+                    printf("[%s] ------------ --- ---- ------- ------------ 
------------ ---------------\n",MyName);
                 }
 
                 show_proc_info();
@@ -5257,7 +5244,7 @@ void show_proc_info( void )
             msg->u.reply.u.process_info.process[i].type
                 = ProcessType_Undefined;
         }
-        printf("%3.3d %-4s %c%c%c%c%c%c%c %-11s %-11s %-15s\n",
+        printf("%3.3d %-4s %c%c%c%c%c%c%c %-12s %-12s %-15s\n",
                msg->u.reply.u.process_info.process[i].priority,
                processTypeStr[msg->u.reply.u.process_info.process[i].type],
                (msg->u.reply.u.process_info.process[i].event_messages?'E':'-'),
@@ -9572,14 +9559,8 @@ int main (int argc, char *argv[])
     env = getenv("SQ_NAMESERVER_ENABLED");
     if ( env && isdigit(*env) )
     {
-        if ( strcmp(env,"0") == 0 )
-        {
-            NameServerEnabled = false;
-        }
-        else
-        {
-            NameServerEnabled = true;
-        }
+        int val = atoi(env);
+        NameServerEnabled = (val != 0) ? true : false;
     }
 
     if ( !VirtualNodes )

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/tmsync.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/tmsync.cxx 
b/core/sqf/monitor/linux/tmsync.cxx
index 548ae81..b56c5f8 100644
--- a/core/sqf/monitor/linux/tmsync.cxx
+++ b/core/sqf/monitor/linux/tmsync.cxx
@@ -1012,7 +1012,8 @@ void CTmSync_Container::SendUnsolicitedMessages (void)
                 }
                 if (NameServerEnabled)
                 {
-                    if (!MyNode->IsMyNode( tm->GetNid() ))
+                    if (!MyNode->IsMyNode( tm->GetNid() )
+                      && (req->GetNext() && req->GetNext()->Nid != 
tm->GetNid() ) )
                     {
                         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | 
TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
                         {
@@ -1024,6 +1025,7 @@ void CTmSync_Container::SendUnsolicitedMessages (void)
                                         , tm->GetVerifier() );
                         }
                         Nodes->DeleteCloneProcess( tm );
+                        tm = NULL;
                     }
                 
                 }

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/monitor/linux/zclient.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/zclient.cxx 
b/core/sqf/monitor/linux/zclient.cxx
index f9bd698..19a7679 100644
--- a/core/sqf/monitor/linux/zclient.cxx
+++ b/core/sqf/monitor/linux/zclient.cxx
@@ -506,7 +506,7 @@ const char* CZClient::WaitForAndReturnMaster( bool doWait )
     string masterMonitor( ss.str( ) );
 
     // wait for 3 minutes for giving up.  
-    while ( (!found) && (retries < 180)) 
+    while ( (GetState() != ZC_SHUTDOWN) && (!found) && (retries < 180)) 
     {
         if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
         {

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/sqenvcom.sh
----------------------------------------------------------------------
diff --git a/core/sqf/sqenvcom.sh b/core/sqf/sqenvcom.sh
index cd2ea37..0bac47c 100644
--- a/core/sqf/sqenvcom.sh
+++ b/core/sqf/sqenvcom.sh
@@ -681,11 +681,14 @@ export SQ_STARTUP=r
 #            (meaning that mpirun is the parent process of the monitor process)
 #   AGENT  - monitor process runs in agent mode versus MPI collective
 #
-# Uncomment the next four environment variables
-#export SQ_MON_CREATOR=MPIRUN
-#export SQ_MON_RUN_MODE=AGENT
-#export MONITOR_COMM_PORT=23390
-#export MONITOR_SYNC_PORT=23380
+# Uncomment the next environment variable
+export SQ_MON_CREATOR=MPIRUN
+if [[ "$SQ_MON_CREATOR" == "MPIRUN" ]]; then
+  export SQ_MON_RUN_MODE=${SQ_MON_RUN_MODE:-AGENT}
+  export MONITOR_COMM_PORT=${MONITOR_COMM_PORT:-23390}
+  export MONITOR_SYNC_PORT=${MONITOR_SYNC_PORT:-23380}
+  export TRAF_SCALING_FACTOR=${TRAF_SCALING_FACTOR:-0.75}
+fi
 
 #
 #   NAME-SERVER - to disable process replication and enable the name-server
@@ -743,6 +746,11 @@ fi
 # set to 0 to disable phandle verifier
 export SQ_PHANDLE_VERIFIER=1
 
+# set to 0 to disable process name long format in clusters larger that 256 
nodes
+#export SQ_MON_PROCESS_NAME_FORMAT_LONG=0
+#   short format: '$Zxxpppp'     xx   = nid, pppp   = pid
+#   long  format: '$Zxxxxpppppp' xxxx = nid, pppppp = pid (default)
+
 # set to 0 to disable or 1 to enable configuration of DTM as a persistent 
process
 # must re-execute 'sqgen' to effect change
 export SQ_DTM_PERSISTENT_PROCESS=1

http://git-wip-us.apache.org/repos/asf/trafodion/blob/32fe8565/core/sqf/sql/scripts/gomon.cold
----------------------------------------------------------------------
diff --git a/core/sqf/sql/scripts/gomon.cold b/core/sqf/sql/scripts/gomon.cold
index 6055490..f963e29 100755
--- a/core/sqf/sql/scripts/gomon.cold
+++ b/core/sqf/sql/scripts/gomon.cold
@@ -90,15 +90,24 @@ eof
       echo `date`" - Continuing with Startup ..."
       echo
    fi
+fi
+
+if (
+    [[ $TRAF_AGENT == "CM" ]] ||
+    [[ $SQ_MON_RUN_MODE == "AGENT" ]]
+   )
+then
+   export TRAF_SCALING_FACTOR=${TRAF_SCALING_FACTOR:-0.75}
 
    # Set the number of nodes configured
    let node_count=`trafconf -nid-count`
+   #echo  "***"
    #echo  "*** node_count = ${node_count}"
-   #echo  "*** TRAF_SCALING_FACTOR = $TRAF_SCALING_FACTOR"
+   #echo  "*** TRAF_SCALING_FACTOR = ${TRAF_SCALING_FACTOR}"
 
    # allow time for other nodes to integrate, scaled to cluster size
    # scaling factor may be non-integer, so use awk to evaluate
-   start_delay=$( echo "${node_count} $TRAF_SCALING_FACTOR" | awk '{print $1 * 
$2}')
+   start_delay=$( echo "${node_count} ${TRAF_SCALING_FACTOR}" | awk '{print $1 
* $2}')
    echo  "***"
    echo  "***" %`date`" - Waiting ${start_delay} seconds for Monitor processes 
to integrate"
    echo  "***"

Reply via email to