Victor created TS-3104:
--------------------------

             Summary: traffic_cop can't restart traffic_manager properly
                 Key: TS-3104
                 URL: https://issues.apache.org/jira/browse/TS-3104
             Project: Traffic Server
          Issue Type: Bug
          Components: Cop
            Reporter: Victor


In some cases traffic_cop can't restart traffic_manager properly. We met these 
issues at "Ashmanov and partners" (http://en.ashmanov.com/). There are two 
places in code which in my opinion need corrections:

1) The logic which decides whether to kill process or group.

2) The main traffic_cop loop: it doesn't reinitialize manager API in case of 
failure and this fact leads to constant attempts to connect to manager using 
socket id == -1. 

I have prepared patches for both issues. Please kindly take a look at them and 
let me know your thoughts.


diff --git lib/ts/lockfile.cc lib/ts/lockfile.cc
index f6e9587..dbd7394 100644
--- lib/ts/lockfile.cc
+++ lib/ts/lockfile.cc
@@ -241,6 +241,7 @@ Lockfile::KillGroup(int sig, int initial_sig, const char 
*pname)
   int err;
   pid_t pid;
   pid_t holding_pid;
+  pid_t self = getpid();
 
   err = Open(&holding_pid);
   if (err == 1)                 // success getting the lock file
@@ -252,12 +253,20 @@ Lockfile::KillGroup(int sig, int initial_sig, const char 
*pname)
       pid = getpgid(holding_pid);
     } while ((pid < 0) && (errno == EINTR));
 
-    if ((pid < 0) || (pid == getpid()))
+    if ((pid < 0) || (pid == self)) {
+      // Error getting process group,
+      // or we are the group's owner.
+      // Let's kill just holding_pid
       pid = holding_pid;
-
-    if (pid != 0) {
+    }
+    else if (pid != self) {
+      // We managed to get holding_pid's process group
+      // and this group is not ours.
       // This way, we kill the process_group:
       pid = -pid;
+    }
+
+    if (pid != 0) {
       // In order to get core files from each process, please
       // set your core_pattern appropriately.
       lockfile_kill_internal(holding_pid, initial_sig, pid, pname, sig);




 diff --git cop/TrafficCop.cc cop/TrafficCop.cc
index 307270e..56bc6d2 100644
--- cop/TrafficCop.cc
+++ cop/TrafficCop.cc
@@ -59,6 +59,7 @@ static const char COP_TRACE_FILE[] = "/tmp/traffic_cop.trace";
 
 #define COP_FATAL    LOG_ALERT
 #define COP_WARNING  LOG_ERR
+#define COP_INFO     LOG_INFO
 #define COP_DEBUG    LOG_DEBUG
 
 Diags * g_diags; // link time dependency
@@ -131,6 +132,9 @@ static int child_pid = 0;
 static int child_status = 0;
 static int sem_id = 11452;
 
+// manager API is initialized
+static bool mgmt_init = false;
+
 AppVersionInfo appVersionInfo;
 
 static char const localhost[] = "127.0.0.1";
@@ -1142,6 +1146,7 @@ test_mgmt_cli_port()
 
   if (TSRecordGetString("proxy.config.manager_binary", &val) !=  TS_ERR_OKAY) {
     cop_log(COP_WARNING, "(cli test) unable to retrieve manager_binary\n");
+    mgmt_init = false; 
     ret = -1;
   } else {
     if (strcmp(val, manager_binary) != 0) {
@@ -1544,7 +1549,6 @@ check_no_run()
 static void*
 check(void *arg)
 {
-  bool mgmt_init = false;
   cop_log_trace("Entering check()\n");
 
   for (;;) {
@@ -1593,6 +1597,7 @@ check(void *arg)
 
     // We do this after the first round of checks, since the first "check" 
will spawn traffic_manager
     if (!mgmt_init) {
+      cop_log(COP_INFO, "Initializing manager API\n");
       TSInit(Layout::get()->runtimedir, 
static_cast<TSInitOptionT>(TS_MGMT_OPT_NO_EVENTS | TS_MGMT_OPT_NO_SOCK_TESTS));
       mgmt_init = true;
     }




--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to