[ 
https://issues.apache.org/jira/browse/TS-3104?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Victor updated TS-3104:
-----------------------
    Attachment: ts-0023-cop-reinit-mgr-api-on-failure.patch
                ts-0022-fix-lockfile-killgroup.patch

Patches for described issues.

> traffic_cop can't restart traffic_manager properly
> --------------------------------------------------
>
>                 Key: TS-3104
>                 URL: https://issues.apache.org/jira/browse/TS-3104
>             Project: Traffic Server
>          Issue Type: Bug
>          Components: Cop
>            Reporter: Victor
>         Attachments: ts-0022-fix-lockfile-killgroup.patch, 
> ts-0023-cop-reinit-mgr-api-on-failure.patch
>
>
> In some cases traffic_cop can't restart traffic_manager properly. We met 
> these issues at "Ashmanov and partners" (http://en.ashmanov.com/). There are 
> two places in code which in my opinion need corrections:
> 1) The logic which decides whether to kill process or group.
> 2) The main traffic_cop loop: it doesn't reinitialize manager API in case of 
> failure and this fact leads to constant attempts to connect to manager using 
> socket id == -1. 
> I have prepared patches for both issues. Please kindly take a look at them 
> and let me know your thoughts.
> diff --git lib/ts/lockfile.cc lib/ts/lockfile.cc
> index f6e9587..dbd7394 100644
> --- lib/ts/lockfile.cc
> +++ lib/ts/lockfile.cc
> @@ -241,6 +241,7 @@ Lockfile::KillGroup(int sig, int initial_sig, const char 
> *pname)
>    int err;
>    pid_t pid;
>    pid_t holding_pid;
> +  pid_t self = getpid();
>  
>    err = Open(&holding_pid);
>    if (err == 1)                 // success getting the lock file
> @@ -252,12 +253,20 @@ Lockfile::KillGroup(int sig, int initial_sig, const 
> char *pname)
>        pid = getpgid(holding_pid);
>      } while ((pid < 0) && (errno == EINTR));
>  
> -    if ((pid < 0) || (pid == getpid()))
> +    if ((pid < 0) || (pid == self)) {
> +      // Error getting process group,
> +      // or we are the group's owner.
> +      // Let's kill just holding_pid
>        pid = holding_pid;
> -
> -    if (pid != 0) {
> +    }
> +    else if (pid != self) {
> +      // We managed to get holding_pid's process group
> +      // and this group is not ours.
>        // This way, we kill the process_group:
>        pid = -pid;
> +    }
> +
> +    if (pid != 0) {
>        // In order to get core files from each process, please
>        // set your core_pattern appropriately.
>        lockfile_kill_internal(holding_pid, initial_sig, pid, pname, sig);
>  diff --git cop/TrafficCop.cc cop/TrafficCop.cc
> index 307270e..56bc6d2 100644
> --- cop/TrafficCop.cc
> +++ cop/TrafficCop.cc
> @@ -59,6 +59,7 @@ static const char COP_TRACE_FILE[] = 
> "/tmp/traffic_cop.trace";
>  
>  #define COP_FATAL    LOG_ALERT
>  #define COP_WARNING  LOG_ERR
> +#define COP_INFO     LOG_INFO
>  #define COP_DEBUG    LOG_DEBUG
>  
>  Diags * g_diags; // link time dependency
> @@ -131,6 +132,9 @@ static int child_pid = 0;
>  static int child_status = 0;
>  static int sem_id = 11452;
>  
> +// manager API is initialized
> +static bool mgmt_init = false;
> +
>  AppVersionInfo appVersionInfo;
>  
>  static char const localhost[] = "127.0.0.1";
> @@ -1142,6 +1146,7 @@ test_mgmt_cli_port()
>  
>    if (TSRecordGetString("proxy.config.manager_binary", &val) !=  
> TS_ERR_OKAY) {
>      cop_log(COP_WARNING, "(cli test) unable to retrieve manager_binary\n");
> +    mgmt_init = false; 
>      ret = -1;
>    } else {
>      if (strcmp(val, manager_binary) != 0) {
> @@ -1544,7 +1549,6 @@ check_no_run()
>  static void*
>  check(void *arg)
>  {
> -  bool mgmt_init = false;
>    cop_log_trace("Entering check()\n");
>  
>    for (;;) {
> @@ -1593,6 +1597,7 @@ check(void *arg)
>  
>      // We do this after the first round of checks, since the first "check" 
> will spawn traffic_manager
>      if (!mgmt_init) {
> +      cop_log(COP_INFO, "Initializing manager API\n");
>        TSInit(Layout::get()->runtimedir, 
> static_cast<TSInitOptionT>(TS_MGMT_OPT_NO_EVENTS | 
> TS_MGMT_OPT_NO_SOCK_TESTS));
>        mgmt_init = true;
>      }



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to