Re: [PATCH 2 of 2] Added "max_shutdown_workers" directive

Maxim Dounin Mon, 25 Sep 2023 12:28:34 -0700

Hello!

On Mon, Aug 28, 2023 at 04:30:44PM +0400, Roman Arutyunyan wrote:


> # HG changeset patch
> # User Roman Arutyunyan <a...@nginx.com>
> # Date 1693218974 -14400
> #      Mon Aug 28 14:36:14 2023 +0400
> # Node ID 42c3166b675a6a7624bdf3d78c0d4685ce8172e3
> # Parent  fdad00808cb485ab83e919be59e9211ef06ac56a
> Added "max_shutdown_workers" directive.
> 
> The directive sets maximum number of workers in shutdown state while reloading
> nginx configuration.  Upon reaching this value, workers restart is postponed
> until shutdown workers start exiting.  This allows for lower peak resource
> consumption at the cost of slower reconfiguration.

I can't say I like the name, as the preferred term is "worker 
processes", not "workers".

max_shutdown_processes?

Also note that this doesn't really limit the total resource usage 
during configuration reloads, since cache manager and cache loader 
processes are exempt from the limit, but new processes are started 
on each configuration reload.  It would be nice to actually 
provide a limit which makes multiple/frequent configuration reloads 
completely safe from the resource usage point of view, that is, to 
limit all processes spawned during configuration reloads, and not 
just worker processes.

> 
> diff --git a/src/core/nginx.c b/src/core/nginx.c
> --- a/src/core/nginx.c
> +++ b/src/core/nginx.c
> @@ -83,6 +83,13 @@ static ngx_command_t  ngx_core_commands[
>        0,
>        NULL },
>  
> +    { ngx_string("max_shutdown_workers"),
> +      NGX_MAIN_CONF|NGX_DIRECT_CONF|NGX_CONF_TAKE1,
> +      ngx_conf_set_num_slot,
> +      0,
> +      offsetof(ngx_core_conf_t, max_shutdown_workers),
> +      NULL },
> +
>      { ngx_string("debug_points"),
>        NGX_MAIN_CONF|NGX_DIRECT_CONF|NGX_CONF_TAKE1,
>        ngx_conf_set_enum_slot,
> @@ -1117,6 +1124,7 @@ ngx_core_module_create_conf(ngx_cycle_t 
>      ccf->shutdown_timeout = NGX_CONF_UNSET_MSEC;
>  
>      ccf->worker_processes = NGX_CONF_UNSET;
> +    ccf->max_shutdown_workers = NGX_CONF_UNSET;
>      ccf->debug_points = NGX_CONF_UNSET;
>  
>      ccf->rlimit_nofile = NGX_CONF_UNSET;
> @@ -1146,6 +1154,7 @@ ngx_core_module_init_conf(ngx_cycle_t *c
>      ngx_conf_init_msec_value(ccf->shutdown_timeout, 0);
>  
>      ngx_conf_init_value(ccf->worker_processes, 1);
> +    ngx_conf_init_value(ccf->max_shutdown_workers, 0);
>      ngx_conf_init_value(ccf->debug_points, 0);
>  
>  #if (NGX_HAVE_CPU_AFFINITY)
> diff --git a/src/core/ngx_cycle.h b/src/core/ngx_cycle.h
> --- a/src/core/ngx_cycle.h
> +++ b/src/core/ngx_cycle.h
> @@ -94,6 +94,7 @@ typedef struct {
>      ngx_msec_t                shutdown_timeout;
>  
>      ngx_int_t                 worker_processes;
> +    ngx_int_t                 max_shutdown_workers;
>      ngx_int_t                 debug_points;
>  
>      ngx_int_t                 rlimit_nofile;
> diff --git a/src/os/unix/ngx_process.c b/src/os/unix/ngx_process.c
> --- a/src/os/unix/ngx_process.c
> +++ b/src/os/unix/ngx_process.c
> @@ -216,6 +216,7 @@ ngx_spawn_process(ngx_cycle_t *cycle, ng
>      ngx_processes[s].data = data;
>      ngx_processes[s].name = name;
>      ngx_processes[s].exiting = 0;
> +    ngx_processes[s].old = 0;
>  
>      switch (respawn) {
>  
> diff --git a/src/os/unix/ngx_process.h b/src/os/unix/ngx_process.h
> --- a/src/os/unix/ngx_process.h
> +++ b/src/os/unix/ngx_process.h
> @@ -33,6 +33,7 @@ typedef struct {
>      unsigned            detached:1;
>      unsigned            exiting:1;
>      unsigned            exited:1;
> +    unsigned            old:1;
>  } ngx_process_t;
>  
>  
> diff --git a/src/os/unix/ngx_process_cycle.c b/src/os/unix/ngx_process_cycle.c
> --- a/src/os/unix/ngx_process_cycle.c
> +++ b/src/os/unix/ngx_process_cycle.c
> @@ -15,6 +15,7 @@ static void ngx_start_worker_processes(n
>      ngx_int_t type);
>  static void ngx_start_worker_process(ngx_cycle_t *cycle, ngx_int_t i,
>      ngx_int_t type);
> +static void ngx_restart_worker_processes(ngx_cycle_t *cycle);
>  static void ngx_start_cache_manager_processes(ngx_cycle_t *cycle,
>      ngx_uint_t respawn);
>  static void ngx_pass_open_channel(ngx_cycle_t *cycle);
> @@ -22,6 +23,7 @@ static void ngx_signal_worker_processes(
>  static void ngx_signal_worker_process(ngx_cycle_t *cycle, ngx_int_t i,
>      int signo);
>  static ngx_uint_t ngx_reap_children(ngx_cycle_t *cycle);
> +static ngx_uint_t ngx_restart_worker_process(ngx_cycle_t *cycle);
>  static void ngx_master_process_exit(ngx_cycle_t *cycle);
>  static void ngx_worker_process_cycle(ngx_cycle_t *cycle, void *data);
>  static void ngx_worker_process_init(ngx_cycle_t *cycle, ngx_int_t worker);
> @@ -235,8 +237,8 @@ ngx_master_process_cycle(ngx_cycle_t *cy
>              ngx_cycle = cycle;
>              ccf = (ngx_core_conf_t *) ngx_get_conf(cycle->conf_ctx,
>                                                     ngx_core_module);
> -            ngx_start_worker_processes(cycle, ccf->worker_processes,
> -                                       NGX_PROCESS_JUST_RESPAWN);
> +            ngx_restart_worker_processes(cycle);
> +
>              ngx_start_cache_manager_processes(cycle, 1);
>  
>              /* allow new processes to start */
> @@ -360,6 +362,70 @@ ngx_start_worker_process(ngx_cycle_t *cy
>  
>  
>  static void
> +ngx_restart_worker_processes(ngx_cycle_t *cycle)
> +{
> +    ngx_int_t         i, n, m, worker;
> +    ngx_core_conf_t  *ccf;
> +
> +    ccf = (ngx_core_conf_t *) ngx_get_conf(cycle->conf_ctx, ngx_core_module);
> +
> +    n = 0;
> +    m = ccf->max_shutdown_workers;
> +
> +    if (m == 0) {
> +        goto start;
> +    }
> +
> +    for (i = 0; i < ngx_last_process; i++) {
> +
> +        if (ngx_processes[i].pid == -1
> +            || ngx_processes[i].proc != ngx_worker_process_cycle)
> +        {

Checking "proc" here looks like a layering violation.  For 
example, ngx_signal_worker_process() does try to depend on the 
process entry point, and instead relies on the "detached" flag to 
decide which processes need to be signalled.

> +            continue;
> +        }
> +
> +        ngx_processes[i].old = 0;
> +        worker = (intptr_t) ngx_processes[i].data;
> +
> +        if (!ngx_processes[i].exiting && worker < ccf->worker_processes) {
> +            n++;

It is not clear what we are counting here, and why.  My best guess 
it that it's an attempt to count worker processes we are going to 
restart (and not just stop).

It is not clear why we are doing this though: the limit as defined 
does not differentiate between shutting down processes being 
stopped and processes being restarted.

Further, I don't think it's done correctly: since "worker" is an 
index number of the worker process, there can be at least gaps (if 
worker process failed to start and cannot be respawn).

> +
> +        } else if (m > 0) {
> +            m--;
> +        }
> +    }
> +
> +    n = (n > m) ? n - m : 0;

n = ngx_max(n - m, 0);
?

Also, some better names for the variables and/or some comments 
explaining what is being done might worth the effort.

(As far as I can tell, "n" here is the number of processes we want 
to restart but not allowed to.)

> +
> +    if (n == 0) {
> +        goto start;
> +    }
> +
> +    for (i = 0; i < ngx_last_process; i++) {
> +
> +        if (ngx_processes[i].pid == -1
> +            || ngx_processes[i].proc != ngx_worker_process_cycle)
> +        {
> +            continue;
> +        }
> +
> +        ngx_processes[i].old = 1;
> +        worker = (intptr_t) ngx_processes[i].data;
> +
> +        if (!ngx_processes[i].exiting && worker < n) {

Here "n" is the number of processes we are not allowed to restart 
right now, while "worker" is the index number of the worker 
process in question.  For a given "n", number of worker processes 
matching this condition might be at least less than "n": if some 
worker processes failed to start.

(Just in case, the rule of thumb is: if you are using worker 
process index number for anything, most likely you are doing it 
wrong.)

> +            ngx_processes[i].just_spawn = 1;

The "just_spawn" flag implies the process was just started, and 
so it is ignored by the following ngx_signal_worker_process() 
call.  Abusing it to ignore processes during shutdown might work, 
but I can't say I like this approach.  Using a separate flag might 
be better.

> +        }
> +    }
> +
> +start:
> +
> +    for (i = n; i < ccf->worker_processes; i++) {
> +        ngx_start_worker_process(cycle, i, NGX_PROCESS_JUST_RESPAWN);
> +    }

The order of processes being started looks somewhat 
counter-intuitive here.

> +}
> +
> +
> +static void
>  ngx_start_cache_manager_processes(ngx_cycle_t *cycle, ngx_uint_t respawn)
>  {
>      ngx_uint_t    i, manager, loader;
> @@ -486,15 +552,16 @@ ngx_signal_worker_process(ngx_cycle_t *c
>      ch.fd = -1;
>  
>  
> -    ngx_log_debug7(NGX_LOG_DEBUG_EVENT, cycle->log, 0,
> -                   "child: %i %P e:%d t:%d d:%d r:%d j:%d",
> +    ngx_log_debug8(NGX_LOG_DEBUG_EVENT, cycle->log, 0,
> +                   "child: %i %P e:%d t:%d d:%d r:%d j:%d o:%d",
>                     i,
>                     ngx_processes[i].pid,
>                     ngx_processes[i].exiting,
>                     ngx_processes[i].exited,
>                     ngx_processes[i].detached,
>                     ngx_processes[i].respawn,
> -                   ngx_processes[i].just_spawn);
> +                   ngx_processes[i].just_spawn,
> +                   ngx_processes[i].old);
>  
>      if (ngx_processes[i].detached || ngx_processes[i].pid == -1) {
>          return;
> @@ -563,15 +630,16 @@ ngx_reap_children(ngx_cycle_t *cycle)
>      live = 0;
>      for (i = 0; i < ngx_last_process; i++) {
>  
> -        ngx_log_debug7(NGX_LOG_DEBUG_EVENT, cycle->log, 0,
> -                       "child: %i %P e:%d t:%d d:%d r:%d j:%d",
> +        ngx_log_debug8(NGX_LOG_DEBUG_EVENT, cycle->log, 0,
> +                       "child: %i %P e:%d t:%d d:%d r:%d j:%d o:%d",
>                         i,
>                         ngx_processes[i].pid,
>                         ngx_processes[i].exiting,
>                         ngx_processes[i].exited,
>                         ngx_processes[i].detached,
>                         ngx_processes[i].respawn,
> -                       ngx_processes[i].just_spawn);
> +                       ngx_processes[i].just_spawn,
> +                       ngx_processes[i].old);
>  
>          if (ngx_processes[i].pid == -1) {
>              continue;
> @@ -660,6 +728,16 @@ ngx_reap_children(ngx_cycle_t *cycle)
>                  ngx_processes[i].pid = -1;
>              }
>  

Note: when a process dies, it is restarted if the "respawn" flag 
is set, regardless of the "old" flag.  The "old" flag is preserved 
even if it is no longer relevant after the respawn: the process is 
restarted with the new configuration, and there is no need to 
restart it again.

> +            if (ngx_processes[i].old
> +                && ngx_processes[i].exiting
> +                && !ngx_terminate
> +                && !ngx_quit)
> +            {
> +                if (ngx_restart_worker_process(cycle)) {
> +                    live = 1;
> +                }
> +            }
> +
>          } else if (ngx_processes[i].exiting || !ngx_processes[i].detached) {
>              live = 1;
>          }
> @@ -669,6 +747,53 @@ ngx_reap_children(ngx_cycle_t *cycle)
>  }
>  
>  
> +static ngx_uint_t
> +ngx_restart_worker_process(ngx_cycle_t *cycle)
> +{
> +    ngx_int_t         i, j, m;
> +    ngx_core_conf_t  *ccf;
> +
> +    ccf = (ngx_core_conf_t *) ngx_get_conf(cycle->conf_ctx, ngx_core_module);
> +
> +    j = -1;
> +    m = ccf->max_shutdown_workers;
> +
> +    if (m == 0) {
> +        return 0;
> +    }
> +
> +    for (i = 0; i < ngx_last_process; i++) {
> +
> +        if (ngx_processes[i].pid == -1 || !ngx_processes[i].old) {
> +            continue;
> +        }
> +
> +        if (!ngx_processes[i].exiting && !ngx_processes[i].exited) {

The "exited" check here is not needed to catch the processes we 
were called for, or other processes being shut down - they already 
have "exiting" flag set.

On the other hand, this can catch unexpectedly died processes 
(the ones which are still waiting for restart).  As per the code, 
such processes will be counted against max_shutdown_workers as if 
they are shutting down - this looks wrong and can cause reload 
hang.

> +            j = i;
> +            continue;
> +        }
> +
> +        if (--m == 0) {
> +            return 0;
> +        }
> +    }
> +
> +    if (j == -1) {
> +        return 0;
> +    }
> +
> +    ngx_start_worker_process(cycle, (intptr_t) ngx_processes[j].data,
> +                             NGX_PROCESS_RESPAWN);
> +
> +    ngx_msleep(100);
> +
> +    ngx_signal_worker_process(cycle, j,
> +                              ngx_signal_value(NGX_SHUTDOWN_SIGNAL));

Also note that the "exited" check above implies that there can be 
multiple exited processes at the same time, but we only restart 
just one process here.

It might be a good idea to restart multiple processes at the same 
time: sleeping for 100ms for each process means that if, for 
example, 64 processes will exit at the same time after shutdown 
timeout expiration, master process will be unresponsive for 6 
seconds.

Also, this looks very close to ngx_restart_worker_processes().  It 
might be a good idea to rework things to ensure unified approach 
to restarting processes instead of providing multiple duplicate 
ways.

> +
> +    return 1;
> +}
> +
> +
>  static void
>  ngx_master_process_exit(ngx_cycle_t *cycle)
>  {

Overall, I tend to think that the suggested code is very fragile, 
and I would rather consider re-thinking the approach.

-- 
Maxim Dounin
http://mdounin.ru/
_______________________________________________
nginx-devel mailing list
nginx-devel@nginx.org
https://mailman.nginx.org/mailman/listinfo/nginx-devel

Re: [PATCH 2 of 2] Added "max_shutdown_workers" directive

Reply via email to