On September 21, 2022 2:49 pm, Dominik Csapak wrote: > currently, the 'forced_cleanup' (sending SIGKILL to the qemu process), > is intended to be triggered 5 seconds after sending the initial shutdown > signal (SIGTERM) which is sadly not enough for some setups. > > Accidentally, it could be triggered earlier than 5 seconds, if a > SIGALRM triggers in the timespan directly before setting it again. > > Also, this approach means that depending on when machines are shutdown > their forced cleanup may happen after 5 seconds, or any time after, if > new vms are shut off in the meantime. > > To improve this situation, rework the way we deal with this cleanup, by > saving the time incl. timeout in the CleanupData, and trigger a > forced_cleanup every 10 seconds. If that happends, remove it from > the forced_cleanup list. > > To improve the shutdown behaviour, increase the default timeout to 60 > seconds, which should be enough, but add a commandline toggle where > users can set it to a different value. > > Signed-off-by: Dominik Csapak <d.csa...@proxmox.com> > --- > qmeventd/qmeventd.c | 75 +++++++++++++++++++++++++-------------------- > qmeventd/qmeventd.h | 2 ++ > 2 files changed, 44 insertions(+), 33 deletions(-) > > diff --git a/qmeventd/qmeventd.c b/qmeventd/qmeventd.c > index 8d32827..e9ff5b3 100644 > --- a/qmeventd/qmeventd.c > +++ b/qmeventd/qmeventd.c > @@ -29,6 +29,7 @@ > #include <signal.h> > #include <stdbool.h> > #include <stdio.h> > +#include <stdlib.h> > #include <string.h> > #include <sys/epoll.h> > #include <sys/socket.h> > @@ -36,15 +37,18 @@ > #include <sys/un.h> > #include <sys/wait.h> > #include <unistd.h> > +#include <time.h> > > #include "qmeventd.h" > > +#define DEFAULT_KILL_TIMEOUT 60 > + > static int verbose = 0; > +static int kill_timeout = DEFAULT_KILL_TIMEOUT; > static int epoll_fd = 0; > static const char *progname; > GHashTable *vm_clients; // key=vmid (freed on remove), value=*Client (free > manually) > GSList *forced_cleanups; > -volatile sig_atomic_t alarm_triggered = 0; > > /* > * Helper functions > @@ -54,9 +58,10 @@ static void > usage() > { > fprintf(stderr, "Usage: %s [-f] [-v] PATH\n", progname); > - fprintf(stderr, " -f run in foreground (default: false)\n"); > - fprintf(stderr, " -v verbose (default: false)\n"); > - fprintf(stderr, " PATH use PATH for socket\n"); > + fprintf(stderr, " -f run in foreground (default: false)\n"); > + fprintf(stderr, " -v verbose (default: false)\n"); > + fprintf(stderr, " -t timeout kill timeout (default: %ds)\n", > DEFAULT_KILL_TIMEOUT); > + fprintf(stderr, " PATH use PATH for socket\n"); > } > > static pid_t > @@ -469,16 +474,16 @@ terminate_client(struct Client *client) > int err = kill(client->pid, SIGTERM); > log_neg(err, "kill"); > > + time_t timeout = time(NULL) + kill_timeout; > + > struct CleanupData *data_ptr = malloc(sizeof(struct CleanupData)); > struct CleanupData data = { > .pid = client->pid, > - .pidfd = pidfd > + .pidfd = pidfd, > + .timeout = timeout, > }; > *data_ptr = data; > forced_cleanups = g_slist_prepend(forced_cleanups, (void *)data_ptr); > - > - // resets any other alarms, but will fire eventually and cleanup all > - alarm(5); > } > > void > @@ -551,27 +556,16 @@ handle_client(struct Client *client) > json_tokener_free(tok); > } > > - > -/* > - * SIGALRM and cleanup handling > - * > - * terminate_client will set an alarm for 5 seconds and add its client's PID > to > - * the forced_cleanups list - when the timer expires, we iterate the list and > - * attempt to issue SIGKILL to all processes which haven't yet stopped. > - */ > - > -static void > -alarm_handler(__attribute__((unused)) int signum) > -{ > - alarm_triggered = 1; > -} > -
wasn't this intentionally decoupled like this? alarm_handler just sets the flag actual force cleanup is conditionalized on the alarm having triggered, but the cleanup happens outside of the signal handler.. is there a reason from switching away from these scheme? we don't need to do the cleanup in the signal handler (timing is already plenty fuzzy anyway ;)) > static void > sigkill(void *ptr, __attribute__((unused)) void *unused) > { > struct CleanupData data = *((struct CleanupData *)ptr); > int err; > > + if (data.timeout > time(NULL)) { nit: current time / cutoff could be passed in via the currently unused user_data parameter.. > + return; > + } > + > if (data.pidfd > 0) { > err = pidfd_send_signal(data.pidfd, SIGKILL, NULL, 0); > (void)close(data.pidfd); > @@ -588,21 +582,29 @@ sigkill(void *ptr, __attribute__((unused)) void *unused) > fprintf(stderr, "cleanup failed, terminating pid '%d' with SIGKILL\n", > data.pid); > } > + > + // remove ourselves from the list > + forced_cleanups = g_slist_remove(forced_cleanups, ptr); > + free(ptr); > } > > +/* > + * SIGALRM and cleanup handling > + * > + * handles the cleanup on non terminated qemu processes, will be called every > + * 10 seconds by setting 'alarm(10)' at the end again > + */ > + > static void > -handle_forced_cleanup() > +alarm_handler(__attribute__((unused)) int signum) > { > - if (alarm_triggered) { > + if (g_slist_length(forced_cleanups) > 0) { > VERBOSE_PRINT("clearing forced cleanup backlog\n"); > - alarm_triggered = 0; > g_slist_foreach(forced_cleanups, sigkill, NULL); > - g_slist_free_full(forced_cleanups, free); > - forced_cleanups = NULL; > } > + alarm(10); > } > > - > int > main(int argc, char *argv[]) > { > @@ -611,7 +613,7 @@ main(int argc, char *argv[]) > char *socket_path = NULL; > progname = argv[0]; > > - while ((opt = getopt(argc, argv, "hfv")) != -1) { > + while ((opt = getopt(argc, argv, "hfvt:")) != -1) { > switch (opt) { > case 'f': > daemonize = 0; > @@ -619,6 +621,15 @@ main(int argc, char *argv[]) > case 'v': > verbose = 1; > break; > + case 't': > + errno = 0; > + char *endptr = NULL; > + kill_timeout = strtoul(optarg, &endptr, 10); > + if (errno != 0 || *endptr != '\0' || kill_timeout == 0) { > + usage(); > + exit(EXIT_FAILURE); > + } > + break; > case 'h': > usage(); > exit(EXIT_SUCCESS); > @@ -668,10 +679,10 @@ main(int argc, char *argv[]) > > int nevents; > > + alarm(10); > for(;;) { > nevents = epoll_wait(epoll_fd, events, 1, -1); > if (nevents < 0 && errno == EINTR) { > - handle_forced_cleanup(); > continue; > } > bail_neg(nevents, "epoll_wait"); > @@ -688,7 +699,5 @@ main(int argc, char *argv[]) > handle_client((struct Client *)events[n].data.ptr); > } > } > - > - handle_forced_cleanup(); > } > } > diff --git a/qmeventd/qmeventd.h b/qmeventd/qmeventd.h > index 2cf1947..1b9cea1 100644 > --- a/qmeventd/qmeventd.h > +++ b/qmeventd/qmeventd.h > @@ -7,6 +7,7 @@ > */ > > #include <sys/syscall.h> > +#include <time.h> > > #ifndef __NR_pidfd_open > #define __NR_pidfd_open 434 > @@ -86,6 +87,7 @@ struct Client { > struct CleanupData { > pid_t pid; > int pidfd; > + time_t timeout; > }; > > void handle_qmp_handshake(struct Client *client); > -- > 2.30.2 > > > > _______________________________________________ > pve-devel mailing list > pve-devel@lists.proxmox.com > https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel > > > _______________________________________________ pve-devel mailing list pve-devel@lists.proxmox.com https://lists.proxmox.com/cgi-bin/mailman/listinfo/pve-devel