On Mon, 31 Mar 2014 23:18:13 +0200 Florian Klink <[email protected]> wrote:
> Am 31.03.2014 21:13, schrieb Dwight Engen: > > On Mon, 31 Mar 2014 20:34:15 +0200 > > Florian Klink <[email protected]> wrote: > > > >> Am 31.03.2014 20:10, schrieb Dwight Engen: > >>> On Sat, 29 Mar 2014 23:39:33 +0100 > >>> Florian Klink <[email protected]> wrote: > >>> > >>>> Hi, > >>>> > >>>> when running multiple lxc actions in row using the command line > >>>> tools, I sometimes observe the following state: > >>>> > >>>> > >>>> - lxc-monitord is not running anymore > >>>> - /run/lxc/var/lib/lxc/monitor-fifo still exists, but is > >>>> "refusing connection" > >>>> > >>>> In the logs, I then see the following: > >>>> > >>>> > >>>> lxc-start 1395671045.703 ERROR lxc_monitor - connect : backing > >>>> off 10 lxc-start 1395671045.713 ERROR lxc_monitor - connect : > >>>> backing off 50 lxc-start 1395671045.763 ERROR lxc_monitor - > >>>> connect : backing off 100 lxc-start 1395671045.864 ERROR > >>>> lxc_monitor - connect : Connection refused > >>>> > >>>> > >>>> ... and the command fails. > >>> > >>> The only time I've seen this happen is if lxc-monitord is hard > >>> killed so it doesn't have a chance to clean up and remove the > >>> socket. > >> > >> Here, it's happening quite frequently. However, the script never > >> kills lxc-monitord on its own, it just tries to detect and fix > >> this state by removing the socket file... > > > > Right, removing the socket file makes it so another lxc-monitord > > will start, but the question is why is the first one exiting without > > cleaning up? Can you reliably reproduce it at will? If so then maybe > > you could attach an strace to lxc-monitord and see why it is > > exiting. > > I was so far not successful in reproducing the bug while having an > strace running. :-( But I'll continue to try! > > > >>> > >>>> > >>>> A possible workaround would be checking for non-running > >>>> lxc-monitord process but existing monitor-fifo file then removing > >>>> the fifo if it exists before running the next lxc command, but > >>>> thats ugly ;-) > >>> > >>> Is there a good non-racy way to do this? I guess monitord could > >>> write its pid in $LXCPATH and we could kill(pid, 0) it. > > I also think that lxc should be able to recover from this problem > automatically. I agree, though I would like to understand the root cause. Can you try out the attached patch? I think it will cure your issues. > >>> > >>>> Is this behaviour known? Is there some missing "cleanup code" in > >>>> lxc(_monitord) or why is it failing like this? > >>> > >>> Currently it catches SIGILL, SIGSEGV, SIGBUS, and SIGTERM and > >>> cleans up. Other than hard kill I'm not sure what else might > >>> cause it to exit without cleaning up. > >> > >> I shutdown containers with `lxc-stop -n container-name` > >> (lxc.stopsignal=30 (SIGPWR)), however this signal should never go > >> to lxc_monitord, right? > > > > Right, that goes to the init process of the container. > > > >>> > >>>> Florian > >>>> > >>>> _______________________________________________ > >>>> lxc-users mailing list > >>>> [email protected] > >>>> http://lists.linuxcontainers.org/listinfo/lxc-users > >>> > >> > >> > >> _______________________________________________ > >> lxc-users mailing list > >> [email protected] > >> http://lists.linuxcontainers.org/listinfo/lxc-users > > > > > _______________________________________________ > lxc-users mailing list > [email protected] > http://lists.linuxcontainers.org/listinfo/lxc-users
>From 39ba7e8c293d74de6588906775167bd97ecf3fbb Mon Sep 17 00:00:00 2001 From: Dwight Engen <[email protected]> Date: Mon, 31 Mar 2014 19:45:45 -0400 Subject: [PATCH] make monitor/monitord more resilient to unexpected termination Signed-off-by: Dwight Engen <[email protected]> --- src/lxc/lxc_monitord.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++-- src/lxc/monitor.c | 53 +++++++++++++----------- src/lxc/monitor.h | 5 ++- 3 files changed, 138 insertions(+), 28 deletions(-) diff --git a/src/lxc/lxc_monitord.c b/src/lxc/lxc_monitord.c index f6d99d5..54cc3f0 100644 --- a/src/lxc/lxc_monitord.c +++ b/src/lxc/lxc_monitord.c @@ -78,13 +78,17 @@ static int lxc_monitord_fifo_create(struct lxc_monitor *mon) char fifo_path[PATH_MAX]; int ret; - ret = lxc_monitor_fifo_name(mon->lxcpath, fifo_path, sizeof(fifo_path), 1); + ret = lxc_monitor_path_name(mon->lxcpath, "monitor-fifo", + fifo_path, sizeof(fifo_path), 1); if (ret < 0) return ret; ret = mknod(fifo_path, S_IFIFO|S_IRUSR|S_IWUSR, 0); if (ret < 0) { - INFO("monitor fifo %s exists, already running?", fifo_path); + /* this shouldn't happen as the pid file exclusivity should + * ensure only one monitord per lxcpath at a time. + */ + NOTICE("failed to mknod monitor fifo %s %s", fifo_path, strerror(errno)); return -1; } @@ -102,7 +106,8 @@ static int lxc_monitord_fifo_delete(struct lxc_monitor *mon) char fifo_path[PATH_MAX]; int ret; - ret = lxc_monitor_fifo_name(mon->lxcpath, fifo_path, sizeof(fifo_path), 0); + ret = lxc_monitor_path_name(mon->lxcpath, "monitor-fifo", + fifo_path, sizeof(fifo_path), 0); if (ret < 0) return ret; @@ -110,6 +115,97 @@ static int lxc_monitord_fifo_delete(struct lxc_monitor *mon) return 0; } +static pid_t lxc_monitord_pid_read(struct lxc_monitor *mon) +{ + FILE *f; + char pid_path[PATH_MAX]; + int ret; + + ret = lxc_monitor_path_name(mon->lxcpath, "monitor-pid", + pid_path, sizeof(pid_path), 0); + if (ret < 0) + return ret; + + f = fopen(pid_path, "r"); + if (f == NULL) { + SYSERROR("lxc-monitord failed to open pidfile '%s'", pid_path); + ret = -errno; + goto err1; + } + + if (fscanf(f, "%d", &ret) != 1) { + SYSERROR("lxc-monitord failed to read '%s'", pid_path); + ret = -errno; + } + + fclose(f); +err1: + return ret; +} + +static void lxc_monitord_pid_delete(struct lxc_monitor *mon) +{ + char pid_path[PATH_MAX]; + int ret; + + ret = lxc_monitor_path_name(mon->lxcpath, "monitor-pid", + pid_path, sizeof(pid_path), 0); + if (ret < 0) + return; + unlink(pid_path); +} + +static int lxc_monitord_pid_create(struct lxc_monitor *mon) +{ + FILE *f; + char pid_path[PATH_MAX]; + int ret; + + ret = lxc_monitor_path_name(mon->lxcpath, "monitor-pid", + pid_path, sizeof(pid_path), 1); + if (ret < 0) + return ret; + + for(;;) { + f = fopen(pid_path, "wx"); + if (f) + break; + + if (errno == EEXIST) { + pid_t pid; + + pid = lxc_monitord_pid_read(mon); + if (pid < 0) { + NOTICE("lxc-monitord couldn't get pid"); + return -1; + } + if (kill(pid, 0) == 0) { + return -1; + } + /* The previous lxc-monitord is dead, clean up and try + * again (may race with other monitord start up, but the + * exclusive open above makes it okay). + */ + NOTICE("old pid:%d discovered dead, starting new instance", pid); + lxc_monitord_fifo_delete(mon); + unlink(pid_path); + continue; + } + + SYSERROR("lxc-monitord failed to create pidfile '%s'", pid_path); + return -errno; + } + + if (fprintf(f, "%d\n", getpid()) < 0) { + SYSERROR("lxc-monitord failed to write '%s'", pid_path); + ret = -errno; + unlink(pid_path); + } + + fclose(f); + return ret; +} + static void lxc_monitord_sockfd_remove(struct lxc_monitor *mon, int fd) { int i; @@ -247,6 +343,10 @@ static int lxc_monitord_create(struct lxc_monitor *mon) { int ret; + ret = lxc_monitord_pid_create(mon); + if (ret < 0) + return ret; + ret = lxc_monitord_fifo_create(mon); if (ret < 0) return ret; @@ -267,6 +367,8 @@ static void lxc_monitord_delete(struct lxc_monitor *mon) close(mon->fifofd); lxc_monitord_fifo_delete(mon); + lxc_monitord_pid_delete(mon); + for (i = 0; i < mon->clientfds_cnt; i++) { lxc_mainloop_del_handler(&mon->descr, mon->clientfds[i]); close(mon->clientfds[i]); diff --git a/src/lxc/monitor.c b/src/lxc/monitor.c index e45b5cf..849faa0 100644 --- a/src/lxc/monitor.c +++ b/src/lxc/monitor.c @@ -50,8 +50,8 @@ lxc_log_define(lxc_monitor, lxc); /* routines used by monitor publishers (containers) */ -int lxc_monitor_fifo_name(const char *lxcpath, char *fifo_path, size_t fifo_path_sz, - int do_mkdirp) +int lxc_monitor_path_name(const char *lxcpath, const char *item, + char *path, size_t path_sz, int do_mkdirp) { int ret; char *rundir; @@ -61,22 +61,22 @@ int lxc_monitor_fifo_name(const char *lxcpath, char *fifo_path, size_t fifo_path return -1; if (do_mkdirp) { - ret = snprintf(fifo_path, fifo_path_sz, "%s/lxc/%s", rundir, lxcpath); - if (ret < 0 || ret >= fifo_path_sz) { - ERROR("rundir/lxcpath (%s/%s) too long for monitor fifo", rundir, lxcpath); + ret = snprintf(path, path_sz, "%s/lxc/%s", rundir, lxcpath); + if (ret < 0 || ret >= path_sz) { + ERROR("rundir/lxcpath (%s/%s) too long for %s", rundir, lxcpath, item); free(rundir); return -1; } - ret = mkdir_p(fifo_path, 0755); + ret = mkdir_p(path, 0755); if (ret < 0) { - ERROR("unable to create monitor fifo dir %s", fifo_path); + ERROR("unable to create %s", path); free(rundir); return ret; } } - ret = snprintf(fifo_path, fifo_path_sz, "%s/lxc/%s/monitor-fifo", rundir, lxcpath); - if (ret < 0 || ret >= fifo_path_sz) { - ERROR("rundir/lxcpath (%s/%s) too long for monitor fifo", rundir, lxcpath); + ret = snprintf(path, path_sz, "%s/lxc/%s/%s", rundir, lxcpath, item); + if (ret < 0 || ret >= path_sz) { + ERROR("rundir/lxcpath (%s/%s) too long for %s", rundir, lxcpath, item); free(rundir); return -1; } @@ -91,18 +91,25 @@ static void lxc_monitor_fifo_send(struct lxc_msg *msg, const char *lxcpath) BUILD_BUG_ON(sizeof(*msg) > PIPE_BUF); /* write not guaranteed atomic */ - ret = lxc_monitor_fifo_name(lxcpath, fifo_path, sizeof(fifo_path), 0); + ret = lxc_monitor_path_name(lxcpath, "monitor-fifo", + fifo_path, sizeof(fifo_path), 0); if (ret < 0) return; - fd = open(fifo_path, O_WRONLY); + /* open the fifo nonblock in case the monitor is dead, we don't want + * the open to wait for a reader since it may never come. + */ + fd = open(fifo_path, O_WRONLY|O_NONBLOCK); if (fd < 0) { - /* it is normal for this open to fail when there is no monitor - * running, so we don't log it + /* it is normal for this open to fail ENXIO when there is no + * monitor running, so we don't log it */ return; } + if (fcntl(fd, F_SETFL, O_WRONLY) < 0) + return; + ret = write(fd, msg, sizeof(*msg)); if (ret != sizeof(*msg)) { close(fd); @@ -168,24 +175,24 @@ int lxc_monitor_open(const char *lxcpath) if (lxc_monitor_sock_name(lxcpath, &addr) < 0) return -1; - fd = socket(PF_UNIX, SOCK_STREAM, 0); - if (fd < 0) { - ERROR("socket : %s", strerror(errno)); - return -1; - } - len = strlen(&addr.sun_path[1]) + 1; if (len >= sizeof(addr.sun_path) - 1) { - ret = -1; errno = ENAMETOOLONG; - goto err1; + return -1; } for (retry = 0; retry < sizeof(backoff_ms)/sizeof(backoff_ms[0]); retry++) { + fd = socket(PF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + ERROR("socket : %s", strerror(errno)); + return -1; + } + ret = connect(fd, (struct sockaddr *)&addr, offsetof(struct sockaddr_un, sun_path) + len); if (ret == 0 || errno != ECONNREFUSED) break; ERROR("connect : backing off %d", backoff_ms[retry]); + close(fd); usleep(backoff_ms[retry] * 1000); } @@ -194,8 +201,8 @@ int lxc_monitor_open(const char *lxcpath) goto err1; } return fd; + err1: - close(fd); return ret; } diff --git a/src/lxc/monitor.h b/src/lxc/monitor.h index a7eb110..72375e5 100644 --- a/src/lxc/monitor.h +++ b/src/lxc/monitor.h @@ -42,8 +42,9 @@ struct lxc_msg { extern int lxc_monitor_open(const char *lxcpath); extern int lxc_monitor_sock_name(const char *lxcpath, struct sockaddr_un *addr); -extern int lxc_monitor_fifo_name(const char *lxcpath, char *fifo_path, - size_t fifo_path_sz, int do_mkdirp); +extern int lxc_monitor_path_name(const char *lxcpath, const char *item, + char *fifo_path, size_t fifo_path_sz, + int do_mkdirp); extern void lxc_monitor_send_state(const char *name, lxc_state_t state, const char *lxcpath); extern int lxc_monitord_spawn(const char *lxcpath); -- 1.8.5.3
_______________________________________________ lxc-users mailing list [email protected] http://lists.linuxcontainers.org/listinfo/lxc-users
