On Mon, 31 Mar 2014 23:18:13 +0200
Florian Klink <[email protected]> wrote:

> Am 31.03.2014 21:13, schrieb Dwight Engen:
> > On Mon, 31 Mar 2014 20:34:15 +0200
> > Florian Klink <[email protected]> wrote:
> > 
> >> Am 31.03.2014 20:10, schrieb Dwight Engen:
> >>> On Sat, 29 Mar 2014 23:39:33 +0100
> >>> Florian Klink <[email protected]> wrote:
> >>>
> >>>> Hi,
> >>>>
> >>>> when running multiple lxc actions in row using the command line
> >>>> tools, I sometimes observe the following state:
> >>>>
> >>>>
> >>>> - lxc-monitord is not running anymore
> >>>> - /run/lxc/var/lib/lxc/monitor-fifo still exists, but is
> >>>> "refusing connection"
> >>>>
> >>>> In the logs, I then see the following:
> >>>>
> >>>>
> >>>> lxc-start 1395671045.703 ERROR    lxc_monitor - connect : backing
> >>>> off 10 lxc-start 1395671045.713 ERROR    lxc_monitor - connect :
> >>>> backing off 50 lxc-start 1395671045.763 ERROR    lxc_monitor -
> >>>> connect : backing off 100 lxc-start 1395671045.864 ERROR
> >>>> lxc_monitor - connect : Connection refused
> >>>>
> >>>>
> >>>> ... and the command fails.
> >>>  
> >>> The only time I've seen this happen is if lxc-monitord is hard
> >>> killed so it doesn't have a chance to clean up and remove the
> >>> socket.
> >>
> >> Here, it's happening quite frequently. However, the script never
> >> kills lxc-monitord on its own, it just tries to detect and fix
> >> this state by removing the socket file...
> > 
> > Right, removing the socket file makes it so another lxc-monitord
> > will start, but the question is why is the first one exiting without
> > cleaning up? Can you reliably reproduce it at will? If so then maybe
> > you could attach an strace to lxc-monitord and see why it is
> > exiting.
> 
> I was so far not successful in reproducing the bug while having an
> strace running. :-( But I'll continue to try!
> > 
> >>>
> >>>>
> >>>> A possible workaround would be checking for non-running
> >>>> lxc-monitord process but existing monitor-fifo file then removing
> >>>> the fifo if it exists before running the next lxc command, but
> >>>> thats ugly ;-)
> >>>
> >>> Is there a good non-racy way to do this? I guess monitord could
> >>> write its pid in $LXCPATH and we could kill(pid, 0) it. 
> 
> I also think that lxc should be able to recover from this problem
> automatically.

I agree, though I would like to understand the root cause. Can you try
out the attached patch? I think it will cure your issues.

> >>>  
> >>>> Is this behaviour known? Is there some missing "cleanup code" in
> >>>> lxc(_monitord) or why is it failing like this?
> >>>  
> >>> Currently it catches SIGILL, SIGSEGV, SIGBUS, and SIGTERM and
> >>> cleans up. Other than hard kill I'm not sure what else might
> >>> cause it to exit without cleaning up.
> >>
> >> I shutdown containers with `lxc-stop -n container-name`
> >> (lxc.stopsignal=30 (SIGPWR)), however this signal should never go
> >> to lxc_monitord, right?
> > 
> > Right, that goes to the init process of the container. 
> >  
> >>>
> >>>> Florian
> >>>>
> >>>> _______________________________________________
> >>>> lxc-users mailing list
> >>>> [email protected]
> >>>> http://lists.linuxcontainers.org/listinfo/lxc-users
> >>>
> >>
> >>
> >> _______________________________________________
> >> lxc-users mailing list
> >> [email protected]
> >> http://lists.linuxcontainers.org/listinfo/lxc-users
> > 
> 
> 
> _______________________________________________
> lxc-users mailing list
> [email protected]
> http://lists.linuxcontainers.org/listinfo/lxc-users

>From 39ba7e8c293d74de6588906775167bd97ecf3fbb Mon Sep 17 00:00:00 2001
From: Dwight Engen <[email protected]>
Date: Mon, 31 Mar 2014 19:45:45 -0400
Subject: [PATCH] make monitor/monitord more resilient to unexpected
 termination

Signed-off-by: Dwight Engen <[email protected]>
---
 src/lxc/lxc_monitord.c | 108 +++++++++++++++++++++++++++++++++++++++++++++++--
 src/lxc/monitor.c      |  53 +++++++++++++-----------
 src/lxc/monitor.h      |   5 ++-
 3 files changed, 138 insertions(+), 28 deletions(-)

diff --git a/src/lxc/lxc_monitord.c b/src/lxc/lxc_monitord.c
index f6d99d5..54cc3f0 100644
--- a/src/lxc/lxc_monitord.c
+++ b/src/lxc/lxc_monitord.c
@@ -78,13 +78,17 @@ static int lxc_monitord_fifo_create(struct lxc_monitor *mon)
 	char fifo_path[PATH_MAX];
 	int ret;
 
-	ret = lxc_monitor_fifo_name(mon->lxcpath, fifo_path, sizeof(fifo_path), 1);
+	ret = lxc_monitor_path_name(mon->lxcpath, "monitor-fifo",
+				    fifo_path, sizeof(fifo_path), 1);
 	if (ret < 0)
 		return ret;
 
 	ret = mknod(fifo_path, S_IFIFO|S_IRUSR|S_IWUSR, 0);
 	if (ret < 0) {
-		INFO("monitor fifo %s exists, already running?", fifo_path);
+		/* this shouldn't happen as the pid file exclusivity should
+		 * ensure only one monitord per lxcpath at a time.
+		 */
+		NOTICE("failed to mknod monitor fifo %s %s", fifo_path, strerror(errno));
 		return -1;
 	}
 
@@ -102,7 +106,8 @@ static int lxc_monitord_fifo_delete(struct lxc_monitor *mon)
 	char fifo_path[PATH_MAX];
 	int ret;
 
-	ret = lxc_monitor_fifo_name(mon->lxcpath, fifo_path, sizeof(fifo_path), 0);
+	ret = lxc_monitor_path_name(mon->lxcpath, "monitor-fifo",
+				    fifo_path, sizeof(fifo_path), 0);
 	if (ret < 0)
 		return ret;
 
@@ -110,6 +115,97 @@ static int lxc_monitord_fifo_delete(struct lxc_monitor *mon)
 	return 0;
 }
 
+static pid_t lxc_monitord_pid_read(struct lxc_monitor *mon)
+{
+	FILE *f;
+	char pid_path[PATH_MAX];
+	int ret;
+
+	ret = lxc_monitor_path_name(mon->lxcpath, "monitor-pid",
+				    pid_path, sizeof(pid_path), 0);
+	if (ret < 0)
+		return ret;
+
+	f = fopen(pid_path, "r");
+	if (f == NULL) {
+		SYSERROR("lxc-monitord failed to open pidfile '%s'", pid_path);
+		ret = -errno;
+		goto err1;
+	}
+
+	if (fscanf(f, "%d", &ret) != 1) {
+		SYSERROR("lxc-monitord failed to read '%s'", pid_path);
+		ret = -errno;
+	}
+
+	fclose(f);
+err1:
+	return ret;
+}
+
+static void lxc_monitord_pid_delete(struct lxc_monitor *mon)
+{
+	char pid_path[PATH_MAX];
+	int ret;
+
+	ret = lxc_monitor_path_name(mon->lxcpath, "monitor-pid",
+				    pid_path, sizeof(pid_path), 0);
+	if (ret < 0)
+		return;
+	unlink(pid_path);
+}
+
+static int lxc_monitord_pid_create(struct lxc_monitor *mon)
+{
+	FILE *f;
+	char pid_path[PATH_MAX];
+	int ret;
+
+	ret = lxc_monitor_path_name(mon->lxcpath, "monitor-pid",
+				    pid_path, sizeof(pid_path), 1);
+	if (ret < 0)
+		return ret;
+
+	for(;;) {
+		f = fopen(pid_path, "wx");
+		if (f)
+			break;
+
+		if (errno == EEXIST) {
+			pid_t pid;
+
+			pid = lxc_monitord_pid_read(mon);
+			if (pid < 0) {
+				NOTICE("lxc-monitord couldn't get pid");
+				return -1;
+			}
+			if (kill(pid, 0) == 0) {
+				return -1;
+			}
+			/* The previous lxc-monitord is dead, clean up and try
+			 * again (may race with other monitord start up, but the
+			 * exclusive open above makes it okay).
+			 */
+			NOTICE("old pid:%d discovered dead, starting new instance", pid);
+			lxc_monitord_fifo_delete(mon);
+			unlink(pid_path);
+			continue;
+		}
+
+		SYSERROR("lxc-monitord failed to create pidfile '%s'", pid_path);
+		return -errno;
+	}
+
+	if (fprintf(f, "%d\n", getpid()) < 0) {
+		SYSERROR("lxc-monitord failed to write '%s'", pid_path);
+		ret = -errno;
+		unlink(pid_path);
+	}
+
+	fclose(f);
+	return ret;
+}
+
 static void lxc_monitord_sockfd_remove(struct lxc_monitor *mon, int fd) {
 	int i;
 
@@ -247,6 +343,10 @@ static int lxc_monitord_create(struct lxc_monitor *mon)
 {
 	int ret;
 
+	ret = lxc_monitord_pid_create(mon);
+	if (ret < 0)
+		return ret;
+
 	ret = lxc_monitord_fifo_create(mon);
 	if (ret < 0)
 		return ret;
@@ -267,6 +367,8 @@ static void lxc_monitord_delete(struct lxc_monitor *mon)
 	close(mon->fifofd);
 	lxc_monitord_fifo_delete(mon);
 
+	lxc_monitord_pid_delete(mon);
+
 	for (i = 0; i < mon->clientfds_cnt; i++) {
 		lxc_mainloop_del_handler(&mon->descr, mon->clientfds[i]);
 		close(mon->clientfds[i]);
diff --git a/src/lxc/monitor.c b/src/lxc/monitor.c
index e45b5cf..849faa0 100644
--- a/src/lxc/monitor.c
+++ b/src/lxc/monitor.c
@@ -50,8 +50,8 @@
 lxc_log_define(lxc_monitor, lxc);
 
 /* routines used by monitor publishers (containers) */
-int lxc_monitor_fifo_name(const char *lxcpath, char *fifo_path, size_t fifo_path_sz,
-			  int do_mkdirp)
+int lxc_monitor_path_name(const char *lxcpath, const char *item,
+			  char *path, size_t path_sz, int do_mkdirp)
 {
 	int ret;
 	char *rundir;
@@ -61,22 +61,22 @@ int lxc_monitor_fifo_name(const char *lxcpath, char *fifo_path, size_t fifo_path
 		return -1;
 
 	if (do_mkdirp) {
-		ret = snprintf(fifo_path, fifo_path_sz, "%s/lxc/%s", rundir, lxcpath);
-		if (ret < 0 || ret >= fifo_path_sz) {
-			ERROR("rundir/lxcpath (%s/%s) too long for monitor fifo", rundir, lxcpath);
+		ret = snprintf(path, path_sz, "%s/lxc/%s", rundir, lxcpath);
+		if (ret < 0 || ret >= path_sz) {
+			ERROR("rundir/lxcpath (%s/%s) too long for %s", rundir, lxcpath, item);
 			free(rundir);
 			return -1;
 		}
-		ret = mkdir_p(fifo_path, 0755);
+		ret = mkdir_p(path, 0755);
 		if (ret < 0) {
-			ERROR("unable to create monitor fifo dir %s", fifo_path);
+			ERROR("unable to create %s", path);
 			free(rundir);
 			return ret;
 		}
 	}
-	ret = snprintf(fifo_path, fifo_path_sz, "%s/lxc/%s/monitor-fifo", rundir, lxcpath);
-	if (ret < 0 || ret >= fifo_path_sz) {
-		ERROR("rundir/lxcpath (%s/%s) too long for monitor fifo", rundir, lxcpath);
+	ret = snprintf(path, path_sz, "%s/lxc/%s/%s", rundir, lxcpath, item);
+	if (ret < 0 || ret >= path_sz) {
+		ERROR("rundir/lxcpath (%s/%s) too long for %s", rundir, lxcpath, item);
 		free(rundir);
 		return -1;
 	}
@@ -91,18 +91,25 @@ static void lxc_monitor_fifo_send(struct lxc_msg *msg, const char *lxcpath)
 
 	BUILD_BUG_ON(sizeof(*msg) > PIPE_BUF); /* write not guaranteed atomic */
 
-	ret = lxc_monitor_fifo_name(lxcpath, fifo_path, sizeof(fifo_path), 0);
+	ret = lxc_monitor_path_name(lxcpath, "monitor-fifo",
+				    fifo_path, sizeof(fifo_path), 0);
 	if (ret < 0)
 		return;
 
-	fd = open(fifo_path, O_WRONLY);
+	/* open the fifo nonblock in case the monitor is dead, we don't want
+	 * the open to wait for a reader since it may never come.
+	 */
+	fd = open(fifo_path, O_WRONLY|O_NONBLOCK);
 	if (fd < 0) {
-		/* it is normal for this open to fail when there is no monitor
-		 * running, so we don't log it
+		/* it is normal for this open to fail ENXIO when there is no
+		 * monitor running, so we don't log it
 		 */
 		return;
 	}
 
+	if (fcntl(fd, F_SETFL, O_WRONLY) < 0)
+		return;
+
 	ret = write(fd, msg, sizeof(*msg));
 	if (ret != sizeof(*msg)) {
 		close(fd);
@@ -168,24 +175,24 @@ int lxc_monitor_open(const char *lxcpath)
 	if (lxc_monitor_sock_name(lxcpath, &addr) < 0)
 		return -1;
 
-	fd = socket(PF_UNIX, SOCK_STREAM, 0);
-	if (fd < 0) {
-		ERROR("socket : %s", strerror(errno));
-		return -1;
-	}
-
 	len = strlen(&addr.sun_path[1]) + 1;
 	if (len >= sizeof(addr.sun_path) - 1) {
-		ret = -1;
 		errno = ENAMETOOLONG;
-		goto err1;
+		return -1;
 	}
 
 	for (retry = 0; retry < sizeof(backoff_ms)/sizeof(backoff_ms[0]); retry++) {
+		fd = socket(PF_UNIX, SOCK_STREAM, 0);
+		if (fd < 0) {
+			ERROR("socket : %s", strerror(errno));
+			return -1;
+		}
+
 		ret = connect(fd, (struct sockaddr *)&addr, offsetof(struct sockaddr_un, sun_path) + len);
 		if (ret == 0 || errno != ECONNREFUSED)
 			break;
 		ERROR("connect : backing off %d", backoff_ms[retry]);
+		close(fd);
 		usleep(backoff_ms[retry] * 1000);
 	}
 
@@ -194,8 +201,8 @@ int lxc_monitor_open(const char *lxcpath)
 		goto err1;
 	}
 	return fd;
+
 err1:
-	close(fd);
 	return ret;
 }
 
diff --git a/src/lxc/monitor.h b/src/lxc/monitor.h
index a7eb110..72375e5 100644
--- a/src/lxc/monitor.h
+++ b/src/lxc/monitor.h
@@ -42,8 +42,9 @@ struct lxc_msg {
 
 extern int lxc_monitor_open(const char *lxcpath);
 extern int lxc_monitor_sock_name(const char *lxcpath, struct sockaddr_un *addr);
-extern int lxc_monitor_fifo_name(const char *lxcpath, char *fifo_path,
-				 size_t fifo_path_sz, int do_mkdirp);
+extern int lxc_monitor_path_name(const char *lxcpath, const char *item,
+				 char *fifo_path, size_t fifo_path_sz,
+				 int do_mkdirp);
 extern void lxc_monitor_send_state(const char *name, lxc_state_t state,
 			    const char *lxcpath);
 extern int lxc_monitord_spawn(const char *lxcpath);
-- 
1.8.5.3

_______________________________________________
lxc-users mailing list
[email protected]
http://lists.linuxcontainers.org/listinfo/lxc-users

Reply via email to