Module Name:    src
Committed By:   dholland
Date:           Tue Jun 10 17:19:22 UTC 2014

Modified Files:
        src/usr.sbin/ypbind: ypbind.c

Log Message:
Instead of using magic numbers in what looks like a boolean
(dom_alive), create a state enumeration (domainstates) and use it
instead.

Instead of three states (new, alive, and, effectively, 'troubled') go
to five: new, alive, pinging, lost, and dead.

Domains start in the NEW state. When we get a reply from a server, the
state goes to ALIVE. The state is set to PINGING when we ping the
server (once a minute normally) and if the ping times out, it goes to
LOST. If we stay lost for a minute, go to DEAD, and in DEAD, do
exponential backoff of nag_servers calls.

Getting rid of the broken logic attached to the 'troubled' state fixes
PR 15355 (ypbind defeats disk idle spindown) -- it will now only
rewrite the binding file when the binding changes.

Also, fix the HEURISTIC code so it doesn't trigger except in ALIVE
state. I think this was the source of a lot of the spamming behavior
seen in PR 32519, which is now fixed.

Might also fix PR 23135 (broadcast ypbind sometimes fails to find
servers).


To generate a diff of this commit:
cvs rdiff -u -r1.95 -r1.96 src/usr.sbin/ypbind/ypbind.c

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/usr.sbin/ypbind/ypbind.c
diff -u src/usr.sbin/ypbind/ypbind.c:1.95 src/usr.sbin/ypbind/ypbind.c:1.96
--- src/usr.sbin/ypbind/ypbind.c:1.95	Tue Jun 10 17:19:12 2014
+++ src/usr.sbin/ypbind/ypbind.c	Tue Jun 10 17:19:22 2014
@@ -1,4 +1,4 @@
-/*	$NetBSD: ypbind.c,v 1.95 2014/06/10 17:19:12 dholland Exp $	*/
+/*	$NetBSD: ypbind.c,v 1.96 2014/06/10 17:19:22 dholland Exp $	*/
 
 /*
  * Copyright (c) 1992, 1993 Theo de Raadt <dera...@fsa.ca>
@@ -28,7 +28,7 @@
 
 #include <sys/cdefs.h>
 #ifndef LINT
-__RCSID("$NetBSD: ypbind.c,v 1.95 2014/06/10 17:19:12 dholland Exp $");
+__RCSID("$NetBSD: ypbind.c,v 1.96 2014/06/10 17:19:22 dholland Exp $");
 #endif
 
 #include <sys/types.h>
@@ -84,16 +84,26 @@ typedef enum {
 	YPBIND_DIRECT, YPBIND_BROADCAST,
 } ypbind_mode_t;
 
+enum domainstates {
+	DOM_NEW,		/* not yet bound */
+	DOM_ALIVE,		/* bound and healthy */
+	DOM_PINGING,		/* ping outstanding */
+	DOM_LOST,		/* binding timed out, looking for a new one */
+	DOM_DEAD,		/* long-term lost, in exponential backoff */
+};
+
 struct domain {
 	struct domain *dom_next;
 
 	char dom_name[YPMAXDOMAIN + 1];
 	struct sockaddr_in dom_server_addr;
 	long dom_vers;
-	time_t dom_checktime;
-	time_t dom_asktime;
+	time_t dom_checktime;		/* time of next check/contact */
+	time_t dom_asktime;		/* time we were last DOMAIN'd */
+	time_t dom_losttime;		/* time the binding was lost, or 0 */
+	unsigned dom_backofftime;	/* current backoff period, when DEAD */
 	int dom_lockfd;
-	int dom_alive;
+	enum domainstates dom_state;
 	uint32_t dom_xid;
 	FILE *dom_serversfile;		/* /var/yp/binding/foo.ypservers */
 	int dom_been_ypset;		/* ypset been done on this domain? */
@@ -145,6 +155,39 @@ open_locked(const char *path, int flags,
 	return fd;
 }
 
+/*
+ * Exponential backoff for pinging servers for a dead domain.
+ *
+ * We go 10 -> 20 -> 40 -> 60 seconds, then 2 -> 4 -> 8 -> 15 -> 30 ->
+ * 60 minutes, and stay at 60 minutes. This is overengineered.
+ *
+ * With a 60 minute max backoff the response time for when things come
+ * back is not awful, but we only try (and log) about 60 times even if
+ * things are down for a whole long weekend. This is an acceptable log
+ * load, I think.
+ */
+static void
+backoff(unsigned *psecs)
+{
+	unsigned secs;
+
+	secs = *psecs;
+	if (secs < 60) {
+		secs *= 2;
+		if (secs > 60) {
+			secs = 60;
+		}
+	} else if (secs < 60 * 15) {
+		secs *= 2;
+		if (secs > 60 * 15) {
+			secs = 60 * 15;
+		}
+	} else if (secs < 60 * 60) {
+		secs *= 2;
+	}
+	*psecs = secs;
+}
+
 ////////////////////////////////////////////////////////////
 // logging
 
@@ -198,14 +241,28 @@ ypservers_filename(const char *domain)
 // struct domain
 
 /*
- * State transition is done like this: 
+ * The state transitions of a domain work as follows:
  *
- * STATE	EVENT		ACTION			NEWSTATE	TIMEOUT
- * no binding	timeout		broadcast 		no binding	5 sec
- * no binding	answer		--			binding		60 sec
- * binding	timeout		ping server		checking	5 sec
- * checking	timeout		ping server + broadcast	checking	5 sec
- * checking	answer		--			binding		60 sec
+ * in state NEW:
+ *    nag_servers every 5 seconds
+ *    upon answer, state is ALIVE
+ *
+ * in state ALIVE:
+ *    every 60 seconds, send ping and switch to state PINGING
+ *
+ * in state PINGING:
+ *    upon answer, go to state ALIVE
+ *    if no answer in 5 seconds, go to state LOST and do nag_servers
+ *
+ * in state LOST:
+ *    do nag_servers every 5 seconds
+ *    upon answer, go to state ALIVE
+ *    if no answer in 60 seconds, go to state DEAD
+ *
+ * in state DEAD
+ *    do nag_servers every backofftime seconds (starts at 10)
+ *    upon answer go to state ALIVE
+ *    backofftime doubles (approximately) each try, with a cap of 1 hour
  */
 
 /*
@@ -263,8 +320,10 @@ domain_create(const char *name)
 	dom->dom_vers = YPVERS;
 	dom->dom_checktime = 0;
 	dom->dom_asktime = 0;
+	dom->dom_losttime = 0;
+	dom->dom_backofftime = 10;
 	dom->dom_lockfd = -1;
-	dom->dom_alive = 0;
+	dom->dom_state = DOM_NEW;
 	dom->dom_xid = unique_xid(dom);
 	dom->dom_been_ypset = 0;
 	dom->dom_serversfile = NULL;
@@ -456,36 +515,71 @@ rpc_received(char *dom_name, struct sock
 	}
 
 	/*
-	 * If the domain is alive and we aren't ypset, we don't need
-	 * to do anything.
-	 *
-	 * This code is unreachable (AFAIK) unless we receive an
-	 * unsolicited ping reply from the ypserver: because dom_alive
-	 * is 0 until we have a binding, but set from 1 to 2 when we
-	 * ping, it will never normally be 1 when a reply comes in,
-	 * even a reply to a ping. In the case where we lost the
-	 * binding and are getting a reply arising from nag_servers,
-	 * we lost the binding because we never got a ping response so
-	 * in that case dom_alive will also be 2.
-	 *
-	 * This logic is clearly wrong. XXX.
+	 * If the domain is alive and we aren't being called by ypset,
+	 * we shouldn't be getting a response at all. Log it, as it
+	 * might be hostile.
+	 */
+	if (dom->dom_state == DOM_ALIVE && force == 0) {
+		if (!memcmp(&dom->dom_server_addr, raddrp,
+			    sizeof(dom->dom_server_addr))) {
+			yp_log(LOG_WARNING,
+			       "Unexpected reply from server %s for domain %s",
+			       inet_ntoa(dom->dom_server_addr.sin_addr),
+			       dom->dom_name);
+		} else {
+			yp_log(LOG_WARNING,
+			       "Falsified reply from %s for domain %s",
+			       inet_ntoa(dom->dom_server_addr.sin_addr),
+			       dom->dom_name);
+		}
+		return;
+	}
+
+	/*
+	 * If we're expected a ping response, and we've got it
+	 * (meaning we aren't being called by ypset), we don't need to
+	 * do anything.
 	 */
-	if (dom->dom_alive == 1 && force == 0) {
+	if (dom->dom_state == DOM_PINGING && force == 0) {
 		/*
 		 * If the reply came from the server we expect, set
-		 * dom_alive back to 1 and ping again in 60 seconds.
+		 * dom_state back to ALIVE and ping again in 60
+		 * seconds.
 		 *
-		 * If it came from somewhere else, ignore it.
+		 * If it came from somewhere else, log it.
 		 */
 		if (!memcmp(&dom->dom_server_addr, raddrp,
 			    sizeof(dom->dom_server_addr))) {
-			dom->dom_alive = 1;
+			dom->dom_state = DOM_ALIVE;
 			/* recheck binding in 60 sec */
 			dom->dom_checktime = time(NULL) + 60;
+		} else {
+			yp_log(LOG_WARNING,
+			       "Falsified reply from %s for domain %s",
+			       inet_ntoa(dom->dom_server_addr.sin_addr),
+			       dom->dom_name);
 		}
 		return;
 	}
 
+#ifdef HEURISTIC
+	/*
+	 * If transitioning to the alive state from a non-alive state,
+	 * clear dom_asktime. This will help prevent any requests that
+	 * are still coming in from triggering unnecessary pings via
+	 * the HEURISTIC code.
+	 *
+	 * XXX: this may not be an adequate measure; we may need to
+	 * keep more state so we can disable the HEURISTIC code for
+	 * the first few seconds after rebinding.
+	 */
+	if (dom->dom_state == DOM_NEW ||
+	    dom->dom_state == DOM_LOST ||
+	    dom->dom_state == DOM_DEAD) {
+		dom->dom_asktime = 0;
+	}
+#endif
+
 	/*
 	 * Take the address we got the message from (or in the case of
 	 * ypset, the explicit address we were given) as the server
@@ -514,8 +608,7 @@ rpc_received(char *dom_name, struct sock
 	 *
 	 * 3. Either way we should not accept a response from an
 	 * arbitrary host unless we don't currently have a binding.
-	 * (The logic in the previous if statement is probably
-	 * supposed to handle this, but it doesn't currently work.)
+	 * (This is now fixed above.)
 	 *
 	 * Note that for a random unsolicited reply to work it has to
 	 * carry the XID of one of the domains we know about; but
@@ -525,7 +618,11 @@ rpc_received(char *dom_name, struct sock
 	    sizeof(dom->dom_server_addr));
 	/* recheck binding in 60 seconds */
 	dom->dom_checktime = time(NULL) + 60;
-	dom->dom_alive = 1;
+	dom->dom_state = DOM_ALIVE;
+
+	/* Clear the dead/backoff state. */
+	dom->dom_losttime = 0;
+	dom->dom_backofftime = 10;
 
 	/*
 	 * Generate a new binding file. If this fails, forget about it.
@@ -641,8 +738,8 @@ ypbindproc_domain_2(SVCXPRT *transp, voi
 		return NULL;
 	}
 
-	if (dom->dom_alive == 0) {
-		DPRINTF("dead domain %s\n", arg);
+	if (dom->dom_state == DOM_NEW) {
+		DPRINTF("new domain %s\n", arg);
 		return NULL;
 	}
 
@@ -655,9 +752,16 @@ ypbindproc_domain_2(SVCXPRT *transp, voi
 	 * elsewhere uses the binding file.
 	 *
 	 * Note: HEURISTIC is enabled by default.
+	 *
+	 * dholland 20140609: I think this is part of the mechanism
+	 * that causes ypbind to spam. I'm changing this logic so it
+	 * only triggers when the state is DOM_ALIVE: if the domain
+	 * is new, lost, or dead we shouldn't send more requests than
+	 * the ones already scheduled, and if we're already in the
+	 * middle of pinging there's no point doing it again.
 	 */
 	(void)time(&now);
-	if (now < dom->dom_asktime + 5) {
+	if (dom->dom_state == DOM_ALIVE && now < dom->dom_asktime + 5) {
 		/*
 		 * Hmm. More than 2 requests in 5 seconds have indicated
 		 * that my binding is possibly incorrect.
@@ -1263,7 +1367,7 @@ nag_servers(struct domain *dom)
 		removelock(dom);
 	}
 
-	if (dom->dom_alive == 2) {
+	if (dom->dom_state == DOM_PINGING || dom->dom_state == DOM_LOST) {
 		/*
 		 * This resolves the following situation:
 		 * ypserver on other subnet was once bound,
@@ -1300,8 +1404,6 @@ nag_servers(struct domain *dom)
 
 /*
  * Send a ping message to a domain's current ypserver.
- *
- * Note that dom_alive is 2 while waiting for the response.
  */
 static int
 ping(struct domain *dom)
@@ -1352,7 +1454,6 @@ ping(struct domain *dom)
 	}
 	AUTH_DESTROY(rpcua);
 
-	dom->dom_alive = 2;
 	DPRINTF("ping %x\n", dom->dom_server_addr.sin_addr.s_addr);
 
 	if (sendto(pingsock, buf, outlen, 0, 
@@ -1371,6 +1472,13 @@ ping(struct domain *dom)
  * server via nag_servers.
  *
  * Try again in five seconds.
+ *
+ * If we get back here and the state is still DOM_PINGING, it means
+ * we didn't receive a ping response within five seconds. Declare the
+ * binding lost. If the binding is already lost, and it's been lost
+ * for 60 seconds, switch to DOM_DEAD and begin exponential backoff.
+ * The exponential backoff starts at 10 seconds and tops out at one
+ * hour; see above.
  */
 static void
 checkwork(void)
@@ -1382,14 +1490,46 @@ checkwork(void)
 
 	(void)time(&t);
 	for (dom = domains; dom != NULL; dom = dom->dom_next) {
-		if (dom->dom_checktime < t) {
-			if (dom->dom_alive == 1)
-				(void)ping(dom);
-			else
-				(void)nag_servers(dom);
-			(void)time(&t);
+		if (dom->dom_checktime >= t) {
+			continue;
+		}
+		switch (dom->dom_state) {
+		    case DOM_NEW:
+			/* XXX should be a timeout for this state */
+			dom->dom_checktime = t + 5;
+			(void)nag_servers(dom);
+			break;
+
+		    case DOM_ALIVE:
+			dom->dom_state = DOM_PINGING;
+			dom->dom_checktime = t + 5;
+			(void)ping(dom);
+			break;
+
+		    case DOM_PINGING:
+			dom->dom_state = DOM_LOST;
+			dom->dom_losttime = t;
 			dom->dom_checktime = t + 5;
+			(void)nag_servers(dom);
+			break;
+
+		    case DOM_LOST:
+			if (t > dom->dom_losttime + 60) {
+				dom->dom_state = DOM_DEAD;
+				dom->dom_backofftime = 10;
+			}
+			dom->dom_checktime = t + 5;
+			(void)nag_servers(dom);
+			break;
+
+		    case DOM_DEAD:
+			dom->dom_checktime = t + dom->dom_backofftime;
+			backoff(&dom->dom_backofftime);
+			(void)nag_servers(dom);
+			break;
 		}
+		/* re-fetch the time in case we hung sending packets */
+		(void)time(&t);
 	}
 }
 
@@ -1566,7 +1706,7 @@ main(int argc, char *argv[])
 		 * on this, which means that if the default domain
 		 * is dead upstream boot will hang indefinitely.
 		 */
-		if (!started && domains->dom_alive) {
+		if (!started && domains->dom_state == DOM_ALIVE) {
 			started = 1;
 #ifdef DEBUG
 			if (!debug)

Reply via email to