Hey all,

I've seeing a notable increase in upstream traffic with the current
dnsmasq release candidate. Some investigations have revealed that the
reason for this is the modified forwarding philosophy that *always*
triggers a retry whenever a query is received before the upstream was
able to answer (which may take long on slow networks).

This patch adds a timeout to stop such forward destination flooding.
Before the timeout is reached, identical queries are just put on the
list form where they will get replied to when the response to the first
forwarded query arrives. The difference added by the patch is that such
queries do not trigger another forwarding within the configured
interval.
If we still received nothing, the next query *after* the timeout is
again forwarded to avoid hanging because the original query got lost.

Th default for this interval is 3 seconds, it can be changed using a
setting and even be disabled (by setting to zero) which restores the
behavior we have right now. The default of 3 seconds has been chosen
such that we will retry when other software considers this a good idea
(retry timeout is 5 seconds in Linux, see RES_TIMEOUT in <resolv.h>).

I confirmed the intended effect in my local tests: Reduced unnecessary
forwarding traffic without the danger of failing when the first query
is lost (or whatever).

Let me know if you need something more/else. It should be easy to
review this one.

Best,
Dominik
From d7b9fbb4eb81b02326918d29bec5ee3f97e02121 Mon Sep 17 00:00:00 2001
From: DL6ER <dl...@dl6er.de>
Date: Mon, 5 Apr 2021 12:00:23 +0200
Subject: [PATCH] Retry queries only after giving the upstream server some time
 to respond

---
 CHANGELOG     | 13 +++++++------
 man/dnsmasq.8 | 10 ++++++++++
 src/config.h  |  1 +
 src/dnsmasq.h |  1 +
 src/forward.c |  9 +++++++--
 src/option.c  |  9 +++++++++
 6 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 1af593f..1328d0a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,13 +1,14 @@
 version 2.85
-        Fix problem with DNS retries in 2.83/2.84.
-        The new logic in 2.83/2.84 which merges distinct requests
+	Fix problem with DNS retries in 2.83/2.84.
+	The new logic in 2.83/2.84 which merges distinct requests
 	for the same domain causes problems with clients which do
 	retries as distinct requests (differing IDs and/or source ports.)
 	The retries just get piggy-backed on the first, failed, request.
-        The logic is now changed so that distinct requests for repeated
-        queries still get merged into a single ID/source port, but
-	they now always trigger a re-try upstream.
-        Thanks to Nicholas Mu for his analysis.
+	The logic is now changed so that distinct requests for repeated
+	queries still get merged into a single ID/source port, but
+	they now trigger a re-try upstream only after a certain timeout
+	is reached. --retry-timeout defaults to 3 seconds.
+	Thanks to Nicholas Mu for his analysis.
 
 	Tweak sort order of tags in get-version. v2.84 sorts
 	before v2.83, but v2.83 sorts before v2.83rc1 and 2.83rc1
diff --git a/man/dnsmasq.8 b/man/dnsmasq.8
index 6546110..a3f7652 100644
--- a/man/dnsmasq.8
+++ b/man/dnsmasq.8
@@ -194,6 +194,16 @@ Dnsmasq picks random ports as source for outbound queries:
 when this option is given, the ports used will always be lower
 than that specified. Useful for systems behind firewalls.
 .TP
+.B --retry-timeout=<timeout>
+Identical queries received within <timeout> seconds don't trigger retries
+upstream. This gives upstream servers some time to respond to the query.
+Otherwise, it may happen, e.g., in very active networks, that we needlessly
+forward one and the same query multiple times before the forward destination
+even had a chance to reply. Setting this to zero results in always retrying
+when identical queries are received. The default of 3 seconds has been chosen
+such that we will retry when clients consider this appropriate
+(retry timeout is 5 seconds in Linux, see RES_TIMEOUT in <resolv.h>).
+.TP
 .B \-i, --interface=<interface name>
 Listen only on the specified interface(s). Dnsmasq automatically adds
 the loopback (local) interface to the list of interfaces to use when
diff --git a/src/config.h b/src/config.h
index 8c41943..ae86909 100644
--- a/src/config.h
+++ b/src/config.h
@@ -58,6 +58,7 @@
 #define SOA_EXPIRY 1209600 /* SOA expiry default */
 #define LOOP_TEST_DOMAIN "test" /* domain for loop testing, "test" is reserved by RFC 2606 and won't therefore clash */
 #define LOOP_TEST_TYPE T_TXT
+#define FORWARD_RETRY_TIMEOUT 3 /* seconds before identical queries are considered failed and get retried when new queries arrive */
  
 /* compile-time options: uncomment below to enable or do eg.
    make COPTS=-DHAVE_BROKEN_RTC
diff --git a/src/dnsmasq.h b/src/dnsmasq.h
index 1e21005..47e3f81 100644
--- a/src/dnsmasq.h
+++ b/src/dnsmasq.h
@@ -1183,6 +1183,7 @@ extern struct daemon {
   /* file for packet dumps. */
   int dumpfd;
 #endif
+  int retry_timeout;
 } *daemon;
 
 /* cache.c */
diff --git a/src/forward.c b/src/forward.c
index 2c0ef7f..9c82733 100644
--- a/src/forward.c
+++ b/src/forward.c
@@ -332,8 +332,8 @@ static int forward_query(int udpfd, union mysockaddr *udpaddr,
 	    sockaddr_isequal(&src->source, udpaddr))
 	  break;
 
-      /* Existing query, but from new source, just add this 
-	 client to the list that will get the reply.*/
+      /* Existing query (either from new source or with new query ID)
+	 Just add this query to the list that will get the reply.*/
       if (!src)
 	{
 	  /* Note whine_malloc() zeros memory. */
@@ -360,6 +360,11 @@ static int forward_query(int udpfd, union mysockaddr *udpaddr,
 	  src->iface = dst_iface;
 	  src->fd = udpfd;
 	}
+
+      /* If we don't want to retry just now, drop this query right after we
+         added it to the list above */
+      if (difftime(now, forward->time) < daemon->retry_timeout)
+	return 0;
     }
 
   /* retry existing query */
diff --git a/src/option.c b/src/option.c
index 6de5914..d600afa 100644
--- a/src/option.c
+++ b/src/option.c
@@ -170,6 +170,7 @@ struct myoption {
 #define LOPT_PXE_VENDOR    361
 #define LOPT_DYNHOST       362
 #define LOPT_LOG_DEBUG     363
+#define LOPT_RETRY_TIMEOUT 364
  
 #ifdef HAVE_GETOPT_LONG
 static const struct option opts[] =  
@@ -345,6 +346,7 @@ static const struct myoption opts[] =
     { "dhcp-ignore-clid", 0, 0,  LOPT_IGNORE_CLID },
     { "dynamic-host", 1, 0, LOPT_DYNHOST },
     { "log-debug", 0, 0, LOPT_LOG_DEBUG },
+    { "retry-timeout", 1, 0, LOPT_RETRY_TIMEOUT },
     { NULL, 0, 0, 0 }
   };
 
@@ -527,6 +529,7 @@ static struct {
   { LOPT_DUMPFILE, ARG_ONE, "<path>", gettext_noop("Path to debug packet dump file"), NULL },
   { LOPT_DUMPMASK, ARG_ONE, "<hex>", gettext_noop("Mask which packets to dump"), NULL },
   { LOPT_SCRIPT_TIME, OPT_LEASE_RENEW, NULL, gettext_noop("Call dhcp-script when lease expiry changes."), NULL },
+  { LOPT_RETRY_TIMEOUT, ARG_ONE, "<integer>", gettext_noop("Specify timeout after which identical queries are retried (defaults to 3 seconds)."), NULL },
   { 0, 0, NULL, NULL, NULL }
 }; 
 
@@ -4514,6 +4517,11 @@ err:
 	daemon->host_records_tail = new;
 	break;
       }
+      
+    case LOPT_RETRY_TIMEOUT:  /* --retry-timeout */
+      if (!atoi_check(arg, &daemon->retry_timeout))
+	ret_err(gen_err);
+      break;
 
 #ifdef HAVE_DNSSEC
     case LOPT_DNSSEC_STAMP: /* --dnssec-timestamp */
@@ -5076,6 +5084,7 @@ void read_opts(int argc, char **argv, char *compile_opts)
   daemon->soa_refresh = SOA_REFRESH;
   daemon->soa_retry = SOA_RETRY;
   daemon->soa_expiry = SOA_EXPIRY;
+  daemon->retry_timeout = FORWARD_RETRY_TIMEOUT;
   
 #ifndef NO_ID
   add_txt("version.bind", "dnsmasq-" VERSION, 0 );
-- 
2.30.2

_______________________________________________
Dnsmasq-discuss mailing list
Dnsmasq-discuss@lists.thekelleys.org.uk
https://lists.thekelleys.org.uk/cgi-bin/mailman/listinfo/dnsmasq-discuss

Reply via email to