This is an automated email from the ASF dual-hosted git repository.
zwoop pushed a commit to branch 9.2.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/9.2.x by this push:
new 9b7ff60bb Restore down nameservers after they come back online (#8847)
9b7ff60bb is described below
commit 9b7ff60bb3dfc030894633ab862683bfbe8b90ec
Author: Brian Neradt <[email protected]>
AuthorDate: Mon May 16 19:38:57 2022 -0500
Restore down nameservers after they come back online (#8847)
This patch ensures that a previously down DNS will be used again by ATS
after connectivity to it is restored. Before this patch, we would try to
reconnect to it after one attempt at DNS_PRIMARY_RETRY_PERIOD, but not
again after.
(cherry picked from commit 417e1576ce120cbddbd93b678a097b7f56b28960)
---
iocore/dns/DNS.cc | 14 +-
iocore/dns/P_DNSProcessor.h | 11 +-
tests/gold_tests/dns/dns_down_nameserver.test.py | 193 +++++++++++++++++++++
.../dns/replay/multiple_host_requests.replay.yaml | 93 ++++++++++
4 files changed, 296 insertions(+), 15 deletions(-)
diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc
index 9ad2ea322..c8da7474d 100644
--- a/iocore/dns/DNS.cc
+++ b/iocore/dns/DNS.cc
@@ -609,6 +609,11 @@ DNSHandler::startEvent(int /* event ATS_UNUSED */, Event
*e)
n_con = 1;
}
+ // Retrying the name servers is something done periodically over the
+ // lifetime of the handler. This ensures that we don't miss retrying if it
+ // is necessary.
+ this->_dns_retry_event = this_ethread()->schedule_every(this,
DNS_PRIMARY_RETRY_PERIOD);
+
return EVENT_CONT;
} else {
ink_assert(false); // I.e. this should never really happen
@@ -775,8 +780,6 @@ DNSHandler::failover()
ip_text_buffer buff;
Warning("failover: connection to DNS server %s lost, retrying",
ats_ip_ntop(&ip.sa, buff, sizeof(buff)));
}
- // Make sure retries are done even if no more requests.
- this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD);
}
/** Mark one of the nameservers as down. */
@@ -790,8 +793,6 @@ DNSHandler::rr_failure(int ndx)
Debug("dns", "rr_failure: Marking nameserver %d as down", ndx);
ns_down[ndx] = 1;
Warning("connection to DNS server %s lost, marking as down",
ats_ip_ntop(&m_res->nsaddr_list[ndx].sa, buff, sizeof(buff)));
- // Make sure retries are done even if no more requests.
- this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD);
}
int nscount = m_res->nscount;
@@ -984,11 +985,6 @@ DNSHandler::check_and_reset_tcp_conn()
int
DNSHandler::mainEvent(int event, Event *e)
{
- // If this was a scheduled retry event, clear the associated flag.
- if (e && e->cookie == RETRY_COOKIE) {
- this->nameserver_retry_in_flight_p = false;
- }
-
recv_dns(event, e);
if (dns_ns_rr) {
if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) {
diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h
index 7d0bba98d..c0776f2ff 100644
--- a/iocore/dns/P_DNSProcessor.h
+++ b/iocore/dns/P_DNSProcessor.h
@@ -24,6 +24,7 @@
#pragma once
#include "I_EventSystem.h"
+#include "tscore/PendingAction.h"
#define MAX_NAMED 32
#define DEFAULT_DNS_RETRIES 5
@@ -172,12 +173,6 @@ struct DNSHandler : public Continuation {
int in_flight = 0;
int name_server = 0;
int in_write_dns = 0;
- /// Rate limiter for down nameserver retries.
- /// Don't schedule another if there is already one in flight.
- std::atomic<bool> nameserver_retry_in_flight_p{false};
- /// Marker for event cookie to indicate it's a nameserver retry event.
- /// @note Can't be @c constexpr because of the cast.
- static inline void *const RETRY_COOKIE{reinterpret_cast<void *>(0x2)};
HostEnt *hostent_cache = nullptr;
@@ -272,6 +267,10 @@ private:
// Check tcp connection for TCP_RETRY mode
void check_and_reset_tcp_conn();
bool reset_tcp_conn(int ndx);
+
+ /** The event used for the periodic retry of connectivity to any down name
+ * servers. */
+ PendingAction _dns_retry_event;
};
/* --------------------------------------------------------------
diff --git a/tests/gold_tests/dns/dns_down_nameserver.test.py
b/tests/gold_tests/dns/dns_down_nameserver.test.py
new file mode 100644
index 000000000..87dcadaf5
--- /dev/null
+++ b/tests/gold_tests/dns/dns_down_nameserver.test.py
@@ -0,0 +1,193 @@
+'''
+Verify ATS handles down name servers correctly.
+'''
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ports import get_port
+
+
+# This value tracks DNS_PRIMARY_RETRY_PERIOD in P_DNSProcessor.h.
+DNS_PRIMARY_RETRY_PERIOD = 5
+
+Test.Summary = '''
+Verify ATS handles down name servers correctly.
+'''
+
+
+class DownDNSNameserverTest:
+ """Encapsulate logic for testing down name server functionality.
+
+ Note that this test verifies that ATS behaves correctly with respect to
+ down *name* servers. This is different than testing down origin servers,
+ which is an entirely different feature and implementation.
+ """
+
+ _replay_file = "replay/multiple_host_requests.replay.yaml"
+
+ def __init__(self):
+ """Initialize the Test processes for the test runs."""
+ self._dns_port = None
+ self._configure_origin_server()
+ self._configure_traffic_server()
+ self._servers_are_running = False
+
+ self._client_counter = 0
+ self._dns_counter = 0
+
+ def _configure_origin_server(self):
+ """Configure the origin server and the DNS port."""
+ self._server = Test.MakeVerifierServerProcess("server",
self._replay_file)
+
+ # Associate the DNS port with the long-running server. This way
+ # ATS can be configured to use it across the multiple test runs
+ # and DNS servers.
+ self._dns_port = get_port(self._server, "DNSPort")
+
+ def _configure_traffic_server(self):
+ """Configure Traffic Server."""
+ if not self._dns_port:
+ raise RuntimeError("The DNS port was not configured.")
+
+ # Since we are testing DNS name resolution and whether these
+ # transactions reach the origin, disable the caching to ensure none of
+ # the responses come out of the cache without going to the origin.
+ self._ts = Test.MakeATSProcess("ts", enable_cache=False)
+
+ self._ts.Disk.records_config.update({
+ 'proxy.config.diags.debug.enabled': 1,
+ 'proxy.config.diags.debug.tags': 'hostdb|dns',
+ 'proxy.config.dns.nameservers': f'127.0.0.1:{self._dns_port}',
+ 'proxy.config.dns.resolv_conf': 'NULL'
+ })
+
+ # Cause a name resolution for each, unique path.
+ self._ts.Disk.remap_config.AddLines([
+ f'map /first/host
http://first.host.com:{self._server.Variables.http_port}/',
+ f'map /second/host
http://second.host.com:{self._server.Variables.http_port}/',
+ f'map /third/host
http://third.host.com:{self._server.Variables.http_port}/',
+ ])
+
+ def _run_transaction(self, start_dns: bool, keyname: str):
+ """Run a transaction with the name server reachable.
+
+ :param start_dns: Whether the TestRun should configure a name server to
+ be running for ATS to talk to.
+
+ :param keyname: The identifier for the transaction to run.
+ """
+ tr = Test.AddTestRun()
+
+ tr.AddVerifierClientProcess(
+ f'client{self._client_counter}',
+ self._replay_file,
+ http_ports=[self._ts.Variables.port],
+ other_args=f'--keys {keyname}')
+ self._client_counter += 1
+
+ if start_dns:
+ dns = tr.MakeDNServer(
+ f'dns{self._dns_counter}',
+ default='127.0.0.1',
+ port=self._dns_port)
+ self._dns_counter += 1
+ tr.Processes.Default.StartBefore(dns)
+
+ if not self._servers_are_running:
+ tr.Processes.Default.StartBefore(self._server)
+ tr.Processes.Default.StartBefore(self._ts)
+ self._servers_are_running = True
+
+ # Verify that the client tried to send the transaction.
+ tr.Processes.Default.Streams.All += Testers.ContainsExpression(
+ f'uuid: {keyname}',
+ f'The client should have sent a transaction with uuid {keyname}')
+
+ # The client will report an error if ATS could not complete the
+ # transaction due to DNS resolution issues.
+ if not start_dns:
+ tr.Processes.Default.ReturnCode = 1
+
+ def _delay_for_dns_retry_period(self, start_dns: bool):
+ """Wait for the failed DNS retry period."""
+ tr = Test.AddTestRun()
+
+ if start_dns:
+ dns = tr.MakeDNServer(
+ f'dns{self._dns_counter}',
+ default='127.0.0.1',
+ port=self._dns_port)
+ self._dns_counter += 1
+ tr.Processes.Default.StartBefore(dns)
+
+ # Exceed the retry period.
+ tr.Processes.Default.Command = f'sleep {DNS_PRIMARY_RETRY_PERIOD + 1}'
+
+ def _test_dns_reachable(self):
+ """This is the base case: DNS is reachable.
+
+ This verifies that ATS can use the name server to resolve a name when
+ the name server is reachable.
+ """
+ self._run_transaction(start_dns=True, keyname='first_host')
+
+ def _test_dns_reachable_within_the_retry_period(self):
+ """Test nameserver connectivity resolution within the retry period.
+
+ Have the name server be inaccessible, but bring it back online within
+ the retry period.
+ """
+ # Trigger a name resolution with the name server down.
+ self._run_transaction(start_dns=False, keyname='second_host')
+
+ # Wait long enough that ATS to retry the name server, with the server
+ # up.
+ self._delay_for_dns_retry_period(start_dns=True)
+
+ # ATS should now use the working name server again to resolve the host
+ # name.
+ self._run_transaction(start_dns=True, keyname='second_host')
+
+ def _test_dns_reachable_outside_the_retry_period(self):
+ """Test nameserver connectivity resolution outside the retry period.
+
+ Have the name server be inaccessible, but bring it back online outside
+ the retry period. This verifies that we handle it coming back online
+ after the first attempt to re-connect to it.
+ """
+ # Trigger a name resolution with the name server down.
+ self._run_transaction(start_dns=False, keyname='third_host')
+
+ # Wait long enough that ATS to retry connectivity to the name server,
+ # with the server down. This will naturally fail.
+ self._delay_for_dns_retry_period(start_dns=False)
+
+ # Wait long enough that ATS to retry connectivity a second time to name
+ # server, this time with the server up.
+ self._delay_for_dns_retry_period(start_dns=True)
+
+ # ATS should now try the name server again, detect that it is up, and
+ # complete the transaction.
+ self._run_transaction(start_dns=True, keyname='third_host')
+
+ def run(self):
+ """Exercise ATS down name server functionality."""
+ self._test_dns_reachable()
+ self._test_dns_reachable_within_the_retry_period()
+ self._test_dns_reachable_outside_the_retry_period()
+
+
+DownDNSNameserverTest().run()
diff --git a/tests/gold_tests/dns/replay/multiple_host_requests.replay.yaml
b/tests/gold_tests/dns/replay/multiple_host_requests.replay.yaml
new file mode 100644
index 000000000..69d79364a
--- /dev/null
+++ b/tests/gold_tests/dns/replay/multiple_host_requests.replay.yaml
@@ -0,0 +1,93 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+meta:
+ version: "1.0"
+
+#
+# Generate two simple transactions for two hosts.
+#
+
+sessions:
+- transactions:
+ - client-request:
+ method: "GET"
+ version: "1.1"
+ url: /first/host
+ headers:
+ fields:
+ - [ Host, first.host.com ]
+ - [ X-Request, request ]
+ - [ uuid, first_host ]
+
+ proxy-request:
+ headers:
+ fields:
+ - [ Host, { value: first.host.com, as: contains } ]
+
+ server-response:
+ status: 200
+ reason: OK
+ headers:
+ fields:
+ - [ Content-Length, 16 ]
+ - [ X-Response, response ]
+
+ - client-request:
+ method: "GET"
+ version: "1.1"
+ url: /second/host
+ headers:
+ fields:
+ - [ Host, second.host.com ]
+ - [ X-Request, request ]
+ - [ uuid, second_host ]
+
+ proxy-request:
+ headers:
+ fields:
+ - [ Host, { value: second.host.com, as: contains } ]
+
+ server-response:
+ status: 200
+ reason: OK
+ headers:
+ fields:
+ - [ Content-Length, 16 ]
+ - [ X-Response, response ]
+
+ - client-request:
+ method: "GET"
+ version: "1.1"
+ url: /third/host
+ headers:
+ fields:
+ - [ Host, third.host.com ]
+ - [ X-Request, request ]
+ - [ uuid, third_host ]
+
+ proxy-request:
+ headers:
+ fields:
+ - [ Host, { value: third.host.com, as: contains } ]
+
+ server-response:
+ status: 200
+ reason: OK
+ headers:
+ fields:
+ - [ Content-Length, 16 ]
+ - [ X-Response, response ]