This is an automated email from the ASF dual-hosted git repository.

zwoop pushed a commit to branch 9.2.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/9.2.x by this push:
     new 9b7ff60bb Restore down nameservers after they come back online (#8847)
9b7ff60bb is described below

commit 9b7ff60bb3dfc030894633ab862683bfbe8b90ec
Author: Brian Neradt <[email protected]>
AuthorDate: Mon May 16 19:38:57 2022 -0500

    Restore down nameservers after they come back online (#8847)
    
    This patch ensures that a previously down DNS will be used again by ATS
    after connectivity to it is restored. Before this patch, we would try to
    reconnect to it after one attempt at DNS_PRIMARY_RETRY_PERIOD, but not
    again after.
    
    (cherry picked from commit 417e1576ce120cbddbd93b678a097b7f56b28960)
---
 iocore/dns/DNS.cc                                  |  14 +-
 iocore/dns/P_DNSProcessor.h                        |  11 +-
 tests/gold_tests/dns/dns_down_nameserver.test.py   | 193 +++++++++++++++++++++
 .../dns/replay/multiple_host_requests.replay.yaml  |  93 ++++++++++
 4 files changed, 296 insertions(+), 15 deletions(-)

diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc
index 9ad2ea322..c8da7474d 100644
--- a/iocore/dns/DNS.cc
+++ b/iocore/dns/DNS.cc
@@ -609,6 +609,11 @@ DNSHandler::startEvent(int /* event ATS_UNUSED */, Event 
*e)
       n_con = 1;
     }
 
+    // Retrying the name servers is something done periodically over the
+    // lifetime of the handler. This ensures that we don't miss retrying if it
+    // is necessary.
+    this->_dns_retry_event = this_ethread()->schedule_every(this, 
DNS_PRIMARY_RETRY_PERIOD);
+
     return EVENT_CONT;
   } else {
     ink_assert(false); // I.e. this should never really happen
@@ -775,8 +780,6 @@ DNSHandler::failover()
     ip_text_buffer buff;
     Warning("failover: connection to DNS server %s lost, retrying", 
ats_ip_ntop(&ip.sa, buff, sizeof(buff)));
   }
-  // Make sure retries are done even if no more requests.
-  this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD);
 }
 
 /** Mark one of the nameservers as down. */
@@ -790,8 +793,6 @@ DNSHandler::rr_failure(int ndx)
     Debug("dns", "rr_failure: Marking nameserver %d as down", ndx);
     ns_down[ndx] = 1;
     Warning("connection to DNS server %s lost, marking as down", 
ats_ip_ntop(&m_res->nsaddr_list[ndx].sa, buff, sizeof(buff)));
-    // Make sure retries are done even if no more requests.
-    this_ethread()->schedule_in(this, DNS_PRIMARY_RETRY_PERIOD);
   }
 
   int nscount = m_res->nscount;
@@ -984,11 +985,6 @@ DNSHandler::check_and_reset_tcp_conn()
 int
 DNSHandler::mainEvent(int event, Event *e)
 {
-  // If this was a scheduled retry event, clear the associated flag.
-  if (e && e->cookie == RETRY_COOKIE) {
-    this->nameserver_retry_in_flight_p = false;
-  }
-
   recv_dns(event, e);
   if (dns_ns_rr) {
     if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) {
diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h
index 7d0bba98d..c0776f2ff 100644
--- a/iocore/dns/P_DNSProcessor.h
+++ b/iocore/dns/P_DNSProcessor.h
@@ -24,6 +24,7 @@
 #pragma once
 
 #include "I_EventSystem.h"
+#include "tscore/PendingAction.h"
 
 #define MAX_NAMED 32
 #define DEFAULT_DNS_RETRIES 5
@@ -172,12 +173,6 @@ struct DNSHandler : public Continuation {
   int in_flight    = 0;
   int name_server  = 0;
   int in_write_dns = 0;
-  /// Rate limiter for down nameserver retries.
-  /// Don't schedule another if there is already one in flight.
-  std::atomic<bool> nameserver_retry_in_flight_p{false};
-  /// Marker for event cookie to indicate it's a nameserver retry event.
-  /// @note Can't be @c constexpr because of the cast.
-  static inline void *const RETRY_COOKIE{reinterpret_cast<void *>(0x2)};
 
   HostEnt *hostent_cache = nullptr;
 
@@ -272,6 +267,10 @@ private:
   // Check tcp connection for TCP_RETRY mode
   void check_and_reset_tcp_conn();
   bool reset_tcp_conn(int ndx);
+
+  /** The event used for the periodic retry of connectivity to any down name
+   * servers. */
+  PendingAction _dns_retry_event;
 };
 
 /* --------------------------------------------------------------
diff --git a/tests/gold_tests/dns/dns_down_nameserver.test.py 
b/tests/gold_tests/dns/dns_down_nameserver.test.py
new file mode 100644
index 000000000..87dcadaf5
--- /dev/null
+++ b/tests/gold_tests/dns/dns_down_nameserver.test.py
@@ -0,0 +1,193 @@
+'''
+Verify ATS handles down name servers correctly.
+'''
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+from ports import get_port
+
+
+# This value tracks DNS_PRIMARY_RETRY_PERIOD in P_DNSProcessor.h.
+DNS_PRIMARY_RETRY_PERIOD = 5
+
+Test.Summary = '''
+Verify ATS handles down name servers correctly.
+'''
+
+
+class DownDNSNameserverTest:
+    """Encapsulate logic for testing down name server functionality.
+
+    Note that this test verifies that ATS behaves correctly with respect to
+    down *name* servers. This is different than testing down origin servers,
+    which is an entirely different feature and implementation.
+    """
+
+    _replay_file = "replay/multiple_host_requests.replay.yaml"
+
+    def __init__(self):
+        """Initialize the Test processes for the test runs."""
+        self._dns_port = None
+        self._configure_origin_server()
+        self._configure_traffic_server()
+        self._servers_are_running = False
+
+        self._client_counter = 0
+        self._dns_counter = 0
+
+    def _configure_origin_server(self):
+        """Configure the origin server and the DNS port."""
+        self._server = Test.MakeVerifierServerProcess("server", 
self._replay_file)
+
+        # Associate the DNS port with the long-running server. This way
+        # ATS can be configured to use it across the multiple test runs
+        # and DNS servers.
+        self._dns_port = get_port(self._server, "DNSPort")
+
+    def _configure_traffic_server(self):
+        """Configure Traffic Server."""
+        if not self._dns_port:
+            raise RuntimeError("The DNS port was not configured.")
+
+        # Since we are testing DNS name resolution and whether these
+        # transactions reach the origin, disable the caching to ensure none of
+        # the responses come out of the cache without going to the origin.
+        self._ts = Test.MakeATSProcess("ts", enable_cache=False)
+
+        self._ts.Disk.records_config.update({
+            'proxy.config.diags.debug.enabled': 1,
+            'proxy.config.diags.debug.tags': 'hostdb|dns',
+            'proxy.config.dns.nameservers': f'127.0.0.1:{self._dns_port}',
+            'proxy.config.dns.resolv_conf': 'NULL'
+        })
+
+        # Cause a name resolution for each, unique path.
+        self._ts.Disk.remap_config.AddLines([
+            f'map /first/host 
http://first.host.com:{self._server.Variables.http_port}/',
+            f'map /second/host 
http://second.host.com:{self._server.Variables.http_port}/',
+            f'map /third/host 
http://third.host.com:{self._server.Variables.http_port}/',
+        ])
+
+    def _run_transaction(self, start_dns: bool, keyname: str):
+        """Run a transaction with the name server reachable.
+
+        :param start_dns: Whether the TestRun should configure a name server to
+        be running for ATS to talk to.
+
+        :param keyname: The identifier for the transaction to run.
+        """
+        tr = Test.AddTestRun()
+
+        tr.AddVerifierClientProcess(
+            f'client{self._client_counter}',
+            self._replay_file,
+            http_ports=[self._ts.Variables.port],
+            other_args=f'--keys {keyname}')
+        self._client_counter += 1
+
+        if start_dns:
+            dns = tr.MakeDNServer(
+                f'dns{self._dns_counter}',
+                default='127.0.0.1',
+                port=self._dns_port)
+            self._dns_counter += 1
+            tr.Processes.Default.StartBefore(dns)
+
+        if not self._servers_are_running:
+            tr.Processes.Default.StartBefore(self._server)
+            tr.Processes.Default.StartBefore(self._ts)
+            self._servers_are_running = True
+
+        # Verify that the client tried to send the transaction.
+        tr.Processes.Default.Streams.All += Testers.ContainsExpression(
+            f'uuid: {keyname}',
+            f'The client should have sent a transaction with uuid {keyname}')
+
+        # The client will report an error if ATS could not complete the
+        # transaction due to DNS resolution issues.
+        if not start_dns:
+            tr.Processes.Default.ReturnCode = 1
+
+    def _delay_for_dns_retry_period(self, start_dns: bool):
+        """Wait for the failed DNS retry period."""
+        tr = Test.AddTestRun()
+
+        if start_dns:
+            dns = tr.MakeDNServer(
+                f'dns{self._dns_counter}',
+                default='127.0.0.1',
+                port=self._dns_port)
+            self._dns_counter += 1
+            tr.Processes.Default.StartBefore(dns)
+
+        # Exceed the retry period.
+        tr.Processes.Default.Command = f'sleep {DNS_PRIMARY_RETRY_PERIOD + 1}'
+
+    def _test_dns_reachable(self):
+        """This is the base case: DNS is reachable.
+
+        This verifies that ATS can use the name server to resolve a name when
+        the name server is reachable.
+        """
+        self._run_transaction(start_dns=True, keyname='first_host')
+
+    def _test_dns_reachable_within_the_retry_period(self):
+        """Test nameserver connectivity resolution within the retry period.
+
+        Have the name server be inaccessible, but bring it back online within
+        the retry period.
+        """
+        # Trigger a name resolution with the name server down.
+        self._run_transaction(start_dns=False, keyname='second_host')
+
+        # Wait long enough that ATS to retry the name server, with the server
+        # up.
+        self._delay_for_dns_retry_period(start_dns=True)
+
+        # ATS should now use the working name server again to resolve the host
+        # name.
+        self._run_transaction(start_dns=True, keyname='second_host')
+
+    def _test_dns_reachable_outside_the_retry_period(self):
+        """Test nameserver connectivity resolution outside the retry period.
+
+        Have the name server be inaccessible, but bring it back online outside
+        the retry period. This verifies that we handle it coming back online
+        after the first attempt to re-connect to it.
+        """
+        # Trigger a name resolution with the name server down.
+        self._run_transaction(start_dns=False, keyname='third_host')
+
+        # Wait long enough that ATS to retry connectivity to the name server,
+        # with the server down. This will naturally fail.
+        self._delay_for_dns_retry_period(start_dns=False)
+
+        # Wait long enough that ATS to retry connectivity a second time to name
+        # server, this time with the server up.
+        self._delay_for_dns_retry_period(start_dns=True)
+
+        # ATS should now try the name server again, detect that it is up, and
+        # complete the transaction.
+        self._run_transaction(start_dns=True, keyname='third_host')
+
+    def run(self):
+        """Exercise ATS down name server functionality."""
+        self._test_dns_reachable()
+        self._test_dns_reachable_within_the_retry_period()
+        self._test_dns_reachable_outside_the_retry_period()
+
+
+DownDNSNameserverTest().run()
diff --git a/tests/gold_tests/dns/replay/multiple_host_requests.replay.yaml 
b/tests/gold_tests/dns/replay/multiple_host_requests.replay.yaml
new file mode 100644
index 000000000..69d79364a
--- /dev/null
+++ b/tests/gold_tests/dns/replay/multiple_host_requests.replay.yaml
@@ -0,0 +1,93 @@
+#  Licensed to the Apache Software Foundation (ASF) under one
+#  or more contributor license agreements.  See the NOTICE file
+#  distributed with this work for additional information
+#  regarding copyright ownership.  The ASF licenses this file
+#  to you under the Apache License, Version 2.0 (the
+#  "License"); you may not use this file except in compliance
+#  with the License.  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+meta:
+  version: "1.0"
+
+#
+# Generate two simple transactions for two hosts.
+#
+
+sessions:
+- transactions:
+  - client-request:
+      method: "GET"
+      version: "1.1"
+      url: /first/host
+      headers:
+        fields:
+        - [ Host, first.host.com ]
+        - [ X-Request, request ]
+        - [ uuid, first_host ]
+
+    proxy-request:
+      headers:
+        fields:
+        - [ Host, { value: first.host.com, as: contains } ]
+
+    server-response:
+      status: 200
+      reason: OK
+      headers:
+        fields:
+        - [ Content-Length, 16 ]
+        - [ X-Response, response ]
+
+  - client-request:
+      method: "GET"
+      version: "1.1"
+      url: /second/host
+      headers:
+        fields:
+        - [ Host, second.host.com ]
+        - [ X-Request, request ]
+        - [ uuid, second_host ]
+
+    proxy-request:
+      headers:
+        fields:
+        - [ Host, { value: second.host.com, as: contains } ]
+
+    server-response:
+      status: 200
+      reason: OK
+      headers:
+        fields:
+        - [ Content-Length, 16 ]
+        - [ X-Response, response ]
+
+  - client-request:
+      method: "GET"
+      version: "1.1"
+      url: /third/host
+      headers:
+        fields:
+        - [ Host, third.host.com ]
+        - [ X-Request, request ]
+        - [ uuid, third_host ]
+
+    proxy-request:
+      headers:
+        fields:
+        - [ Host, { value: third.host.com, as: contains } ]
+
+    server-response:
+      status: 200
+      reason: OK
+      headers:
+        fields:
+        - [ Content-Length, 16 ]
+        - [ X-Response, response ]

Reply via email to