This is an automated email from the ASF dual-hosted git repository.

cmcfarlen pushed a commit to branch 10.0.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/10.0.x by this push:
     new 388565d009 Fixes to HostDB marking IPs up/down (#11814)
388565d009 is described below

commit 388565d009fbea88108a57715f4bfaedbcf0ae1a
Author: Chris McFarlen <[email protected]>
AuthorDate: Thu Oct 10 19:19:18 2024 -0500

    Fixes to HostDB marking IPs up/down (#11814)
    
    * Fixes to hostdb marking IPs up/down
    
    * Mark HostDBInfo::select const
    
    * Update dns_host_down test to reflect fixed behavior for returning 500
    
    ---------
    
    Co-authored-by: Chris McFarlen <[email protected]>
    (cherry picked from commit ffa0b26e037aa39a7b4c759955b7f9bf017ae928)
---
 include/iocore/hostdb/HostDBProcessor.h            | 31 +++++++--
 src/iocore/hostdb/HostDB.cc                        | 73 +++++++++++-----------
 src/proxy/http/HttpSM.cc                           | 49 +++++++--------
 src/proxy/http/HttpTransact.cc                     |  4 +-
 tests/gold_tests/dns/dns_host_down.test.py         |  3 +-
 .../gold_tests/dns/replay/server_down.replay.yaml  |  4 +-
 6 files changed, 91 insertions(+), 73 deletions(-)

diff --git a/include/iocore/hostdb/HostDBProcessor.h 
b/include/iocore/hostdb/HostDBProcessor.h
index 30d572a497..a532e23e49 100644
--- a/include/iocore/hostdb/HostDBProcessor.h
+++ b/include/iocore/hostdb/HostDBProcessor.h
@@ -25,6 +25,7 @@
 
 #include <chrono>
 #include <atomic>
+#include <utility>
 
 #include "tscore/HashFNV.h"
 #include "tscore/ink_time.h"
@@ -150,7 +151,7 @@ struct HostDBInfo {
    * If a zombie is selected the failure time is updated to make it appear 
down to other threads in a thread safe
    * manner. The caller should check @c last_fail_time to see if a zombie was 
selected.
    */
-  bool select(ts_time now, ts_seconds fail_window);
+  bool select(ts_time now, ts_seconds fail_window) const;
 
   /// Check if this info is valid.
   bool is_valid() const;
@@ -167,6 +168,8 @@ struct HostDBInfo {
    */
   bool mark_down(ts_time now);
 
+  std::pair<bool, uint8_t> increment_fail_count(ts_time now, uint8_t 
max_retries);
+
   /** Mark the target as up / alive.
    *
    * @return Previous alive state of the target.
@@ -243,8 +246,12 @@ HostDBInfo::is_down(ts_time now, ts_seconds fail_window)
 inline bool
 HostDBInfo::mark_up()
 {
-  auto t = last_failure.exchange(TS_TIME_ZERO);
-  return t != TS_TIME_ZERO;
+  auto t        = last_failure.exchange(TS_TIME_ZERO);
+  bool was_down = t != TS_TIME_ZERO;
+  if (was_down) {
+    fail_count.store(0);
+  }
+  return was_down;
 }
 
 inline bool
@@ -254,21 +261,33 @@ HostDBInfo::mark_down(ts_time now)
   return last_failure.compare_exchange_strong(t0, now);
 }
 
+inline std::pair<bool, uint8_t>
+HostDBInfo::increment_fail_count(ts_time now, uint8_t max_retries)
+{
+  auto fcount      = ++fail_count;
+  bool marked_down = false;
+  if (fcount >= max_retries) {
+    marked_down = mark_down(now);
+  }
+  return std::make_pair(marked_down, fcount);
+}
+
 inline bool
-HostDBInfo::select(ts_time now, ts_seconds fail_window)
+HostDBInfo::select(ts_time now, ts_seconds fail_window) const
 {
   auto t0 = this->last_fail_time();
   if (t0 == TS_TIME_ZERO) {
     return true; // it's alive and so is valid for selection.
   }
-  // Success means this is a zombie and this thread updated the failure time.
-  return (t0 + fail_window < now) && last_failure.compare_exchange_strong(t0, 
now);
+  // Return true and give it a try if enough time is elapsed since the last 
failure
+  return (t0 + fail_window < now);
 }
 
 inline void
 HostDBInfo::migrate_from(HostDBInfo::self_type const &that)
 {
   this->last_failure = that.last_failure.load();
+  this->fail_count   = that.fail_count.load();
   this->http_version = that.http_version;
 }
 
diff --git a/src/iocore/hostdb/HostDB.cc b/src/iocore/hostdb/HostDB.cc
index 361f79e83d..d1b597d626 100644
--- a/src/iocore/hostdb/HostDB.cc
+++ b/src/iocore/hostdb/HostDB.cc
@@ -1418,46 +1418,49 @@ HostDBRecord::select_best_http(ts_time now, ts_seconds 
fail_window, sockaddr con
   HostDBInfo *best_alive = nullptr;
 
   auto info{this->rr_info()};
-
-  if (HostDBProcessor::hostdb_strict_round_robin) {
-    // Always select the next viable target - select failure means no valid 
targets at all.
-    best_alive = best_any = this->select_next_rr(now, fail_window);
-    Dbg(dbg_ctl_hostdb, "Using strict round robin - index %d", 
this->index_of(best_alive));
-  } else if (HostDBProcessor::hostdb_timed_round_robin > 0) {
-    auto ctime = rr_ctime.load(); // cache for atomic update.
-    auto ntime = ctime + ts_seconds(HostDBProcessor::hostdb_timed_round_robin);
-    // Check and update RR if it's time - this always yields a valid target if 
there is one.
-    if (now > ntime && rr_ctime.compare_exchange_strong(ctime, ntime)) {
+  if (info.count() > 1) {
+    if (HostDBProcessor::hostdb_strict_round_robin) {
+      // Always select the next viable target - select failure means no valid 
targets at all.
       best_alive = best_any = this->select_next_rr(now, fail_window);
-      Dbg(dbg_ctl_hostdb, "Round robin timed interval expired - index %d", 
this->index_of(best_alive));
-    } else { // pick the current index, which may be down.
-      best_any = &info[this->rr_idx()];
-    }
-    Dbg(dbg_ctl_hostdb, "Using timed round robin - index %d", 
this->index_of(best_any));
-  } else {
-    // Walk the entries and find the best (largest) hash.
-    unsigned int best_hash = 0; // any hash is better than this.
-    for (auto &target : info) {
-      unsigned int h = HOSTDB_CLIENT_IP_HASH(hash_addr, target.data.ip);
-      if (best_hash <= h) {
-        best_any  = &target;
-        best_hash = h;
+      Dbg(dbg_ctl_hostdb, "Using strict round robin - index %d", 
this->index_of(best_alive));
+    } else if (HostDBProcessor::hostdb_timed_round_robin > 0) {
+      auto ctime = rr_ctime.load(); // cache for atomic update.
+      auto ntime = ctime + 
ts_seconds(HostDBProcessor::hostdb_timed_round_robin);
+      // Check and update RR if it's time - this always yields a valid target 
if there is one.
+      if (now > ntime && rr_ctime.compare_exchange_strong(ctime, ntime)) {
+        best_alive = best_any = this->select_next_rr(now, fail_window);
+        Dbg(dbg_ctl_hostdb, "Round robin timed interval expired - index %d", 
this->index_of(best_alive));
+      } else { // pick the current index, which may be down.
+        best_any = &info[this->rr_idx()];
       }
+      Dbg(dbg_ctl_hostdb, "Using timed round robin - index %d", 
this->index_of(best_any));
+    } else {
+      // Walk the entries and find the best (largest) hash.
+      unsigned int best_hash = 0; // any hash is better than this.
+      for (auto &target : info) {
+        unsigned int h = HOSTDB_CLIENT_IP_HASH(hash_addr, target.data.ip);
+        if (best_hash <= h) {
+          best_any  = &target;
+          best_hash = h;
+        }
+      }
+      Dbg(dbg_ctl_hostdb, "Using client affinity - index %d", 
this->index_of(best_any));
     }
-    Dbg(dbg_ctl_hostdb, "Using client affinity - index %d", 
this->index_of(best_any));
-  }
-
-  // If there is a base choice, search for valid target starting there.
-  // Otherwise there is no valid target in the record.
-  if (best_any && !best_alive) {
-    // Starting at the current target, search for a valid one.
-    for (unsigned short i = 0; i < rr_count; i++) {
-      auto target = &info[this->rr_idx(i)];
-      if (target->select(now, fail_window)) {
-        best_alive = target;
-        break;
+
+    // If there is a base choice, search for valid target starting there.
+    // Otherwise there is no valid target in the record.
+    if (best_any && !best_alive) {
+      // Starting at the current target, search for a valid one.
+      for (unsigned short i = 0; i < rr_count; i++) {
+        auto target = &info[this->rr_idx(i)];
+        if (target->select(now, fail_window)) {
+          best_alive = target;
+          break;
+        }
       }
     }
+  } else {
+    best_alive = &info[0];
   }
 
   return best_alive;
diff --git a/src/proxy/http/HttpSM.cc b/src/proxy/http/HttpSM.cc
index 225296bf48..89ce9647b6 100644
--- a/src/proxy/http/HttpSM.cc
+++ b/src/proxy/http/HttpSM.cc
@@ -4555,12 +4555,14 @@ HttpSM::do_hostdb_update_if_necessary()
     this->mark_host_failure(&t_state.dns_info, 
ts_clock::from_time_t(t_state.client_request_time));
   } else {
     if (t_state.dns_info.mark_active_server_alive()) {
+      char addrbuf[INET6_ADDRPORTSTRLEN];
+      ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, 
sizeof(addrbuf));
+      ATS_PROBE2(mark_active_server_alive, sm_id, addrbuf);
       if (t_state.dns_info.record->is_srv()) {
-        SMDbg(dbg_ctl_http, "[%" PRId64 "] hostdb update marking SRV: %s as 
up", sm_id, t_state.dns_info.record->name());
+        SMDbg(dbg_ctl_http, "[%" PRId64 "] hostdb update marking SRV: %s(%s) 
as up", sm_id, t_state.dns_info.record->name(),
+              addrbuf);
       } else {
-        char addrbuf[INET6_ADDRPORTSTRLEN];
-        SMDbg(dbg_ctl_http, "[%" PRId64 "] hostdb update marking IP: %s as 
up", sm_id,
-              ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, 
sizeof(addrbuf)));
+        SMDbg(dbg_ctl_http, "[%" PRId64 "] hostdb update marking IP: %s as 
up", sm_id, addrbuf);
       }
     }
   }
@@ -5768,30 +5770,26 @@ HttpSM::mark_host_failure(ResolveInfo *info, ts_time 
time_down)
 
   if (info->active) {
     if (time_down != TS_TIME_ZERO) {
+      ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, 
sizeof(addrbuf));
       // Increment the fail_count
-      if (++info->active->fail_count >= 
t_state.txn_conf->connect_attempts_rr_retries) {
-        if (info->active) {
-          if (info->active->last_failure.load() == TS_TIME_ZERO) {
-            char            *url_str = 
t_state.hdr_info.client_request.url_string_get_ref(nullptr);
-            int              host_len;
-            const char      *host_name_ptr = 
t_state.unmapped_url.host_get(&host_len);
-            std::string_view host_name{host_name_ptr, size_t(host_len)};
-            swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} 
for host='{}' url='{}' marking down",
-                          
swoc::bwf::Errno(t_state.current.server->connect_result), 
t_state.current.server->dst_addr, host_name,
-                          swoc::bwf::FirstOf(url_str, "<none>"));
-            Log::error("%s", error_bw_buffer.c_str());
-          }
-          info->active->last_failure = time_down;
-          SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down",
-                ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, 
sizeof(addrbuf)));
-        } else {
-          SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d",
-                ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, 
sizeof(addrbuf)), info->active->fail_count.load());
-        }
+      if (auto [down, fail_count] = 
info->active->increment_fail_count(time_down, 
t_state.txn_conf->connect_attempts_rr_retries);
+          down) {
+        char            *url_str = 
t_state.hdr_info.client_request.url_string_get_ref(nullptr);
+        int              host_len;
+        const char      *host_name_ptr = 
t_state.unmapped_url.host_get(&host_len);
+        std::string_view host_name{host_name_ptr, 
static_cast<size_t>(host_len)};
+        swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for 
host='{}' url='{}' fail_count='{}' marking down",
+                      
swoc::bwf::Errno(t_state.current.server->connect_result), 
t_state.current.server->dst_addr, host_name,
+                      swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
+        Log::error("%s", error_bw_buffer.c_str());
+        SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
+        ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
+      } else {
+        ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
+        SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf, 
fail_count);
       }
     } else { // Clear the failure
-      info->active->fail_count   = 0;
-      info->active->last_failure = time_down;
+      info->active->mark_up();
     }
   }
 #ifdef DEBUG
@@ -8118,6 +8116,7 @@ HttpSM::set_next_state()
 
   case HttpTransact::SM_ACTION_ORIGIN_SERVER_RR_MARK_DOWN: {
     HTTP_SM_SET_DEFAULT_HANDLER(&HttpSM::state_mark_os_down);
+    ATS_PROBE(next_state_SM_ACTION_ORIGIN_SERVER_RR_MARK_DOWN);
 
     ink_assert(t_state.dns_info.looking_up == ResolveInfo::ORIGIN_SERVER);
 
diff --git a/src/proxy/http/HttpTransact.cc b/src/proxy/http/HttpTransact.cc
index 8c1fb59570..cb49e0d481 100644
--- a/src/proxy/http/HttpTransact.cc
+++ b/src/proxy/http/HttpTransact.cc
@@ -47,6 +47,7 @@
 #include "proxy/http/HttpBodyFactory.h"
 #include "proxy/IPAllow.h"
 #include "iocore/utils/Machine.h"
+#include "ts/ats_probe.h"
 
 DbgCtl HttpTransact::State::_dbg_ctl{"http"};
 
@@ -521,8 +522,7 @@ HttpTransact::is_server_negative_cached(State *s)
     //   down to 2*down_server_timeout
     if (s->dns_info.active &&
         ts_clock::from_time_t(s->client_request_time) + 
s->txn_conf->down_server_timeout < s->dns_info.active->last_fail_time()) {
-      s->dns_info.active->last_failure = TS_TIME_ZERO;
-      s->dns_info.active->fail_count   = 0;
+      s->dns_info.active->mark_up();
       ink_assert(!"extreme clock skew");
       return true;
     }
diff --git a/tests/gold_tests/dns/dns_host_down.test.py 
b/tests/gold_tests/dns/dns_host_down.test.py
index 1bb5dc2b8d..74930c7544 100644
--- a/tests/gold_tests/dns/dns_host_down.test.py
+++ b/tests/gold_tests/dns/dns_host_down.test.py
@@ -81,9 +81,8 @@ class DownCachedOriginServerTest:
             os.path.join(Test.Variables.AtsTestToolsDir, 'condwait') + ' 60 1 
-f ' +
             os.path.join(self._ts.Variables.LOGDIR, 'error.log'))
 
-        self._ts.Disk.error_log.Content = 
Testers.ContainsExpression("/dns/mark/down' marking down", "host should be 
marked down")
         self._ts.Disk.error_log.Content = Testers.ContainsExpression(
-            "DNS Error: no valid server http://resolve.this.com";, "DNS lookup 
should fail")
+            "/dns/mark/down' fail_count='1' marking down", "host should be 
marked down")
 
     def run(self):
         self._test_host_mark_down()
diff --git a/tests/gold_tests/dns/replay/server_down.replay.yaml 
b/tests/gold_tests/dns/replay/server_down.replay.yaml
index d1dd1e2022..d5b33bc784 100644
--- a/tests/gold_tests/dns/replay/server_down.replay.yaml
+++ b/tests/gold_tests/dns/replay/server_down.replay.yaml
@@ -55,7 +55,5 @@ sessions:
     server-response:
       status: 200
 
-    # Previous request marked host down so DNS lookup returns unsuccessful
-    # 500 response indicates no valid server
     proxy-response:
-      status: 500
+      status: 502

Reply via email to