Re: [PR] Fix connect attempt retries [trafficserver]

via GitHub Mon, 08 Jun 2026 17:39:50 -0700


Copilot commented on code in PR #13102:
URL: https://github.com/apache/trafficserver/pull/13102#discussion_r3377151428



##########
src/proxy/http/HttpSM.cc:
##########
@@ -6053,34 +6049,36 @@ HttpSM::do_transform_open()
 void
 HttpSM::mark_host_failure(ResolveInfo *info, ts_time time_down)
 {
-  char addrbuf[INET6_ADDRPORTSTRLEN];
+  ink_assert(time_down != TS_TIME_ZERO);
 
-  if (info->active) {
-    if (time_down != TS_TIME_ZERO) {
-      ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, 
sizeof(addrbuf));
-      // Increment the fail_count
-      if (auto [down, fail_count] = 
info->active->increment_fail_count(time_down, 
t_state.txn_conf->connect_attempts_rr_retries,
-                                                                       
t_state.txn_conf->down_server_timeout);
-          down) {
-        char            *url_str = 
t_state.hdr_info.client_request.url_string_get_ref(nullptr);
-        std::string_view host_name{t_state.unmapped_url.host_get()};
-        swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for 
host='{}' url='{}' fail_count='{}' marking down",
-                      
swoc::bwf::Errno(t_state.current.server->connect_result), 
t_state.current.server->dst_addr, host_name,
-                      swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
-        Log::error("%s", error_bw_buffer.c_str());
-        SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
-        ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
-      } else {
-        ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
-        SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf, 
fail_count);
-      }
-    } else { // Clear the failure
-      info->active->mark_up();
-    }
+  if (info->active == nullptr) {
+    return;
+  }
+
+  char addrbuf[INET6_ADDRPORTSTRLEN];
+  ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf));
+
+  const uint8_t    max_connect_retries = 
HttpTransact::origin_server_connect_attempts_max_retries(&t_state);
+  const ts_seconds fail_window         = t_state.txn_conf->down_server_timeout;
+
+  // Mark the host DOWN only after every attempt has failed. 
`max_connect_retries` counts only "retries", so the total attempt
+  // budget is `max_connect_retries + 1` (the initial connect plus each retry).
+  auto [down, fail_count] = info->active->increment_fail_count(time_down, 
max_connect_retries + 1, fail_window);
+
+  if (down) {
+    Metrics::Counter::increment(http_rsb.down_server_no_requests);
+    char            *url_str = 
t_state.hdr_info.client_request.url_string_get_ref(nullptr);

Review Comment:
   proxy.process.http.down_server.no_requests is documented as counting 
requests where no request was sent because the origin was *already* marked 
down. Incrementing it when a host transitions to DOWN (i.e., on the transaction 
that marks it down) counts connection-failure transactions rather than “skipped 
due to down” transactions, which can make this metric misleading for operators.



##########
src/proxy/http/HttpTransact.cc:
##########
@@ -562,6 +562,41 @@ HttpTransact::is_server_negative_cached(State *s)
   }
 }
 
+/**
+  ATS has two configuration options controlling how many times it retries a 
connection attempt against origin servers.
+
+  - proxy.config.http.connect_attempts_max_retries
+  - proxy.config.http.connect_attempts_max_retries_suspect_server
+
+  The choice is based on the state of the active HostDBInfo.
+
+  - HostDBInfo::State::UP: use proxy.config.http.connect_attempts_max_retries
+  - HostDBInfo::State::DOWN: no retry
+  - HostDBInfo::State::SUSPECT: use 
proxy.config.http.connect_attempts_max_retries_suspect_server
+
+*/
+uint8_t
+HttpTransact::origin_server_connect_attempts_max_retries(State *s)
+{
+  HostDBInfo *active = s->dns_info.active;
+  if (active == nullptr) {
+    return 0;
+  }
+
+  switch (active->state(ts_clock::now(), s->txn_conf->down_server_timeout)) {
+  case HostDBInfo::State::UP:
+    return s->txn_conf->connect_attempts_max_retries;
+  case HostDBInfo::State::DOWN:
+    return 0;
+  case HostDBInfo::State::SUSPECT:
+    return s->txn_conf->connect_attempts_max_retries_suspect_server;
+  default:
+    break;
+  }
+
+  return 0;
+}

Review Comment:
   origin_server_connect_attempts_max_retries() returns uint8_t but directly 
returns MgmtInt config values, which truncates/wraps if a plugin overrides 
these configs outside the intended [0-254] range (or sets negative values). 
That can produce incorrect retry limits and can also make the later `+ 1` 
attempt-budget computation overflow back to 0 in mark_host_failure. Clamp the 
per-txn config values to the same bounds enforced for records.yaml (e.g. 
0..254) before converting to uint8_t.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Fix connect attempt retries [trafficserver]

Reply via email to