Copilot commented on code in PR #13102:
URL: https://github.com/apache/trafficserver/pull/13102#discussion_r3377151428
##########
src/proxy/http/HttpSM.cc:
##########
@@ -6053,34 +6049,36 @@ HttpSM::do_transform_open()
void
HttpSM::mark_host_failure(ResolveInfo *info, ts_time time_down)
{
- char addrbuf[INET6_ADDRPORTSTRLEN];
+ ink_assert(time_down != TS_TIME_ZERO);
- if (info->active) {
- if (time_down != TS_TIME_ZERO) {
- ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf));
- // Increment the fail_count
- if (auto [down, fail_count] =
info->active->increment_fail_count(time_down,
t_state.txn_conf->connect_attempts_rr_retries,
-
t_state.txn_conf->down_server_timeout);
- down) {
- char *url_str =
t_state.hdr_info.client_request.url_string_get_ref(nullptr);
- std::string_view host_name{t_state.unmapped_url.host_get()};
- swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for
host='{}' url='{}' fail_count='{}' marking down",
-
swoc::bwf::Errno(t_state.current.server->connect_result),
t_state.current.server->dst_addr, host_name,
- swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
- Log::error("%s", error_bw_buffer.c_str());
- SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
- ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
- } else {
- ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
- SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf,
fail_count);
- }
- } else { // Clear the failure
- info->active->mark_up();
- }
+ if (info->active == nullptr) {
+ return;
+ }
+
+ char addrbuf[INET6_ADDRPORTSTRLEN];
+ ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf));
+
+ const uint8_t max_connect_retries =
HttpTransact::origin_server_connect_attempts_max_retries(&t_state);
+ const ts_seconds fail_window = t_state.txn_conf->down_server_timeout;
+
+ // Mark the host DOWN only after every attempt has failed.
`max_connect_retries` counts only "retries", so the total attempt
+ // budget is `max_connect_retries + 1` (the initial connect plus each retry).
+ auto [down, fail_count] = info->active->increment_fail_count(time_down,
max_connect_retries + 1, fail_window);
+
+ if (down) {
+ Metrics::Counter::increment(http_rsb.down_server_no_requests);
+ char *url_str =
t_state.hdr_info.client_request.url_string_get_ref(nullptr);
Review Comment:
proxy.process.http.down_server.no_requests is documented as counting
requests where no request was sent because the origin was *already* marked
down. Incrementing it when a host transitions to DOWN (i.e., on the transaction
that marks it down) counts connection-failure transactions rather than “skipped
due to down” transactions, which can make this metric misleading for operators.
##########
src/proxy/http/HttpTransact.cc:
##########
@@ -562,6 +562,41 @@ HttpTransact::is_server_negative_cached(State *s)
}
}
+/**
+ ATS has two configuration options controlling how many times it retries a
connection attempt against origin servers.
+
+ - proxy.config.http.connect_attempts_max_retries
+ - proxy.config.http.connect_attempts_max_retries_suspect_server
+
+ The choice is based on the state of the active HostDBInfo.
+
+ - HostDBInfo::State::UP: use proxy.config.http.connect_attempts_max_retries
+ - HostDBInfo::State::DOWN: no retry
+ - HostDBInfo::State::SUSPECT: use
proxy.config.http.connect_attempts_max_retries_suspect_server
+
+*/
+uint8_t
+HttpTransact::origin_server_connect_attempts_max_retries(State *s)
+{
+ HostDBInfo *active = s->dns_info.active;
+ if (active == nullptr) {
+ return 0;
+ }
+
+ switch (active->state(ts_clock::now(), s->txn_conf->down_server_timeout)) {
+ case HostDBInfo::State::UP:
+ return s->txn_conf->connect_attempts_max_retries;
+ case HostDBInfo::State::DOWN:
+ return 0;
+ case HostDBInfo::State::SUSPECT:
+ return s->txn_conf->connect_attempts_max_retries_suspect_server;
+ default:
+ break;
+ }
+
+ return 0;
+}
Review Comment:
origin_server_connect_attempts_max_retries() returns uint8_t but directly
returns MgmtInt config values, which truncates/wraps if a plugin overrides
these configs outside the intended [0-254] range (or sets negative values).
That can produce incorrect retry limits and can also make the later `+ 1`
attempt-budget computation overflow back to 0 in mark_host_failure. Clamp the
per-txn config values to the same bounds enforced for records.yaml (e.g.
0..254) before converting to uint8_t.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]