bneradt commented on code in PR #13102:
URL: https://github.com/apache/trafficserver/pull/13102#discussion_r3182410605
##########
src/iocore/hostdb/HostDB.cc:
##########
@@ -1706,13 +1706,19 @@ ResolveInfo::set_active(HostDBInfo *info)
}
bool
-ResolveInfo::select_next_rr()
+ResolveInfo::select_next_rr(ts_time now, ts_seconds fail_window)
{
if (active) {
if (auto rr_info{this->record->rr_info()}; rr_info.count() > 1) {
- unsigned limit = active - rr_info.data(), idx = (limit + 1) %
rr_info.count();
- while ((idx = (idx + 1) % rr_info.count()) != limit &&
!rr_info[idx].is_up()) {}
- active = &rr_info[idx];
+ unsigned limit = active - rr_info.data();
+ size_t idx = (limit + 1) % rr_info.count();
+ for (; idx != limit; idx = (idx + 1) % rr_info.count()) {
+ if (!rr_info[idx].is_down(now, fail_window)) {
+ active = &rr_info[idx];
+ break;
+ }
+ }
Review Comment:
Good loop refactor. Looks like `limit` can be `const`.
##########
src/proxy/http/HttpSM.cc:
##########
@@ -5972,34 +5968,35 @@ HttpSM::do_transform_open()
void
HttpSM::mark_host_failure(ResolveInfo *info, ts_time time_down)
{
- char addrbuf[INET6_ADDRPORTSTRLEN];
+ ink_assert(time_down != TS_TIME_ZERO);
- if (info->active) {
- if (time_down != TS_TIME_ZERO) {
- ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf));
- // Increment the fail_count
- if (auto [down, fail_count] =
info->active->increment_fail_count(time_down,
t_state.txn_conf->connect_attempts_rr_retries,
-
t_state.txn_conf->down_server_timeout);
- down) {
- char *url_str =
t_state.hdr_info.client_request.url_string_get_ref(nullptr);
- std::string_view host_name{t_state.unmapped_url.host_get()};
- swoc::bwprint(error_bw_buffer, "CONNECT : {::s} connecting to {} for
host='{}' url='{}' fail_count='{}' marking down",
-
swoc::bwf::Errno(t_state.current.server->connect_result),
t_state.current.server->dst_addr, host_name,
- swoc::bwf::FirstOf(url_str, "<none>"), fail_count);
- Log::error("%s", error_bw_buffer.c_str());
- SMDbg(dbg_ctl_http, "hostdb update marking IP: %s as down", addrbuf);
- ATS_PROBE2(hostdb_mark_ip_as_down, sm_id, addrbuf);
- } else {
- ATS_PROBE3(hostdb_inc_ip_failcount, sm_id, addrbuf, fail_count);
- SMDbg(dbg_ctl_http, "hostdb increment IP failcount %s to %d", addrbuf,
fail_count);
- }
- } else { // Clear the failure
- info->active->mark_up();
- }
+ if (info->active == nullptr) {
+ return;
+ }
+
+ char addrbuf[INET6_ADDRPORTSTRLEN];
+ ats_ip_nptop(&t_state.current.server->dst_addr.sa, addrbuf, sizeof(addrbuf));
+
+ uint8_t max_connect_retries =
HttpTransact::origin_server_connect_attempts_max_retries(&t_state);
+ ts_seconds fail_window = t_state.txn_conf->down_server_timeout;
+
+ // Mark the host DOWN only after every attempt has failed.
`max_connect_retries` counts only "retries", so the total attempt
+ // budget is `max_connect_retries + 1` (the initial connect plus each retry).
+ auto [down, fail_count] = info->active->increment_fail_count(time_down,
max_connect_retries + 1, fail_window);
+
Review Comment:
I believe we addressed this by clamping the configuration to 0-255:
https://github.com/apache/trafficserver/pull/13102/changes/a1d98bec93fd392c112fc581ae9cdd5d2b9db23f
But maybe we need to use 0-254? If the user puts in 255 and we increment
here, won't that overflow the uint8_t? (Or use a wider type.)
##########
src/proxy/http/HttpTransact.cc:
##########
@@ -562,6 +562,41 @@ HttpTransact::is_server_negative_cached(State *s)
}
}
+/**
+ ATS has two configuration options controlling how many times it retries a
connection attempt against origin servers.
+
+ - proxy.config.http.connect_attempts_max_retries
+ - proxy.config.http.connect_attempts_max_retries_down_server
+
+ The choice is based on the state of the active HostDBInfo.
+
+ - HostDBInfo::State::UP: use proxy.config.http.connect_attempts_max_retries
+ - HostDBInfo::State::DOWN: no retry
+ - HostDBInfo::State::SUSPECT: use
proxy.config.http.connect_attempts_max_retries_down_server
+
+*/
+uint8_t
+HttpTransact::origin_server_connect_attempts_max_retries(State *s)
+{
+ HostDBInfo *active = s->dns_info.active;
+ if (active == nullptr) {
+ return 0;
+ }
+
Review Comment:
Codex says:
> This re-evaluates the retry budget through
origin_server_connect_attempts_max_retries(), which returns 0 when
dns_info.active is null. That is a normal supported path for USE_CLIENT after
transparent CTA fallback, and can also happen with plugin-supplied addresses
after the one OS_DNS retry. In those cases the next failed connect immediately
trips max_connect_retries <= retry_attempts and skips the configured
connect_attempts_max_retries. I think the call site should keep the
existing/global retry budget when there is no active HostDBInfo.
> src/proxy/http/HttpTransact.cc:4074-4075
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]