Make HostDBRoundRobin::select_best_http take last_failure time into consideration for all RR types
In the current setup it only checks that status of the reals if you use "default RR" (which is actually consistent hashing... but we'll let that slide). This patch consolidates the alive() check into the HostDBInfo struct, and then calls if from all 3 LB mechanisms. Since you can control if/when a host is marked as down in ATS there is no reason to not check. Issue: TS-3724 Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/be68bd8f Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/be68bd8f Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/be68bd8f Branch: refs/heads/master Commit: be68bd8f47f7ecde5403d9a63dbf81604d9bdf56 Parents: 6a56fd2 Author: Thomas Jackson <[email protected]> Authored: Thu Jun 25 18:50:28 2015 -0700 Committer: Thomas Jackson <[email protected]> Committed: Mon Jun 29 18:59:01 2015 -0700 ---------------------------------------------------------------------- iocore/hostdb/I_HostDBProcessor.h | 35 +++++++++++++++++++++++++++++ iocore/hostdb/P_HostDBProcessor.h | 41 ++++++++++++++-------------------- 2 files changed, 52 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafficserver/blob/be68bd8f/iocore/hostdb/I_HostDBProcessor.h ---------------------------------------------------------------------- diff --git a/iocore/hostdb/I_HostDBProcessor.h b/iocore/hostdb/I_HostDBProcessor.h index bcdc3a0..e4ef5f0 100644 --- a/iocore/hostdb/I_HostDBProcessor.h +++ b/iocore/hostdb/I_HostDBProcessor.h @@ -254,6 +254,41 @@ struct HostDBInfo { uint64_t md5_high; + /* + * Given the current time `now` and the fail_window, determine if this real is alive + */ + bool + alive(ink_time_t now, int32_t fail_window) + { + unsigned int last_failure = app.http_data.last_failure; + + if (last_failure == 0 || (unsigned int)(now - fail_window) > last_failure) { + return true; + } else { + // Entry is marked down. Make sure some nasty clock skew + // did not occur. Use the retry time to set an upper bound + // as to how far in the future we should tolerate bogus last + // failure times. This sets the upper bound that we would ever + // consider a server down to 2*down_server_timeout + if (now + fail_window < last_failure) { +#ifdef DEBUG + // because this region is mmaped, I cann't get anything + // useful from the structure in core files, therefore + // copy the revelvant info to the stack so it will + // be readble in the core + HostDBInfo current_info; + HostDBRoundRobin current_rr; + memcpy(¤t_info, &info[i], sizeof(HostDBInfo)); + memcpy(¤t_rr, this, sizeof(HostDBRoundRobin)); +#endif + ink_assert(!"extreme clock skew"); + app.http_data.last_failure = 0; + return false; + } + return false; + } + + } bool failed() { http://git-wip-us.apache.org/repos/asf/trafficserver/blob/be68bd8f/iocore/hostdb/P_HostDBProcessor.h ---------------------------------------------------------------------- diff --git a/iocore/hostdb/P_HostDBProcessor.h b/iocore/hostdb/P_HostDBProcessor.h index d80bc6e..d276112 100644 --- a/iocore/hostdb/P_HostDBProcessor.h +++ b/iocore/hostdb/P_HostDBProcessor.h @@ -284,9 +284,17 @@ HostDBRoundRobin::select_best_http(sockaddr const *client_ip, ink_time_t now, in int best_any = 0; int best_up = -1; + // Basic round robin, increment current and mod with how many we have if (HostDBProcessor::hostdb_strict_round_robin) { Debug("hostdb", "Using strict round robin"); - best_up = current++ % good; + // Check that the host we selected is alive + for (int i=0; i < good; i++){ + best_any = current++ % good; + if (info[best_any].alive(now, fail_window)){ + best_up = best_any; + break; + } + } } else if (HostDBProcessor::hostdb_timed_round_robin > 0) { Debug("hostdb", "Using timed round-robin for HTTP"); if ((now - timed_rr_ctime) > HostDBProcessor::hostdb_timed_round_robin) { @@ -294,7 +302,13 @@ HostDBRoundRobin::select_best_http(sockaddr const *client_ip, ink_time_t now, in ++current; timed_rr_ctime = now; } - best_up = current % good; + for (int i=0; i < good; i++){ + best_any = current++ % good; + if (info[best_any].alive(now, fail_window)){ + best_up = best_any; + break; + } + } Debug("hostdb", "Using %d for best_up", best_up); } else { Debug("hostdb", "Using default round robin"); @@ -308,32 +322,11 @@ HostDBRoundRobin::select_best_http(sockaddr const *client_ip, ink_time_t now, in best_any = i; best_hash_any = h; } - if (info[i].app.http_data.last_failure == 0 || (unsigned int)(now - fail_window) > info[i].app.http_data.last_failure) { - // Entry is marked up + if (info[i].alive(now, fail_window)){ if (best_hash_up <= h) { best_up = i; best_hash_up = h; } - } else { - // Entry is marked down. Make sure some nasty clock skew - // did not occur. Use the retry time to set an upper bound - // as to how far in the future we should tolerate bogus last - // failure times. This sets the upper bound that we would ever - // consider a server down to 2*down_server_timeout - if (now + fail_window < (int32_t)(info[i].app.http_data.last_failure)) { -#ifdef DEBUG - // because this region is mmaped, I cann't get anything - // useful from the structure in core files, therefore - // copy the revelvant info to the stack so it will - // be readble in the core - HostDBInfo current_info; - HostDBRoundRobin current_rr; - memcpy(¤t_info, &info[i], sizeof(HostDBInfo)); - memcpy(¤t_rr, this, sizeof(HostDBRoundRobin)); -#endif - ink_assert(!"extreme clock skew"); - info[i].app.http_data.last_failure = 0; - } } } }
