This is an automated email from the ASF dual-hosted git repository.
bcall pushed a commit to branch 8.0.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/8.0.x by this push:
new 514cab5 This fixes issue #5642 Origin_max_connections can create
state machine loops when using parent selection to select a parent for
redundancy and/or load balancing.
514cab5 is described below
commit 514cab5e0e55b952aac6555a81faa3017e3c4b34
Author: John Rushford <[email protected]>
AuthorDate: Tue Jun 18 17:30:39 2019 +0000
This fixes issue #5642 Origin_max_connections can create state
machine loops when using parent selection to select a parent
for redundancy and/or load balancing.
The parent selection handling in the state machine did not
handle the case when origin max connections is reached or
exceeded when using throttling. This led to state machine
looping and a crash when the stack is overflowed due to the
looping. Now this is handled by either trying a new uncongested
origin or sending a negative 503 response if all available
origins are congested.
---
proxy/http/HttpDebugNames.cc | 2 +
proxy/http/HttpSM.cc | 8 +-
proxy/http/HttpTransact.cc | 236 ++++++++++++++++++++++++-------------------
proxy/http/HttpTransact.h | 3 +-
4 files changed, 143 insertions(+), 106 deletions(-)
diff --git a/proxy/http/HttpDebugNames.cc b/proxy/http/HttpDebugNames.cc
index 8f270a8..31c8f9f 100644
--- a/proxy/http/HttpDebugNames.cc
+++ b/proxy/http/HttpDebugNames.cc
@@ -56,6 +56,8 @@
HttpDebugNames::get_server_state_name(HttpTransact::ServerState_t state)
return "TRANSACTION_COMPLETE";
case HttpTransact::PARENT_RETRY:
return "PARENT_RETRY";
+ case HttpTransact::OUTBOUND_CONGESTION:
+ return "OUTBOUND_CONGESTION";
}
return ("unknown state name");
diff --git a/proxy/http/HttpSM.cc b/proxy/http/HttpSM.cc
index b6d03b5..1008fd8 100644
--- a/proxy/http/HttpSM.cc
+++ b/proxy/http/HttpSM.cc
@@ -4620,8 +4620,12 @@ HttpSM::do_cache_prepare_action(HttpCacheSM *c_sm,
CacheHTTPInfo *object_read_in
void
HttpSM::send_origin_throttled_response()
{
- t_state.current.attempts = t_state.txn_conf->connect_attempts_max_retries;
- t_state.current.state = HttpTransact::CONNECTION_ERROR;
+ // if the request is to a parent proxy, do not reset
+ // t_state.current.attempts so that another parent may be tried.
+ if (t_state.current.request_to != HttpTransact::PARENT_PROXY) {
+ t_state.current.attempts = t_state.txn_conf->connect_attempts_max_retries;
+ }
+ t_state.current.state = HttpTransact::OUTBOUND_CONGESTION;
call_transact_and_set_next_state(HttpTransact::HandleResponse);
}
diff --git a/proxy/http/HttpTransact.cc b/proxy/http/HttpTransact.cc
index 10b5548..de03df6 100644
--- a/proxy/http/HttpTransact.cc
+++ b/proxy/http/HttpTransact.cc
@@ -263,8 +263,14 @@ find_server_and_update_current_info(HttpTransact::State *s)
TxnDebug("http_trans", "request is from localhost, so bypass parent");
s->parent_result.result = PARENT_DIRECT;
} else if (s->method == HTTP_WKSIDX_CONNECT &&
s->http_config_param->disable_ssl_parenting) {
- s->parent_params->findParent(&s->request_data, &s->parent_result,
s->txn_conf->parent_fail_threshold,
- s->txn_conf->parent_retry_time);
+ if (s->parent_result.result == PARENT_SPECIFIED) {
+ s->parent_params->nextParent(&s->request_data, &s->parent_result,
s->txn_conf->parent_fail_threshold,
+ s->txn_conf->parent_retry_time);
+ } else {
+ s->parent_params->findParent(&s->request_data, &s->parent_result,
s->txn_conf->parent_fail_threshold,
+ s->txn_conf->parent_retry_time);
+ }
+
if (!s->parent_result.is_some() || s->parent_result.is_api_result() ||
s->parent_result.parent_is_proxy()) {
TxnDebug("http_trans", "request not cacheable, so bypass parent");
s->parent_result.result = PARENT_DIRECT;
@@ -277,8 +283,13 @@ find_server_and_update_current_info(HttpTransact::State *s)
// we are assuming both child and parent have similar configuration
// with respect to whether a request is cacheable or not.
// For example, the cache_urls_that_look_dynamic variable.
- s->parent_params->findParent(&s->request_data, &s->parent_result,
s->txn_conf->parent_fail_threshold,
- s->txn_conf->parent_retry_time);
+ if (s->parent_result.result == PARENT_SPECIFIED) {
+ s->parent_params->nextParent(&s->request_data, &s->parent_result,
s->txn_conf->parent_fail_threshold,
+ s->txn_conf->parent_retry_time);
+ } else {
+ s->parent_params->findParent(&s->request_data, &s->parent_result,
s->txn_conf->parent_fail_threshold,
+ s->txn_conf->parent_retry_time);
+ }
if (!s->parent_result.is_some() || s->parent_result.is_api_result() ||
s->parent_result.parent_is_proxy()) {
TxnDebug("http_trans", "request not cacheable, so bypass parent");
s->parent_result.result = PARENT_DIRECT;
@@ -3072,6 +3083,8 @@ HttpTransact::OriginServerRawOpen(State *s)
/* fall through */
case CONNECTION_CLOSED:
/* fall through */
+ case OUTBOUND_CONGESTION:
+ /* fall through */
handle_server_died(s);
ink_assert(s->cache_info.action == CACHE_DO_NO_ACTION);
@@ -3272,6 +3285,7 @@ HttpTransact::HandleStatPage(State *s)
void
HttpTransact::handle_response_from_parent(State *s)
{
+ LookingUp_t next_lookup = UNDEFINED_LOOKUP;
TxnDebug("http_trans", "[handle_response_from_parent] (hrfp)");
HTTP_RELEASE_ASSERT(s->current.server == &s->parent_info);
@@ -3294,122 +3308,120 @@ HttpTransact::handle_response_from_parent(State *s)
}
handle_forward_server_connection_open(s);
break;
- default: {
- LookingUp_t next_lookup = UNDEFINED_LOOKUP;
-
- if (s->current.state == PARENT_RETRY) {
- if (s->current.retry_type == PARENT_RETRY_SIMPLE) {
- if (s->current.simple_retry_attempts >=
s->parent_result.max_retries(PARENT_RETRY_SIMPLE)) {
- TxnDebug("http_trans", "PARENT_RETRY_SIMPLE: retried all parents,
send error to client.");
- s->current.retry_type = PARENT_RETRY_NONE;
- } else {
- s->current.simple_retry_attempts++;
- TxnDebug("http_trans", "PARENT_RETRY_SIMPLE: try another parent.");
- s->current.retry_type = PARENT_RETRY_NONE;
- next_lookup = find_server_and_update_current_info(s);
- }
- } else if (s->current.retry_type == PARENT_RETRY_UNAVAILABLE_SERVER) {
- if (s->current.unavailable_server_retry_attempts >=
s->parent_result.max_retries(PARENT_RETRY_UNAVAILABLE_SERVER)) {
- TxnDebug("http_trans", "PARENT_RETRY_UNAVAILABLE_SERVER: retried all
parents, send error to client.");
- s->current.retry_type = PARENT_RETRY_NONE;
- } else {
- s->current.unavailable_server_retry_attempts++;
- TxnDebug("http_trans", "PARENT_RETRY_UNAVAILABLE_SERVER: marking
parent down and trying another.");
- s->current.retry_type = PARENT_RETRY_NONE;
- HTTP_INCREMENT_DYN_STAT(http_total_parent_marked_down_count);
- s->parent_params->markParentDown(&s->parent_result,
s->txn_conf->parent_fail_threshold, s->txn_conf->parent_retry_time);
- next_lookup = find_server_and_update_current_info(s);
- }
+ case PARENT_RETRY:
+ if (s->current.retry_type == PARENT_RETRY_SIMPLE) {
+ if (s->current.simple_retry_attempts >=
s->parent_result.max_retries(PARENT_RETRY_SIMPLE)) {
+ TxnDebug("http_trans", "PARENT_RETRY_SIMPLE: retried all parents, send
error to client.");
+ s->current.retry_type = PARENT_RETRY_NONE;
+ } else {
+ s->current.simple_retry_attempts++;
+ TxnDebug("http_trans", "PARENT_RETRY_SIMPLE: try another parent.");
+ s->current.retry_type = PARENT_RETRY_NONE;
+ next_lookup = find_server_and_update_current_info(s);
}
- } else {
- TxnDebug("http_trans", "[hrfp] connection not alive");
- SET_VIA_STRING(VIA_DETAIL_PP_CONNECT, VIA_DETAIL_PP_FAILURE);
+ } else if (s->current.retry_type == PARENT_RETRY_UNAVAILABLE_SERVER) {
+ if (s->current.unavailable_server_retry_attempts >=
s->parent_result.max_retries(PARENT_RETRY_UNAVAILABLE_SERVER)) {
+ TxnDebug("http_trans", "PARENT_RETRY_UNAVAILABLE_SERVER: retried all
parents, send error to client.");
+ s->current.retry_type = PARENT_RETRY_NONE;
+ } else {
+ s->current.unavailable_server_retry_attempts++;
+ TxnDebug("http_trans", "PARENT_RETRY_UNAVAILABLE_SERVER: marking
parent down and trying another.");
+ s->current.retry_type = PARENT_RETRY_NONE;
+ HTTP_INCREMENT_DYN_STAT(http_total_parent_marked_down_count);
+ s->parent_params->markParentDown(&s->parent_result,
s->txn_conf->parent_fail_threshold, s->txn_conf->parent_retry_time);
+ next_lookup = find_server_and_update_current_info(s);
+ }
+ }
+ break;
+ default:
+ TxnDebug("http_trans", "[hrfp] connection not alive");
+ SET_VIA_STRING(VIA_DETAIL_PP_CONNECT, VIA_DETAIL_PP_FAILURE);
- ink_assert(s->hdr_info.server_request.valid());
+ ink_assert(s->hdr_info.server_request.valid());
- s->current.server->connect_result = ENOTCONN;
- // only mark the parent down in hostdb if the configuration allows it,
- // see proxy.config.http.parent_proxy.mark_down_hostdb in records.config.
- if (s->txn_conf->parent_failures_update_hostdb) {
- s->state_machine->do_hostdb_update_if_necessary();
- }
+ s->current.server->connect_result = ENOTCONN;
+ // only mark the parent down in hostdb if the configuration allows it and
the parent
+ // is not congested, see proxy.config.http.parent_proxy.mark_down_hostdb
in records.config.
+ if (s->txn_conf->parent_failures_update_hostdb && s->current.state !=
OUTBOUND_CONGESTION) {
+ s->state_machine->do_hostdb_update_if_necessary();
+ }
- char addrbuf[INET6_ADDRSTRLEN];
- TxnDebug("http_trans", "[%d] failed to connect to parent %s",
s->current.attempts,
- ats_ip_ntop(&s->current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf)));
+ char addrbuf[INET6_ADDRSTRLEN];
+ TxnDebug("http_trans", "[%d] failed to connect to parent %s",
s->current.attempts,
+ ats_ip_ntop(&s->current.server->dst_addr.sa, addrbuf,
sizeof(addrbuf)));
- // If the request is not retryable, just give up!
- if (!is_request_retryable(s)) {
+ // If the request is not retryable, just give up!
+ if (!is_request_retryable(s)) {
+ // do not mark the parent down if it is only congested.
+ if (s->current.state != OUTBOUND_CONGESTION) {
HTTP_INCREMENT_DYN_STAT(http_total_parent_marked_down_count);
s->parent_params->markParentDown(&s->parent_result,
s->txn_conf->parent_fail_threshold, s->txn_conf->parent_retry_time);
- s->parent_result.result = PARENT_FAIL;
- handle_parent_died(s);
- return;
}
+ s->parent_result.result = PARENT_FAIL;
+ handle_parent_died(s);
+ return;
+ }
- if (s->current.attempts < s->txn_conf->parent_connect_attempts) {
- HTTP_INCREMENT_DYN_STAT(http_total_parent_retries_stat);
- s->current.attempts++;
-
- // Are we done with this particular parent?
- if ((s->current.attempts - 1) %
s->txn_conf->per_parent_connect_attempts != 0) {
- // No we are not done with this parent so retry
- HTTP_INCREMENT_DYN_STAT(http_total_parent_switches_stat);
- s->next_action = how_to_open_connection(s);
- TxnDebug("http_trans", "%s Retrying parent for attempt %d, max %"
PRId64, "[handle_response_from_parent]",
- s->current.attempts,
s->txn_conf->per_parent_connect_attempts);
- return;
- } else {
- TxnDebug("http_trans", "%s %d per parent attempts exhausted",
"[handle_response_from_parent]", s->current.attempts);
- HTTP_INCREMENT_DYN_STAT(http_total_parent_retries_exhausted_stat);
-
- // Only mark the parent down if we failed to connect
- // to the parent otherwise slow origin servers cause
- // us to mark the parent down
- if (s->current.state == CONNECTION_ERROR) {
- HTTP_INCREMENT_DYN_STAT(http_total_parent_marked_down_count);
- s->parent_params->markParentDown(&s->parent_result,
s->txn_conf->parent_fail_threshold, s->txn_conf->parent_retry_time);
- }
- // We are done so look for another parent if any
- next_lookup = find_server_and_update_current_info(s);
- }
+ if (s->current.attempts < s->txn_conf->parent_connect_attempts) {
+ HTTP_INCREMENT_DYN_STAT(http_total_parent_retries_stat);
+ s->current.attempts++;
+
+ // Are we done with this particular parent?
+ if ((s->current.attempts - 1) % s->txn_conf->per_parent_connect_attempts
!= 0) {
+ // No we are not done with this parent so retry
+ HTTP_INCREMENT_DYN_STAT(http_total_parent_switches_stat);
+ s->next_action = how_to_open_connection(s);
+ TxnDebug("http_trans", "%s Retrying parent for attempt %d, max %"
PRId64, "[handle_response_from_parent]",
+ s->current.attempts,
s->txn_conf->per_parent_connect_attempts);
+ return;
} else {
- // Done trying parents... fail over to origin server if that is
- // appropriate
+ TxnDebug("http_trans", "%s %d per parent attempts exhausted",
"[handle_response_from_parent]", s->current.attempts);
HTTP_INCREMENT_DYN_STAT(http_total_parent_retries_exhausted_stat);
- TxnDebug("http_trans", "[handle_response_from_parent] Error. No more
retries.");
+
+ // Only mark the parent down if we failed to connect
+ // to the parent otherwise slow origin servers cause
+ // us to mark the parent down
if (s->current.state == CONNECTION_ERROR) {
HTTP_INCREMENT_DYN_STAT(http_total_parent_marked_down_count);
s->parent_params->markParentDown(&s->parent_result,
s->txn_conf->parent_fail_threshold, s->txn_conf->parent_retry_time);
}
- s->parent_result.result = PARENT_FAIL;
- next_lookup = find_server_and_update_current_info(s);
+ // We are done so look for another parent if any
+ next_lookup = find_server_and_update_current_info(s);
}
+ } else {
+ // Done trying parents... fail over to origin server if that is
+ // appropriate
+ HTTP_INCREMENT_DYN_STAT(http_total_parent_retries_exhausted_stat);
+ TxnDebug("http_trans", "[handle_response_from_parent] Error. No more
retries.");
+ if (s->current.state == CONNECTION_ERROR) {
+ HTTP_INCREMENT_DYN_STAT(http_total_parent_marked_down_count);
+ s->parent_params->markParentDown(&s->parent_result,
s->txn_conf->parent_fail_threshold, s->txn_conf->parent_retry_time);
+ }
+ s->parent_result.result = PARENT_FAIL;
+ next_lookup = HOST_NONE;
}
-
- // We have either tried to find a new parent or failed over to the
- // origin server
- switch (next_lookup) {
- case PARENT_PROXY:
- ink_assert(s->current.request_to == PARENT_PROXY);
- TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
- break;
- case ORIGIN_SERVER:
- // Next lookup is Origin Server, try DNS for Origin Server
- TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, OSDNSLookup);
- break;
- case HOST_NONE:
- handle_parent_died(s);
- break;
- default:
- // This handles:
- // UNDEFINED_LOOKUP
- // INCOMING_ROUTER
- break;
- }
-
break;
}
+
+ // We have either tried to find a new parent or failed over to the
+ // origin server
+ switch (next_lookup) {
+ case PARENT_PROXY:
+ ink_assert(s->current.request_to == PARENT_PROXY);
+ TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
+ break;
+ case ORIGIN_SERVER:
+ // Next lookup is Origin Server, try DNS for Origin Server
+ TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, OSDNSLookup);
+ break;
+ case HOST_NONE:
+ handle_parent_died(s);
+ break;
+ default:
+ // This handles:
+ // UNDEFINED_LOOKUP
+ // INCOMING_ROUTER
+ break;
}
}
@@ -3450,6 +3462,12 @@ HttpTransact::handle_response_from_server(State *s)
s->current.server->clear_connect_fail();
handle_forward_server_connection_open(s);
break;
+ case OUTBOUND_CONGESTION:
+ TxnDebug("http_trans", "[handle_response_from_server] Error. congestion
control -- congested.");
+ SET_VIA_STRING(VIA_DETAIL_SERVER_CONNECT, VIA_DETAIL_SERVER_FAILURE);
+ s->current.server->set_connect_fail(EUSERS); // too many users
+ handle_server_connection_not_open(s);
+ break;
case OPEN_RAW_ERROR:
/* fall through */
case CONNECTION_ERROR:
@@ -6306,7 +6324,8 @@ HttpTransact::is_response_valid(State *s, HTTPHdr
*incoming_response)
if (s->current.state != CONNECTION_ALIVE) {
ink_assert((s->current.state == CONNECTION_ERROR) || (s->current.state ==
OPEN_RAW_ERROR) ||
(s->current.state == PARSE_ERROR) || (s->current.state ==
CONNECTION_CLOSED) ||
- (s->current.state == INACTIVE_TIMEOUT) || (s->current.state ==
ACTIVE_TIMEOUT));
+ (s->current.state == INACTIVE_TIMEOUT) || (s->current.state ==
ACTIVE_TIMEOUT) ||
+ s->current.state == OUTBOUND_CONGESTION);
s->hdr_info.response_error = CONNECTION_OPEN_FAILED;
return false;
@@ -7350,7 +7369,12 @@ HttpTransact::handle_parent_died(State *s)
{
ink_assert(s->parent_result.result == PARENT_FAIL);
- build_error_response(s, HTTP_STATUS_BAD_GATEWAY, "Next Hop Connection
Failed", "connect#failed_connect");
+ // congestion to a parent multi-site origin server
+ if (s->current.state == OUTBOUND_CONGESTION) {
+ build_error_response(s, HTTP_STATUS_SERVICE_UNAVAILABLE, "Next Hop
Congested", "congestion#retryAfter");
+ } else {
+ build_error_response(s, HTTP_STATUS_BAD_GATEWAY, "Next Hop Connection
Failed", "connect#failed_connect");
+ }
TRANSACT_RETURN(SM_ACTION_SEND_ERROR_CACHE_NOOP, nullptr);
}
@@ -7409,6 +7433,12 @@ HttpTransact::handle_server_died(State *s)
reason = "Invalid HTTP Response";
body_type = "response#bad_response";
break;
+ case OUTBOUND_CONGESTION:
+ status = HTTP_STATUS_SERVICE_UNAVAILABLE;
+ reason = "Origin server congested";
+ body_type = "congestion#retryAfter";
+ s->hdr_info.response_error = TOTAL_RESPONSE_ERROR_TYPES;
+ break;
case STATE_UNDEFINED:
case TRANSACTION_COMPLETE:
default: /* unknown death */
diff --git a/proxy/http/HttpTransact.h b/proxy/http/HttpTransact.h
index e3ee61b..1178f41 100644
--- a/proxy/http/HttpTransact.h
+++ b/proxy/http/HttpTransact.h
@@ -349,7 +349,8 @@ public:
OPEN_RAW_ERROR,
PARSE_ERROR,
TRANSACTION_COMPLETE,
- PARENT_RETRY
+ PARENT_RETRY,
+ OUTBOUND_CONGESTION
};
enum CacheWriteStatus_t {