This is an automated email from the ASF dual-hosted git repository.
zwoop pushed a commit to branch 9.1.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/9.1.x by this push:
new 2ca54b5 fix DNS spike issue for TCP_RETRY mode (#7307)
2ca54b5 is described below
commit 2ca54b5e689aae4e0e917d886e18542cf1ddeb0d
Author: Xin Li <[email protected]>
AuthorDate: Mon May 10 11:04:06 2021 -0700
fix DNS spike issue for TCP_RETRY mode (#7307)
fix compiling Error
rename metrics
update the docs for the new config and metric
Co-authored-by: xinli1 <[email protected]>
(cherry picked from commit c1ed4a4a50ba4b9d32933c58c2c0eb53f3636076)
---
doc/admin-guide/files/records.config.en.rst | 7 +++
.../monitoring/statistics/core/dns.en.rst | 12 +++++
iocore/dns/DNS.cc | 63 +++++++++++++++++++++-
iocore/dns/P_DNSProcessor.h | 11 +++-
mgmt/RecordsConfig.cc | 2 +
5 files changed, 92 insertions(+), 3 deletions(-)
diff --git a/doc/admin-guide/files/records.config.en.rst
b/doc/admin-guide/files/records.config.en.rst
index 400f8c2..021b721 100644
--- a/doc/admin-guide/files/records.config.en.rst
+++ b/doc/admin-guide/files/records.config.en.rst
@@ -2629,6 +2629,13 @@ DNS
``2`` TCP_ONLY: |TS| always talks to nameservers over TCP.
===== ======================================================================
+.. ts:cv:: CONFIG proxy.config.dns.max_tcp_continuous_failures INT 10
+
+ If DNS connection mode is TCP_RETRY, set the threshold of the continuous TCP
+ query failures count for the TCP connection, reset the TCP connection
immediately
+ if the continuous TCP query failures conut over the threshold. If the
threshold
+ is 0 (or less than 0) we close this feature.
+
.. ts:cv:: CONFIG proxy.config.dns.max_dns_in_flight INT 2048
Maximum inflight DNS queries made by |TS| at any given instant
diff --git a/doc/admin-guide/monitoring/statistics/core/dns.en.rst
b/doc/admin-guide/monitoring/statistics/core/dns.en.rst
index 0e0f93e..fed97d0 100644
--- a/doc/admin-guide/monitoring/statistics/core/dns.en.rst
+++ b/doc/admin-guide/monitoring/statistics/core/dns.en.rst
@@ -35,6 +35,18 @@ DNS
The number of DNS lookups currently in progress.
+.. ts:stat:: global proxy.process.dns.tcp_retries integer
+ :type: gauge
+ :ungathered:
+
+ The number of DNS query over TCP in TCP_RETRY connection mode.
+
+.. ts:stat:: global proxy.process.dns.tcp_reset integer
+ :type: gauge
+ :ungathered:
+
+ The number of resetting TCP connection in TCP_RETRY connection mode.
+
.. ts:stat:: global proxy.process.dns.lookup_avg_time integer
:type: derivative
:units: milliseconds
diff --git a/iocore/dns/DNS.cc b/iocore/dns/DNS.cc
index 95a913b..eb4a83e 100644
--- a/iocore/dns/DNS.cc
+++ b/iocore/dns/DNS.cc
@@ -44,6 +44,7 @@ int dns_failover_number =
DEFAULT_FAILOVER_NUMBER;
int dns_failover_period = DEFAULT_FAILOVER_PERIOD;
int dns_failover_try_period = DEFAULT_FAILOVER_TRY_PERIOD;
int dns_max_dns_in_flight = MAX_DNS_IN_FLIGHT;
+int dns_max_tcp_continuous_failures = MAX_DNS_TCP_CONTINUOUS_FAILURES;
int dns_validate_qname = 0;
unsigned int dns_handler_initialized = 0;
int dns_ns_rr = 0;
@@ -213,6 +214,7 @@ DNSProcessor::start(int, size_t stacksize)
REC_EstablishStaticConfigInt32(dns_max_dns_in_flight,
"proxy.config.dns.max_dns_in_flight");
REC_EstablishStaticConfigInt32(dns_validate_qname,
"proxy.config.dns.validate_query_name");
REC_EstablishStaticConfigInt32(dns_ns_rr,
"proxy.config.dns.round_robin_nameservers");
+ REC_EstablishStaticConfigInt32(dns_max_tcp_continuous_failures,
"proxy.config.dns.max_tcp_continuous_failures");
REC_ReadConfigStringAlloc(dns_ns_list, "proxy.config.dns.nameservers");
REC_ReadConfigStringAlloc(dns_local_ipv4, "proxy.config.dns.local_ipv4");
REC_ReadConfigStringAlloc(dns_local_ipv6, "proxy.config.dns.local_ipv6");
@@ -460,6 +462,17 @@ DNSHandler::open_cons(sockaddr const *target, bool failed,
int icon)
}
/**
+ Close the old TCP connection and open a new one
+ */
+bool
+DNSHandler::reset_tcp_conn(int ndx)
+{
+ DNS_INCREMENT_DYN_STAT(dns_tcp_reset_stat);
+ tcpcon[ndx].close();
+ return open_con(&m_res->nsaddr_list[ndx].sa, true, ndx, true);
+}
+
+/**
Open (and close) connections as necessary and also assures that the
epoll fd struct is properly updated.
@@ -473,11 +486,12 @@ DNSHandler::open_cons(sockaddr const *target, bool
failed, int icon)
target != nullptr and icon > 0 :
open connection to target.
*/
-void
+bool
DNSHandler::open_con(sockaddr const *target, bool failed, int icon, bool
over_tcp)
{
ip_port_text_buffer ip_text;
PollDescriptor *pd = get_PollDescriptor(dnsProcessor.thread);
+ bool ret = false;
ink_assert(target != &ip.sa);
@@ -509,7 +523,7 @@ DNSHandler::open_con(sockaddr const *target, bool failed,
int icon, bool over_tc
failover();
}
}
- return;
+ return false;
} else {
ns_down[icon] = 0;
if (cur_con.eio.start(pd, &cur_con, EVENTIO_READ) < 0) {
@@ -518,7 +532,10 @@ DNSHandler::open_con(sockaddr const *target, bool failed,
int icon, bool over_tc
cur_con.num = icon;
Debug("dns", "opening connection %s SUCCEEDED for %d", ip_text, icon);
}
+ ret = true;
}
+
+ return ret;
}
void
@@ -934,12 +951,32 @@ DNSHandler::recv_dns(int /* event ATS_UNUSED */, Event *
/* e ATS_UNUSED */)
}
}
+void
+DNSHandler::check_and_reset_tcp_conn()
+{
+ for (int i = 0; i < n_con; i++) {
+ if (dns_max_tcp_continuous_failures > 0 && tcp_continuous_failures[i] >=
dns_max_tcp_continuous_failures) {
+ // The times of continuous failures are more than the threshold, have to
reset the connection
+ if (reset_tcp_conn(i)) {
+ // Reset the counter after the new TCP connection
+ // don't reset the counter if tcp conn reset failed, and we need
continue to try to reset tcp conn next time
+ Warning("Reset tcp connection: nameserver = %d, failures = %d,
threshold = %d", i, tcp_continuous_failures[i],
+ dns_max_tcp_continuous_failures);
+ tcp_continuous_failures[i] = 0;
+ }
+ }
+ }
+}
+
/** Main event for the DNSHandler. Attempt to read from and write to named. */
int
DNSHandler::mainEvent(int event, Event *e)
{
recv_dns(event, e);
if (dns_ns_rr) {
+ if (DNS_CONN_MODE::TCP_RETRY == dns_conn_mode) {
+ check_and_reset_tcp_conn();
+ }
ink_hrtime t = Thread::get_hrtime();
if (t - last_primary_retry > DNS_PRIMARY_RETRY_PERIOD) {
for (int i = 0; i < n_con; i++) {
@@ -1140,6 +1177,14 @@ write_dns_event(DNSHandler *h, DNSEntry *e, bool
over_tcp)
int s = socketManager.send(con_fd, buffer, r, 0);
if (s != r) {
Debug("dns", "send() failed: qname = %s, %d != %d, nameserver= %d",
e->qname, s, r, h->name_server);
+
+ if (over_tcp) {
+ // add the counter for tcp connection failed
+ Debug("dns", "tcp query failed: name_server = %d,
tcp_continuous_failures = %d", h->name_server,
+ h->tcp_continuous_failures[h->name_server]);
+ ++h->tcp_continuous_failures[h->name_server];
+ }
+
// changed if condition from 'r < 0' to 's < 0' - 8/2001 pas
if (s < 0) {
if (dns_ns_rr) {
@@ -1151,6 +1196,13 @@ write_dns_event(DNSHandler *h, DNSEntry *e, bool
over_tcp)
return false;
}
+ if (over_tcp && h->tcp_continuous_failures[h->name_server] > 0) {
+ // reset the counter for any tcp connection succeed
+ Debug("dns", "reset tcp_continuous_failures: name_server = %d,
tcp_continuous_failures = %d", h->name_server,
+ h->tcp_continuous_failures[h->name_server]);
+ h->tcp_continuous_failures[h->name_server] = 0;
+ }
+
e->written_flag = true;
e->which_ns = h->name_server;
e->once_written_flag = true;
@@ -1483,6 +1535,7 @@ dns_process(DNSHandler *handler, HostEnt *buf, int len)
if (dns_conn_mode == DNS_CONN_MODE::TCP_RETRY && h->tc == 1) {
Debug("dns", "Retrying DNS query over TCP for [%s]", e->qname);
tcp_retry = true;
+ DNS_INCREMENT_DYN_STAT(dns_tcp_retries_stat);
goto Lerror;
}
@@ -1844,6 +1897,12 @@ ink_dns_init(ts::ModuleVersion v)
RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.in_flight",
RECD_INT, RECP_NON_PERSISTENT, (int)dns_in_flight_stat,
RecRawStatSyncSum);
+
+ RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.tcp_retries",
RECD_INT, RECP_PERSISTENT, (int)dns_tcp_retries_stat,
+ RecRawStatSyncSum);
+
+ RecRegisterRawStat(dns_rsb, RECT_PROCESS, "proxy.process.dns.tcp_reset",
RECD_INT, RECP_PERSISTENT, (int)dns_tcp_reset_stat,
+ RecRawStatSyncSum);
}
#if TS_HAS_TESTS
diff --git a/iocore/dns/P_DNSProcessor.h b/iocore/dns/P_DNSProcessor.h
index 6284328..7b04b92 100644
--- a/iocore/dns/P_DNSProcessor.h
+++ b/iocore/dns/P_DNSProcessor.h
@@ -30,6 +30,7 @@
#define MAX_DNS_RETRIES 9
#define DEFAULT_DNS_TIMEOUT 30
#define MAX_DNS_IN_FLIGHT 2048
+#define MAX_DNS_TCP_CONTINUOUS_FAILURES 10
#define DEFAULT_FAILOVER_NUMBER (DEFAULT_DNS_RETRIES + 1)
#define DEFAULT_FAILOVER_PERIOD (DEFAULT_DNS_TIMEOUT + 30)
// how many seconds before FAILOVER_PERIOD to try the primary with
@@ -49,6 +50,7 @@ extern int dns_failover_number;
extern int dns_failover_period;
extern int dns_failover_try_period;
extern int dns_max_dns_in_flight;
+extern int dns_max_tcp_continuous_failures;
extern unsigned int dns_sequence_number;
//
@@ -85,6 +87,8 @@ enum DNS_Stats {
dns_retries_stat,
dns_max_retries_exceeded_stat,
dns_in_flight_stat,
+ dns_tcp_retries_stat,
+ dns_tcp_reset_stat,
DNS_Stat_Count
};
@@ -173,6 +177,7 @@ struct DNSHandler : public Continuation {
int ns_down[MAX_NAMED];
int failover_number[MAX_NAMED];
int failover_soon_number[MAX_NAMED];
+ int tcp_continuous_failures[MAX_NAMED];
ink_hrtime crossed_failover_number[MAX_NAMED];
ink_hrtime last_primary_retry = 0;
ink_hrtime last_primary_reopen = 0;
@@ -225,7 +230,7 @@ struct DNSHandler : public Continuation {
int mainEvent(int event, Event *e);
void open_cons(sockaddr const *addr, bool failed = false, int icon = 0);
- void open_con(sockaddr const *addr, bool failed = false, int icon = 0, bool
over_tcp = false);
+ bool open_con(sockaddr const *addr, bool failed = false, int icon = 0, bool
over_tcp = false);
void failover();
void rr_failure(int ndx);
void recover();
@@ -257,6 +262,9 @@ struct DNSHandler : public Continuation {
private:
// Check the IP address and switch to default if needed.
void validate_ip();
+ // Check tcp connection for TCP_RETRY mode
+ void check_and_reset_tcp_conn();
+ bool reset_tcp_conn(int ndx);
};
/* --------------------------------------------------------------
@@ -295,6 +303,7 @@ DNSHandler::DNSHandler()
failover_number[i] = 0;
failover_soon_number[i] = 0;
crossed_failover_number[i] = 0;
+ tcp_continuous_failures[i] = 0;
ns_down[i] = 1;
tcpcon[i].handler = this;
udpcon[i].handler = this;
diff --git a/mgmt/RecordsConfig.cc b/mgmt/RecordsConfig.cc
index 1b50d3b..a68f185 100644
--- a/mgmt/RecordsConfig.cc
+++ b/mgmt/RecordsConfig.cc
@@ -886,6 +886,8 @@ static const RecordElement RecordsConfig[] =
,
{RECT_CONFIG, "proxy.config.dns.max_dns_in_flight", RECD_INT, "2048",
RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL}
,
+ {RECT_CONFIG, "proxy.config.dns.max_tcp_continuous_failures", RECD_INT,
"10", RECU_DYNAMIC, RR_NULL, RECC_NULL, nullptr, RECA_NULL}
+ ,
{RECT_CONFIG, "proxy.config.dns.validate_query_name", RECD_INT, "0",
RECU_DYNAMIC, RR_NULL, RECC_NULL, "[0-1]", RECA_NULL}
,
{RECT_CONFIG, "proxy.config.dns.splitDNS.enabled", RECD_INT, "0",
RECU_DYNAMIC, RR_NULL, RECC_INT, "[0-1]", RECA_NULL}