Repository: trafficserver Updated Branches: refs/heads/master 2cdd1016f -> 9bf5beb36
TS-4228 Adds better error handling in the synthetic checks In traffic_manager, the thread that handles the request from traffic_cop (via traffic_server) does not deal well with various (obscure) error conditions. Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/9bf5beb3 Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/9bf5beb3 Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/9bf5beb3 Branch: refs/heads/master Commit: 9bf5beb3625038ada8de89850d35dfc561220b77 Parents: 2cdd101 Author: Leif Hedstrom <[email protected]> Authored: Wed Feb 24 19:44:14 2016 -0700 Committer: Leif Hedstrom <[email protected]> Committed: Fri Feb 26 09:07:25 2016 -0700 ---------------------------------------------------------------------- cmd/traffic_cop/traffic_cop.cc | 31 ++++++++++++------------------- cmd/traffic_manager/MgmtHandlers.cc | 16 +++++++++++++--- lib/ts/ink_sock.cc | 8 ++++---- lib/ts/ink_sock.h | 4 ++-- mgmt/Cop.h | 27 +++++++++++++++++++++++++++ 5 files changed, 58 insertions(+), 28 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/cmd/traffic_cop/traffic_cop.cc ---------------------------------------------------------------------- diff --git a/cmd/traffic_cop/traffic_cop.cc b/cmd/traffic_cop/traffic_cop.cc index 94cfcd3..8758514 100644 --- a/cmd/traffic_cop/traffic_cop.cc +++ b/cmd/traffic_cop/traffic_cop.cc @@ -35,6 +35,7 @@ #include "RecordsConfig.h" #include "ClusterCom.h" #include "ts/ink_cap.h" +#include "Cop.h" #include <string> #include <map> @@ -110,11 +111,7 @@ static int source_port = 0; static int manager_failures = 0; static int server_failures = 0; static int server_not_found = 0; - -static const int sleep_time = 10; // 10 sec -static int init_sleep_time = sleep_time; // 10 sec -static const int manager_timeout = 3 * 60; // 3 min -static const int server_timeout = 3 * 60; // 3 min +static int init_sleep_time = cop_sleep_time; // 10 sec // traffic_manager flap detection #define MANAGER_FLAP_DETECTION 1 @@ -131,8 +128,6 @@ static ink_hrtime manager_flap_retry_start_time = 0; // first time we attempt // transient syscall error timeout #define TRANSIENT_ERROR_WAIT_MS 500 -static const int kill_timeout = 1 * 60; // 1 min - static int child_pid = 0; static int child_status = 0; @@ -316,12 +311,10 @@ sig_alarm_warn(int signum) #endif { cop_log_trace("Entering sig_alarm_warn(%d)\n", signum); - cop_log(COP_WARNING, "unable to kill traffic_server for the last" - " %d seconds\n", - kill_timeout); + cop_log(COP_WARNING, "unable to kill traffic_server for the last %d seconds\n", cop_kill_timeout); // Set us up for another alarm - alarm(kill_timeout); + alarm(cop_kill_timeout); cop_log_trace("Leaving sig_alarm_warn(%d)\n", signum); } @@ -402,7 +395,7 @@ safe_kill(const char *lockfile_name, const char *pname, bool group) cop_log_trace("Entering safe_kill(%s, %s, %d)\n", lockfile_name, pname, group); set_alarm_warn(); - alarm(kill_timeout); + alarm(cop_kill_timeout); if (group == true) { lockfile.KillGroup(killsig, coresig, pname); @@ -1017,7 +1010,7 @@ read_manager_string(const char *variable, char *value, size_t val_len) snprintf(request, sizeof(request), "read %s\n", variable); - err = test_port(rs_port, request, buffer, 4095, manager_timeout * 1000); + err = test_port(rs_port, request, buffer, 4095, cop_manager_timeout * 1000); if (err < 0) { return err; } @@ -1071,7 +1064,7 @@ read_manager_int(const char *variable, int *value) snprintf(request, sizeof(request), "read %s\n", variable); - err = test_port(rs_port, request, buffer, 4095, manager_timeout * 1000); + err = test_port(rs_port, request, buffer, 4095, cop_manager_timeout * 1000); if (err < 0) { return err; } @@ -1236,7 +1229,7 @@ test_server_http_port() // servers up on the autoconf port. snprintf(request, sizeof(request), "GET http://127.0.0.1:%d/synthetic.txt HTTP/1.0\r\n\r\n", synthetic_port); - return test_http_port(http_backdoor_port, request, server_timeout * 1000, localhost, localhost); + return test_http_port(http_backdoor_port, request, cop_server_timeout * 1000, localhost, localhost); } static int @@ -1444,7 +1437,7 @@ check_programs() // is up, we make sure there is actually a server process // running. If there is we test it. - alarm(2 * manager_timeout); + alarm(2 * cop_manager_timeout); err = heartbeat_manager(); alarm(0); @@ -1471,7 +1464,7 @@ check_programs() safe_kill(manager_lockfile, manager_binary, true); } } else { - alarm(2 * server_timeout); + alarm(2 * cop_server_timeout); heartbeat_server(); alarm(0); } @@ -1566,7 +1559,7 @@ check(void *arg) chown_file_to_admin_user(manager_lockfile); chown_file_to_admin_user(server_lockfile); - alarm(2 * (sleep_time + manager_timeout * 2 + server_timeout)); + alarm(2 * (cop_sleep_time + cop_manager_timeout * 2 + cop_server_timeout)); if (check_no_run() < 0) { break; @@ -1601,7 +1594,7 @@ check(void *arg) // Pause to catch our breath. (10 seconds). // Use 'millisleep()' because normal 'sleep()' interferes with // the SIGALRM signal which we use to heartbeat the cop. - millisleep(sleep_time * 1000); + millisleep(cop_sleep_time * 1000); // We do this after the first round of checks, since the first "check" will spawn traffic_manager if (!mgmt_init) { http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/cmd/traffic_manager/MgmtHandlers.cc ---------------------------------------------------------------------- diff --git a/cmd/traffic_manager/MgmtHandlers.cc b/cmd/traffic_manager/MgmtHandlers.cc index 913037d..3f667e5 100644 --- a/cmd/traffic_manager/MgmtHandlers.cc +++ b/cmd/traffic_manager/MgmtHandlers.cc @@ -36,6 +36,7 @@ #include "MgmtSocket.h" #include "NetworkUtilsRemote.h" #include "MIME.h" +#include "Cop.h" // INKqa09866 #include "TSControlMain.h" @@ -157,14 +158,20 @@ synthetic_thread(void *info) // Read the request bufp = buffer; while (len < strlen(RequestStr)) { + if (read_ready(clientFD, cop_server_timeout * 1000) <= 0) { + mgmt_log(stderr, "[SyntheticHealthServer] poll() failed, no request to read()"); + goto error; + } bytes = read(clientFD, buffer, sizeof(buffer)); - if (bytes < 0) { + if (0 == bytes) { + mgmt_log(stderr, "[SyntheticHealthServer] EOF on the socket, likely prematurely closed"); + goto error; + } else if (bytes < 0) { if (errno == EINTR || errno == EAGAIN) { continue; } else { mgmt_log(stderr, "[SyntheticHealthServer] Failed to read the request"); goto error; - break; } } else { len += bytes; @@ -186,6 +193,10 @@ synthetic_thread(void *info) // Write it bufp = buffer; while (len) { + if (write_ready(clientFD, cop_server_timeout * 1000) <= 0) { + mgmt_log(stderr, "[SyntheticHealthServer] poll() failed, no response to write()"); + goto error; + } bytes = write(clientFD, buffer, len); if (bytes < 0) { if (errno == EINTR || errno == EAGAIN) { @@ -193,7 +204,6 @@ synthetic_thread(void *info) } else { mgmt_log(stderr, "[SyntheticHealthServer] Failed to write the response"); goto error; - break; } } else { len -= bytes; http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/lib/ts/ink_sock.cc ---------------------------------------------------------------------- diff --git a/lib/ts/ink_sock.cc b/lib/ts/ink_sock.cc index 3c447a2..2b98ed1 100644 --- a/lib/ts/ink_sock.cc +++ b/lib/ts/ink_sock.cc @@ -123,12 +123,12 @@ safe_blocking(int fd) } int -write_ready(int fd) +write_ready(int fd, int timeout_msec) { struct pollfd p; p.events = POLLOUT; p.fd = fd; - int r = poll(&p, 1, 0); + int r = poll(&p, 1, timeout_msec); if (r <= 0) return r; if (p.revents & (POLLERR | POLLNVAL)) @@ -139,12 +139,12 @@ write_ready(int fd) } int -read_ready(int fd) +read_ready(int fd, int timeout_msec) { struct pollfd p; p.events = POLLIN; p.fd = fd; - int r = poll(&p, 1, 0); + int r = poll(&p, 1, timeout_msec); if (r <= 0) return r; if (p.revents & (POLLERR | POLLNVAL)) http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/lib/ts/ink_sock.h ---------------------------------------------------------------------- diff --git a/lib/ts/ink_sock.h b/lib/ts/ink_sock.h index 6e73faa..7c7d66e 100644 --- a/lib/ts/ink_sock.h +++ b/lib/ts/ink_sock.h @@ -51,8 +51,8 @@ int safe_clr_fl(int fd, int arg); int safe_blocking(int fd); int safe_nonblocking(int fd); -int write_ready(int fd); -int read_ready(int fd); +int write_ready(int fd, int timeout_msec = 0); +int read_ready(int fd, int timeout_msec = 0); char fd_read_char(int fd); int fd_read_line(int fd, char *s, int len); http://git-wip-us.apache.org/repos/asf/trafficserver/blob/9bf5beb3/mgmt/Cop.h ---------------------------------------------------------------------- diff --git a/mgmt/Cop.h b/mgmt/Cop.h new file mode 100644 index 0000000..d1fab21 --- /dev/null +++ b/mgmt/Cop.h @@ -0,0 +1,27 @@ +/** @file + + Main entry point for the traffic_cop application. + + @section license License + + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +static const int cop_sleep_time = 10; // 10 sec +static const int cop_manager_timeout = 3 * 60; // 3 min +static const int cop_server_timeout = 3 * 60; // 3 min +static const int cop_kill_timeout = 1 * 60; // 1 min
