client: add timeout duration and scan attempts to scanner errors Previously, scanner errors could be confusing because certain types of errors, like ServiceUnavailable, are retriable, and the RPC timeouts mentioned in some error messages tend to be confusingly short because after several retries the client-side deadline is looming. This patch adds additional details to scanner timeout error messages, including information about number of retries and the client-set timeout value.
An example of a new "enhanced" error message looks something like this (from the new test that injects an error on scan): Timed out: exceeded configured scan timeout of 1.000s: after 7 scan attempts: unable to retry before timeout: Remote error: Service unavailable: Injecting service unavailable status on Scan due to --scanner_inject_service_unavailable_on_continue_scan Change-Id: I3a8f731f029132d0894355098d5804840f09e7c2 Reviewed-on: http://gerrit.cloudera.org:8080/11646 Reviewed-by: Adar Dembo <a...@cloudera.com> Tested-by: Mike Percy <mpe...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/8386ef9e Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/8386ef9e Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/8386ef9e Branch: refs/heads/master Commit: 8386ef9e2962ee5692c0e646fdd63f9826b3b47c Parents: e61de49 Author: Mike Percy <mpe...@apache.org> Authored: Tue Oct 9 17:15:46 2018 -0700 Committer: Mike Percy <mpe...@apache.org> Committed: Thu Oct 18 02:08:50 2018 +0000 ---------------------------------------------------------------------- src/kudu/client/client-test.cc | 20 ++++++++++++++++++++ src/kudu/client/scanner-internal.cc | 27 +++++++++++++++++++-------- src/kudu/client/scanner-internal.h | 4 ++++ src/kudu/tserver/tablet_service.cc | 10 ++++++++++ 4 files changed, 53 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/8386ef9e/src/kudu/client/client-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc index dc7e7b8..cc4c9ab 100644 --- a/src/kudu/client/client-test.cc +++ b/src/kudu/client/client-test.cc @@ -114,6 +114,7 @@ DECLARE_bool(fail_dns_resolution); DECLARE_bool(log_inject_latency); DECLARE_bool(master_support_connect_to_master_rpc); DECLARE_bool(rpc_trace_negotiation); +DECLARE_bool(scanner_inject_service_unavailable_on_continue_scan); DECLARE_int32(flush_threshold_mb); DECLARE_int32(flush_threshold_secs); DECLARE_int32(heartbeat_interval_ms); @@ -5161,6 +5162,25 @@ TEST_F(ClientTest, TestLastErrorEmbeddedInScanTimeoutStatus) { ASSERT_STR_CONTAINS(s.ToString(), "Illegal state: Tablet not RUNNING"); } +TEST_F(ClientTest, TestRetriesEmbeddedInScanTimeoutStatus) { + // For the random() calls that take place during scan retries. + SeedRandom(); + + NO_FATALS(InsertTestRows(client_table_.get(), FLAGS_test_scan_num_rows)); + + // Allow creating a scanner but fail all of the read calls. + FLAGS_scanner_inject_service_unavailable_on_continue_scan = true; + + // The scanner will return a retriable error on read, and we will eventually + // observe a timeout. + KuduScanner scanner(client_table_.get()); + ASSERT_OK(scanner.SetTimeoutMillis(1000)); + Status s = scanner.Open(); + ASSERT_TRUE(s.IsTimedOut()) << s.ToString(); + ASSERT_STR_CONTAINS(s.ToString(), "exceeded configured scan timeout of"); + ASSERT_STR_CONTAINS(s.ToString(), "scan attempts"); +} + TEST_F(ClientTest, TestNoDefaultPartitioning) { unique_ptr<KuduTableCreator> table_creator(client_->NewTableCreator()); Status s = table_creator->table_name("TestNoDefaultPartitioning").schema(&schema_).Create(); http://git-wip-us.apache.org/repos/asf/kudu/blob/8386ef9e/src/kudu/client/scanner-internal.cc ---------------------------------------------------------------------- diff --git a/src/kudu/client/scanner-internal.cc b/src/kudu/client/scanner-internal.cc index 51cdca6..9ac1a86 100644 --- a/src/kudu/client/scanner-internal.cc +++ b/src/kudu/client/scanner-internal.cc @@ -81,6 +81,20 @@ KuduScanner::Data::Data(KuduTable* table) KuduScanner::Data::~Data() { } +Status KuduScanner::Data::EnrichStatusMessage(Status s) const { + if (scan_attempts_ > 1) { + s = s.CloneAndPrepend(Substitute("after $0 scan attempts", scan_attempts_)); + } + if (s.IsTimedOut()) { + s = s.CloneAndPrepend(Substitute("exceeded configured scan timeout of $0", + configuration_.timeout().ToString())); + } + if (!last_error_.ok()) { + s = s.CloneAndAppend(last_error_.ToString()); + } + return s; +} + Status KuduScanner::Data::HandleError(const ScanRpcStatus& err, const MonoTime& deadline, set<string>* blacklist, @@ -92,10 +106,10 @@ Status KuduScanner::Data::HandleError(const ScanRpcStatus& err, // If we timed out because of the overall deadline, we're done. // We didn't wait a full RPC timeout, though, so don't mark the tserver as failed. if (err.result == ScanRpcStatus::OVERALL_DEADLINE_EXCEEDED) { - LOG(INFO) << "Scan of tablet " << remote_->tablet_id() << " at " - << ts_->ToString() << " deadline expired."; - return last_error_.ok() - ? err.status : err.status.CloneAndAppend(last_error_.ToString()); + LOG(INFO) << "Scan of tablet " << remote_->tablet_id() << " at " + << ts_->ToString() << " deadline exceeded after " + << configuration_.timeout().ToString(); + return EnrichStatusMessage(err.status); } UpdateLastError(err.status); @@ -172,10 +186,7 @@ Status KuduScanner::Data::HandleError(const ScanRpcStatus& err, KuduClient::Data::ComputeExponentialBackoff(scan_attempts_); MonoTime now = MonoTime::Now() + sleep; if (deadline < now) { - Status ret = Status::TimedOut("unable to retry before timeout", - err.status.ToString()); - return last_error_.ok() ? - ret : ret.CloneAndAppend(last_error_.ToString()); + return EnrichStatusMessage(Status::TimedOut("unable to retry before timeout")); } VLOG(1) << "Error scanning on server " << ts_->ToString() << ": " << err.status.ToString() << ". Will retry after " http://git-wip-us.apache.org/repos/asf/kudu/blob/8386ef9e/src/kudu/client/scanner-internal.h ---------------------------------------------------------------------- diff --git a/src/kudu/client/scanner-internal.h b/src/kudu/client/scanner-internal.h index 5db5bd9..0dddb40 100644 --- a/src/kudu/client/scanner-internal.h +++ b/src/kudu/client/scanner-internal.h @@ -278,6 +278,10 @@ class KuduScanner::Data { const MonoTime& overall_deadline, const MonoTime& rpc_deadline); + // Add additional details to the status message, such as number of retries, + // original cause of the error, etc. Returns a cloned object. + Status EnrichStatusMessage(Status s) const; + void UpdateResourceMetrics(); DISALLOW_COPY_AND_ASSIGN(Data); http://git-wip-us.apache.org/repos/asf/kudu/blob/8386ef9e/src/kudu/tserver/tablet_service.cc ---------------------------------------------------------------------- diff --git a/src/kudu/tserver/tablet_service.cc b/src/kudu/tserver/tablet_service.cc index f8b309e..da4002c 100644 --- a/src/kudu/tserver/tablet_service.cc +++ b/src/kudu/tserver/tablet_service.cc @@ -132,6 +132,11 @@ DEFINE_int32(scanner_inject_latency_on_each_batch_ms, 0, "Used for tests."); TAG_FLAG(scanner_inject_latency_on_each_batch_ms, unsafe); +DEFINE_bool(scanner_inject_service_unavailable_on_continue_scan, false, + "If set, the scanner will return a ServiceUnavailable Status on " + "any Scan continuation RPC call. Used for tests."); +TAG_FLAG(scanner_inject_service_unavailable_on_continue_scan, unsafe); + DECLARE_bool(raft_prepare_replacement_before_eviction); DECLARE_int32(memory_limit_warn_threshold_percentage); DECLARE_int32(tablet_history_max_age_sec); @@ -2052,6 +2057,11 @@ Status TabletServiceImpl::HandleContinueScanRequest(const ScanRequestPB* req, return s; } + if (PREDICT_FALSE(FLAGS_scanner_inject_service_unavailable_on_continue_scan)) { + return Status::ServiceUnavailable("Injecting service unavailable status on Scan due to " + "--scanner_inject_service_unavailable_on_continue_scan"); + } + // Set the row format flags on the ScanResultCollector. result_collector->set_row_format_flags(scanner->row_format_flags());