[client] retry operation in case of ServiceUnavailable

KUDU-1871 Kudu C++ client does not retry on ServiceUnavailable errors

Follow the common retry path scenario for Kudu C++ client in case
of ServiceUnavailable error as well, for single- and multi-master
scenarios.

Besides, cleaned up invocation sites of SyncLeaderMasterRpc: moved
handling of ResponsePB::has_error() and converting the PB error into
Status into the code of the SyncLeaderMasterRpc method template.

This commit re-enables ClientTest::TestRetriesOnServiceUnavailable test
which passes with this fix.

Change-Id: Iae169ce1efe3950dfcb769594a1944d05e2ad8e7
Reviewed-on: http://gerrit.cloudera.org:8080/5964
Reviewed-by: Alexey Serbin <[email protected]>
Tested-by: Kudu Jenkins


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/07c6efb5
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/07c6efb5
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/07c6efb5

Branch: refs/heads/master
Commit: 07c6efb58eace6564e28280536441a8df05f7757
Parents: 0898a5b
Author: Alexey Serbin <[email protected]>
Authored: Thu Feb 9 20:17:35 2017 -0800
Committer: Alexey Serbin <[email protected]>
Committed: Tue Feb 14 23:51:40 2017 +0000

----------------------------------------------------------------------
 src/kudu/client/client-internal.cc              | 106 ++++++++-----------
 src/kudu/client/client-test.cc                  |   2 +-
 .../create-table-stress-test.cc                 |  28 +++--
 src/kudu/master/catalog_manager.h               |   4 +-
 4 files changed, 68 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/07c6efb5/src/kudu/client/client-internal.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-internal.cc 
b/src/kudu/client/client-internal.cc
index b328bf5..f761a9b 100644
--- a/src/kudu/client/client-internal.cc
+++ b/src/kudu/client/client-internal.cc
@@ -189,11 +189,15 @@ Status KuduClient::Data::SyncLeaderMasterRpc(
       }
     }
 
+    // A network error is a special case for retries: in most cases a network
+    // error means there is some misconfiguration, a typo in the command line,
+    // or the whole Kudu cluster is offline. It's better to report on such
+    // errors right away to allow faster troubleshooting.
     if (s.IsNetworkError()) {
       KLOG_EVERY_N_SECS(WARNING, 1)
           << "Unable to send the request (" << SecureShortDebugString(req)
-          << ") to leader Master (" << leader_master_hostport().ToString() << 
"): "
-          << s.ToString();
+          << ") to leader Master (" << leader_master_hostport().ToString()
+          << "): " << s.ToString();
       if (client->IsMultiMaster()) {
         LOG(INFO) << "Determining the new leader Master and retrying...";
         WARN_NOT_OK(ConnectToCluster(client, deadline),
@@ -202,6 +206,19 @@ Status KuduClient::Data::SyncLeaderMasterRpc(
       }
     }
 
+    if (s.IsServiceUnavailable()) {
+      KLOG_EVERY_N_SECS(WARNING, 1)
+          << "Unable to send the request (" << SecureShortDebugString(req)
+          << ") to leader Master (" << leader_master_hostport().ToString()
+          << "): " << s.ToString();
+      if (client->IsMultiMaster()) {
+        LOG(INFO) << "Determining the new leader Master and retrying...";
+        WARN_NOT_OK(ConnectToCluster(client, deadline),
+                    "Unable to determine the new leader Master");
+      }
+      continue;
+    }
+
     if (s.IsTimedOut()) {
       if (MonoTime::Now() < deadline) {
         KLOG_EVERY_N_SECS(WARNING, 1)
@@ -228,8 +245,10 @@ Status KuduClient::Data::SyncLeaderMasterRpc(
           KLOG_EVERY_N_SECS(INFO, 1) << "Determining the new leader Master and 
retrying...";
           WARN_NOT_OK(ConnectToCluster(client, deadline),
                       "Unable to determine the new leader Master");
-          continue;
         }
+        continue;
+      } else {
+        return StatusFromPB(resp->error().status());
       }
     }
     return s;
@@ -385,14 +404,9 @@ Status KuduClient::Data::CreateTable(KuduClient* client,
   if (has_range_partition_bounds) {
     features.push_back(MasterFeatures::RANGE_PARTITION_BOUNDS);
   }
-  Status s = SyncLeaderMasterRpc<CreateTableRequestPB, CreateTableResponsePB>(
-      deadline, client, req, &resp, "CreateTable", 
&MasterServiceProxy::CreateTable,
-      features);
-  RETURN_NOT_OK(s);
-  if (resp.has_error()) {
-    return StatusFromPB(resp.error().status());
-  }
-  return Status::OK();
+  return SyncLeaderMasterRpc<CreateTableRequestPB, CreateTableResponsePB>(
+      deadline, client, req, &resp, "CreateTable",
+      &MasterServiceProxy::CreateTable, features);
 }
 
 Status KuduClient::Data::IsCreateTableInProgress(KuduClient* client,
@@ -403,9 +417,10 @@ Status 
KuduClient::Data::IsCreateTableInProgress(KuduClient* client,
   IsCreateTableDoneResponsePB resp;
   req.mutable_table()->set_table_name(table_name);
 
-  // TODO: Add client rpc timeout and use 'default_admin_operation_timeout_' as
-  // the default timeout for all admin operations.
-  Status s =
+  // TODO(aserbin): Add client rpc timeout and use
+  // 'default_admin_operation_timeout_' as the default timeout for all
+  // admin operations.
+  RETURN_NOT_OK((
       SyncLeaderMasterRpc<IsCreateTableDoneRequestPB, 
IsCreateTableDoneResponsePB>(
           deadline,
           client,
@@ -413,15 +428,7 @@ Status 
KuduClient::Data::IsCreateTableInProgress(KuduClient* client,
           &resp,
           "IsCreateTableDone",
           &MasterServiceProxy::IsCreateTableDone,
-          {});
-  // RETURN_NOT_OK macro can't take templated function call as param,
-  // and SyncLeaderMasterRpc must be explicitly instantiated, else the
-  // compiler complains.
-  RETURN_NOT_OK(s);
-  if (resp.has_error()) {
-    return StatusFromPB(resp.error().status());
-  }
-
+          {})));
   *create_in_progress = !resp.done();
   return Status::OK();
 }
@@ -443,14 +450,9 @@ Status KuduClient::Data::DeleteTable(KuduClient* client,
   DeleteTableResponsePB resp;
 
   req.mutable_table()->set_table_name(table_name);
-  Status s = SyncLeaderMasterRpc<DeleteTableRequestPB, DeleteTableResponsePB>(
+  return SyncLeaderMasterRpc<DeleteTableRequestPB, DeleteTableResponsePB>(
       deadline, client, req, &resp,
       "DeleteTable", &MasterServiceProxy::DeleteTable, {});
-  RETURN_NOT_OK(s);
-  if (resp.has_error()) {
-    return StatusFromPB(resp.error().status());
-  }
-  return Status::OK();
 }
 
 Status KuduClient::Data::AlterTable(KuduClient* client,
@@ -462,20 +464,14 @@ Status KuduClient::Data::AlterTable(KuduClient* client,
     
required_feature_flags.push_back(MasterFeatures::ADD_DROP_RANGE_PARTITIONS);
   }
   AlterTableResponsePB resp;
-  Status s =
-      SyncLeaderMasterRpc<AlterTableRequestPB, AlterTableResponsePB>(
-          deadline,
-          client,
-          req,
-          &resp,
-          "AlterTable",
-          &MasterServiceProxy::AlterTable,
-          std::move(required_feature_flags));
-  RETURN_NOT_OK(s);
-  if (resp.has_error()) {
-    return StatusFromPB(resp.error().status());
-  }
-  return Status::OK();
+  return SyncLeaderMasterRpc<AlterTableRequestPB, AlterTableResponsePB>(
+      deadline,
+      client,
+      req,
+      &resp,
+      "AlterTable",
+      &MasterServiceProxy::AlterTable,
+      std::move(required_feature_flags));
 }
 
 Status KuduClient::Data::IsAlterTableInProgress(KuduClient* client,
@@ -486,7 +482,7 @@ Status KuduClient::Data::IsAlterTableInProgress(KuduClient* 
client,
   IsAlterTableDoneResponsePB resp;
 
   req.mutable_table()->set_table_name(table_name);
-  Status s =
+  RETURN_NOT_OK((
       SyncLeaderMasterRpc<IsAlterTableDoneRequestPB, 
IsAlterTableDoneResponsePB>(
           deadline,
           client,
@@ -494,12 +490,7 @@ Status 
KuduClient::Data::IsAlterTableInProgress(KuduClient* client,
           &resp,
           "IsAlterTableDone",
           &MasterServiceProxy::IsAlterTableDone,
-          {});
-  RETURN_NOT_OK(s);
-  if (resp.has_error()) {
-    return StatusFromPB(resp.error().status());
-  }
-
+          {})));
   *alter_in_progress = !resp.done();
   return Status::OK();
 }
@@ -569,14 +560,10 @@ Status KuduClient::Data::GetTableSchema(KuduClient* 
client,
   GetTableSchemaResponsePB resp;
 
   req.mutable_table()->set_table_name(table_name);
-  Status s = SyncLeaderMasterRpc<GetTableSchemaRequestPB, 
GetTableSchemaResponsePB>(
-      deadline, client, req, &resp,
-      "GetTableSchema", &MasterServiceProxy::GetTableSchema, {});
-  RETURN_NOT_OK(s);
-  if (resp.has_error()) {
-    return StatusFromPB(resp.error().status());
-  }
-
+  RETURN_NOT_OK((
+      SyncLeaderMasterRpc<GetTableSchemaRequestPB, GetTableSchemaResponsePB>(
+          deadline, client, req, &resp,
+          "GetTableSchema", &MasterServiceProxy::GetTableSchema, {})));
   // Parse the server schema out of the response.
   unique_ptr<Schema> new_schema(new Schema());
   RETURN_NOT_OK(SchemaFromPB(resp.schema(), new_schema.get()));
@@ -584,9 +571,8 @@ Status KuduClient::Data::GetTableSchema(KuduClient* client,
   // Parse the server partition schema out of the response.
   PartitionSchema new_partition_schema;
   RETURN_NOT_OK(PartitionSchema::FromPB(resp.partition_schema(),
-                                        *new_schema.get(),
+                                        *new_schema,
                                         &new_partition_schema));
-
   if (schema) {
     delete schema->schema_;
     schema->schema_ = new_schema.release();

http://git-wip-us.apache.org/repos/asf/kudu/blob/07c6efb5/src/kudu/client/client-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/client/client-test.cc b/src/kudu/client/client-test.cc
index 8442da6..26d2f6b 100644
--- a/src/kudu/client/client-test.cc
+++ b/src/kudu/client/client-test.cc
@@ -4672,7 +4672,7 @@ class ServiceUnavailableRetryClientTest :
 
 // Test the client retries on ServiceUnavailable errors. This scenario uses
 // 'CreateTable' RPC to verify the retry behavior.
-TEST_P(ServiceUnavailableRetryClientTest, DISABLED_CreateTable) {
+TEST_P(ServiceUnavailableRetryClientTest, CreateTable) {
   // This scenario is designed to run on a single-master cluster.
   ASSERT_EQ(1, cluster_->num_masters());
 

http://git-wip-us.apache.org/repos/asf/kudu/blob/07c6efb5/src/kudu/integration-tests/create-table-stress-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/integration-tests/create-table-stress-test.cc 
b/src/kudu/integration-tests/create-table-stress-test.cc
index c301152..9e1abed 100644
--- a/src/kudu/integration-tests/create-table-stress-test.cc
+++ b/src/kudu/integration-tests/create-table-stress-test.cc
@@ -330,12 +330,14 @@ TEST_F(CreateTableStressTest, 
TestConcurrentCreateTableAndReloadMetadata) {
 
   thread reload_metadata_thread([&]() {
     while (!stop.Load()) {
-      
CHECK_OK(cluster_->mini_master()->master()->catalog_manager()->VisitTablesAndTablets());
+      CHECK_OK(cluster_->mini_master()->master()->catalog_manager()->
+          VisitTablesAndTablets());
 
       // Give table creation a chance to run.
-      SleepFor(MonoDelta::FromMilliseconds(1));
+      SleepFor(MonoDelta::FromMilliseconds(1 + rand() % 10));
     }
   });
+
   for (int num_tables_created = 0; num_tables_created < 20;) {
     string table_name = Substitute("test-$0", num_tables_created);
     LOG(INFO) << "Creating table " << table_name;
@@ -346,14 +348,22 @@ TEST_F(CreateTableStressTest, 
TestConcurrentCreateTableAndReloadMetadata) {
                   .num_replicas(3)
                   .wait(false)
                   .Create();
-    if (s.IsServiceUnavailable()) {
-      // The master was busy reloading its metadata. Try again.
+    if (s.IsTimedOut()) {
+      // The master was busy reloading its metadata, replying with
+      // ServiceUnavailable on CreateTable() requests. The client transparently
+      // retried (randomized exponential back-off) until the timeout elapsed.
+      //
+      // It's hard to find some universal constant for timeout which would work
+      // in any testing environment instead of simply retrying here. That's
+      // because the client uses exponential-with-random back-off strategy
+      // while the metadata is being reloaded very often. So, from one side
+      // we want to have more or less random interaction between the metadata
+      // reloading and the simultaneous table creations, but from the other 
side
+      // it's hard do deduce the universal timeout constant and we prefer to
+      // not introduce test flakiness.
       //
-      // This is a purely synthetic case. In real life, it only manifests at
-      // startup (single master) or during leader failover (multiple masters).
-      // In the latter case, the client will transparently retry to another
-      // master. That won't happen here as we've only got one master, so we
-      // must handle retrying ourselves.
+      // TODO(aserbin): update the test keeping its racy essence but making it
+      //                cleaner regarding this timeout&retry workaround.
       continue;
     }
     ASSERT_OK(s);

http://git-wip-us.apache.org/repos/asf/kudu/blob/07c6efb5/src/kudu/master/catalog_manager.h
----------------------------------------------------------------------
diff --git a/src/kudu/master/catalog_manager.h 
b/src/kudu/master/catalog_manager.h
index d7c3800..65614d6 100644
--- a/src/kudu/master/catalog_manager.h
+++ b/src/kudu/master/catalog_manager.h
@@ -50,7 +50,7 @@ class ThreadPool;
 
 // Working around FRIEND_TEST() ugliness.
 namespace client {
-class ServiceUnavailableRetryClientTest_DISABLED_CreateTable_Test;
+class ServiceUnavailableRetryClientTest_CreateTable_Test;
 } // namespace client
 
 class CreateTableStressTest_TestConcurrentCreateTableAndReloadMetadata_Test;
@@ -525,7 +525,7 @@ class CatalogManager : public tserver::TabletPeerLookupIf {
   FRIEND_TEST(kudu::CreateTableStressTest, 
TestConcurrentCreateTableAndReloadMetadata);
 
   // This test exclusively acquires the leader_lock_ directly.
-  FRIEND_TEST(kudu::client::ServiceUnavailableRetryClientTest, 
DISABLED_CreateTable);
+  FRIEND_TEST(kudu::client::ServiceUnavailableRetryClientTest, CreateTable);
 
   friend class TableLoader;
   friend class TabletLoader;

Reply via email to