This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new d7fa4f9e6 [Tool] Return not OK status when copying tablets failed
d7fa4f9e6 is described below

commit d7fa4f9e62c4e9d87246656d1aea1b450cbf90fd
Author: xinghuayu007 <1450306...@qq.com>
AuthorDate: Thu Feb 29 15:12:14 2024 +0800

    [Tool] Return not OK status when copying tablets failed
    
    Currently, 'local_replica copy_from_remote' command does not
    return a non-OK status when some tablets copying failed. Therefore
    it is not possible to use the return status to know whether it is
    failed or succeeded. Only the log can show some useful message.
    
    This patch fixes this problem so now the CLI tool exits with non-OK
    status code when some tablets failed to copy.
    
    Change-Id: Ic957cbc379645e0607c1c2a3bc568e20afc126b2
    Reviewed-on: http://gerrit.cloudera.org:8080/21089
    Tested-by: Alexey Serbin <ale...@apache.org>
    Reviewed-by: Alexey Serbin <ale...@apache.org>
---
 src/kudu/tools/kudu-tool-test.cc            | 27 +++++++++++++++++++++++++++
 src/kudu/tools/tool_action_local_replica.cc | 10 +++++++++-
 2 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index cb15e927c..f5fdc984c 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -9539,6 +9539,33 @@ TEST_F(ToolTest, 
TestLocalReplicaCopyRemoteWithSpeedLimit) {
   }
 }
 
+TEST_F(ToolTest, TestLocalReplicaCopyRemoteCanReturnError) {
+  SKIP_IF_SLOW_NOT_ALLOWED();
+  InternalMiniClusterOptions opts;
+  opts.num_tablet_servers = 2;
+  NO_FATALS(StartMiniCluster(std::move(opts)));
+  NO_FATALS(CreateTableWithFlushedData("table1", mini_cluster_.get(), 3, 1));
+  const auto source_tserver_rpc_addr = mini_cluster_->mini_tablet_server(0)
+                                                    
->bound_rpc_addr().ToString();
+  const auto wal_dir = 
mini_cluster_->mini_tablet_server(1)->options()->fs_opts.wal_root;
+  const auto data_dirs = JoinStrings(mini_cluster_->mini_tablet_server(1)
+                                                  
->options()->fs_opts.data_roots, ",");
+  NO_FATALS(mini_cluster_->mini_tablet_server(1)->Shutdown());
+
+  // An attempt to copy a non-existent tablet fails, and the return value is 
not OK.
+  string stderr;
+  Status s = RunActionStderrString(
+      Substitute("local_replica copy_from_remote $0 $1 "
+                 "-fs_data_dirs=$2 -fs_wal_dir=$3 ",
+                 "non-existent-tablet-ids-str",
+                 source_tserver_rpc_addr,
+                 data_dirs,
+                 wal_dir), &stderr);
+  ASSERT_TRUE(s.IsRuntimeError());
+  ASSERT_STR_CONTAINS(stderr,
+      "some tablets failed to copy: check error messages for details");
+}
+
 
 class DownloadSuperblockInBatchTest :
     public ToolTest,
diff --git a/src/kudu/tools/tool_action_local_replica.cc 
b/src/kudu/tools/tool_action_local_replica.cc
index 3ca93755d..012f72850 100644
--- a/src/kudu/tools/tool_action_local_replica.cc
+++ b/src/kudu/tools/tool_action_local_replica.cc
@@ -16,6 +16,7 @@
 // under the License.
 
 #include <algorithm>
+#include <atomic>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -339,7 +340,7 @@ class TabletCopier {
                                               
FLAGS_tablet_copy_throttler_bytes_per_sec,
                                               
FLAGS_tablet_copy_throttler_burst_factor);
     }
-
+    std::atomic<bool> has_failed_tablets(false);
     // Start to copy tablets.
     for (const auto& tablet_id : tablet_ids_to_copy_) {
       RETURN_NOT_OK(copy_pool->Submit([&]() {
@@ -381,6 +382,10 @@ class TabletCopier {
           }
           copying_replicas_by_tablet_id.erase(tablet_id);
 
+          if (!failed_tablet_ids.empty()) {
+            has_failed_tablets.store(true);
+          }
+
           LOG(INFO) << Substitute("$0/$1 tablets, $2 bytes copied, include $3 
failed tablets.",
                                   succeed_tablet_count + 
failed_tablet_ids.size(),
                                   total_tablet_count,
@@ -396,6 +401,9 @@ class TabletCopier {
     latch.CountDown();
     check_thread->Join();
 
+    if (has_failed_tablets.load()) {
+      return Status::RuntimeError("some tablets failed to copy: check error 
messages for details");
+    }
     return Status::OK();
   }
 

Reply via email to