This is an automated email from the ASF dual-hosted git repository. alexey pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push: new d7fa4f9e6 [Tool] Return not OK status when copying tablets failed d7fa4f9e6 is described below commit d7fa4f9e62c4e9d87246656d1aea1b450cbf90fd Author: xinghuayu007 <1450306...@qq.com> AuthorDate: Thu Feb 29 15:12:14 2024 +0800 [Tool] Return not OK status when copying tablets failed Currently, 'local_replica copy_from_remote' command does not return a non-OK status when some tablets copying failed. Therefore it is not possible to use the return status to know whether it is failed or succeeded. Only the log can show some useful message. This patch fixes this problem so now the CLI tool exits with non-OK status code when some tablets failed to copy. Change-Id: Ic957cbc379645e0607c1c2a3bc568e20afc126b2 Reviewed-on: http://gerrit.cloudera.org:8080/21089 Tested-by: Alexey Serbin <ale...@apache.org> Reviewed-by: Alexey Serbin <ale...@apache.org> --- src/kudu/tools/kudu-tool-test.cc | 27 +++++++++++++++++++++++++++ src/kudu/tools/tool_action_local_replica.cc | 10 +++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc index cb15e927c..f5fdc984c 100644 --- a/src/kudu/tools/kudu-tool-test.cc +++ b/src/kudu/tools/kudu-tool-test.cc @@ -9539,6 +9539,33 @@ TEST_F(ToolTest, TestLocalReplicaCopyRemoteWithSpeedLimit) { } } +TEST_F(ToolTest, TestLocalReplicaCopyRemoteCanReturnError) { + SKIP_IF_SLOW_NOT_ALLOWED(); + InternalMiniClusterOptions opts; + opts.num_tablet_servers = 2; + NO_FATALS(StartMiniCluster(std::move(opts))); + NO_FATALS(CreateTableWithFlushedData("table1", mini_cluster_.get(), 3, 1)); + const auto source_tserver_rpc_addr = mini_cluster_->mini_tablet_server(0) + ->bound_rpc_addr().ToString(); + const auto wal_dir = mini_cluster_->mini_tablet_server(1)->options()->fs_opts.wal_root; + const auto data_dirs = JoinStrings(mini_cluster_->mini_tablet_server(1) + ->options()->fs_opts.data_roots, ","); + NO_FATALS(mini_cluster_->mini_tablet_server(1)->Shutdown()); + + // An attempt to copy a non-existent tablet fails, and the return value is not OK. + string stderr; + Status s = RunActionStderrString( + Substitute("local_replica copy_from_remote $0 $1 " + "-fs_data_dirs=$2 -fs_wal_dir=$3 ", + "non-existent-tablet-ids-str", + source_tserver_rpc_addr, + data_dirs, + wal_dir), &stderr); + ASSERT_TRUE(s.IsRuntimeError()); + ASSERT_STR_CONTAINS(stderr, + "some tablets failed to copy: check error messages for details"); +} + class DownloadSuperblockInBatchTest : public ToolTest, diff --git a/src/kudu/tools/tool_action_local_replica.cc b/src/kudu/tools/tool_action_local_replica.cc index 3ca93755d..012f72850 100644 --- a/src/kudu/tools/tool_action_local_replica.cc +++ b/src/kudu/tools/tool_action_local_replica.cc @@ -16,6 +16,7 @@ // under the License. #include <algorithm> +#include <atomic> #include <cstddef> #include <cstdint> #include <functional> @@ -339,7 +340,7 @@ class TabletCopier { FLAGS_tablet_copy_throttler_bytes_per_sec, FLAGS_tablet_copy_throttler_burst_factor); } - + std::atomic<bool> has_failed_tablets(false); // Start to copy tablets. for (const auto& tablet_id : tablet_ids_to_copy_) { RETURN_NOT_OK(copy_pool->Submit([&]() { @@ -381,6 +382,10 @@ class TabletCopier { } copying_replicas_by_tablet_id.erase(tablet_id); + if (!failed_tablet_ids.empty()) { + has_failed_tablets.store(true); + } + LOG(INFO) << Substitute("$0/$1 tablets, $2 bytes copied, include $3 failed tablets.", succeed_tablet_count + failed_tablet_ids.size(), total_tablet_count, @@ -396,6 +401,9 @@ class TabletCopier { latch.CountDown(); check_thread->Join(); + if (has_failed_tablets.load()) { + return Status::RuntimeError("some tablets failed to copy: check error messages for details"); + } return Status::OK(); }