Repository: kudu
Updated Branches:
  refs/heads/master 388dd2955 -> 6be01642b


[tools] Tombstone the tablet with "local_replica delete"

This change makes the default action of 'local_replica delete'
tool to tombstone the tablet. Release 1.1 shipped the tool
with "--clean_unsafe" flag which let you delete the replica
from the disks. Deleting removes consensus metadata for the given
tablet and hence voilates Raft vote durability requirements.
Tombstoning on the other hand will preserve Raft votes for the
specfied tablet on the local node, hence is a safer choice
to be the default action for this tool.

This adds the support for tombstone action for this tool
which was unsupported in previous release.

Change-Id: Ia773de475431eb85fb0dbe724d524e8dd59b1b12
Reviewed-on: http://gerrit.cloudera.org:8080/5191
Tested-by: Kudu Jenkins
Reviewed-by: Mike Percy <mpe...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/6be01642
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/6be01642
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/6be01642

Branch: refs/heads/master
Commit: 6be01642b25ecf7c407238537b51014a66a4f61f
Parents: 388dd29
Author: Dinesh Bhat <din...@cloudera.com>
Authored: Tue Nov 22 16:39:08 2016 -0800
Committer: Mike Percy <mpe...@apache.org>
Committed: Fri Dec 2 13:15:00 2016 +0000

----------------------------------------------------------------------
 src/kudu/tools/kudu-tool-test.cc            | 126 ++++++++++++++++-------
 src/kudu/tools/tool_action_local_replica.cc |  71 ++++++++++---
 2 files changed, 144 insertions(+), 53 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/6be01642/src/kudu/tools/kudu-tool-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/kudu-tool-test.cc b/src/kudu/tools/kudu-tool-test.cc
index 7d2e62a..ea01c66 100644
--- a/src/kudu/tools/kudu-tool-test.cc
+++ b/src/kudu/tools/kudu-tool-test.cc
@@ -299,7 +299,7 @@ TEST_F(ToolTest, TestModeHelp) {
         "cmeta.*Operate on a local tablet replica's consensus",
         "dump.*Dump a Kudu filesystem",
         "copy_from_remote.*Copy a tablet replica",
-        "delete.*Delete tablet replica from the local filesystem",
+        "delete.*Delete a tablet replica from the local filesystem",
         "list.*Show list of tablet replicas"
     };
     NO_FATALS(RunTestHelp("local_replica", kLocalReplicaModeRegexes));
@@ -1052,9 +1052,9 @@ TEST_F(ToolTest, TestRemoteReplicaCopy) {
                                    tablet::RUNNING, kTimeout));
 }
 
-// Test 'kudu local_replica delete' tool when the tablet server is offline.
+// Test 'kudu local_replica delete' tool with --clean_unsafe flag for
+// deleting the tablet from the tablet server.
 TEST_F(ToolTest, TestLocalReplicaDelete) {
-  MonoDelta kTimeout = MonoDelta::FromSeconds(30);
   NO_FATALS(StartMiniCluster());
 
   // TestWorkLoad.Setup() internally generates a table.
@@ -1067,21 +1067,6 @@ TEST_F(ToolTest, TestLocalReplicaDelete) {
   ASSERT_OK(mini_cluster_->WaitForTabletServerCount(1));
   MiniTabletServer* ts = mini_cluster_->mini_tablet_server(0);
 
-  // Grab the tablet_id to delete
-  ListTabletsRequestPB req;
-  ListTabletsResponsePB resp;
-  RpcController rpc;
-  rpc.set_timeout(kTimeout);
-  {
-    unique_ptr<TabletServerServiceProxy> ts_proxy;
-    ASSERT_OK(BuildProxy(ts->bound_rpc_addr().ToString(),
-                         tserver::TabletServer::kDefaultPort, &ts_proxy));
-    ASSERT_OK(ts_proxy->ListTablets(req, &resp, &rpc));
-  }
-  ASSERT_FALSE(resp.has_error());
-  ASSERT_EQ(resp.status_and_schema_size(), 1);
-  const string& tablet_id = 
resp.status_and_schema(0).tablet_status().tablet_id();
-
   // Generate some workload followed by a flush to grow the size of the tablet 
on disk.
   while (workload.rows_inserted() < 10000) {
     SleepFor(MonoDelta::FromMilliseconds(10));
@@ -1090,15 +1075,18 @@ TEST_F(ToolTest, TestLocalReplicaDelete) {
 
   // Make sure the tablet data is flushed to disk. This is needed
   // so that we can compare the size of the data on disk before and
-  // after the deletion of local_replica to check if the size-on-disk is 
reduced at all.
+  // after the deletion of local_replica to check if the size-on-disk
+  // is reduced at all.
+  string tablet_id;
   {
-    scoped_refptr<TabletPeer> tablet_peer;
-    ASSERT_TRUE(ts->server()->tablet_manager()->LookupTablet(tablet_id, 
&tablet_peer));
-    Tablet* tablet = tablet_peer->tablet();
+    vector<scoped_refptr<TabletPeer>> tablet_peers;
+    ts->server()->tablet_manager()->GetTabletPeers(&tablet_peers);
+    ASSERT_EQ(1, tablet_peers.size());
+    Tablet* tablet = tablet_peers[0]->tablet();
     ASSERT_OK(tablet->Flush());
+    tablet_id = tablet_peers[0]->tablet_id();
   }
   const string& tserver_dir = ts->options()->fs_opts.wal_path;
-
   // Using the delete tool with tablet server running fails.
   string stderr;
   Status s = RunTool(
@@ -1109,17 +1097,9 @@ TEST_F(ToolTest, TestLocalReplicaDelete) {
   SCOPED_TRACE(stderr);
   ASSERT_STR_CONTAINS(stderr, "Resource temporarily unavailable");
 
-  // Shut down tablet server and use the delete tool again.
+  // Shut down tablet server and use the delete tool with --clean_unsafe.
   ts->Shutdown();
 
-  // Run the tool without --clean_unsafe flag first.
-  s = RunTool(Substitute("local_replica delete $0 --fs_wal_dir=$1 
--fs_data_dirs=$1",
-                         tablet_id, tserver_dir),
-              nullptr, &stderr, nullptr, nullptr);
-  ASSERT_TRUE(s.IsRuntimeError());
-  SCOPED_TRACE(stderr);
-  ASSERT_STR_CONTAINS(stderr, "currently not supported without --clean_unsafe 
flag");
-
   const string& data_dir = JoinPathSegments(tserver_dir, "data");
   uint64_t size_before_delete;
   ASSERT_OK(env_->GetFileSizeOnDiskRecursively(data_dir, &size_before_delete));
@@ -1146,16 +1126,82 @@ TEST_F(ToolTest, TestLocalReplicaDelete) {
   // we can expect the tablet server to have nothing after it comes up.
   ASSERT_OK(ts->Start());
   ASSERT_OK(ts->WaitStarted());
-  rpc.Reset();
-  rpc.set_timeout(kTimeout);
+  vector<scoped_refptr<TabletPeer>> tablet_peers;
+  ts->server()->tablet_manager()->GetTabletPeers(&tablet_peers);
+  ASSERT_EQ(0, tablet_peers.size());
+}
+
+// Test 'kudu local_replica delete' tool for tombstoning the tablet.
+TEST_F(ToolTest, TestLocalReplicaTombstoneDelete) {
+  NO_FATALS(StartMiniCluster());
+
+  // TestWorkLoad.Setup() internally generates a table.
+  TestWorkload workload(mini_cluster_.get());
+  workload.set_num_replicas(1);
+  workload.Setup();
+  workload.Start();
+
+  // There is only one tserver in the minicluster.
+  ASSERT_OK(mini_cluster_->WaitForTabletServerCount(1));
+  MiniTabletServer* ts = mini_cluster_->mini_tablet_server(0);
+
+  // Generate some workload followed by a flush to grow the size of the tablet 
on disk.
+  while (workload.rows_inserted() < 10000) {
+    SleepFor(MonoDelta::FromMilliseconds(10));
+  }
+  workload.StopAndJoin();
+
+  // Make sure the tablet data is flushed to disk. This is needed
+  // so that we can compare the size of the data on disk before and
+  // after the deletion of local_replica to verify that the size-on-disk
+  // is reduced after the tool operation.
+  OpId last_logged_opid;
+  last_logged_opid.Clear();
+  string tablet_id;
   {
-    unique_ptr<TabletServerServiceProxy> ts_proxy;
-    ASSERT_OK(BuildProxy(ts->bound_rpc_addr().ToString(),
-                         tserver::TabletServer::kDefaultPort, &ts_proxy));
-    ASSERT_OK(ts_proxy->ListTablets(req, &resp, &rpc));
+    vector<scoped_refptr<TabletPeer>> tablet_peers;
+    ts->server()->tablet_manager()->GetTabletPeers(&tablet_peers);
+    ASSERT_EQ(1, tablet_peers.size());
+    tablet_id = tablet_peers[0]->tablet_id();
+    tablet_peers[0]->log()->GetLatestEntryOpId(&last_logged_opid);
+    Tablet* tablet = tablet_peers[0]->tablet();
+    ASSERT_OK(tablet->Flush());
+  }
+  const string& tserver_dir = ts->options()->fs_opts.wal_path;
+
+  // Shut down tablet server and use the delete tool.
+  ts->Shutdown();
+  const string& data_dir = JoinPathSegments(tserver_dir, "data");
+  uint64_t size_before_delete;
+  ASSERT_OK(env_->GetFileSizeOnDiskRecursively(data_dir, &size_before_delete));
+  NO_FATALS(RunActionStdoutNone(Substitute("local_replica delete $0 
--fs_wal_dir=$1 "
+                                           "--fs_data_dirs=$1",
+                                           tablet_id, tserver_dir)));
+  // Verify WAL segments for the tablet_id are gone and
+  // the data_dir size on tserver is reduced.
+  const string& wal_dir = JoinPathSegments(tserver_dir,
+                                           Substitute("wals/$0", tablet_id));
+  ASSERT_FALSE(env_->FileExists(wal_dir));
+  uint64_t size_after_delete;
+  ASSERT_OK(env_->GetFileSizeOnDiskRecursively(data_dir, &size_after_delete));
+  ASSERT_LT(size_after_delete, size_before_delete);
+
+  // Bring up the tablet server and verify that tablet is tombstoned and
+  // tombstone_last_logged_opid matches with the one test had cached earlier.
+  ASSERT_OK(ts->Start());
+  ASSERT_OK(ts->WaitStarted());
+  {
+    vector<scoped_refptr<TabletPeer>> tablet_peers;
+    ts->server()->tablet_manager()->GetTabletPeers(&tablet_peers);
+    ASSERT_EQ(1, tablet_peers.size());
+    ASSERT_EQ(tablet_id, tablet_peers[0]->tablet_id());
+    ASSERT_EQ(TabletDataState::TABLET_DATA_TOMBSTONED,
+              tablet_peers[0]->tablet_metadata()->tablet_data_state());
+    OpId tombstoned_opid = 
tablet_peers[0]->tablet_metadata()->tombstone_last_logged_opid();
+    ASSERT_TRUE(tombstoned_opid.IsInitialized());
+    ASSERT_EQ(last_logged_opid.term(), tombstoned_opid.term());
+    ASSERT_EQ(last_logged_opid.index(), tombstoned_opid.index());
   }
-  ASSERT_FALSE(resp.has_error());
-  ASSERT_EQ(resp.status_and_schema_size(), 0);
 }
 
 } // namespace tools

http://git-wip-us.apache.org/repos/asf/kudu/blob/6be01642/src/kudu/tools/tool_action_local_replica.cc
----------------------------------------------------------------------
diff --git a/src/kudu/tools/tool_action_local_replica.cc 
b/src/kudu/tools/tool_action_local_replica.cc
index 47e60aa..b788748 100644
--- a/src/kudu/tools/tool_action_local_replica.cc
+++ b/src/kudu/tools/tool_action_local_replica.cc
@@ -91,10 +91,12 @@ using cfile::CFileReader;
 using cfile::DumpIterator;
 using cfile::ReaderOptions;
 using consensus::ConsensusMetadata;
+using consensus::OpId;
 using consensus::RaftConfigPB;
 using consensus::RaftPeerPB;
-using consensus::ReplicateMsg;
 using fs::ReadableBlock;
+using log::LogEntryPB;
+using log::LogEntryReader;
 using log::LogIndex;
 using log::LogReader;
 using log::ReadableLogSegment;
@@ -160,6 +162,36 @@ Status ParseHostPortString(const string& hostport_str, 
HostPort* hostport) {
   return Status::OK();
 }
 
+// Find the last committed OpId for the tablet_id from the WAL.
+Status FindLastCommittedOpId(FsManager* fs, const string& tablet_id,
+                             OpId* last_committed_opid) {
+  shared_ptr<LogReader> reader;
+  RETURN_NOT_OK(LogReader::Open(fs, scoped_refptr<log::LogIndex>(), tablet_id,
+                                scoped_refptr<MetricEntity>(), &reader));
+  SegmentSequence segs;
+  RETURN_NOT_OK(reader->GetSegmentsSnapshot(&segs));
+  // Reverse iterate segments to find the first 'last committed' entry.
+  // Note that we still read the entries within a segment in sequential
+  // fashion, so the last entry within the 'found' segment will
+  // give us the last_committed_opid.
+  vector<scoped_refptr<ReadableLogSegment>>::reverse_iterator seg;
+  bool found = false;
+  for (seg = segs.rbegin(); seg != segs.rend(); ++seg) {
+    LogEntryReader reader(seg->get());
+    while (true) {
+      LogEntryPB entry;
+      Status s = reader.ReadNextEntry(&entry);
+      if (s.IsEndOfFile()) break;
+      RETURN_NOT_OK_PREPEND(s, "Error in log segment");
+      if (entry.type() != log::COMMIT) continue;
+      *last_committed_opid = entry.commit().commited_op_id();
+      found = true;
+    }
+    if (found) return Status::OK();
+  }
+  return Status::NotFound("Committed OpId not found in the log");
+}
+
 // Parses a colon-delimited string containing a uuid, hostname or IP address,
 // and port into its respective parts. For example,
 // "1c7f19e7ecad4f918c0d3d23180fdb18:localhost:12345" parses into
@@ -266,22 +298,34 @@ Status CopyFromRemote(const RunnerContext& context) {
 
 Status DeleteLocalReplica(const RunnerContext& context) {
   const string& tablet_id = FindOrDie(context.required_args, kTabletIdArg);
-  if (!FLAGS_clean_unsafe) {
-    return Status::NotSupported(Substitute("Deletion of local replica $0 is "
-        "currently not supported without --clean_unsafe flag", tablet_id));
-  }
   FsManager fs_manager(Env::Default(), FsManagerOpts());
   RETURN_NOT_OK(fs_manager.Open());
+  boost::optional<OpId> last_committed_opid = boost::none;
+  TabletDataState state = TabletDataState::TABLET_DATA_DELETED;
+  if (!FLAGS_clean_unsafe) {
+    state = TabletDataState::TABLET_DATA_TOMBSTONED;
+    // Tombstone the tablet. If we couldn't find the last committed OpId from
+    // the log, it's not an error. But if we receive any other error,
+    // indicate the user to delete with --clean_unsafe flag.
+    OpId opid;
+    Status s = FindLastCommittedOpId(&fs_manager, tablet_id, &opid);
+    if (s.ok()) {
+      last_committed_opid = opid;
+    } else if (s.IsNotFound()) {
+      LOG(INFO) << "Could not find last committed OpId from WAL, "
+                << "but proceeding with tablet tombstone: " << s.ToString();
+    } else {
+      LOG(ERROR) << "Error attempting to find last committed OpId in WAL: " << 
s.ToString();
+      LOG(ERROR) << "Cannot delete (tombstone) the tablet, use --clean_unsafe 
to delete"
+                 << " the tablet permanently from this server.";
+      return s;
+    }
+  }
 
-  // This action cleans up metadata/data/WAL for a tablet on this node when
-  // the tablet server is offline. i.e, this tool is an alternative to
-  // actions like 'rm -rf <tablet-meta>' which could orphan the
-  // tablet data blocks with no means to clean them up later.
+  // Force the specified tablet on this node to be in 'state'.
   scoped_refptr<TabletMetadata> meta;
   RETURN_NOT_OK(TabletMetadata::Load(&fs_manager, tablet_id, &meta));
-
-  RETURN_NOT_OK(TSTabletManager::DeleteTabletData(
-      meta, TabletDataState::TABLET_DATA_DELETED, boost::none));
+  RETURN_NOT_OK(TSTabletManager::DeleteTabletData(meta, state, 
last_committed_opid));
   return Status::OK();
 }
 
@@ -742,7 +786,8 @@ unique_ptr<Mode> BuildLocalReplicaMode() {
 
   unique_ptr<Action> delete_local_replica =
       ActionBuilder("delete", &DeleteLocalReplica)
-      .Description("Delete tablet replica from the local filesystem")
+      .Description("Delete a tablet replica from the local filesystem. "
+          "By default, leaves a tombstone record.")
       .AddRequiredParameter({ kTabletIdArg, kTabletIdArgDesc })
       .AddOptionalParameter("fs_wal_dir")
       .AddOptionalParameter("fs_data_dirs")

Reply via email to