KUDU-1764: truncate preallocated space off of full lbm containers The LBM maintains a preallocation window at the end of each container, but it erroneously leaves the window intact even as a container becomes full.
This patch addresses that in two ways: 1. The window is truncated when a container becomes full, and 2. It is truncated at startup after a full container is loaded. The first trims the window in real time, while the second handles both existing deployments and the edge case where a tserver has crashed mid-trim. The implementation uses total_bytes_written_ as the truncation offset, which means it still leaves last_block_size % filesystem_block_size bytes behind. This is equivalent to the internal fragmentation caused by aligning block offsets to the nearest filesystem block size, so I don't think it matters. I snuck in a change to remove the "filesystem does not support preallocation" behavior from block-manager-test; it's not possible to create an LBM on a filesystem sans hole punching; such a filesystem should support preallocation by definition. Change-Id: Ib9173f955e53d096bfd9d3ddacf4294846cff11a Reviewed-on: http://gerrit.cloudera.org:8080/5254 Tested-by: Kudu Jenkins Reviewed-by: Todd Lipcon <[email protected]> Tested-by: Todd Lipcon <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/kudu/repo Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/4aacaf6f Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/4aacaf6f Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/4aacaf6f Branch: refs/heads/master Commit: 4aacaf6fc57b26195634c09c0433a24f5d4a778e Parents: 8524516 Author: Adar Dembo <[email protected]> Authored: Mon Nov 28 17:31:29 2016 -0800 Committer: Todd Lipcon <[email protected]> Committed: Wed Nov 30 21:24:01 2016 +0000 ---------------------------------------------------------------------- src/kudu/fs/block_manager-test.cc | 65 +++++++++++++++++++++++++++------- src/kudu/fs/log_block_manager.cc | 39 ++++++++++++++++++++ 2 files changed, 92 insertions(+), 12 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/kudu/blob/4aacaf6f/src/kudu/fs/block_manager-test.cc ---------------------------------------------------------------------- diff --git a/src/kudu/fs/block_manager-test.cc b/src/kudu/fs/block_manager-test.cc index b484a4b..a854717 100644 --- a/src/kudu/fs/block_manager-test.cc +++ b/src/kudu/fs/block_manager-test.cc @@ -326,26 +326,22 @@ void BlockManagerTest<LogBlockManager>::RunLogContainerPreallocationTest() { ASSERT_OK(written_block->Append(kTestData)); ASSERT_OK(written_block->Close()); - // We expect the container size to either be equal to the test data size (if - // preallocation isn't supported) or equal to the preallocation amount, which - // we know is greater than the test data size. + // We expect the container size to be equal to the preallocation amount, + // which we know is greater than the test data size. string container_data_filename; NO_FATALS(GetOnlyContainerDataFile(&container_data_filename)); uint64_t size; ASSERT_OK(env_->GetFileSizeOnDisk(container_data_filename, &size)); - ASSERT_TRUE(size == kTestData.size() || - size == FLAGS_log_container_preallocate_bytes); + ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size); - // Upon writing a second block, we'd expect the container to either double in - // size (without preallocation) or remain the same size (with preallocation). + // Upon writing a second block, we'd expect the container to remain the same + // size. ASSERT_OK(bm_->CreateBlock(&written_block)); ASSERT_OK(written_block->Append(kTestData)); ASSERT_OK(written_block->Close()); NO_FATALS(GetOnlyContainerDataFile(&container_data_filename)); ASSERT_OK(env_->GetFileSizeOnDisk(container_data_filename, &size)); - ASSERT_TRUE(size == kTestData.size() * 2 || - size == FLAGS_log_container_preallocate_bytes); - + ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size); // Now reopen the block manager and create another block. The block manager // should be smart enough to reuse the previously preallocated amount. @@ -358,8 +354,7 @@ void BlockManagerTest<LogBlockManager>::RunLogContainerPreallocationTest() { ASSERT_OK(written_block->Close()); NO_FATALS(GetOnlyContainerDataFile(&container_data_filename)); ASSERT_OK(env_->GetFileSizeOnDisk(container_data_filename, &size)); - ASSERT_TRUE(size == kTestData.size() * 3 || - size == FLAGS_log_container_preallocate_bytes); + ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size); } template <> @@ -1027,6 +1022,52 @@ TEST_F(LogBlockManagerTest, TestAppendExceedsPreallocation) { ASSERT_OK(writer->Append("hello world")); } +TEST_F(LogBlockManagerTest, TestPreallocationAndTruncation) { + RETURN_NOT_LOG_BLOCK_MANAGER(); + + // Ensure preallocation window is greater than the container size itself. + FLAGS_log_container_max_size = 1024 * 1024; + FLAGS_log_container_preallocate_bytes = 32 * 1024 * 1024; + + // Fill up one container. + gscoped_ptr<WritableBlock> writer; + ASSERT_OK(bm_->CreateBlock(&writer)); + unique_ptr<uint8_t[]> data(new uint8_t[FLAGS_log_container_max_size]); + memset(data.get(), 0, FLAGS_log_container_max_size); + ASSERT_OK(writer->Append({ data.get(), FLAGS_log_container_max_size } )); + string fname; + NO_FATALS(GetOnlyContainerDataFile(&fname)); + uint64_t size_after_append; + ASSERT_OK(env_->GetFileSize(fname, &size_after_append)); + ASSERT_EQ(FLAGS_log_container_preallocate_bytes, size_after_append); + + // Close it. The extra preallocated space should be truncated off the file. + ASSERT_OK(writer->Close()); + uint64_t size_after_close; + ASSERT_OK(env_->GetFileSize(fname, &size_after_close)); + ASSERT_EQ(FLAGS_log_container_max_size, size_after_close); + + // For the sake of testing, artificially double the file's size. + unique_ptr<RWFile> data_file; + RWFileOptions opts; + opts.mode = Env::OPEN_EXISTING; + ASSERT_OK(env_->NewRWFile(opts, fname, &data_file)); + ASSERT_OK(data_file->PreAllocate(size_after_close, size_after_close)); + uint64_t size_after_preallocate; + ASSERT_OK(env_->GetFileSize(fname, &size_after_preallocate)); + ASSERT_EQ(size_after_close * 2, size_after_preallocate); + + // Now reopen the block manager. It should notice that the container grew + // and truncate the extra preallocated space off again. + ASSERT_OK(ReopenBlockManager(scoped_refptr<MetricEntity>(), + shared_ptr<MemTracker>(), + { this->test_dir_ }, + false)); + uint64_t size_after_reopen; + ASSERT_OK(env_->GetFileSize(fname, &size_after_reopen)); + ASSERT_EQ(FLAGS_log_container_max_size, size_after_reopen); +} + TYPED_TEST(BlockManagerTest, TestDiskSpaceCheck) { // Reopen the block manager with metrics enabled. MetricRegistry registry; http://git-wip-us.apache.org/repos/asf/kudu/blob/4aacaf6f/src/kudu/fs/log_block_manager.cc ---------------------------------------------------------------------- diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc index 874a282..cce6d84 100644 --- a/src/kudu/fs/log_block_manager.cc +++ b/src/kudu/fs/log_block_manager.cc @@ -234,6 +234,16 @@ class LogBlockContainer { // TODO(unknown): Add support to synchronize just a range. Status SyncMetadata(); + // Truncates this container's data file to 'total_bytes_written_' if it is + // full. This effectively removes any preallocated but unused space. + // + // Should be called only when 'total_bytes_written_' is up-to-date with + // respect to the data on disk (i.e. after the container's records have + // been loaded), otherwise data may be lost! + // + // This function is thread unsafe. + Status TruncateDataToTotalBytesWritten(); + // Reads the container's metadata from disk, sanity checking and // returning the records. Status ReadContainerRecords(deque<BlockRecordPB>* records) const; @@ -438,6 +448,15 @@ Status LogBlockContainer::Open(LogBlockManager* block_manager, return Status::OK(); } +Status LogBlockContainer::TruncateDataToTotalBytesWritten() { + if (full() && preallocated_offset_ > total_bytes_written_) { + VLOG(2) << Substitute("Truncating container $0 to offset $1", + ToString(), total_bytes_written_); + RETURN_NOT_OK(data_file_->Truncate(total_bytes_written_)); + } + return Status::OK(); +} + Status LogBlockContainer::ReadContainerRecords(deque<BlockRecordPB>* records) const { string metadata_path = metadata_file_->filename(); unique_ptr<RandomAccessFile> metadata_reader; @@ -539,6 +558,14 @@ Status LogBlockContainer::FinishBlock(const Status& s, WritableBlock* block) { block->BytesAppended())); UpdateBytesWritten(0); + // Truncate the container if it's now full; any left over preallocated space + // is no longer needed. + // + // Note that this take places _after_ the container has been synced to disk. + // That's OK; truncation isn't needed for correctness, and in the event of a + // crash, it will be retried at startup. + RETURN_NOT_OK(TruncateDataToTotalBytesWritten()); + if (full() && block_manager()->metrics()) { block_manager()->metrics()->full_containers->Increment(); } @@ -1479,6 +1506,18 @@ void LogBlockManager::OpenDataDir(DataDir* dir, ProcessBlockRecord(r, container.get(), &blocks_in_container); max_block_id = std::max(max_block_id, r.block_id().id()); } + + // Having processed the block records, it is now safe to truncate the + // preallocated space off of the end of the container. This is a no-op for + // non-full containers, where excess preallocated space is expected to be + // (eventually) used. + s = container->TruncateDataToTotalBytesWritten(); + if (!s.ok()) { + *result_status = s.CloneAndPrepend(Substitute( + "Could not truncate container $0", container->ToString())); + return; + } + next_block_id_.StoreMax(max_block_id + 1); // Under the lock, merge this map into the main block map and add
