Repository: kudu
Updated Branches:
  refs/heads/master 0a397b2f7 -> abea8c69f


block_manager: better preallocation in log block manager

As has been discussed to death, the LBM's preallocation strategy isn't great
because it yields one filesystem extent per created block. While multiple
Append() calls will write to the same extent, small blocks (i.e. those with
just a single Append()) suffer. We can do much better, and in this patch, we
do: a rolling preallocation window is maintained for each container.

Implementation-wise, I did what I could to restrict this knowledge to the
containers. But it did mean a small semantic change to UpdateBytesWritten()
to accomodate for the fact that Append() now updates total_bytes_written_.

As a quick test, I ran block_manager_stress-test before and after, using
filefrag to count the number of extents. The number of containers was the
same in both cases (64). Before, I counted 446 extents totaling 2.1Gb and
after I counted 72 extents totaling 1.4Gb.

Change-Id: Ie7dcba01aff125a68817ca91b3c306977c7104e6
Reviewed-on: http://gerrit.cloudera.org:8080/4848
Tested-by: Adar Dembo <[email protected]>
Reviewed-by: Jean-Daniel Cryans <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/kudu/commit/abea8c69
Tree: http://git-wip-us.apache.org/repos/asf/kudu/tree/abea8c69
Diff: http://git-wip-us.apache.org/repos/asf/kudu/diff/abea8c69

Branch: refs/heads/master
Commit: abea8c69f786cc884c50be810a7eee7200caec03
Parents: 0a397b2
Author: Adar Dembo <[email protected]>
Authored: Tue Oct 25 16:13:12 2016 -0700
Committer: Jean-Daniel Cryans <[email protected]>
Committed: Fri Nov 4 19:46:35 2016 +0000

----------------------------------------------------------------------
 src/kudu/fs/block_manager-test.cc |  80 ++++++++++++++++--------
 src/kudu/fs/log_block_manager.cc  | 107 +++++++++++++++++++++------------
 2 files changed, 124 insertions(+), 63 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/kudu/blob/abea8c69/src/kudu/fs/block_manager-test.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/block_manager-test.cc 
b/src/kudu/fs/block_manager-test.cc
index cdf3ebf..9aeb29e 100644
--- a/src/kudu/fs/block_manager-test.cc
+++ b/src/kudu/fs/block_manager-test.cc
@@ -121,6 +121,24 @@ class BlockManagerTest : public KuduTest {
     return bm_->Open();
   }
 
+  void GetOnlyContainerDataFile(string* data_file) {
+    // The expected directory contents are dot, dotdot, test metadata, instance
+    // file, and one container file pair.
+    string container_data_filename;
+    vector<string> children;
+    ASSERT_OK(env_->GetChildren(GetTestDataDirectory(), &children));
+    ASSERT_EQ(6, children.size());
+    for (const string& child : children) {
+      if (HasSuffixString(child, ".data")) {
+        ASSERT_TRUE(container_data_filename.empty());
+        container_data_filename = JoinPathSegments(GetTestDataDirectory(), 
child);
+        break;
+      }
+    }
+    ASSERT_FALSE(container_data_filename.empty());
+    *data_file = container_data_filename;
+  }
+
   void RunMultipathTest(const vector<string>& paths);
 
   void RunLogMetricsTest();
@@ -294,44 +312,53 @@ void 
BlockManagerTest<FileBlockManager>::RunLogContainerPreallocationTest() {
 
 template <>
 void BlockManagerTest<LogBlockManager>::RunLogContainerPreallocationTest() {
+  string kTestData = "test data";
+
+  // For this test to work properly, the preallocation window has to be at
+  // least three times the size of the test data.
+  ASSERT_GE(FLAGS_log_container_preallocate_bytes, kTestData.size() * 3);
+
   // Create a block with some test data. This should also trigger
   // preallocation of the container, provided it's supported by the kernel.
   gscoped_ptr<WritableBlock> written_block;
   ASSERT_OK(this->bm_->CreateBlock(&written_block));
+  ASSERT_OK(written_block->Append(kTestData));
   ASSERT_OK(written_block->Close());
 
-  // Now reopen the block manager and create another block. More
-  // preallocation, but it should be from the end of the previous block,
-  // not from the end of the file.
+  // We expect the container size to either be equal to the test data size (if
+  // preallocation isn't supported) or equal to the preallocation amount, which
+  // we know is greater than the test data size.
+  string container_data_filename;
+  NO_FATALS(GetOnlyContainerDataFile(&container_data_filename));
+  uint64_t size;
+  ASSERT_OK(this->env_->GetFileSizeOnDisk(container_data_filename, &size));
+  ASSERT_TRUE(size == kTestData.size() ||
+              size == FLAGS_log_container_preallocate_bytes);
+
+  // Upon writing a second block, we'd expect the container to either double in
+  // size (without preallocation) or remain the same size (with preallocation).
+  ASSERT_OK(this->bm_->CreateBlock(&written_block));
+  ASSERT_OK(written_block->Append(kTestData));
+  ASSERT_OK(written_block->Close());
+  NO_FATALS(GetOnlyContainerDataFile(&container_data_filename));
+  ASSERT_OK(this->env_->GetFileSizeOnDisk(container_data_filename, &size));
+  ASSERT_TRUE(size == kTestData.size() * 2 ||
+              size == FLAGS_log_container_preallocate_bytes);
+
+
+  // Now reopen the block manager and create another block. The block manager
+  // should be smart enough to reuse the previously preallocated amount.
   ASSERT_OK(this->ReopenBlockManager(scoped_refptr<MetricEntity>(),
                                      shared_ptr<MemTracker>(),
                                      { GetTestDataDirectory() },
                                      false));
   ASSERT_OK(this->bm_->CreateBlock(&written_block));
+  ASSERT_OK(written_block->Append(kTestData));
   ASSERT_OK(written_block->Close());
-
-  // dot, dotdot, test metadata, instance file, and one container file pair.
-  vector<string> children;
-  ASSERT_OK(this->env_->GetChildren(GetTestDataDirectory(), &children));
-  ASSERT_EQ(6, children.size());
-
-  // If preallocation was done from the end of the file (rather than the
-  // end of the previous block), the file's size would be twice the
-  // preallocation amount. That would be wrong.
-  //
-  // Instead, we expect the size to either be 0 (preallocation isn't
-  // supported) or equal to the preallocation amount.
-  bool found = false;
-  for (const string& child : children) {
-    if (HasSuffixString(child, ".data")) {
-      found = true;
-      uint64_t size;
-      ASSERT_OK(this->env_->GetFileSizeOnDisk(
-          JoinPathSegments(GetTestDataDirectory(), child), &size));
-      ASSERT_TRUE(size == 0 || size == FLAGS_log_container_preallocate_bytes);
-    }
-  }
-  ASSERT_TRUE(found);
+  NO_FATALS(GetOnlyContainerDataFile(&container_data_filename));
+  ASSERT_OK(this->env_->GetFileSizeOnDisk(container_data_filename, &size));
+  ASSERT_TRUE(size == kTestData.size() * 3 ||
+              size == FLAGS_log_container_preallocate_bytes);
 }
 
 template <>
@@ -991,6 +1018,7 @@ TYPED_TEST(BlockManagerTest, TestDiskSpaceCheck) {
 
   FLAGS_fs_data_dirs_full_disk_cache_seconds = 0; // Don't cache device 
fullness.
   FLAGS_fs_data_dirs_reserved_bytes = 1; // Keep at least 1 byte reserved in 
the FS.
+  FLAGS_log_container_preallocate_bytes = 0; // Disable preallocation.
 
   // Normally, a data dir is checked for fullness only after a block is closed;
   // if it's now full, the next attempt at block creation will fail. Only when

http://git-wip-us.apache.org/repos/asf/kudu/blob/abea8c69/src/kudu/fs/log_block_manager.cc
----------------------------------------------------------------------
diff --git a/src/kudu/fs/log_block_manager.cc b/src/kudu/fs/log_block_manager.cc
index 2688fa2..129766b 100644
--- a/src/kudu/fs/log_block_manager.cc
+++ b/src/kudu/fs/log_block_manager.cc
@@ -52,7 +52,7 @@
 DECLARE_bool(enable_data_block_fsync);
 DECLARE_bool(block_manager_lock_dirs);
 
-// TODO: How should this be configured? Should provide some guidance.
+// TODO(unknown): How should this be configured? Should provide some guidance.
 DEFINE_uint64(log_container_max_size, 10LU * 1024 * 1024 * 1024,
               "Maximum size (soft) of a log container");
 TAG_FLAG(log_container_max_size, advanced);
@@ -191,10 +191,10 @@ class LogBlockContainer {
   // The on-disk effects of this call are made durable only after SyncData().
   Status DeleteBlock(int64_t offset, int64_t length);
 
-  // Writes 'data' to this container's data file at offset 'offset'.
+  // Appends 'data' to this container's data file.
   //
   // The on-disk effects of this call are made durable only after SyncData().
-  Status WriteData(int64_t offset, const Slice& data);
+  Status AppendData(const Slice& data);
 
   // See RWFile::Read().
   Status ReadData(int64_t offset, size_t length,
@@ -215,25 +215,21 @@ class LogBlockContainer {
   //
   // Does not guarantee metadata durability; use SyncMetadata() for that.
   //
-  // TODO: Add support to just flush a range.
+  // TODO(unknown): Add support to just flush a range.
   Status FlushMetadata();
 
   // Synchronize this container's data file with the disk. On success,
   // guarantees that the data is made durable.
   //
-  // TODO: Add support to synchronize just a range.
+  // TODO(unknown): Add support to synchronize just a range.
   Status SyncData();
 
   // Synchronize this container's metadata file with the disk. On success,
   // guarantees that the metadata is made durable.
   //
-  // TODO: Add support to synchronize just a range.
+  // TODO(unknown): Add support to synchronize just a range.
   Status SyncMetadata();
 
-  // Ensure that 'length' bytes are preallocated in this container,
-  // beginning from the position where the last written block ended.
-  Status Preallocate(size_t length);
-
   // Reads the container's metadata from disk, sanity checking and
   // returning the records.
   Status ReadContainerRecords(deque<BlockRecordPB>* records) const;
@@ -274,6 +270,12 @@ class LogBlockContainer {
   void CheckBlockRecord(const BlockRecordPB& record,
                         uint64_t data_file_size) const;
 
+  // Preallocate enough space to ensure that an append of 'next_append_bytes'
+  // can be satisfied by this container.
+  //
+  // Does nothing if preallocation is disabled.
+  Status EnsurePreallocated(size_t next_append_bytes);
+
   // The owning block manager. Must outlive the container itself.
   LogBlockManager* const block_manager_;
 
@@ -424,12 +426,15 @@ Status LogBlockContainer::Open(LogBlockManager* 
block_manager,
   RETURN_NOT_OK(env->NewRWFile(rw_opts,
                                data_path,
                                &data_file));
+  uint64_t data_file_size;
+  RETURN_NOT_OK(data_file->Size(&data_file_size));
 
   // Create the in-memory container and populate it.
   gscoped_ptr<LogBlockContainer> open_container(new 
LogBlockContainer(block_manager,
                                                                       dir,
                                                                       
std::move(metadata_pb_writer),
                                                                       
std::move(data_file)));
+  open_container->preallocated_offset_ = data_file_size;
   VLOG(1) << "Opened log block container " << open_container->ToString();
   container->swap(open_container);
   return Status::OK();
@@ -512,8 +517,9 @@ void LogBlockContainer::CheckBlockRecord(const 
BlockRecordPB& record,
 
 Status LogBlockContainer::FinishBlock(const Status& s, WritableBlock* block) {
   auto cleanup = MakeScopedCleanup([&]() {
-      block_manager_->MakeContainerAvailable(this);
-    });
+    block_manager_->MakeContainerAvailable(this);
+  });
+
   if (!s.ok()) {
     // Early return; 'cleanup' makes the container available again.
     return s;
@@ -527,9 +533,14 @@ Status LogBlockContainer::FinishBlock(const Status& s, 
WritableBlock* block) {
   // will have written some garbage that can be expunged during a GC.
   RETURN_NOT_OK(block_manager()->SyncContainer(*this));
 
+  // Each call to AppendData() updated 'total_bytes_written_' to reflect the
+  // new block. Nevertheless, we must call UpdateBytesWritten() whenever a
+  // block is finished in order to prepare for the next block.
   CHECK(block_manager()->AddLogBlock(this, block->id(),
-                                     total_bytes_written(), 
block->BytesAppended()));
-  UpdateBytesWritten(block->BytesAppended());
+                                     total_bytes_written_ - 
block->BytesAppended(),
+                                     block->BytesAppended()));
+  UpdateBytesWritten(0);
+
   if (full() && block_manager()->metrics()) {
     block_manager()->metrics()->full_containers->Increment();
   }
@@ -557,12 +568,13 @@ Status LogBlockContainer::DeleteBlock(int64_t offset, 
int64_t length) {
   return Status::OK();
 }
 
-Status LogBlockContainer::WriteData(int64_t offset, const Slice& data) {
-  DCHECK_GE(offset, 0);
+Status LogBlockContainer::AppendData(const Slice& data) {
+  RETURN_NOT_OK(EnsurePreallocated(data.size()));
   {
     std::lock_guard<Mutex> l(data_file_lock_);
-    return data_file_->Write(offset, data);
+    RETURN_NOT_OK(data_file_->Write(total_bytes_written_, data));
   }
+  total_bytes_written_ += data.size();
 
   if (PREDICT_FALSE(!FLAGS_log_container_preallocate_bytes)) {
     // Without preallocation, every call grows the container.
@@ -614,13 +626,27 @@ Status LogBlockContainer::SyncMetadata() {
   return Status::OK();
 }
 
-Status LogBlockContainer::Preallocate(size_t length) {
-  RETURN_NOT_OK(data_file_->PreAllocate(total_bytes_written(), length));
-  preallocated_offset_ = total_bytes_written() + length;
+Status LogBlockContainer::EnsurePreallocated(size_t next_append_bytes) {
+  if (!FLAGS_log_container_preallocate_bytes) {
+    return Status::OK();
+  }
+  DCHECK_GE(preallocated_offset_, total_bytes_written_);
 
-  // Preallocation succeeded and the container has grown; recheck its data
-  // directory fullness.
-  RETURN_NOT_OK(data_dir_->RefreshIsFull(DataDir::RefreshMode::ALWAYS));
+  // If the next write exceeds the remaining preallocated window, we need to
+  // preallocate another chunk.
+  if (next_append_bytes > preallocated_offset_ - total_bytes_written_) {
+    int64_t off = preallocated_offset_;
+    int64_t len = FLAGS_log_container_preallocate_bytes;
+    {
+      std::lock_guard<Mutex> l(data_file_lock_);
+      RETURN_NOT_OK(data_file_->PreAllocate(off, len));
+    }
+    RETURN_NOT_OK(data_dir_->RefreshIsFull(DataDir::RefreshMode::ALWAYS));
+    VLOG(2) << Substitute("Preallocated $0 bytes at offset $1 in container $2",
+                          len, off, ToString());
+
+    preallocated_offset_ += len;
+  }
 
   return Status::OK();
 }
@@ -628,16 +654,18 @@ Status LogBlockContainer::Preallocate(size_t length) {
 void LogBlockContainer::UpdateBytesWritten(int64_t more_bytes) {
   DCHECK_GE(more_bytes, 0);
 
+  total_bytes_written_ += more_bytes;
+
   // The number of bytes is rounded up to the nearest filesystem block so
   // that each Kudu block is guaranteed to be on a filesystem block
   // boundary. This guarantees that the disk space can be reclaimed when
   // the block is deleted.
-  total_bytes_written_ += KUDU_ALIGN_UP(more_bytes,
-                                        
instance()->filesystem_block_size_bytes());
+  total_bytes_written_ = KUDU_ALIGN_UP(total_bytes_written_,
+                                       
instance()->filesystem_block_size_bytes());
   if (full()) {
-    VLOG(1) << "Container " << ToString() << " with size "
-            << total_bytes_written_ << " is now full, max size is "
-            << FLAGS_log_container_max_size;
+    VLOG(1) << Substitute(
+        "Container $0 with size $1 is now full, max size is $2",
+        ToString(), total_bytes_written_, FLAGS_log_container_max_size);
   }
 }
 
@@ -854,7 +882,7 @@ Status LogWritableBlock::Append(const Slice& data) {
   // length is still in flux.
 
   MicrosecondsInt64 start_time = GetMonoTimeMicros();
-  RETURN_NOT_OK(container_->WriteData(block_offset_ + block_length_, data));
+  RETURN_NOT_OK(container_->AppendData(data));
   MicrosecondsInt64 end_time = GetMonoTimeMicros();
 
   int64_t dur = end_time - start_time;
@@ -876,7 +904,7 @@ Status LogWritableBlock::FlushDataAsync() {
 
     RETURN_NOT_OK_PREPEND(AppendMetadata(), "Unable to append block metadata");
 
-    // TODO: Flush just the range we care about.
+    // TODO(unknown): Flush just the range we care about.
     RETURN_NOT_OK(container_->FlushMetadata());
   }
 
@@ -927,11 +955,11 @@ Status LogWritableBlock::DoClose(SyncMode mode) {
         (state_ == CLEAN || state_ == DIRTY || state_ == FLUSHING)) {
       VLOG(3) << "Syncing block " << id();
 
-      // TODO: Sync just this block's dirty data.
+      // TODO(unknown): Sync just this block's dirty data.
       s = container_->SyncData();
       RETURN_NOT_OK(s);
 
-      // TODO: Sync just this block's dirty metadata.
+      // TODO(unknown): Sync just this block's dirty metadata.
       s = container_->SyncMetadata();
       RETURN_NOT_OK(s);
     }
@@ -1176,9 +1204,6 @@ Status LogBlockManager::CreateBlock(const 
CreateBlockOptions& opts,
   // force callers to block if we've reached it?
   LogBlockContainer* container;
   RETURN_NOT_OK(GetOrCreateContainer(&container));
-  if (FLAGS_log_container_preallocate_bytes) {
-    
RETURN_NOT_OK(container->Preallocate(FLAGS_log_container_preallocate_bytes));
-  }
 
   // Generate a free block ID.
   // We have to loop here because earlier versions used non-sequential block 
IDs,
@@ -1230,7 +1255,7 @@ Status LogBlockManager::DeleteBlock(const BlockId& 
block_id) {
 
   // Record the on-disk deletion.
   //
-  // TODO: what if this fails? Should we restore the in-memory block?
+  // TODO(unknown): what if this fails? Should we restore the in-memory block?
   BlockRecordPB record;
   block_id.CopyToPB(record.mutable_block_id());
   record.set_op_type(DELETE);
@@ -1241,7 +1266,9 @@ Status LogBlockManager::DeleteBlock(const BlockId& 
block_id) {
   // We don't bother fsyncing the metadata append for deletes in order to avoid
   // the disk overhead. Even if we did fsync it, we'd still need to account for
   // garbage at startup time (in the event that we crashed just before the
-  // fsync). TODO: Implement GC of orphaned blocks. See KUDU-829.
+  // fsync).
+  //
+  // TODO(KUDU-829): Implement GC of orphaned blocks.
 
   return Status::OK();
 }
@@ -1378,6 +1405,9 @@ bool LogBlockManager::AddLogBlockUnlocked(const 
scoped_refptr<LogBlock>& lb) {
     return false;
   }
 
+  VLOG(2) << Substitute("Added block: offset $0, length $1",
+                        lb->offset(), lb->length());
+
   // There may already be an entry in open_block_ids_ (e.g. we just finished
   // writing out a block).
   open_block_ids_.erase(lb->block_id());
@@ -1393,6 +1423,9 @@ scoped_refptr<LogBlock> 
LogBlockManager::RemoveLogBlock(const BlockId& block_id)
   scoped_refptr<LogBlock> result =
       EraseKeyReturnValuePtr(&blocks_by_block_id_, block_id);
   if (result) {
+    VLOG(2) << Substitute("Removed block: offset $0, length $1",
+                          result->offset(), result->length());
+
     mem_tracker_->Release(kudu_malloc_usable_size(result.get()));
 
     if (metrics()) {

Reply via email to