This is an automated email from the ASF dual-hosted git repository.

alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git


The following commit(s) were added to refs/heads/master by this push:
     new 39b70ee1c KUDU-3371 Parameterize some options of RocksDB
39b70ee1c is described below

commit 39b70ee1ce0ee976b4c2a956ca3f82e4a0d47dca
Author: Yingchun Lai <[email protected]>
AuthorDate: Sun Jul 7 23:53:04 2024 +0800

    KUDU-3371 Parameterize some options of RocksDB
    
    There are many options in RocksDB for performance
    and capacity tuning. This patch exposes some
    options for Kudu use case.
    
    Change-Id: I6a6da6868511fe69af528ca122fcaa98cfc9dac4
    Reviewed-on: http://gerrit.cloudera.org:8080/21570
    Tested-by: Yingchun Lai <[email protected]>
    Reviewed-by: Alexey Serbin <[email protected]>
---
 src/kudu/cfile/block_cache.cc |  16 ++++++
 src/kudu/fs/dir_manager.cc    | 113 ++++++++++++++++++++++++++++++++++++------
 2 files changed, 113 insertions(+), 16 deletions(-)

diff --git a/src/kudu/cfile/block_cache.cc b/src/kudu/cfile/block_cache.cc
index 9da714ca2..29b2c21b2 100644
--- a/src/kudu/cfile/block_cache.cc
+++ b/src/kudu/cfile/block_cache.cc
@@ -57,6 +57,10 @@ DEFINE_string(block_cache_type, "DRAM",
               "libmemkind 1.8.0 or newer must be available on the system; "
               "otherwise Kudu will crash.");
 
+#if !defined(NO_ROCKSDB)
+DECLARE_uint32(log_container_rdb_block_cache_capacity_mb);
+#endif
+
 using strings::Substitute;
 
 template <class T> class scoped_refptr;
@@ -112,6 +116,18 @@ bool ValidateBlockCacheCapacity() {
                                "--memory_limit_hard_bytes.",
                                capacity, mpt);
   }
+#if !defined(NO_ROCKSDB)
+  if (FLAGS_log_container_rdb_block_cache_capacity_mb >= 
FLAGS_block_cache_capacity_mb) {
+    LOG(WARNING) << Substitute("Block cache capacity for RocksDB which is used 
only for metadata "
+                               "is larger than that for data ($0 MB vs. $1 
MB). This may cause "
+                               "performance problems. Consider lowering "
+                               "--log_container_rdb_block_cache_capacity_mb or 
raising "
+                               "--block_cache_capacity_mb.",
+                               FLAGS_log_container_rdb_block_cache_capacity_mb,
+                               FLAGS_block_cache_capacity_mb);
+    return true;
+  }
+#endif
   return true;
 }
 
diff --git a/src/kudu/fs/dir_manager.cc b/src/kudu/fs/dir_manager.cc
index 520d1d302..c4474d34a 100644
--- a/src/kudu/fs/dir_manager.cc
+++ b/src/kudu/fs/dir_manager.cc
@@ -30,7 +30,11 @@
 #include <utility>
 #include <vector>
 
+#if !defined(NO_ROCKSDB)
+#include <gflags/gflags.h>
+#else
 #include <gflags/gflags_declare.h>
+#endif
 #include <glog/logging.h>
 #if !defined(NO_ROCKSDB)
 #include <rocksdb/cache.h>
@@ -50,6 +54,9 @@
 #include "kudu/gutil/strings/util.h"
 #include "kudu/util/env.h"
 #include "kudu/util/env_util.h"
+#if !defined(NO_ROCKSDB)
+#include "kudu/util/flag_tags.h"
+#endif
 #include "kudu/util/oid_generator.h"
 #include "kudu/util/path_util.h"
 #include "kudu/util/pb_util.h"
@@ -75,6 +82,84 @@ DECLARE_int32(fs_data_dirs_available_space_cache_seconds);
 DECLARE_int64(fs_data_dirs_reserved_bytes);
 DECLARE_string(block_manager);
 
+#if !defined(NO_ROCKSDB)
+DEFINE_double(log_container_rdb_bits_per_key, 9.9,
+              "Average number of bits allocated per key in RocksDB bloom 
filter, for details see "
+              "https://github.com/facebook/rocksdb/wiki/RocksDB-Bloom-Filter. 
It is only effective "
+              "when --block_manager='logr'");
+TAG_FLAG(log_container_rdb_bits_per_key, advanced);
+TAG_FLAG(log_container_rdb_bits_per_key, experimental);
+
+DEFINE_uint32(log_container_rdb_block_cache_capacity_mb, 10,
+              "The block cache capacity of RocksDB in MiB, it is shared by all 
RocksDB instances "
+              "in the process. It is only effective when 
--block_manager='logr'");
+TAG_FLAG(log_container_rdb_block_cache_capacity_mb, advanced);
+TAG_FLAG(log_container_rdb_block_cache_capacity_mb, experimental);
+
+DEFINE_uint32(log_container_rdb_max_background_jobs, 8,
+              "The maximum number of concurrent background jobs (compactions 
and flushes) shared "
+              "between RocksDB instances. It is only effective when 
--block_manager='logr'");
+TAG_FLAG(log_container_rdb_max_background_jobs, advanced);
+TAG_FLAG(log_container_rdb_max_background_jobs, experimental);
+
+DEFINE_uint32(log_container_rdb_max_write_buffer_number, 2,
+              "The maximum number of write buffers that are built up in memory 
of each RocksDB "
+              "instance. It is only effective when --block_manager='logr'");
+TAG_FLAG(log_container_rdb_max_write_buffer_number, advanced);
+TAG_FLAG(log_container_rdb_max_write_buffer_number, experimental);
+
+DEFINE_double(log_container_rdb_memtable_prefix_bloom_size_ratio, 0.1,
+              "Enables a dynamic bloom filter of RocksDB memtable to optimize 
many queries that "
+              "must go beyond the memtable if it is larger than 0. The size in 
bytes of the filter "
+              "is --log_container_rdb_write_buffer_size * "
+              "--log_container_rdb_memtable_prefix_bloom_size_ratio. It is 
only effective when "
+              "--block_manager='logr'");
+TAG_FLAG(log_container_rdb_memtable_prefix_bloom_size_ratio, advanced);
+TAG_FLAG(log_container_rdb_memtable_prefix_bloom_size_ratio, experimental);
+
+DEFINE_uint64(log_container_rdb_write_buffer_size, 64 << 20,
+              "The amount of data in RocksDB to build up in memory (backed by 
an unsorted log on "
+              "disk) before converting to a sorted on-disk file. It is only 
effective when "
+              "--block_manager='logr'");
+TAG_FLAG(log_container_rdb_write_buffer_size, advanced);
+TAG_FLAG(log_container_rdb_write_buffer_size, experimental);
+
+DEFINE_string(log_container_rdb_db_log_dir, "",
+              "This specifies the info log dir of RocksDB. If it is empty, the 
log files are in "
+              "the same dir as data (i.e. each dir in --fs_data_dirs). If it 
is not empty, the log "
+              "files will be in the specified dir, and the --fs_data_dirs 
absolute path will "
+              "be used as the log file name's prefixes. It is only effective 
when "
+              "--block_manager='logr'");
+TAG_FLAG(log_container_rdb_db_log_dir, advanced);
+TAG_FLAG(log_container_rdb_db_log_dir, experimental);
+
+DEFINE_uint64(log_container_rdb_max_log_file_size, 8 << 20,
+              "Maximum byte size of the RocksDB info log file. If the log file 
is larger "
+              "than specified, a new info log file will be created. If it is 
0, all logs will be "
+              "written to one log file. It is only effective when 
--block_manager='logr'");
+TAG_FLAG(log_container_rdb_max_log_file_size, advanced);
+TAG_FLAG(log_container_rdb_max_log_file_size, experimental);
+
+DEFINE_uint64(log_container_rdb_keep_log_file_num, 10,
+              "Maximum number of RocksDB info log files to keep. It is only 
effective when "
+              "--block_manager='logr'");
+TAG_FLAG(log_container_rdb_keep_log_file_num, advanced);
+TAG_FLAG(log_container_rdb_keep_log_file_num, experimental);
+
+DEFINE_uint64(log_container_rdb_max_manifest_file_size, 64 << 20,
+              "The RocksDB manifest file is rolled over on reaching this byte 
limit. It is only "
+              "effective when --block_manager='logr'");
+TAG_FLAG(log_container_rdb_max_manifest_file_size, advanced);
+TAG_FLAG(log_container_rdb_max_manifest_file_size, experimental);
+
+DEFINE_int32(log_container_rdb_level0_file_num_compaction_trigger, 4,
+             "Number of files to trigger level-0 compaction in RocksDB. A 
value <0 means that "
+             "level-0 compaction will not be triggered by the number of files 
at all. It is only "
+             "effective when --block_manager='logr'");
+TAG_FLAG(log_container_rdb_level0_file_num_compaction_trigger, advanced);
+TAG_FLAG(log_container_rdb_level0_file_num_compaction_trigger, experimental);
+#endif
+
 namespace kudu {
 namespace {
 // Wrapper for env_util::DeleteTmpFilesRecursively that is suitable for 
parallel
@@ -258,32 +343,28 @@ Status RdbDir::InitRocksDBInstance(bool newly_created) {
       opts.create_if_missing = false;
       opts.error_if_exists = false;
   }
-  // TODO(yingchun): parameterize more rocksDB options, including:
-  //  opts.use_fsync
-  //  opts.db_log_dir
-  //  opts.wal_dir
-  //  opts.max_log_file_size
-  //  opts.keep_log_file_num
-  //  opts.max_manifest_file_size
-  //  opts.max_background_jobs
-  //  opts.write_buffer_size
-  //  opts.level0_file_num_compaction_trigger
-  //  opts.max_write_buffer_number
+  opts.db_log_dir = FLAGS_log_container_rdb_db_log_dir;
+  opts.max_log_file_size = FLAGS_log_container_rdb_max_log_file_size;
+  opts.keep_log_file_num = FLAGS_log_container_rdb_keep_log_file_num;
+  opts.write_buffer_size = FLAGS_log_container_rdb_write_buffer_size;
+  opts.max_write_buffer_number = 
FLAGS_log_container_rdb_max_write_buffer_number;
+  opts.max_background_jobs = FLAGS_log_container_rdb_max_background_jobs;
+  opts.max_manifest_file_size = FLAGS_log_container_rdb_max_manifest_file_size;
+  opts.level0_file_num_compaction_trigger =
+      FLAGS_log_container_rdb_level0_file_num_compaction_trigger;
 
   static std::once_flag flag;
   std::call_once(flag, [&]() {
-    // TODO(yingchun): parameterize the rocksdb block cache size.
-    s_block_cache_ = rocksdb::NewLRUCache(10 << 20);
+    s_block_cache_ = 
rocksdb::NewLRUCache(FLAGS_log_container_rdb_block_cache_capacity_mb << 20);
   });
   rocksdb::BlockBasedTableOptions tbl_opts;
   tbl_opts.block_cache = s_block_cache_;
   tbl_opts.whole_key_filtering = false;
-  // TODO(yingchun): parameterize these options.
-  tbl_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(9.9));
+  
tbl_opts.filter_policy.reset(rocksdb::NewBloomFilterPolicy(FLAGS_log_container_rdb_bits_per_key));
   opts.table_factory.reset(NewBlockBasedTableFactory(tbl_opts));
   // Take advantage of Prefix-Seek, see 
https://github.com/facebook/rocksdb/wiki/Prefix-Seek.
   
opts.prefix_extractor.reset(rocksdb::NewFixedPrefixTransform(ObjectIdGenerator::IdLength()));
-  opts.memtable_prefix_bloom_size_ratio = 0.1;
+  opts.memtable_prefix_bloom_size_ratio = 
FLAGS_log_container_rdb_memtable_prefix_bloom_size_ratio;
 
   rdb_dir_ = JoinPathSegments(dir_, kRocksDBDirName);
   rocksdb::DB* db_temp = nullptr;

Reply via email to