This is an automated email from the ASF dual-hosted git repository.
alexey pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/kudu.git
The following commit(s) were added to refs/heads/master by this push:
new 63d9c3926 KUDU-3734 include undo size while picking rowsets
63d9c3926 is described below
commit 63d9c39267bc7f35dc3a2e7535148b28088b229a
Author: Ashwani <[email protected]>
AuthorDate: Tue Aug 26 18:35:07 2025 +0530
KUDU-3734 include undo size while picking rowsets
As part of compaction improvement effort, this change is focussing on
taking into consideration the size of undo deltas while picking rowsets
during rowset compaction. I could not find any historical reason to why
it was not done before. Maybe there was some analysis done that ended
in a conclusion that considering undo deltas may not be right approach
when estimating upper bound fractional solution in the knapsack. The
size of undo deltas are taken into consideration while deciding whether
a new rowset (a potential compaction 'candidate') fits into the budget
and is at least denser than least dense candidate in knapsack.
The test data used here has significant undo deltas that fits perfectly
into the OOM scenario. Total size of uncompacted data is 29GB.
With this patch, rowset compaction never hit OOM and the resident
memory kept well within limits (~1GB).
Without this patch, compaction hit OOM on a node with limited memory
where rowset compaction peak memory touched ~30GB.
Testing with different scenarios:
1. Compaction that takes into consideration:
- Base data
- Redo deltas
- Undo delats
Budget for compaction (tablet_compaction_budget_mb) is 1024 MB
Result: Rowset 211 was skipped due to 1024 MB size constraint.
Budgeted compaction selection:
[ ] RowSet(211)( 1945M) [0.0000, 1.0000]
[ ] RowSet(217)( 1M) [0.0027, 0.1953]
[x] RowSet(216)( 1M) [0.5341, 0.5341]
[x] RowSet(218)( 1M) [0.5341, 0.5341]
[x] RowSet(219)( 1M) [0.5341, 0.5341]
2. Compaction that takes into consideration:
- Base data
- Redo deltas
- Undo delats
Budget for compaction (tablet_compaction_budget_mb) is 2048 MB
Result: Rowset 211 was NOT skipped with 2048 MB size limit.
Budgeted compaction selection:
[x] RowSet(211)( 1945M) [0.0000, 1.0000]
[x] RowSet(217)( 1M) [0.0027, 0.1953]
[x] RowSet(216)( 1M) [0.5341, 0.5341]
[x] RowSet(218)( 1M) [0.5341, 0.5341]
[x] RowSet(219)( 1M) [0.5341, 0.5341]
3. Compaction that takes into consideration:
- Base data
- Redo deltas
Compaction budget (tablet_compaction_budget_mb) is default 128 MB
Result: Rowset 211 with size 1M (ignoring UNDO deltas) included.
Budgeted compaction selection:
[x] RowSet(211)( 1M) [0.0000, 1.0000]
[x] RowSet(217)( 1M) [0.0026, 0.2015]
[x] RowSet(216)( 1M) [0.5358, 0.5385]
[x] RowSet(218)( 1M) [0.5385, 0.5404]
[x] RowSet(219)( 1M) [0.5404, 0.5404]
Note: This is different from rowset compaction batching effort.
Change-Id: I351c0ba96a02e6ded5153adf9d981083a8c40592
Reviewed-on: http://gerrit.cloudera.org:8080/23348
Tested-by: Alexey Serbin <[email protected]>
Reviewed-by: Alexey Serbin <[email protected]>
---
src/kudu/tablet/compaction_policy-test.cc | 3 ++-
src/kudu/tablet/compaction_policy.cc | 8 ++++----
src/kudu/tablet/diskrowset.cc | 13 +++++++++++--
src/kudu/tablet/diskrowset.h | 2 +-
src/kudu/tablet/memrowset.h | 4 ++--
src/kudu/tablet/mock-rowsets.h | 4 ++--
src/kudu/tablet/rowset.cc | 4 ++--
src/kudu/tablet/rowset.h | 10 +++++-----
src/kudu/tablet/rowset_info.cc | 16 ++++++++--------
src/kudu/tablet/rowset_info.h | 19 ++++++++++---------
src/kudu/tablet/tablet.cc | 7 ++++---
11 files changed, 51 insertions(+), 39 deletions(-)
diff --git a/src/kudu/tablet/compaction_policy-test.cc
b/src/kudu/tablet/compaction_policy-test.cc
index 2161e2483..dc650eeb0 100644
--- a/src/kudu/tablet/compaction_policy-test.cc
+++ b/src/kudu/tablet/compaction_policy-test.cc
@@ -285,7 +285,8 @@ TEST_F(TestCompactionPolicy, TestYcsbCompaction) {
LOG(INFO) << "quality=" << quality;
int total_size = 0;
for (const auto* rs : picked) {
- total_size += rs->OnDiskBaseDataSizeWithRedos() / 1024 / 1024;
+ total_size += static_cast<int>(
+ rs->OnDiskBaseDataSizeWithDeltas() / 1024 / 1024);
}
ASSERT_LE(total_size, budget_mb);
qualities.push_back(quality);
diff --git a/src/kudu/tablet/compaction_policy.cc
b/src/kudu/tablet/compaction_policy.cc
index a8451647e..6da9b205b 100644
--- a/src/kudu/tablet/compaction_policy.cc
+++ b/src/kudu/tablet/compaction_policy.cc
@@ -229,7 +229,7 @@ struct KnapsackTraits {
typedef const RowSetInfo* item_type;
typedef double value_type;
static int get_weight(item_type item) {
- return item->base_and_redos_size_mb();
+ return item->base_and_deltas_size_mb();
}
static value_type get_value(item_type item) {
return item->value();
@@ -284,11 +284,11 @@ class BoundCalculator {
std::push_heap(fractional_solution_.begin(),
fractional_solution_.end(),
compareByDescendingDensity);
- current_weight_ += candidate.base_and_redos_size_mb();
+ current_weight_ += candidate.base_and_deltas_size_mb();
current_value_ += candidate.value();
const RowSetInfo* top = fractional_solution_.front();
- while (current_weight_ - top->base_and_redos_size_mb() > max_weight_) {
- current_weight_ -= top->base_and_redos_size_mb();
+ while (current_weight_ - top->base_and_deltas_size_mb() > max_weight_) {
+ current_weight_ -= top->base_and_deltas_size_mb();
current_value_ -= top->value();
std::pop_heap(fractional_solution_.begin(),
fractional_solution_.end(),
diff --git a/src/kudu/tablet/diskrowset.cc b/src/kudu/tablet/diskrowset.cc
index 4fd157cd4..f0460c37f 100644
--- a/src/kudu/tablet/diskrowset.cc
+++ b/src/kudu/tablet/diskrowset.cc
@@ -82,8 +82,14 @@ DEFINE_bool(rowset_metadata_store_keys, false,
"metadata. If false, keys will be read from the data blocks.");
TAG_FLAG(rowset_metadata_store_keys, experimental);
+DEFINE_bool(rowset_deltas_size_include_undo, true,
+ "Whether to consider undo delta size while picking rowsets for "
+ "merge compaction. If false, old behavior holds i.e., only base "
+ "data and redo delta size will be taken into account during "
+ "computation of upper and lower bounds for a set of rowsets");
+TAG_FLAG(rowset_deltas_size_include_undo, advanced);
+
using kudu::cfile::BloomFileWriter;
-using kudu::fs::BlockManager;
using kudu::fs::BlockCreationTransaction;
using kudu::fs::CreateBlockOptions;
using kudu::fs::IOContext;
@@ -813,9 +819,12 @@ uint64_t DiskRowSet::OnDiskBaseDataColumnSize(const
ColumnId& col_id) const {
return 0;
}
-uint64_t DiskRowSet::OnDiskBaseDataSizeWithRedos() const {
+uint64_t DiskRowSet::OnDiskBaseDataSizeWithDeltas() const {
DiskRowSetSpace drss;
GetDiskRowSetSpaceUsage(&drss);
+ if (FLAGS_rowset_deltas_size_include_undo) {
+ return drss.base_data_size + drss.redo_deltas_size + drss.undo_deltas_size;
+ }
return drss.base_data_size + drss.redo_deltas_size;
}
diff --git a/src/kudu/tablet/diskrowset.h b/src/kudu/tablet/diskrowset.h
index 84a4d00df..b587f2cee 100644
--- a/src/kudu/tablet/diskrowset.h
+++ b/src/kudu/tablet/diskrowset.h
@@ -411,7 +411,7 @@ class DiskRowSet :
uint64_t OnDiskBaseDataColumnSize(const ColumnId& col_id) const override;
- uint64_t OnDiskBaseDataSizeWithRedos() const override;
+ uint64_t OnDiskBaseDataSizeWithDeltas() const override;
size_t DeltaMemStoreSize() const override;
diff --git a/src/kudu/tablet/memrowset.h b/src/kudu/tablet/memrowset.h
index 9fcba9fa2..5b3aa3278 100644
--- a/src/kudu/tablet/memrowset.h
+++ b/src/kudu/tablet/memrowset.h
@@ -270,11 +270,11 @@ class MemRowSet : public RowSet,
return 0;
}
- uint64_t OnDiskBaseDataSizeWithRedos() const override {
+ uint64_t OnDiskBaseDataSizeWithDeltas() const override {
return 0;
}
- uint64_t OnDiskBaseDataColumnSize(const ColumnId& col_id) const override {
+ uint64_t OnDiskBaseDataColumnSize(const ColumnId& /*col_id*/) const override
{
return 0;
}
diff --git a/src/kudu/tablet/mock-rowsets.h b/src/kudu/tablet/mock-rowsets.h
index 234efac14..d421ccb81 100644
--- a/src/kudu/tablet/mock-rowsets.h
+++ b/src/kudu/tablet/mock-rowsets.h
@@ -95,7 +95,7 @@ class MockRowSet : public RowSet {
LOG(FATAL) << "Unimplemented";
return 0;
}
- uint64_t OnDiskBaseDataSizeWithRedos() const override {
+ uint64_t OnDiskBaseDataSizeWithDeltas() const override {
LOG(FATAL) << "Unimplemented";
return 0;
}
@@ -213,7 +213,7 @@ class MockDiskRowSet : public MockRowSet {
return column_size_;
}
- uint64_t OnDiskBaseDataSizeWithRedos() const override {
+ uint64_t OnDiskBaseDataSizeWithDeltas() const override {
return size_;
}
diff --git a/src/kudu/tablet/rowset.cc b/src/kudu/tablet/rowset.cc
index 5d0113ce7..ebd1d8688 100644
--- a/src/kudu/tablet/rowset.cc
+++ b/src/kudu/tablet/rowset.cc
@@ -280,12 +280,12 @@ uint64_t
DuplicatingRowSet::OnDiskBaseDataColumnSize(const ColumnId& col_id) con
return size;
}
-uint64_t DuplicatingRowSet::OnDiskBaseDataSizeWithRedos() const {
+uint64_t DuplicatingRowSet::OnDiskBaseDataSizeWithDeltas() const {
// The actual value of this doesn't matter, since it won't be selected
// for compaction.
uint64_t size = 0;
for (const shared_ptr<RowSet> &rs : new_rowsets_) {
- size += rs->OnDiskBaseDataSizeWithRedos();
+ size += rs->OnDiskBaseDataSizeWithDeltas();
}
return size;
}
diff --git a/src/kudu/tablet/rowset.h b/src/kudu/tablet/rowset.h
index 7a31cf0cd..1b4f67881 100644
--- a/src/kudu/tablet/rowset.h
+++ b/src/kudu/tablet/rowset.h
@@ -202,9 +202,9 @@ class RowSet {
// Excludes bloomfiles and the ad hoc index.
virtual uint64_t OnDiskBaseDataSize() const = 0;
- // Return the size, in bytes, of this rowset's base data and REDO deltas.
- // Does not include bloomfiles, the ad hoc index, or UNDO deltas.
- virtual uint64_t OnDiskBaseDataSizeWithRedos() const = 0;
+ // Return the size, in bytes, of this rowset's base data, REDO and UNDO
deltas.
+ // Does not include bloomfiles or the ad hoc index.
+ virtual uint64_t OnDiskBaseDataSizeWithDeltas() const = 0;
// Return the size of this rowset's column in base data on disk, in bytes.
virtual uint64_t OnDiskBaseDataColumnSize(const ColumnId& col_id) const = 0;
@@ -446,8 +446,8 @@ class DuplicatingRowSet : public RowSet {
// Return the total size on-disk of this rowset's column data, in bytes.
uint64_t OnDiskBaseDataColumnSize(const ColumnId& col_id) const override;
- // Return the size, in bytes, of this rowset's data, not including UNDOs.
- uint64_t OnDiskBaseDataSizeWithRedos() const override;
+ // Return the size, in bytes, of this rowset's base data and deltas.
+ uint64_t OnDiskBaseDataSizeWithDeltas() const override;
std::string ToString() const override;
diff --git a/src/kudu/tablet/rowset_info.cc b/src/kudu/tablet/rowset_info.cc
index ecba52be8..61eda58ae 100644
--- a/src/kudu/tablet/rowset_info.cc
+++ b/src/kudu/tablet/rowset_info.cc
@@ -195,7 +195,7 @@ double WidthByDataSize(const Slice& prev, const Slice& next,
for (const auto& rs_rsi : active) {
double fraction = StringFractionInRange(rs_rsi.second, prev, next);
- weight += rs_rsi.second->base_and_redos_size_bytes() * fraction;
+ weight += rs_rsi.second->base_and_deltas_size_bytes() * fraction;
}
return weight;
@@ -469,11 +469,11 @@ RowSetInfo::RowSetInfo(RowSet* rs, double init_cdf)
density_(0.0),
extra_(new ExtraData()) {
extra_->rowset = rs;
- extra_->base_and_redos_size_bytes = rs->OnDiskBaseDataSizeWithRedos();
+ extra_->base_and_deltas_size_bytes = rs->OnDiskBaseDataSizeWithDeltas();
extra_->size_bytes = rs->OnDiskSize();
extra_->has_bounds = rs->GetBounds(&extra_->min_key, &extra_->max_key).ok();
- base_and_redos_size_mb_ =
- std::max(implicit_cast<int>(extra_->base_and_redos_size_bytes / 1024 /
1024),
+ base_and_deltas_size_mb_ =
+ std::max(implicit_cast<int>(extra_->base_and_deltas_size_bytes / 1024 /
1024),
kMinSizeMb);
}
@@ -484,22 +484,22 @@ uint64_t RowSetInfo::size_bytes(const ColumnId& col_id)
const {
void RowSetInfo::FinalizeCDFVector(double quot, vector<RowSetInfo>* vec) {
if (quot == 0) return;
for (RowSetInfo& cdf_rs : *vec) {
- CHECK_GT(cdf_rs.base_and_redos_size_mb_, 0)
+ CHECK_GT(cdf_rs.base_and_deltas_size_mb_, 0)
<< "Expected file size to be at least 1MB "
<< "for RowSet " << cdf_rs.rowset()->ToString()
- << ", was " << cdf_rs.base_and_redos_size_bytes()
+ << ", was " << cdf_rs.base_and_deltas_size_bytes()
<< " bytes.";
cdf_rs.cdf_min_key_ /= quot;
cdf_rs.cdf_max_key_ /= quot;
cdf_rs.value_ = ComputeRowsetValue(cdf_rs.width(),
cdf_rs.extra_->size_bytes);
- cdf_rs.density_ = cdf_rs.value_ / cdf_rs.base_and_redos_size_mb_;
+ cdf_rs.density_ = cdf_rs.value_ / cdf_rs.base_and_deltas_size_mb_;
}
}
string RowSetInfo::ToString() const {
string ret;
ret.append(rowset()->ToString());
- StringAppendF(&ret, "(% 3dM) [%.04f, %.04f]", base_and_redos_size_mb_,
+ StringAppendF(&ret, "(% 3dM) [%.04f, %.04f]", base_and_deltas_size_mb_,
cdf_min_key_, cdf_max_key_);
if (extra_->has_bounds) {
ret.append("
[").append(KUDU_REDACT(Slice(extra_->min_key).ToDebugString()));
diff --git a/src/kudu/tablet/rowset_info.h b/src/kudu/tablet/rowset_info.h
index 2ec8b4d31..3a7e40cd8 100644
--- a/src/kudu/tablet/rowset_info.h
+++ b/src/kudu/tablet/rowset_info.h
@@ -89,11 +89,11 @@ class RowSetInfo {
double mem_to_disk_ratio);
uint64_t size_bytes(const ColumnId& col_id) const;
- uint64_t base_and_redos_size_bytes() const {
- return extra_->base_and_redos_size_bytes;
+ uint64_t base_and_deltas_size_bytes() const {
+ return extra_->base_and_deltas_size_bytes;
}
uint64_t size_bytes() const { return extra_->size_bytes; }
- int base_and_redos_size_mb() const { return base_and_redos_size_mb_; }
+ int base_and_deltas_size_mb() const { return base_and_deltas_size_mb_; }
// Return the value of the CDF at the minimum key of this candidate.
double cdf_min_key() const { return cdf_min_key_; }
@@ -136,10 +136,11 @@ class RowSetInfo {
static void FinalizeCDFVector(double quot, std::vector<RowSetInfo>* vec);
- // The size of the base data and redos in MB, already clamped so that all
- // rowsets have size at least 1MB. This is cached to avoid the branch during
- // the selection hot path.
- int base_and_redos_size_mb_;
+ // The size (in MB) of the base data, redos and undos or just the base data
and
+ // redos (depending on whether --rowset_deltas_size_include_undo is enabled
or not),
+ // already clamped so that all rowsets have size at least 1MB. This is
cached to
+ // avoid the branch during the selection hot path.
+ int base_and_deltas_size_mb_;
double cdf_min_key_, cdf_max_key_;
@@ -162,8 +163,8 @@ class RowSetInfo {
//
// These are ref-counted so that RowSetInfo is copyable.
struct ExtraData : public RefCounted<ExtraData> {
- // Cached version of rowset_->OnDiskBaseDataSizeWithRedos().
- uint64_t base_and_redos_size_bytes;
+ // Cached version of rowset_->OnDiskBaseDataSizeWithDeltas().
+ uint64_t base_and_deltas_size_bytes;
// Cached version of rowset_->OnDiskSize().
uint64_t size_bytes;
diff --git a/src/kudu/tablet/tablet.cc b/src/kudu/tablet/tablet.cc
index 171fc7147..58517984c 100644
--- a/src/kudu/tablet/tablet.cc
+++ b/src/kudu/tablet/tablet.cc
@@ -107,8 +107,9 @@ DEFINE_bool(prevent_kudu_2233_corruption, true,
"Whether or not to prevent KUDU-2233 corruptions. Used for testing
only!");
TAG_FLAG(prevent_kudu_2233_corruption, unsafe);
-DEFINE_int32(tablet_compaction_budget_mb, 128,
- "Budget for a single compaction");
+DEFINE_int32(tablet_compaction_budget_mb, 256,
+ "Budget for a single compaction. Consider setting this back to
128 if "
+ "--rowset_deltas_size_include_undo is set to false.");
TAG_FLAG(tablet_compaction_budget_mb, experimental);
DEFINE_int32(tablet_bloom_block_size, 4096,
@@ -2244,7 +2245,7 @@ Status Tablet::DoMergeCompactionOrFlush(const
RowSetsInCompactionOrFlush &input,
if (input.num_rowsets() > 1) {
MAYBE_FAULT(FLAGS_fault_crash_before_flush_tablet_meta_after_compaction);
} else if (input.num_rowsets() == 1 &&
- input.rowsets()[0]->OnDiskBaseDataSizeWithRedos() == 0) {
+ input.rowsets()[0]->OnDiskBaseDataSizeWithDeltas() == 0) {
MAYBE_FAULT(FLAGS_fault_crash_before_flush_tablet_meta_after_flush_mrs);
}