This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new 24c5767bc2a branch-3.0: [Fix](compaction) Failed compaction tablets
should sleep before being selected again #50672 (#51030)
24c5767bc2a is described below
commit 24c5767bc2aebf590bed599fee4ed73a8486d3f6
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Tue May 20 09:29:09 2025 +0800
branch-3.0: [Fix](compaction) Failed compaction tablets should sleep before
being selected again #50672 (#51030)
Cherry-picked from #50672
Co-authored-by: abmdocrt <[email protected]>
---
be/src/cloud/cloud_cumulative_compaction.cpp | 16 +-
be/src/cloud/cloud_storage_engine.cpp | 15 +-
be/src/cloud/cloud_tablet_mgr.cpp | 9 +-
be/src/cloud/cloud_tablet_mgr.h | 3 +
be/src/common/status.h | 1 +
be/src/olap/cumulative_compaction.cpp | 4 +-
be/src/olap/tablet.cpp | 2 +-
be/test/cloud/cloud_compaction_test.cpp | 195 +++++++++++++++++++++++
regression-test/plugins/plugin_compaction.groovy | 2 +-
9 files changed, 228 insertions(+), 19 deletions(-)
diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp
b/be/src/cloud/cloud_cumulative_compaction.cpp
index 5d6a519aa5a..d04155ab283 100644
--- a/be/src/cloud/cloud_cumulative_compaction.cpp
+++ b/be/src/cloud/cloud_cumulative_compaction.cpp
@@ -86,8 +86,8 @@ Status CloudCumulativeCompaction::prepare_compact() {
// NOTICE: after that, the cumulative point may be larger than max
version of this tablet, but it doesn't matter.
update_cumulative_point();
if (!config::enable_sleep_between_delete_cumu_compaction) {
- st = Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>(
- "_last_delete_version.first not equal to -1");
+ st = Status::Error<CUMULATIVE_MEET_DELETE_VERSION>(
+ "cumulative compaction meet delete version");
}
}
return st;
@@ -154,7 +154,8 @@ Status CloudCumulativeCompaction::request_global_lock() {
LOG_WARNING("failed to prepare cumu compaction")
.tag("job_id", _uuid)
.tag("msg", resp.status().msg());
- return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>("no suitable
versions");
+ return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>(
+ "cumu no suitable versions: job tablet busy");
} else if (resp.status().code() == cloud::JOB_CHECK_ALTER_VERSION) {
(static_cast<CloudTablet*>(_tablet.get()))->set_alter_version(resp.alter_version());
std::stringstream ss;
@@ -483,7 +484,8 @@ Status CloudCumulativeCompaction::pick_rowsets_to_compact()
{
});
}
if (candidate_rowsets.empty()) {
- return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>("no suitable
versions");
+ return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>(
+ "no suitable versions: candidate rowsets empty");
}
std::sort(candidate_rowsets.begin(), candidate_rowsets.end(),
Rowset::comparator);
if (auto st = check_version_continuity(candidate_rowsets); !st.ok()) {
@@ -511,12 +513,14 @@ Status
CloudCumulativeCompaction::pick_rowsets_to_compact() {
&_last_delete_version, &compaction_score);
if (_input_rowsets.empty()) {
- return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>("no suitable
versions");
+ return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>(
+ "no suitable versions: input rowsets empty");
} else if (_input_rowsets.size() == 1 &&
!_input_rowsets.front()->rowset_meta()->is_segments_overlapping()) {
VLOG_DEBUG << "there is only one rowset and not overlapping.
tablet_id="
<< _tablet->tablet_id() << ", version=" <<
_input_rowsets.front()->version();
- return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>("no suitable
versions");
+ return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>(
+ "no suitable versions: only one rowset and not overlapping");
}
return Status::OK();
}
diff --git a/be/src/cloud/cloud_storage_engine.cpp
b/be/src/cloud/cloud_storage_engine.cpp
index 668880be6df..f19bf4f4e4d 100644
--- a/be/src/cloud/cloud_storage_engine.cpp
+++ b/be/src/cloud/cloud_storage_engine.cpp
@@ -738,12 +738,14 @@ Status
CloudStorageEngine::_submit_cumulative_compaction_task(const CloudTabletS
long now = duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
- if (st.is<ErrorCode::CUMULATIVE_NO_SUITABLE_VERSION>() &&
- st.msg() != "_last_delete_version.first not equal to -1") {
- // Backoff strategy if no suitable version
- tablet->last_cumu_no_suitable_version_ms = now;
+ if (!st.is<ErrorCode::CUMULATIVE_MEET_DELETE_VERSION>()) {
+ if (st.is<ErrorCode::CUMULATIVE_NO_SUITABLE_VERSION>()) {
+ // Backoff strategy if no suitable version
+ tablet->last_cumu_no_suitable_version_ms = now;
+ } else {
+ tablet->set_last_cumu_compaction_failure_time(now);
+ }
}
- tablet->set_last_cumu_compaction_failure_time(now);
std::lock_guard lock(_compaction_mtx);
_tablet_preparing_cumu_compaction.erase(tablet->tablet_id());
return st;
@@ -831,10 +833,9 @@ Status
CloudStorageEngine::_submit_cumulative_compaction_task(const CloudTabletS
if (_should_delay_large_task()) {
long now =
duration_cast<milliseconds>(system_clock::now().time_since_epoch())
.count();
+ // sleep 5s for this tablet
tablet->set_last_cumu_compaction_failure_time(now);
erase_executing_cumu_compaction();
- // sleep 5s for this tablet
- tablet->last_cumu_no_suitable_version_ms = now;
LOG_WARNING(
"failed to do CloudCumulativeCompaction, cumu
thread pool is "
"intensive, delay large task.")
diff --git a/be/src/cloud/cloud_tablet_mgr.cpp
b/be/src/cloud/cloud_tablet_mgr.cpp
index deab00c7ccf..4fcf5fed2f3 100644
--- a/be/src/cloud/cloud_tablet_mgr.cpp
+++ b/be/src/cloud/cloud_tablet_mgr.cpp
@@ -331,11 +331,13 @@ Status CloudTabletMgr::get_topn_tablets_to_compact(
auto now =
duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
auto skip = [now, compaction_type](CloudTablet* t) {
if (compaction_type == CompactionType::BASE_COMPACTION) {
- return now - t->last_base_compaction_success_time_ms <
config::base_compaction_freeze_interval_s * 1000;
+ return now - t->last_base_compaction_success_time_ms <
config::base_compaction_freeze_interval_s * 1000 ||
+ now - t->last_base_compaction_failure_time() <
config::min_compaction_failure_interval_ms;
}
// If tablet has too many rowsets but not be compacted for a long
time, compaction should be performed
// regardless of whether there is a load job recently.
- return now - t->last_cumu_no_suitable_version_ms <
config::min_compaction_failure_interval_ms ||
+ return now - t->last_cumu_compaction_failure_time() <
config::min_compaction_failure_interval_ms ||
+ now - t->last_cumu_no_suitable_version_ms <
config::min_compaction_failure_interval_ms ||
(now - t->last_load_time_ms >
config::cu_compaction_freeze_interval_s * 1000
&& now - t->last_cumu_compaction_success_time_ms <
config::cumu_compaction_interval_s * 1000
&& t->fetch_add_approximate_num_rowsets(0) <
config::max_tablet_version_num / 2);
@@ -481,4 +483,7 @@ void CloudTabletMgr::get_topn_tablet_delete_bitmap_score(
<< max_base_rowset_delete_bitmap_score_tablet_id << ",tablets=["
<< ss.str() << "]";
}
+void CloudTabletMgr::put_tablet_for_UT(std::shared_ptr<CloudTablet> tablet) {
+ _tablet_map->put(tablet);
+}
} // namespace doris
diff --git a/be/src/cloud/cloud_tablet_mgr.h b/be/src/cloud/cloud_tablet_mgr.h
index 1a6ec72c1f7..ab56586cd88 100644
--- a/be/src/cloud/cloud_tablet_mgr.h
+++ b/be/src/cloud/cloud_tablet_mgr.h
@@ -87,6 +87,9 @@ public:
void get_topn_tablet_delete_bitmap_score(uint64_t* max_delete_bitmap_score,
uint64_t*
max_base_rowset_delete_bitmap_score);
+ // **ATTN: JUST FOR UT**
+ void put_tablet_for_UT(std::shared_ptr<CloudTablet> tablet);
+
private:
CloudStorageEngine& _engine;
diff --git a/be/src/common/status.h b/be/src/common/status.h
index d059f289402..d003645b258 100644
--- a/be/src/common/status.h
+++ b/be/src/common/status.h
@@ -245,6 +245,7 @@ namespace ErrorCode {
E(CUMULATIVE_MISS_VERSION, -2006, true); \
E(FULL_NO_SUITABLE_VERSION, -2008, false); \
E(FULL_MISS_VERSION, -2009, true); \
+ E(CUMULATIVE_MEET_DELETE_VERSION, -2010, false); \
E(META_INVALID_ARGUMENT, -3000, true); \
E(META_OPEN_DB_ERROR, -3001, true); \
E(META_KEY_NOT_FOUND, -3002, false); \
diff --git a/be/src/olap/cumulative_compaction.cpp
b/be/src/olap/cumulative_compaction.cpp
index bc71fdafbf6..18873f55dec 100644
--- a/be/src/olap/cumulative_compaction.cpp
+++ b/be/src/olap/cumulative_compaction.cpp
@@ -194,8 +194,8 @@ Status CumulativeCompaction::pick_rowsets_to_compact() {
.tag("tablet id:", tablet()->tablet_id())
.tag("after cumulative compaction, cumu point:",
tablet()->cumulative_layer_point());
- return Status::Error<CUMULATIVE_NO_SUITABLE_VERSION>(
- "_last_delete_version.first not equal to -1");
+ return Status::Error<CUMULATIVE_MEET_DELETE_VERSION>(
+ "cumulative compaction meet delete version");
}
// we did not meet any delete version. which means compaction_score is
not enough to do cumulative compaction.
diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp
index f3d1917e66a..2a0bb32396f 100644
--- a/be/src/olap/tablet.cpp
+++ b/be/src/olap/tablet.cpp
@@ -1715,7 +1715,7 @@ Status Tablet::prepare_compaction_and_calculate_permits(
permits = 0;
// if we meet a delete version, should increase the cumulative
point to let base compaction handle the delete version.
// no need to wait 5s.
- if (!(res.msg() == "_last_delete_version.first not equal to -1") ||
+ if (!res.is<ErrorCode::CUMULATIVE_MEET_DELETE_VERSION>() ||
config::enable_sleep_between_delete_cumu_compaction) {
tablet->set_last_cumu_compaction_failure_time(UnixMillis());
}
diff --git a/be/test/cloud/cloud_compaction_test.cpp
b/be/test/cloud/cloud_compaction_test.cpp
new file mode 100644
index 00000000000..c8db6739084
--- /dev/null
+++ b/be/test/cloud/cloud_compaction_test.cpp
@@ -0,0 +1,195 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gen_cpp/AgentService_types.h>
+#include <gen_cpp/olap_file.pb.h>
+#include <gtest/gtest-message.h>
+#include <gtest/gtest-test-part.h>
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "cloud/cloud_storage_engine.h"
+#include "cloud/cloud_tablet.h"
+#include "cloud/cloud_tablet_mgr.h"
+#include "gtest/gtest_pred_impl.h"
+#include "json2pb/json_to_pb.h"
+#include "olap/olap_common.h"
+#include "olap/rowset/rowset_factory.h"
+#include "olap/rowset/rowset_meta.h"
+#include "olap/tablet_meta.h"
+#include "util/uid_util.h"
+
+namespace doris {
+class TabletMap;
+
+class CloudCompactionTest : public testing::Test {
+ CloudCompactionTest() : _engine(CloudStorageEngine({})) {}
+ void SetUp() override {
+ config::compaction_promotion_size_mbytes = 1024;
+ config::compaction_promotion_ratio = 0.05;
+ config::compaction_promotion_min_size_mbytes = 64;
+ config::compaction_min_size_mbytes = 64;
+
+ _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5,
TTabletSchema(), 6, {{7, 8}},
+ UniqueId(9, 10),
TTabletType::TABLET_TYPE_DISK,
+ TCompressionType::LZ4F));
+
+ _json_rowset_meta = R"({
+ "rowset_id": 540081,
+ "tablet_id": 15673,
+ "txn_id": 4042,
+ "tablet_schema_hash": 567997577,
+ "rowset_type": "BETA_ROWSET",
+ "rowset_state": "VISIBLE",
+ "start_version": 2,
+ "end_version": 2,
+ "num_rows": 3929,
+ "total_disk_size": 41,
+ "data_disk_size": 41,
+ "index_disk_size": 235,
+ "empty": false,
+ "load_id": {
+ "hi": -5350970832824939812,
+ "lo": -6717994719194512122
+ },
+ "creation_time": 1553765670,
+ "num_segments": 3
+ })";
+ }
+ void TearDown() override {}
+
+ void init_rs_meta(RowsetMetaSharedPtr& pb1, int64_t start, int64_t end) {
+ RowsetMetaPB rowset_meta_pb;
+ json2pb::JsonToProtoMessage(_json_rowset_meta, &rowset_meta_pb);
+ rowset_meta_pb.set_start_version(start);
+ rowset_meta_pb.set_end_version(end);
+ rowset_meta_pb.set_creation_time(10000);
+
+ pb1->init_from_pb(rowset_meta_pb);
+ pb1->set_total_disk_size(41);
+ pb1->set_tablet_schema(_tablet_meta->tablet_schema());
+ }
+
+ void init_rs_meta_small_base(std::vector<RowsetMetaSharedPtr>* rs_metas) {
+ RowsetMetaSharedPtr ptr1(new RowsetMeta());
+ init_rs_meta(ptr1, 0, 0);
+ rs_metas->push_back(ptr1);
+
+ RowsetMetaSharedPtr ptr2(new RowsetMeta());
+ init_rs_meta(ptr2, 1, 1);
+ rs_metas->push_back(ptr2);
+
+ RowsetMetaSharedPtr ptr3(new RowsetMeta());
+ init_rs_meta(ptr3, 2, 2);
+ rs_metas->push_back(ptr3);
+
+ RowsetMetaSharedPtr ptr4(new RowsetMeta());
+ init_rs_meta(ptr4, 3, 3);
+ rs_metas->push_back(ptr4);
+
+ RowsetMetaSharedPtr ptr5(new RowsetMeta());
+ init_rs_meta(ptr5, 4, 4);
+ rs_metas->push_back(ptr5);
+ }
+
+protected:
+ std::string _json_rowset_meta;
+ TabletMetaSharedPtr _tablet_meta;
+
+public:
+ CloudStorageEngine _engine;
+};
+
+TEST_F(CloudCompactionTest, failure_base_compaction_tablet_sleep_test) {
+ auto filter_out = [](CloudTablet* t) { return false; };
+ CloudTabletMgr mgr(_engine);
+
+ std::vector<RowsetMetaSharedPtr> rs_metas;
+ init_rs_meta_small_base(&rs_metas);
+
+ CloudTabletSPtr tablet1 = std::make_shared<CloudTablet>(_engine,
_tablet_meta);
+ for (auto& rs_meta : rs_metas) {
+ static_cast<void>(_tablet_meta->add_rs_meta(rs_meta));
+ }
+ tablet1->tablet_meta()->_tablet_id = 10000;
+ tablet1->set_last_base_compaction_failure_time(
+ duration_cast<std::chrono::milliseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count() -
+ 100000);
+ tablet1->set_last_base_compaction_failure_time(0);
+
tablet1->tablet_meta()->tablet_schema()->set_disable_auto_compaction(false);
+ tablet1->_approximate_num_rowsets = 10;
+ mgr.put_tablet_for_UT(tablet1);
+
+ int64_t max_score;
+ std::vector<std::shared_ptr<CloudTablet>> tablets {};
+ Status st = mgr.get_topn_tablets_to_compact(1,
CompactionType::BASE_COMPACTION, filter_out,
+ &tablets, &max_score);
+ ASSERT_EQ(st, Status::OK());
+ ASSERT_EQ(tablets.size(), 1);
+
+ tablet1->set_last_base_compaction_failure_time(
+ duration_cast<std::chrono::milliseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count());
+ st = mgr.get_topn_tablets_to_compact(1, CompactionType::BASE_COMPACTION,
filter_out, &tablets,
+ &max_score);
+ ASSERT_EQ(st, Status::OK());
+ ASSERT_EQ(tablets.size(), 0);
+}
+
+TEST_F(CloudCompactionTest, failure_cumu_compaction_tablet_sleep_test) {
+ auto filter_out = [](CloudTablet* t) { return false; };
+ CloudTabletMgr mgr(_engine);
+
+ std::vector<RowsetMetaSharedPtr> rs_metas;
+ init_rs_meta_small_base(&rs_metas);
+
+ CloudTabletSPtr tablet1 = std::make_shared<CloudTablet>(_engine,
_tablet_meta);
+ for (auto& rs_meta : rs_metas) {
+ static_cast<void>(_tablet_meta->add_rs_meta(rs_meta));
+ }
+ tablet1->tablet_meta()->_tablet_id = 10000;
+ tablet1->set_last_cumu_compaction_failure_time(
+ duration_cast<std::chrono::milliseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count() -
+ 100000);
+ tablet1->set_last_cumu_compaction_failure_time(0);
+
tablet1->tablet_meta()->tablet_schema()->set_disable_auto_compaction(false);
+ tablet1->_approximate_cumu_num_deltas = 10;
+ mgr.put_tablet_for_UT(tablet1);
+
+ int64_t max_score;
+ std::vector<std::shared_ptr<CloudTablet>> tablets {};
+ Status st = mgr.get_topn_tablets_to_compact(1,
CompactionType::CUMULATIVE_COMPACTION,
+ filter_out, &tablets,
&max_score);
+ ASSERT_EQ(st, Status::OK());
+ ASSERT_EQ(tablets.size(), 1);
+
+ tablet1->set_last_cumu_compaction_failure_time(
+ duration_cast<std::chrono::milliseconds>(
+ std::chrono::system_clock::now().time_since_epoch())
+ .count());
+ st = mgr.get_topn_tablets_to_compact(1, CompactionType::BASE_COMPACTION,
filter_out, &tablets,
+ &max_score);
+ ASSERT_EQ(st, Status::OK());
+ ASSERT_EQ(tablets.size(), 0);
+}
+} // namespace doris
diff --git a/regression-test/plugins/plugin_compaction.groovy
b/regression-test/plugins/plugin_compaction.groovy
index 45dd99a97a3..b187fe4b178 100644
--- a/regression-test/plugins/plugin_compaction.groovy
+++ b/regression-test/plugins/plugin_compaction.groovy
@@ -106,7 +106,7 @@ Suite.metaClass.trigger_and_wait_compaction = { String
table_name, String compac
triggered_tablets.add(tablet) // compaction already in queue,
treat it as successfully triggered
} else if (!auto_compaction_disabled) {
// ignore the error if auto compaction enabled
- } else if (status_lower.contains("e-2000")) {
+ } else if (status_lower.contains("e-2000") ||
status_lower.contains("e-2010")) {
// ignore this tablet compaction.
} else if (ignored_errors.any { error ->
status_lower.contains(error.toLowerCase()) }) {
// ignore this tablet compaction if the error is in the
ignored_errors list
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]