This is an automated email from the ASF dual-hosted git repository.
hulk pushed a commit to branch unstable
in repository https://gitbox.apache.org/repos/asf/kvrocks.git
The following commit(s) were added to refs/heads/unstable by this push:
new 5cdbb195 Try to recover automatically from SST corruption background
error(#1667)
5cdbb195 is described below
commit 5cdbb1954657ba387490adeef2af361b744b21be
Author: Myth <[email protected]>
AuthorDate: Tue Aug 15 14:09:34 2023 +0800
Try to recover automatically from SST corruption background error(#1667)
When the SST file corrupts, which is an unrecoverable error for the
rocksdb, then rocksdb will go into read-only
mode(https://github.com/facebook/rocksdb/wiki/Background-Error-Handling). Only
restart rocksdb to try to recover from the error.
When does sst file corruption occur? The error message looks like this:
```
1. Corruption: Corrupt or unsupported format_version: 1005 in
/tmp/kvrocks/data/db/000038.sst
2. Corruption: Bad table magic number: expected 9863518390377041911, found
9863518390377041912 in /tmp/kvrocks_db/data/db/000038.sst
3. Corruption: block checksum mismatch: stored = 3308200672, computed =
51173877, type = 4 in /tmp/kvrocks_db/data/db/000038.sst offset 0 size 15715
```
The cause of the error is usually a hardware issue or a problem with the
network or cloud disk (when using the cloud disk).
The most common place we see this error is when a file is generated by
`Compaction` or `Flush` and the `Version` applies the result.
In this case, the result of the compaction is not actually applied, so we
can ignore the error and avoid restarting the rocksdb.
Tikv introduces this check when sst file corruption occurs, you can refer
to:
- https://github.com/tikv/tikv/issues/10578
- https://github.com/tikv/tikv/pull/10961
Let's try it on Kvrocks:
1. Extract the sst file from the background error message
2. Determine if it is a living file
3. If not, we ignore the error and force recovery from the background error
For the rocksdb error message, before the rocksdb v7.10.2, the error
message was imperfect and we could only recover from a limited number of
errors. Thanks to this PR https://github.com/facebook/rocksdb/pull/11009, the
error message is enriched and we can recover from more scenarios.
---
src/common/string_util.cc | 14 +++++++++
src/common/string_util.h | 1 +
src/storage/event_listener.cc | 65 ++++++++++++++++++++++++++-------------
tests/cppunit/string_util_test.cc | 15 +++++++++
4 files changed, 74 insertions(+), 21 deletions(-)
diff --git a/src/common/string_util.cc b/src/common/string_util.cc
index 1b3eb39e..b3004a3a 100644
--- a/src/common/string_util.cc
+++ b/src/common/string_util.cc
@@ -22,6 +22,7 @@
#include <fmt/format.h>
+#include <regex>
#include <string>
#include "parse_util.h"
@@ -214,6 +215,19 @@ int StringMatchLen(const char *pattern, size_t
pattern_len, const char *string,
return 0;
}
+std::vector<std::string> RegexMatch(const std::string &str, const std::string
®ex) {
+ std::regex base_regex(regex);
+ std::smatch pieces_match;
+ std::vector<std::string> out;
+
+ if (std::regex_match(str, pieces_match, base_regex)) {
+ for (const auto &piece : pieces_match) {
+ out.emplace_back(piece.str());
+ }
+ }
+ return out;
+}
+
std::string StringToHex(const std::string &input) {
static const char hex_digits[] = "0123456789ABCDEF";
std::string output;
diff --git a/src/common/string_util.h b/src/common/string_util.h
index dfb00706..2ebd7639 100644
--- a/src/common/string_util.h
+++ b/src/common/string_util.h
@@ -34,6 +34,7 @@ std::vector<std::string> Split2KV(const std::string &in,
const std::string &deli
bool HasPrefix(const std::string &str, const std::string &prefix);
int StringMatch(const std::string &pattern, const std::string &in, int nocase);
int StringMatchLen(const char *p, size_t plen, const char *s, size_t slen, int
nocase);
+std::vector<std::string> RegexMatch(const std::string &str, const std::string
®ex);
std::string StringToHex(const std::string &input);
std::vector<std::string> TokenizeRedisProtocol(const std::string &value);
std::string EscapeString(const std::string &s);
diff --git a/src/storage/event_listener.cc b/src/storage/event_listener.cc
index 46f399fc..6e054945 100644
--- a/src/storage/event_listener.cc
+++ b/src/storage/event_listener.cc
@@ -20,10 +20,20 @@
#include "event_listener.h"
-#include <map>
#include <string>
#include <vector>
+#include "fmt/format.h"
+
+std::string BackgroundErrorReason2String(const rocksdb::BackgroundErrorReason
reason) {
+ std::vector<std::string> background_error_reason = {
+ "flush", "compaction", "write_callback", "memtable", "manifest_write",
"flush_no_wal", "manifest_write_no_wal"};
+ if (static_cast<size_t>(reason) < background_error_reason.size()) {
+ return background_error_reason[static_cast<size_t>(reason)];
+ }
+ return "unknown";
+}
+
std::string FileCreatedReason2String(const rocksdb::TableFileCreationReason
reason) {
std::vector<std::string> file_created_reason = {"flush", "compaction",
"recovery", "misc"};
if (static_cast<size_t>(reason) < file_created_reason.size()) {
@@ -49,6 +59,14 @@ std::string CompressType2String(const
rocksdb::CompressionType type) {
return "unknown";
}
+std::string ExtractSSTFileNameFromError(const std::string &error) {
+ auto match_results = util::RegexMatch(error, ".*(/\\w*\\.sst).*");
+ if (match_results.size() == 2) {
+ return match_results[1];
+ }
+ return {};
+}
+
bool IsDiskQuotaExceeded(const rocksdb::Status &bg_error) {
// EDQUOT: Disk quota exceeded (POSIX.1-2001)
std::string exceeded_quota_str = "Disk quota exceeded";
@@ -58,7 +76,7 @@ bool IsDiskQuotaExceeded(const rocksdb::Status &bg_error) {
}
void EventListener::OnCompactionCompleted(rocksdb::DB *db, const
rocksdb::CompactionJobInfo &ci) {
- LOG(INFO) << "[event_listener/compaction_completed] column family: " <<
ci.cf_name
+ LOG(INFO) << "[event_listener/compaction_completed] column family: " <<
ci.cf_name << ", job_id: " << ci.job_id
<< ", compaction reason: " <<
static_cast<int>(ci.compaction_reason)
<< ", output compression type: " <<
CompressType2String(ci.compression)
<< ", base input level(files): " << ci.base_input_level << "(" <<
ci.input_files.size() << ")"
@@ -87,30 +105,35 @@ void EventListener::OnFlushCompleted(rocksdb::DB *db,
const rocksdb::FlushJobInf
}
void EventListener::OnBackgroundError(rocksdb::BackgroundErrorReason reason,
rocksdb::Status *bg_error) {
- std::string reason_str;
- switch (reason) {
- case rocksdb::BackgroundErrorReason::kCompaction:
- reason_str = "compact";
- break;
- case rocksdb::BackgroundErrorReason::kFlush:
- reason_str = "flush";
- break;
- case rocksdb::BackgroundErrorReason::kMemTable:
- reason_str = "memtable";
- break;
- case rocksdb::BackgroundErrorReason::kWriteCallback:
- reason_str = "writecallback";
- break;
- default:
- // Should not arrive here
- break;
+ auto reason_str = BackgroundErrorReason2String(reason);
+ auto error_str = bg_error->ToString();
+ if (bg_error->IsCorruption() || bg_error->IsIOError()) {
+ // Background error may occur when SST are generated during
flush/compaction. If those files are not applied
+ // to Version, we consider them non-fatal background error. We can
override bg_error to recover from
+ // background error.
+ // Note that we cannot call Resume() manually because the error severity
is unrecoverable.
+ auto corrupt_sst = ExtractSSTFileNameFromError(error_str);
+ if (!corrupt_sst.empty()) {
+ std::vector<std::string> live_files;
+ uint64_t manifest_size = 0;
+ auto s = storage_->GetDB()->GetLiveFiles(live_files, &manifest_size,
false /* flush_memtable */);
+ if (s.ok() && std::find(live_files.begin(), live_files.end(),
corrupt_sst) == live_files.end()) {
+ *bg_error = rocksdb::Status::OK();
+ LOG(WARNING) << fmt::format(
+ "[event_listener/background_error] ignore no-fatal background
error about sst file, reason: {}, bg_error: "
+ "{}",
+ reason_str, error_str);
+ return;
+ }
+ }
}
+
if ((bg_error->IsNoSpace() || IsDiskQuotaExceeded(*bg_error)) &&
bg_error->severity() < rocksdb::Status::kFatalError) {
storage_->SetDBInRetryableIOError(true);
}
- LOG(ERROR) << "[event_listener/background_error] reason: " << reason_str <<
", bg_error: " << bg_error->ToString();
+ LOG(ERROR) << fmt::format("[event_listener/background_error] reason: {},
bg_error: {}", reason_str, error_str);
}
void EventListener::OnTableFileDeleted(const rocksdb::TableFileDeletionInfo
&info) {
@@ -126,6 +149,6 @@ void EventListener::OnStallConditionsChanged(const
rocksdb::WriteStallInfo &info
void EventListener::OnTableFileCreated(const rocksdb::TableFileCreationInfo
&info) {
LOG(INFO) << "[event_listener/table_file_created] column family: " <<
info.cf_name
- << ", file path: " << info.file_path << ", file size: " <<
info.file_size << ", job id: " << info.job_id
+ << ", file path: " << info.file_path << ", file size: " <<
info.file_size << ", job_id: " << info.job_id
<< ", reason: " << FileCreatedReason2String(info.reason) << ",
status: " << info.status.ToString();
}
diff --git a/tests/cppunit/string_util_test.cc
b/tests/cppunit/string_util_test.cc
index a3092fed..f95ccbff 100644
--- a/tests/cppunit/string_util_test.cc
+++ b/tests/cppunit/string_util_test.cc
@@ -97,3 +97,18 @@ TEST(StringUtil, EscapeString) {
ASSERT_TRUE(util::EscapeString(origin) == escaped);
}
}
+
+TEST(StringUtil, RegexMatchExtractSSTFile) {
+ // Test for ExtractSSTFileNameFromError() in event_listener.cc
+ auto bg_error_str = {"Corruption: Corrupt or unsupported format_version:
1005 in /tmp/kvrocks/data/db/000038.sst",
+ "Corruption: Bad table magic number: expected
9863518390377041911, found 9863518390377041912 in "
+ "/tmp/kvrocks_db/data/db/000038.sst",
+ "Corruption: block checksum mismatch: stored =
3308200672, computed = 51173877, type = 4 in "
+ "/tmp/kvrocks_db/data/db/000038.sst offset 0 size
15715"};
+
+ for (const auto &str : bg_error_str) {
+ auto match_results = util::RegexMatch(str, ".*(/\\w*\\.sst).*");
+ ASSERT_TRUE(match_results.size() == 2);
+ ASSERT_TRUE(match_results[1] == "/000038.sst");
+ }
+}