This is an automated email from the ASF dual-hosted git repository.
gavinchou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new e5d4e989210 [fix](cloud) fix segment footer CORRUPTION not triggering
file cache retry (#61386)
e5d4e989210 is described below
commit e5d4e9892102a0c20994b9bf428d31d47adf804a
Author: Siyang Tang <[email protected]>
AuthorDate: Tue Mar 17 15:13:01 2026 +0800
[fix](cloud) fix segment footer CORRUPTION not triggering file cache retry
(#61386)
## Proposed changes
The three-tier retry logic in `Segment::_open()` (static method) was
structured as `if-else-if`, so when `open_file()` succeeded but
`_parse_footer()` returned `CORRUPTION`, the retry branch was
unreachable.
### Root cause
```cpp
// Before: if-else-if structure
auto st = fs->open_file(path, &file_reader, &reader_options);
if (st) { // open_file succeeded (almost
always)
segment->_file_reader = ...;
st = segment->_open(stats); // _parse_footer() → CORRUPTION
stored in st
} else if (st.is<CORRUPTION>() && ...) { // UNREACHABLE: already entered
`if` branch
// Tier 1: clear cache, retry
// Tier 2: bypass cache, read remote directly
}
RETURN_IF_ERROR(st); // CORRUPTION returned without any retry
```
`open_file()` only opens a file handle and rarely returns `CORRUPTION`.
The actual footer checksum validation happens inside `_parse_footer()`
(called via `segment->_open()`). Because the retry was in an `else if`
guarded by the same `st` from `open_file()`, it was never reachable for
the common `_parse_footer()` corruption case.
### Fix
Change `else if` to a separate `if` block, so CORRUPTION from either
`open_file()` or `_parse_footer()` triggers the three-tier retry:
```cpp
// After: independent if blocks
if (st) {
segment->_file_reader = ...;
st = segment->_open(stats); // _parse_footer() → CORRUPTION stored in
st
}
// NOW reachable regardless of where CORRUPTION came from
if (st.is<CORRUPTION>() && reader_options.cache_type == FILE_BLOCK_CACHE) {
// Tier 1: clear file cache, re-download from remote
// Tier 2: bypass cache entirely, read remote directly
// Tier 3: remote source itself corrupt (logs warning)
}
```
### Issue
Observed in cloud (S3) deployments: schema change fails with
`CORRUPTION: Bad segment file footer checksum not match`. Log analysis
confirmed that no retry log messages were ever emitted, consistent with
this code bug.
## Tests
- Added `TestFooterCorruptionTriggersRetry` to
`segment_corruption_test.cpp`
- Uses the existing `Segment::parse_footer:magic_number_corruption` sync
point to corrupt the footer magic number on the first `_parse_footer()`
call only (simulating file cache corruption)
- Verifies the segment opens successfully via the retry path
---
be/src/storage/segment/segment.cpp | 11 +++-
.../storage/segment/segment_corruption_test.cpp | 71 ++++++++++++++++++++++
2 files changed, 80 insertions(+), 2 deletions(-)
diff --git a/be/src/storage/segment/segment.cpp
b/be/src/storage/segment/segment.cpp
index 00acc2fc036..fdeeb3e752c 100644
--- a/be/src/storage/segment/segment.cpp
+++ b/be/src/storage/segment/segment.cpp
@@ -116,8 +116,13 @@ Status Segment::_open(io::FileSystemSPtr fs, const
std::string& path, uint32_t s
segment->_fs = fs;
segment->_file_reader = std::move(file_reader);
st = segment->_open(stats);
- } else if (st.is<ErrorCode::CORRUPTION>() &&
- reader_options.cache_type ==
io::FileCachePolicy::FILE_BLOCK_CACHE) {
+ }
+
+ // Three-tier retry for CORRUPTION errors when file cache is enabled.
+ // This handles CORRUPTION from both open_file() and _parse_footer() (via
_open()).
+ if (st.is<ErrorCode::CORRUPTION>() &&
+ reader_options.cache_type == io::FileCachePolicy::FILE_BLOCK_CACHE) {
+ // Tier 1: Clear file cache and retry with cache support (re-downloads
from remote).
LOG(WARNING) << "bad segment file may be read from file cache, try to
read remote source "
"file directly, file path: "
<< path << " cache_key: " << file_cache_key_str(path);
@@ -133,6 +138,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const
std::string& path, uint32_t s
}
TEST_INJECTION_POINT_CALLBACK("Segment::open:corruption1", &st);
if (st.is<ErrorCode::CORRUPTION>()) { // corrupt again
+ // Tier 2: Bypass cache entirely and read directly from remote
storage.
LOG(WARNING) << "failed to try to read remote source file again
with cache support,"
<< " try to read from remote directly, "
<< " file path: " << path << " cache_key: " <<
file_cache_key_str(path);
@@ -146,6 +152,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const
std::string& path, uint32_t s
segment->_file_reader = std::move(file_reader);
st = segment->_open(stats);
if (!st.ok()) {
+ // Tier 3: Remote source itself is corrupt.
LOG(WARNING) << "failed to try to read remote source file
directly,"
<< " file path: " << path
<< " cache_key: " << file_cache_key_str(path);
diff --git a/be/test/storage/segment/segment_corruption_test.cpp
b/be/test/storage/segment/segment_corruption_test.cpp
index 6f979ac531e..0bc03d2c36a 100644
--- a/be/test/storage/segment/segment_corruption_test.cpp
+++ b/be/test/storage/segment/segment_corruption_test.cpp
@@ -336,6 +336,77 @@ TEST_F(SegmentCorruptionTest,
TestFsSetInCorruptionRetryPath) {
// If we reach here, _fs was correctly set
}
+// Test that CORRUPTION from _parse_footer() (after successful open_file())
triggers the
+// three-tier retry logic. Before the fix, the retry was in an else-if branch
that was
+// only reachable when open_file() itself returned CORRUPTION, not when
_parse_footer()
+// did. This test verifies the fix by:
+// 1. Creating a valid segment
+// 2. Corrupting the footer magic number on the first parse attempt via sync
point
+// 3. Verifying the segment opens successfully after cache-bypass retry
+TEST_F(SegmentCorruptionTest, TestFooterCorruptionTriggersRetry) {
+ auto schema = create_schema_with_inverted_index();
+ RowsetId rowset_id;
+ rowset_id.init(3);
+
+ auto path = create_segment_with_inverted_index(schema, 0, rowset_id);
+ auto fs = io::global_local_filesystem();
+
+ // Use sync point to corrupt the magic number on the first _parse_footer()
call.
+ // This simulates reading corrupt data from file cache while the remote
file is fine.
+ auto* sp = SyncPoint::get_instance();
+ sp->enable_processing();
+
+ int parse_footer_count = 0;
+ SyncPoint::CallbackGuard guard;
+ sp->set_call_back(
+ "Segment::parse_footer:magic_number_corruption",
+ [&parse_footer_count](auto&& args) {
+ // Corrupt magic number only on the first attempt to simulate
cache corruption.
+ // Subsequent retries (which bypass or clear cache) will read
correct data.
+ if (parse_footer_count == 0) {
+ auto* buf = try_any_cast<uint8_t*>(args[0]);
+ // Corrupt the magic number (last 4 bytes of the 12-byte
trailer)
+ buf[8] = 0xFF;
+ buf[9] = 0xFF;
+ buf[10] = 0xFF;
+ buf[11] = 0xFF;
+ parse_footer_count++;
+ }
+ },
+ &guard);
+
+ std::shared_ptr<Segment> segment;
+ // Use FILE_BLOCK_CACHE to enable the corruption retry path
+ io::FileReaderOptions reader_options;
+ reader_options.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE;
+
+ auto st = Segment::open(fs, path, /*tablet_id=*/100, /*segment_id=*/0,
rowset_id, schema,
+ reader_options, &segment);
+
+ sp->disable_processing();
+
+ // Verify that the magic number was corrupted on the first attempt
+ ASSERT_EQ(parse_footer_count, 1) << "Footer corruption should have been
injected once";
+
+ // The segment should open successfully after retry (tier 1: clear cache +
retry)
+ ASSERT_TRUE(st.ok()) << st.to_string();
+ ASSERT_NE(segment, nullptr);
+
+ // Verify that _fs was correctly set through the retry path
+ OlapReaderStatistics stats;
+ StorageReadOptions read_options;
+ read_options.stats = &stats;
+
+ auto indexes = schema->inverted_indexs(schema->column(1));
+ ASSERT_FALSE(indexes.empty());
+ const TabletIndex* idx_meta = indexes[0];
+ std::unique_ptr<IndexIterator> iter;
+ st = segment->new_index_iterator(schema->column(1), idx_meta,
read_options, &iter);
+ st =
segment->_index_file_reader->init(config::inverted_index_read_buffer_size,
+ &read_options.io_ctx);
+ ASSERT_TRUE(st.ok()) << st.to_string();
+}
+
// Test normal segment open path
TEST_F(SegmentCorruptionTest, TestFsSetInNormalPath) {
auto schema = create_schema_with_inverted_index();
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]