This is an automated email from the ASF dual-hosted git repository.

gavinchou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e5d4e989210 [fix](cloud) fix segment footer CORRUPTION not triggering 
file cache retry (#61386)
e5d4e989210 is described below

commit e5d4e9892102a0c20994b9bf428d31d47adf804a
Author: Siyang Tang <[email protected]>
AuthorDate: Tue Mar 17 15:13:01 2026 +0800

    [fix](cloud) fix segment footer CORRUPTION not triggering file cache retry 
(#61386)
    
    ## Proposed changes
    
    The three-tier retry logic in `Segment::_open()` (static method) was
    structured as `if-else-if`, so when `open_file()` succeeded but
    `_parse_footer()` returned `CORRUPTION`, the retry branch was
    unreachable.
    
    ### Root cause
    
    ```cpp
    // Before: if-else-if structure
    auto st = fs->open_file(path, &file_reader, &reader_options);
    if (st) {                                  // open_file succeeded (almost 
always)
        segment->_file_reader = ...;
        st = segment->_open(stats);            // _parse_footer() → CORRUPTION 
stored in st
    } else if (st.is<CORRUPTION>() && ...) {   // UNREACHABLE: already entered 
`if` branch
        // Tier 1: clear cache, retry
        // Tier 2: bypass cache, read remote directly
    }
    RETURN_IF_ERROR(st);  // CORRUPTION returned without any retry
    ```
    
    `open_file()` only opens a file handle and rarely returns `CORRUPTION`.
    The actual footer checksum validation happens inside `_parse_footer()`
    (called via `segment->_open()`). Because the retry was in an `else if`
    guarded by the same `st` from `open_file()`, it was never reachable for
    the common `_parse_footer()` corruption case.
    
    ### Fix
    
    Change `else if` to a separate `if` block, so CORRUPTION from either
    `open_file()` or `_parse_footer()` triggers the three-tier retry:
    
    ```cpp
    // After: independent if blocks
    if (st) {
        segment->_file_reader = ...;
        st = segment->_open(stats);   // _parse_footer() → CORRUPTION stored in 
st
    }
    // NOW reachable regardless of where CORRUPTION came from
    if (st.is<CORRUPTION>() && reader_options.cache_type == FILE_BLOCK_CACHE) {
        // Tier 1: clear file cache, re-download from remote
        // Tier 2: bypass cache entirely, read remote directly
        // Tier 3: remote source itself corrupt (logs warning)
    }
    ```
    
    ### Issue
    
    Observed in cloud (S3) deployments: schema change fails with
    `CORRUPTION: Bad segment file footer checksum not match`. Log analysis
    confirmed that no retry log messages were ever emitted, consistent with
    this code bug.
    
    ## Tests
    
    - Added `TestFooterCorruptionTriggersRetry` to
    `segment_corruption_test.cpp`
    - Uses the existing `Segment::parse_footer:magic_number_corruption` sync
    point to corrupt the footer magic number on the first `_parse_footer()`
    call only (simulating file cache corruption)
    - Verifies the segment opens successfully via the retry path
---
 be/src/storage/segment/segment.cpp                 | 11 +++-
 .../storage/segment/segment_corruption_test.cpp    | 71 ++++++++++++++++++++++
 2 files changed, 80 insertions(+), 2 deletions(-)

diff --git a/be/src/storage/segment/segment.cpp 
b/be/src/storage/segment/segment.cpp
index 00acc2fc036..fdeeb3e752c 100644
--- a/be/src/storage/segment/segment.cpp
+++ b/be/src/storage/segment/segment.cpp
@@ -116,8 +116,13 @@ Status Segment::_open(io::FileSystemSPtr fs, const 
std::string& path, uint32_t s
         segment->_fs = fs;
         segment->_file_reader = std::move(file_reader);
         st = segment->_open(stats);
-    } else if (st.is<ErrorCode::CORRUPTION>() &&
-               reader_options.cache_type == 
io::FileCachePolicy::FILE_BLOCK_CACHE) {
+    }
+
+    // Three-tier retry for CORRUPTION errors when file cache is enabled.
+    // This handles CORRUPTION from both open_file() and _parse_footer() (via 
_open()).
+    if (st.is<ErrorCode::CORRUPTION>() &&
+        reader_options.cache_type == io::FileCachePolicy::FILE_BLOCK_CACHE) {
+        // Tier 1: Clear file cache and retry with cache support (re-downloads 
from remote).
         LOG(WARNING) << "bad segment file may be read from file cache, try to 
read remote source "
                         "file directly, file path: "
                      << path << " cache_key: " << file_cache_key_str(path);
@@ -133,6 +138,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const 
std::string& path, uint32_t s
         }
         TEST_INJECTION_POINT_CALLBACK("Segment::open:corruption1", &st);
         if (st.is<ErrorCode::CORRUPTION>()) { // corrupt again
+            // Tier 2: Bypass cache entirely and read directly from remote 
storage.
             LOG(WARNING) << "failed to try to read remote source file again 
with cache support,"
                          << " try to read from remote directly, "
                          << " file path: " << path << " cache_key: " << 
file_cache_key_str(path);
@@ -146,6 +152,7 @@ Status Segment::_open(io::FileSystemSPtr fs, const 
std::string& path, uint32_t s
             segment->_file_reader = std::move(file_reader);
             st = segment->_open(stats);
             if (!st.ok()) {
+                // Tier 3: Remote source itself is corrupt.
                 LOG(WARNING) << "failed to try to read remote source file 
directly,"
                              << " file path: " << path
                              << " cache_key: " << file_cache_key_str(path);
diff --git a/be/test/storage/segment/segment_corruption_test.cpp 
b/be/test/storage/segment/segment_corruption_test.cpp
index 6f979ac531e..0bc03d2c36a 100644
--- a/be/test/storage/segment/segment_corruption_test.cpp
+++ b/be/test/storage/segment/segment_corruption_test.cpp
@@ -336,6 +336,77 @@ TEST_F(SegmentCorruptionTest, 
TestFsSetInCorruptionRetryPath) {
     // If we reach here, _fs was correctly set
 }
 
+// Test that CORRUPTION from _parse_footer() (after successful open_file()) 
triggers the
+// three-tier retry logic. Before the fix, the retry was in an else-if branch 
that was
+// only reachable when open_file() itself returned CORRUPTION, not when 
_parse_footer()
+// did. This test verifies the fix by:
+// 1. Creating a valid segment
+// 2. Corrupting the footer magic number on the first parse attempt via sync 
point
+// 3. Verifying the segment opens successfully after cache-bypass retry
+TEST_F(SegmentCorruptionTest, TestFooterCorruptionTriggersRetry) {
+    auto schema = create_schema_with_inverted_index();
+    RowsetId rowset_id;
+    rowset_id.init(3);
+
+    auto path = create_segment_with_inverted_index(schema, 0, rowset_id);
+    auto fs = io::global_local_filesystem();
+
+    // Use sync point to corrupt the magic number on the first _parse_footer() 
call.
+    // This simulates reading corrupt data from file cache while the remote 
file is fine.
+    auto* sp = SyncPoint::get_instance();
+    sp->enable_processing();
+
+    int parse_footer_count = 0;
+    SyncPoint::CallbackGuard guard;
+    sp->set_call_back(
+            "Segment::parse_footer:magic_number_corruption",
+            [&parse_footer_count](auto&& args) {
+                // Corrupt magic number only on the first attempt to simulate 
cache corruption.
+                // Subsequent retries (which bypass or clear cache) will read 
correct data.
+                if (parse_footer_count == 0) {
+                    auto* buf = try_any_cast<uint8_t*>(args[0]);
+                    // Corrupt the magic number (last 4 bytes of the 12-byte 
trailer)
+                    buf[8] = 0xFF;
+                    buf[9] = 0xFF;
+                    buf[10] = 0xFF;
+                    buf[11] = 0xFF;
+                    parse_footer_count++;
+                }
+            },
+            &guard);
+
+    std::shared_ptr<Segment> segment;
+    // Use FILE_BLOCK_CACHE to enable the corruption retry path
+    io::FileReaderOptions reader_options;
+    reader_options.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE;
+
+    auto st = Segment::open(fs, path, /*tablet_id=*/100, /*segment_id=*/0, 
rowset_id, schema,
+                            reader_options, &segment);
+
+    sp->disable_processing();
+
+    // Verify that the magic number was corrupted on the first attempt
+    ASSERT_EQ(parse_footer_count, 1) << "Footer corruption should have been 
injected once";
+
+    // The segment should open successfully after retry (tier 1: clear cache + 
retry)
+    ASSERT_TRUE(st.ok()) << st.to_string();
+    ASSERT_NE(segment, nullptr);
+
+    // Verify that _fs was correctly set through the retry path
+    OlapReaderStatistics stats;
+    StorageReadOptions read_options;
+    read_options.stats = &stats;
+
+    auto indexes = schema->inverted_indexs(schema->column(1));
+    ASSERT_FALSE(indexes.empty());
+    const TabletIndex* idx_meta = indexes[0];
+    std::unique_ptr<IndexIterator> iter;
+    st = segment->new_index_iterator(schema->column(1), idx_meta, 
read_options, &iter);
+    st = 
segment->_index_file_reader->init(config::inverted_index_read_buffer_size,
+                                           &read_options.io_ctx);
+    ASSERT_TRUE(st.ok()) << st.to_string();
+}
+
 // Test normal segment open path
 TEST_F(SegmentCorruptionTest, TestFsSetInNormalPath) {
     auto schema = create_schema_with_inverted_index();


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to