github-actions[bot] commented on code in PR #63376:
URL: https://github.com/apache/doris/pull/63376#discussion_r3481935795
##########
be/src/format/generic_reader.h:
##########
@@ -292,6 +294,7 @@ class GenericReader : public ProfileCollector {
// Cache to save some common part such as file footer.
// Maybe null if not used
FileMetaCache* _meta_cache = nullptr;
+ bool _enable_file_meta_memory_cache = true;
Review Comment:
This default leaks through the nested delete-file readers. The primary
scanner now computes `enable_file_meta_memory_cache` and sets it on the
top-level Parquet/ORC table readers, and equality-delete readers copy
`this->_enable_file_meta_memory_cache`, but the stack-created Iceberg
position-delete readers (`iceberg_reader.cpp`) and Transactional Hive
delete-delta reader (`transactional_hive_reader.cpp`) are constructed with
`_meta_cache` and call `init_reader()` without setting the new flag. For a
large scan where the parent correctly runs disk-only
(`enable_file_meta_memory_cache == false`), those delete-file readers keep this
default `true` and can still admit delete-file footers into the L1 memory
cache. Please propagate the parent flag to those nested readers before
`init_reader()`, the same way the equality-delete path does, so all parallel
delete-file paths follow the same memory-cache admission decision.
##########
be/test/format/file_reader/file_meta_cache_test.cpp:
##########
@@ -155,4 +339,765 @@ TEST(FileMetaCacheTest, InsertAndLookupWithIntValue) {
EXPECT_EQ(*cached_val2, 12345);
}
-} // namespace doris
\ No newline at end of file
+TEST(FileMetaCacheTest, ExternalFileMetaDiskCacheSwitchIsStartupOnly) {
+ const bool old_enable_external_file_meta_disk_cache =
+ config::enable_external_file_meta_disk_cache;
+ Defer defer {[&] {
+ config::enable_external_file_meta_disk_cache =
old_enable_external_file_meta_disk_cache;
+ }};
+
+ config::enable_external_file_meta_disk_cache = false;
+ Status status = config::set_config("enable_external_file_meta_disk_cache",
"true");
+ EXPECT_FALSE(status.ok());
+ EXPECT_TRUE(status.is<ErrorCode::NOT_IMPLEMENTED_ERROR>() ||
status.is<ErrorCode::NOT_FOUND>())
+ << status.to_string();
+ EXPECT_FALSE(config::enable_external_file_meta_disk_cache);
+}
+
+TEST(FileMetaCacheTest, ExternalFileMetaDiskCacheIsEnabledByDefault) {
+ EXPECT_TRUE(config::enable_external_file_meta_disk_cache);
+}
+
+TEST(FileMetaCacheTest, LookupUpdatesMemoryHitProfile) {
+ FileMetaCache cache(config::max_external_file_meta_cache_num);
+ const std::string meta_key =
FileMetaCache::get_key("s3://bucket/memory.parquet", 123, 456);
+ const FileMetaCacheContext meta_context {.format =
FileMetaCacheFormat::PARQUET,
+ .key = meta_key,
+ .modification_time = 123,
+ .file_size = 456};
+ auto cached_payload = std::make_unique<std::string>("serialized footer
payload");
+ ObjLRUCache::CacheHandle insert_handle;
+ ASSERT_TRUE(cache.insert(meta_key, cached_payload, &insert_handle));
+
+ int64_t hit_cache = 0;
+ int64_t hit_memory_cache = 0;
+ int64_t hit_disk_cache = 0;
+ int64_t miss_disk_cache = 0;
+ int64_t write_disk_cache = 0;
+ int64_t read_disk_cache_time = 0;
+ int64_t write_disk_cache_time = 0;
+ FileMetaCacheProfile profile {.hit_cache = &hit_cache,
+ .hit_memory_cache = &hit_memory_cache,
+ .hit_disk_cache = &hit_disk_cache,
+ .miss_disk_cache = &miss_disk_cache,
+ .write_disk_cache = &write_disk_cache,
+ .read_disk_cache_time =
&read_disk_cache_time,
+ .write_disk_cache_time =
&write_disk_cache_time};
+
+ ObjLRUCache::CacheHandle lookup_handle;
+ std::string output;
+ const auto lookup_result = cache.lookup(meta_context, &lookup_handle,
&output, &profile);
+
+ EXPECT_EQ(lookup_result.state, FileMetaCacheLookupState::MEMORY_HIT);
+ EXPECT_EQ(hit_cache, 1);
+ EXPECT_EQ(hit_memory_cache, 1);
+ EXPECT_EQ(hit_disk_cache, 0);
+ EXPECT_EQ(miss_disk_cache, 0);
+ EXPECT_EQ(write_disk_cache, 0);
+ EXPECT_EQ(read_disk_cache_time, 0);
+ EXPECT_EQ(write_disk_cache_time, 0);
+}
+
+TEST_F(FileMetaCacheDiskTest,
DiskCacheWorksWhenMemoryCacheAdmissionIsDisabled) {
+ std::filesystem::path cache_dir =
+ std::filesystem::current_path() /
"file_meta_disk_cache_without_memory_test";
+ if (std::filesystem::exists(cache_dir)) {
+ std::filesystem::remove_all(cache_dir);
+ }
+ std::filesystem::create_directories(cache_dir);
+ ScopedFileCacheDiskResourceLimitConfig disk_resource_limit_config;
+ const bool old_enable_external_file_meta_disk_cache =
+ config::enable_external_file_meta_disk_cache;
+ Defer defer {[&] {
+ config::enable_external_file_meta_disk_cache =
old_enable_external_file_meta_disk_cache;
+ std::filesystem::remove_all(cache_dir);
+ }};
+ config::enable_external_file_meta_disk_cache = true;
+
+ io::FileCacheSettings settings;
+ settings.capacity = 1024 * 1024;
+ settings.max_file_block_size = 16;
+ settings.index_queue_size = 1024 * 1024;
+ settings.index_queue_elements = 1024;
+ io::BlockFileCache block_cache(cache_dir.string(), settings);
+ ASSERT_TRUE(block_cache.initialize().ok());
+ for (int i = 0; i < 5000; ++i) {
+ if (block_cache.get_async_open_success()) {
+ break;
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ }
+ ASSERT_TRUE(block_cache.get_async_open_success());
+
+ FileMetaCache cache(config::max_external_file_meta_cache_num,
&block_cache);
+ const std::string meta_key =
+ FileMetaCache::get_key("s3://bucket/without-memory.parquet", 123,
456);
+ const FileMetaCacheContext meta_context {.format =
FileMetaCacheFormat::PARQUET,
+ .key = meta_key,
+ .modification_time = 123,
+ .file_size = 456,
+ .enable_memory_cache = false};
+ const std::string payload = "serialized footer payload";
+ auto cached_payload = std::make_unique<std::string>(payload);
+ ObjLRUCache::CacheHandle cache_handle;
+
+ const auto insert_result = cache.insert(meta_context, cached_payload,
&cache_handle, payload);
+ EXPECT_TRUE(insert_result.persisted_inserted);
+ EXPECT_FALSE(insert_result.memory_inserted);
+
+ ObjLRUCache::CacheHandle memory_lookup_handle;
+ EXPECT_FALSE(cache.lookup(meta_key, &memory_lookup_handle));
+
+ std::string output;
+ ObjLRUCache::CacheHandle lookup_handle;
+ const auto lookup_result = cache.lookup(meta_context, &lookup_handle,
&output);
+ EXPECT_EQ(lookup_result.state, FileMetaCacheLookupState::PERSISTED_HIT);
+ EXPECT_EQ(output, payload);
+ EXPECT_FALSE(lookup_handle.valid());
+}
+
+TEST_F(FileMetaCacheDiskTest, PersistentValueUsesVersionedProtoEnvelope) {
+ const bool old_enable_external_file_meta_disk_cache =
+ config::enable_external_file_meta_disk_cache;
+ Defer defer {[&] {
+ config::enable_external_file_meta_disk_cache =
old_enable_external_file_meta_disk_cache;
+ }};
+ config::enable_external_file_meta_disk_cache = true;
+
+ io::FileCacheSettings settings;
+ settings.capacity = 1024 * 1024;
+ settings.index_queue_size = 1024 * 1024;
+ settings.index_queue_elements = 1024;
+ settings.max_file_block_size = 1024;
+ settings.max_query_cache_size = 1024 * 1024;
+ settings.storage = "memory";
+ io::BlockFileCache block_cache("file_meta_disk_cache_proto_envelope_test",
settings);
+ ASSERT_TRUE(block_cache.initialize().ok());
+
+ FileMetaCache cache(config::max_external_file_meta_cache_num,
&block_cache);
+ const std::string meta_key =
+ FileMetaCache::get_key("s3://bucket/proto-envelope.parquet", 123,
456);
+ const FileMetaCacheContext meta_context {.format =
FileMetaCacheFormat::PARQUET,
+ .key = meta_key,
+ .modification_time = 123,
+ .file_size = 456};
+ const std::string payload = "serialized footer payload";
+ auto cached_payload = std::make_unique<std::string>(payload);
+ ObjLRUCache::CacheHandle cache_handle;
+
+ const auto insert_result =
+ cache.insert(meta_context, cached_payload, &cache_handle,
std::string_view(payload));
+ ASSERT_TRUE(insert_result.persisted_inserted);
+
+ const auto hash = io::BlockFileCache::hash(
+
FileMetaCache::get_persistent_cache_key(FileMetaCacheFormat::PARQUET,
meta_key));
+ std::string cached_value;
+ Status status = read_block_cache_value_for_test(&block_cache, hash,
&cached_value);
+ ASSERT_TRUE(status.ok()) << status;
+ ASSERT_GE(cached_value.size(), 1 + sizeof(uint64_t) + sizeof(uint32_t));
+
+ const auto* data = reinterpret_cast<const uint8_t*>(cached_value.data());
+ EXPECT_EQ(data[0], 1);
+ const uint64_t proto_size = decode_fixed64_le(data + 1);
+ ASSERT_EQ(proto_size, cached_value.size() - 1 - sizeof(uint64_t) -
sizeof(uint32_t));
+ const auto* proto_data = cached_value.data() + 1 + sizeof(uint64_t);
+ const uint32_t checksum =
+ decode_fixed32_le(reinterpret_cast<const uint8_t*>(proto_data +
proto_size));
+ EXPECT_EQ(checksum, crc32c::Crc32c(proto_data, proto_size));
+
+ io::cache::FileMetaCacheDiskEntryPb entry;
+ ASSERT_TRUE(entry.ParseFromArray(proto_data,
static_cast<int>(proto_size)));
+ EXPECT_EQ(entry.format(),
static_cast<uint32_t>(FileMetaCacheFormat::PARQUET));
+ EXPECT_EQ(entry.modification_time(), meta_context.modification_time);
+ EXPECT_EQ(entry.file_size(), meta_context.file_size);
+ EXPECT_EQ(entry.payload(), payload);
+}
+
+TEST_F(FileMetaCacheDiskTest, ReadReturnsPayloadWrittenThroughIndexQueue) {
+ std::filesystem::path cache_dir = std::filesystem::current_path() /
"file_meta_disk_cache_test";
+ if (std::filesystem::exists(cache_dir)) {
+ std::filesystem::remove_all(cache_dir);
+ }
+ std::filesystem::create_directories(cache_dir);
+ ScopedFileCacheDiskResourceLimitConfig disk_resource_limit_config;
+ const bool old_enable_external_file_meta_disk_cache =
+ config::enable_external_file_meta_disk_cache;
+ Defer defer {[&] {
+ config::enable_external_file_meta_disk_cache =
old_enable_external_file_meta_disk_cache;
+ std::filesystem::remove_all(cache_dir);
+ }};
+ config::enable_external_file_meta_disk_cache = true;
+
+ io::FileCacheSettings settings;
+ settings.capacity = 1024 * 1024;
+ settings.max_file_block_size = 16;
+ settings.index_queue_size = 1024 * 1024;
+ settings.index_queue_elements = 1024;
+ io::BlockFileCache block_cache(cache_dir.string(), settings);
+ ASSERT_TRUE(block_cache.initialize().ok());
+ for (int i = 0; i < 5000; ++i) {
+ if (block_cache.get_async_open_success()) {
+ break;
+ }
+ std::this_thread::sleep_for(std::chrono::milliseconds(1));
+ }
+ ASSERT_TRUE(block_cache.get_async_open_success());
+
+ FileMetaCache cache(config::max_external_file_meta_cache_num,
&block_cache);
+ const std::string meta_key =
FileMetaCache::get_key("s3://bucket/test.parquet", 123, 456);
+ const FileMetaCacheContext meta_context {.format =
FileMetaCacheFormat::PARQUET,
+ .key = meta_key,
+ .modification_time = 123,
+ .file_size = 456};
+ const std::string payload = "serialized footer payload";
+ auto cached_payload = std::make_unique<std::string>(payload);
+ ObjLRUCache::CacheHandle cache_handle;
+ int64_t write_disk_cache = 0;
+ int64_t write_disk_cache_time = 0;
+ FileMetaCacheProfile insert_profile {.write_disk_cache = &write_disk_cache,
+ .write_disk_cache_time =
&write_disk_cache_time};
+
+ const auto insert_result = cache.insert(meta_context, cached_payload,
&cache_handle,
+ std::string_view(payload),
&insert_profile);
+ EXPECT_TRUE(insert_result.persisted_inserted);
+ EXPECT_EQ(write_disk_cache, 1);
+
+ FileMetaCache
cache_after_l1_miss(config::max_external_file_meta_cache_num, &block_cache);
+ std::string output;
+ ObjLRUCache::CacheHandle lookup_handle;
+ int64_t hit_cache = 0;
+ int64_t hit_memory_cache = 0;
+ int64_t hit_disk_cache = 0;
+ int64_t miss_disk_cache = 0;
+ int64_t read_disk_cache_time = 0;
+ FileMetaCacheProfile lookup_profile {.hit_cache = &hit_cache,
+ .hit_memory_cache = &hit_memory_cache,
+ .hit_disk_cache = &hit_disk_cache,
+ .miss_disk_cache = &miss_disk_cache,
+ .read_disk_cache_time =
&read_disk_cache_time};
+ const auto lookup_result =
+ cache_after_l1_miss.lookup(meta_context, &lookup_handle, &output,
&lookup_profile);
+ EXPECT_EQ(lookup_result.state, FileMetaCacheLookupState::PERSISTED_HIT);
+ EXPECT_EQ(output, payload);
+ EXPECT_EQ(hit_cache, 1);
+ EXPECT_EQ(hit_memory_cache, 0);
+ EXPECT_EQ(hit_disk_cache, 1);
+ EXPECT_EQ(miss_disk_cache, 0);
+
+ std::string stale_output;
+ ObjLRUCache::CacheHandle stale_lookup_handle;
+ const FileMetaCacheContext stale_meta_context {.format =
FileMetaCacheFormat::PARQUET,
+ .key = meta_key,
+ .modification_time = 124,
+ .file_size = 456};
+ int64_t stale_miss_disk_cache = 0;
+ FileMetaCacheProfile stale_lookup_profile {.miss_disk_cache =
&stale_miss_disk_cache};
+ const auto stale_lookup_result = cache_after_l1_miss.lookup(
+ stale_meta_context, &stale_lookup_handle, &stale_output,
&stale_lookup_profile);
+ EXPECT_EQ(stale_lookup_result.state, FileMetaCacheLookupState::MISS);
+ EXPECT_EQ(stale_miss_disk_cache, 1);
+
+ const std::string refreshed_payload = "refreshed serialized footer
payload";
+ auto refreshed_cached_payload =
std::make_unique<std::string>(refreshed_payload);
+ ObjLRUCache::CacheHandle refreshed_cache_handle;
+ const auto refreshed_insert_result = cache_after_l1_miss.insert(
+ stale_meta_context, refreshed_cached_payload,
&refreshed_cache_handle,
+ std::string_view(refreshed_payload), &insert_profile);
+ EXPECT_TRUE(refreshed_insert_result.persisted_inserted);
+
+ FileMetaCache
cache_after_stale_l1_miss(config::max_external_file_meta_cache_num,
&block_cache);
+ ObjLRUCache::CacheHandle refreshed_lookup_handle;
+ const auto refreshed_lookup_result = cache_after_stale_l1_miss.lookup(
+ stale_meta_context, &refreshed_lookup_handle, &stale_output,
&lookup_profile);
+ EXPECT_EQ(refreshed_lookup_result.state,
FileMetaCacheLookupState::PERSISTED_HIT);
+ EXPECT_EQ(stale_output, refreshed_payload);
+}
+
+TEST_F(FileMetaCacheDiskTest, PersistentHitRefreshesIndexQueueLru) {
+ std::filesystem::path cache_dir =
+ std::filesystem::current_path() / "file_meta_disk_cache_lru_test";
+ if (std::filesystem::exists(cache_dir)) {
+ std::filesystem::remove_all(cache_dir);
+ }
+ std::filesystem::create_directories(cache_dir);
+ ScopedFileCacheDiskResourceLimitConfig disk_resource_limit_config;
+ const bool old_enable_external_file_meta_disk_cache =
+ config::enable_external_file_meta_disk_cache;
+ Defer defer {[&] {
+ config::enable_external_file_meta_disk_cache =
old_enable_external_file_meta_disk_cache;
+ std::filesystem::remove_all(cache_dir);
+ }};
+ config::enable_external_file_meta_disk_cache = true;
+
+ constexpr int64_t modification_time = 123;
+ constexpr int64_t file_size = 456;
+ const std::string first_payload = "payload1";
+ const std::string second_payload = "payload2";
+ const std::string third_payload = "payload3";
+ const size_t disk_entry_size =
+ build_disk_cache_value_for_test(FileMetaCacheFormat::PARQUET,
modification_time,
+ file_size, first_payload)
+ .size();
+
+ io::FileCacheSettings settings;
+ settings.capacity = disk_entry_size * 2 + 1;
+ settings.max_file_block_size = settings.capacity;
+ settings.index_queue_size = settings.capacity;
+ settings.index_queue_elements = 1024;
+ io::BlockFileCache block_cache(cache_dir.string(), settings);
+ ASSERT_TRUE(block_cache.initialize().ok());
+ ASSERT_TRUE(wait_for_cache_async_open(&block_cache));
+
+ const std::string first_key =
+ FileMetaCache::get_key("s3://bucket/lru-first.parquet",
modification_time, file_size);
+ const std::string second_key =
+ FileMetaCache::get_key("s3://bucket/lru-second.parquet",
modification_time, file_size);
+ const std::string third_key =
+ FileMetaCache::get_key("s3://bucket/lru-third.parquet",
modification_time, file_size);
+ const FileMetaCacheContext first_context {.format =
FileMetaCacheFormat::PARQUET,
+ .key = first_key,
+ .modification_time =
modification_time,
+ .file_size = file_size,
+ .enable_memory_cache = false};
+ const FileMetaCacheContext second_context {.format =
FileMetaCacheFormat::PARQUET,
+ .key = second_key,
+ .modification_time =
modification_time,
+ .file_size = file_size,
+ .enable_memory_cache = false};
+ const FileMetaCacheContext third_context {.format =
FileMetaCacheFormat::PARQUET,
+ .key = third_key,
+ .modification_time =
modification_time,
+ .file_size = file_size,
+ .enable_memory_cache = false};
+
+ auto insert_payload = [&](FileMetaCache& cache, const
FileMetaCacheContext& context,
+ std::string_view payload) {
+ auto cached_payload = std::make_unique<std::string>(payload);
+ ObjLRUCache::CacheHandle cache_handle;
+ const auto insert_result = cache.insert(context, cached_payload,
&cache_handle, payload);
+ ASSERT_TRUE(insert_result.persisted_inserted);
+ };
+
+ FileMetaCache cache(config::max_external_file_meta_cache_num,
&block_cache);
+ insert_payload(cache, first_context, first_payload);
+ insert_payload(cache, second_context, second_payload);
+
+ FileMetaCache
cache_after_l1_miss(config::max_external_file_meta_cache_num, &block_cache);
+ std::string output;
+ ObjLRUCache::CacheHandle lookup_handle;
+ const auto first_lookup_result =
+ cache_after_l1_miss.lookup(first_context, &lookup_handle, &output);
+ ASSERT_EQ(first_lookup_result.state,
FileMetaCacheLookupState::PERSISTED_HIT);
+ ASSERT_EQ(output, first_payload);
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(100));
Review Comment:
This makes the regression timing-dependent. The persistent hit only queues
an async LRU touch; the move is applied later by
`run_background_block_lru_update()` after that thread wakes and drains
`_need_update_lru_blocks`. If CI delays that background thread past this fixed
sleep, the first entry is still oldest when the third payload is inserted and
this test can evict the very entry it expects to preserve. Please make the test
wait on an observable condition, or drain/apply the pending LRU update directly
under the test lock like the cache LRU unit tests do, before forcing eviction.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]