github-actions[bot] commented on code in PR #63376:
URL: https://github.com/apache/doris/pull/63376#discussion_r3321738172


##########
be/src/io/fs/file_meta_cache.cpp:
##########
@@ -40,4 +189,192 @@ std::string FileMetaCache::get_key(io::FileReaderSPtr 
file_reader,
             _file_description.file_size == -1 ? file_reader->size() : 
_file_description.file_size);
 }
 
+std::string FileMetaCache::get_persistent_cache_key(FileMetaCacheFormat format,
+                                                    std::string_view 
file_meta_cache_key) {
+    std::string key;
+    key.reserve(32 + file_meta_cache_key.size());
+    key.append("file_meta_cache:v1:");
+    key.append(format_name(format));
+    key.push_back(':');
+    key.append(file_meta_cache_key.data(), file_meta_cache_key.size());
+    return key;
+}
+
+FileMetaCacheLookupResult FileMetaCache::lookup(const FileMetaCacheContext& 
context,
+                                                ObjLRUCache::CacheHandle* 
handle,
+                                                std::string* serialized_meta,
+                                                FileMetaCacheProfile* profile) 
{
+    DCHECK(handle != nullptr);
+    DCHECK(serialized_meta != nullptr);
+    if (lookup(context.key, handle)) {
+        serialized_meta->clear();
+        if (profile != nullptr) {
+            update_profile_counter(profile->hit_cache);
+            update_profile_counter(profile->hit_memory_cache);
+        }
+        return {.state = FileMetaCacheLookupState::MEMORY_HIT};
+    }
+
+    FileMetaCacheLookupResult result;
+    int64_t persisted_read_time = 0;
+    if (lookup_persistent_cache(context, serialized_meta, 
&persisted_read_time)) {
+        result.state = FileMetaCacheLookupState::PERSISTED_HIT;
+        if (profile != nullptr) {
+            update_profile_counter(profile->hit_cache);
+            update_profile_counter(profile->hit_disk_cache);
+            update_profile_counter(profile->read_disk_cache_time, 
persisted_read_time);
+        }
+    } else if (config::enable_external_file_meta_disk_cache && profile != 
nullptr) {
+        update_profile_counter(profile->miss_disk_cache);
+    }
+    return result;
+}
+
+bool FileMetaCache::lookup_persistent_cache(const FileMetaCacheContext& 
context,
+                                            std::string* payload, int64_t* 
read_time) {
+    DCHECK(payload != nullptr);
+    DCHECK(read_time != nullptr);
+    payload->clear();
+    *read_time = 0;
+    if (!config::enable_external_file_meta_disk_cache) {
+        return false;
+    }
+
+    MonotonicStopWatch watch;
+    watch.start();
+    auto stop_watch = [&]() { *read_time = watch.elapsed_time(); };
+
+    const std::string disk_cache_key = 
get_persistent_cache_key(context.format, context.key);
+    const auto hash = io::BlockFileCache::hash(disk_cache_key);
+    io::BlockFileCache* cache = get_block_file_cache(hash);
+    if (cache == nullptr) {
+        stop_watch();
+        return false;
+    }
+
+    auto invalidate_entry = [&](const Status& status) {
+        payload->clear();
+        cache->remove_if_cached(hash);
+        VLOG_DEBUG << "lookup file meta disk cache failed: " << status;
+        stop_watch();
+        return false;
+    };
+
+    std::string header(FILE_META_CACHE_DISK_HEADER_SIZE, '\0');
+    Status status = read_cached_file_cache(cache, hash, 0, 
Slice(header.data(), header.size()));
+    if (!status.ok()) {
+        VLOG_DEBUG << "lookup file meta disk cache failed: " << status;
+        stop_watch();
+        return false;
+    }
+
+    FileMetaCacheDiskHeader parsed;
+    status = parse_disk_cache_header(header, &parsed);
+    if (!status.ok()) {
+        return invalidate_entry(status);
+    }
+    const auto max_entry_bytes =
+            
static_cast<uint64_t>(config::external_file_meta_disk_cache_max_entry_bytes);
+    if (parsed.format != context.format || parsed.modification_time != 
context.modification_time ||

Review Comment:
   `external_file_meta_disk_cache_max_entry_bytes` is mutable and signed, but 
both lookup and insert cast it to unsigned types. If an operator sets it to a 
negative value, the limit becomes effectively huge, so oversized entries are 
accepted and lookup can call `payload->resize(parsed.payload_size)` for a 
corrupt/large header instead of rejecting it. Please validate this config as 
non-negative before casting, or treat non-positive values as disabling the disk 
metadata cache.



##########
be/src/format/orc/vorc_reader.cpp:
##########
@@ -403,16 +427,43 @@ Status OrcReader::_create_file_reader() {
 
         // Local variables can be required because setSerializedFileTail is an 
assignment operation, not a reference.
         ObjLRUCache::CacheHandle _meta_cache_handle;
-        if (_meta_cache->lookup(file_meta_cache_key, &_meta_cache_handle)) {
+        const int64_t file_size = _file_description.file_size == -1 ? 
inner_file_reader->size()
+                                                                    : 
_file_description.file_size;
+        const FileMetaCacheContext file_meta_cache_context {
+                .format = FileMetaCacheFormat::ORC,
+                .key = file_meta_cache_key,
+                .modification_time = _file_description.mtime,
+                .file_size = file_size};
+        FileMetaCacheProfile file_meta_cache_profile {
+                .hit_cache = &_statistics.file_footer_hit_cache,
+                .hit_memory_cache = &_statistics.file_footer_hit_memory_cache,
+                .hit_disk_cache = &_statistics.file_footer_hit_disk_cache,
+                .miss_disk_cache = &_statistics.file_footer_miss_disk_cache,
+                .write_disk_cache = &_statistics.file_footer_write_disk_cache,
+                .read_disk_cache_time = 
&_statistics.file_footer_read_disk_cache_time,
+                .write_disk_cache_time = 
&_statistics.file_footer_write_disk_cache_time};
+        std::string footer_payload;
+        const auto lookup_result = 
_meta_cache->lookup(file_meta_cache_context, &_meta_cache_handle,
+                                                       &footer_payload, 
&file_meta_cache_profile);
+        if (lookup_result.state == FileMetaCacheLookupState::MEMORY_HIT) {
             const std::string* footer_ptr = _meta_cache_handle.data<String>();
             options.setSerializedFileTail(*footer_ptr);
             RETURN_IF_ERROR(create_orc_reader());
-            _statistics.file_footer_hit_cache++;
+        } else if (lookup_result.state == 
FileMetaCacheLookupState::PERSISTED_HIT) {
+            auto footer_ptr = 
std::make_unique<std::string>(std::move(footer_payload));
+            options.setSerializedFileTail(*footer_ptr);
+            RETURN_IF_ERROR(create_orc_reader());
+            _meta_cache->insert(file_meta_cache_key, footer_ptr, 
&_meta_cache_handle);
         } else {
             _statistics.file_footer_read_calls++;
             RETURN_IF_ERROR(create_orc_reader());
-            std::string* footer_ptr = new std::string 
{_reader->getSerializedFileTail()};
-            _meta_cache->insert(file_meta_cache_key, footer_ptr, 
&_meta_cache_handle);
+            auto footer_ptr = 
std::make_unique<std::string>(_reader->getSerializedFileTail());
+            const auto insert_result =

Review Comment:
   Same over-limit issue as the Parquet path: `getSerializedFileTail()` copies 
the full ORC tail before the cache insert checks 
`external_file_meta_disk_cache_max_entry_bytes`. Large ORC footers that will 
not be cached still pay the extra allocation/copy cost. Please guard this with 
the configured max-entry size before constructing the string for disk-cache 
insertion.



##########
be/src/format/parquet/vparquet_reader.cpp:
##########
@@ -346,19 +358,62 @@ Status ParquetReader::_open_file() {
         } else {
             const auto& file_meta_cache_key =
                     FileMetaCache::get_key(_tracing_file_reader, 
_file_description);
-            if (!_meta_cache->lookup(file_meta_cache_key, 
&_meta_cache_handle)) {
+            const int64_t file_size = _file_description.file_size == -1
+                                              ? _tracing_file_reader->size()
+                                              : _file_description.file_size;
+            const FileMetaCacheContext file_meta_cache_context {
+                    .format = FileMetaCacheFormat::PARQUET,
+                    .key = file_meta_cache_key,
+                    .modification_time = _file_description.mtime,
+                    .file_size = file_size};
+            FileMetaCacheProfile file_meta_cache_profile {
+                    .hit_cache = &_reader_statistics.file_footer_hit_cache,
+                    .hit_memory_cache = 
&_reader_statistics.file_footer_hit_memory_cache,
+                    .hit_disk_cache = 
&_reader_statistics.file_footer_hit_disk_cache,
+                    .miss_disk_cache = 
&_reader_statistics.file_footer_miss_disk_cache,
+                    .write_disk_cache = 
&_reader_statistics.file_footer_write_disk_cache,
+                    .read_disk_cache_time = 
&_reader_statistics.file_footer_read_disk_cache_time,
+                    .write_disk_cache_time = 
&_reader_statistics.file_footer_write_disk_cache_time};
+            std::string footer_payload;
+            const auto lookup_result =
+                    _meta_cache->lookup(file_meta_cache_context, 
&_meta_cache_handle,
+                                        &footer_payload, 
&file_meta_cache_profile);
+            if (lookup_result.state == FileMetaCacheLookupState::MEMORY_HIT) {
+                _file_metadata = _meta_cache_handle.data<FileMetaData>();
+            } else if (lookup_result.state == 
FileMetaCacheLookupState::PERSISTED_HIT) {
+                auto metadata_size = 
static_cast<uint32_t>(footer_payload.size());
+                tparquet::FileMetaData t_metadata;
+                RETURN_IF_ERROR(deserialize_thrift_msg(
+                        reinterpret_cast<const 
uint8_t*>(footer_payload.data()), &metadata_size,
+                        true, &t_metadata));
+                _file_metadata_ptr = 
std::make_unique<FileMetaData>(t_metadata, metadata_size);
+                
RETURN_IF_ERROR(_file_metadata_ptr->init_schema(enable_mapping_varbinary,
+                                                                
enable_mapping_timestamp_tz));
+                if (_meta_cache->insert(file_meta_cache_key, 
_file_metadata_ptr,
+                                        &_meta_cache_handle)) {
+                    _file_metadata = _meta_cache_handle.data<FileMetaData>();
+                } else {
+                    _file_metadata = _file_metadata_ptr.get();
+                }
+            } else {
                 RETURN_IF_ERROR(parse_thrift_footer(_tracing_file_reader, 
&_file_metadata_ptr,
                                                     &meta_size, _io_ctx, 
enable_mapping_varbinary,
                                                     
enable_mapping_timestamp_tz));
-                // _file_metadata_ptr.release() : move control of 
_file_metadata to _meta_cache_handle
-                _meta_cache->insert(file_meta_cache_key, 
_file_metadata_ptr.release(),
-                                    &_meta_cache_handle);
-                _file_metadata = _meta_cache_handle.data<FileMetaData>();
+                if (config::enable_external_file_meta_disk_cache) {
+                    tparquet::FileMetaData thrift_metadata = 
_file_metadata_ptr->to_thrift();

Review Comment:
   This path serializes the full Parquet footer whenever disk meta cache is 
enabled, but `FileMetaCache::insert_persistent_cache()` only checks 
`external_file_meta_disk_cache_max_entry_bytes` after this work is already 
done. For a file with a footer larger than the configured limit, we still build 
the thrift metadata and allocate/serialize `footer_payload`, then immediately 
discard it. Please check the limit using `meta_size` before 
`to_thrift()`/`serialize()` so over-limit footers skip disk-cache payload 
construction.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to