This is an automated email from the ASF dual-hosted git repository.

gavinchou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 4cbae8a97f7 [enhancement](filecache) fine-grained cache space 
observation (#57783)
4cbae8a97f7 is described below

commit 4cbae8a97f7d0448e0b79d61fb958165059f27d6
Author: zhengyu <[email protected]>
AuthorDate: Thu Nov 13 13:36:14 2025 +0800

    [enhancement](filecache) fine-grained cache space observation (#57783)
---
 be/src/exec/schema_scanner.cpp                     |   3 +
 .../schema_file_cache_info_scanner.cpp             | 189 +++++++++++++++++++++
 .../schema_file_cache_info_scanner.h               |  47 +++++
 be/src/io/cache/block_file_cache.h                 |   3 +
 be/src/io/cache/block_file_cache_factory.h         |   3 +
 .../org/apache/doris/analysis/SchemaTableType.java |   2 +
 .../java/org/apache/doris/catalog/SchemaTable.java |  11 ++
 gensrc/thrift/Descriptors.thrift                   |   1 +
 .../cloud_p0/cache/test_file_cache_info.groovy     | 129 ++++++++++++++
 9 files changed, 388 insertions(+)

diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp
index a39af09cc51..caadef76377 100644
--- a/be/src/exec/schema_scanner.cpp
+++ b/be/src/exec/schema_scanner.cpp
@@ -39,6 +39,7 @@
 #include "exec/schema_scanner/schema_columns_scanner.h"
 #include "exec/schema_scanner/schema_dummy_scanner.h"
 #include "exec/schema_scanner/schema_encryption_keys_scanner.h"
+#include "exec/schema_scanner/schema_file_cache_info_scanner.h"
 #include "exec/schema_scanner/schema_file_cache_statistics.h"
 #include "exec/schema_scanner/schema_files_scanner.h"
 #include "exec/schema_scanner/schema_load_job_scanner.h"
@@ -257,6 +258,8 @@ std::unique_ptr<SchemaScanner> 
SchemaScanner::create(TSchemaTableType::type type
         return SchemaClusterSnapshotPropertiesScanner::create_unique();
     case TSchemaTableType::SCH_COLUMN_DATA_SIZES:
         return SchemaColumnDataSizesScanner::create_unique();
+    case TSchemaTableType::SCH_FILE_CACHE_INFO:
+        return SchemaFileCacheInfoScanner::create_unique();
     default:
         return SchemaDummyScanner::create_unique();
         break;
diff --git a/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp 
b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp
new file mode 100644
index 00000000000..9734dbfe44b
--- /dev/null
+++ b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp
@@ -0,0 +1,189 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exec/schema_scanner/schema_file_cache_info_scanner.h"
+
+#include "io/cache/file_cache_common.h"
+#include "runtime/exec_env.h"
+#include "runtime/runtime_state.h"
+#include "vec/common/string_ref.h"
+#include "vec/core/block.h"
+#include "vec/data_types/data_type_factory.hpp"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+
+std::vector<SchemaScanner::ColumnDesc> 
SchemaFileCacheInfoScanner::_s_tbls_columns = {
+        //   name,       type,          size,     is_null
+        {"HASH", TYPE_STRING, sizeof(StringRef), true},
+        {"TABLET_ID", TYPE_BIGINT, sizeof(int64_t), true},
+        {"SIZE", TYPE_BIGINT, sizeof(int64_t), true},
+        {"TYPE", TYPE_STRING, sizeof(StringRef), true},
+        {"REMOTE_PATH", TYPE_STRING, sizeof(StringRef), true},
+        {"CACHE_PATH", TYPE_STRING, sizeof(StringRef), true},
+        {"BE_ID", TYPE_BIGINT, sizeof(int64_t), true}};
+
+SchemaFileCacheInfoScanner::SchemaFileCacheInfoScanner()
+        : SchemaScanner(_s_tbls_columns, 
TSchemaTableType::SCH_FILE_CACHE_INFO) {}
+
+SchemaFileCacheInfoScanner::~SchemaFileCacheInfoScanner() {}
+
+Status SchemaFileCacheInfoScanner::start(RuntimeState* state) {
+    return Status::OK();
+}
+
+Status SchemaFileCacheInfoScanner::get_next_block_internal(vectorized::Block* 
block, bool* eos) {
+    if (!_is_init) {
+        return Status::InternalError("Used before initialized.");
+    }
+
+    if (nullptr == block || nullptr == eos) {
+        return Status::InternalError("input pointer is nullptr.");
+    }
+
+    *eos = true;
+    return _fill_block_impl(block);
+}
+
+Status SchemaFileCacheInfoScanner::_fill_block_impl(vectorized::Block* block) {
+    SCOPED_TIMER(_fill_block_timer);
+
+    auto* file_cache_factory = ExecEnv::GetInstance()->file_cache_factory();
+    if (!file_cache_factory) {
+        return Status::OK();
+    }
+
+    // Collect all cache entries from all file cache instances
+    std::vector<std::tuple<std::string, int64_t, int64_t, int, std::string>> 
cache_entries;
+
+    // Get all cache instances using the public getter
+    const auto& caches = file_cache_factory->get_caches();
+    for (const auto& cache : caches) {
+        const std::string& cache_path = cache->get_base_path();
+
+        // Get the storage from cache using the public getter
+        auto* storage = cache->get_storage();
+        if (!storage) {
+            continue;
+        }
+
+        // Try to get meta_store from FSFileCacheStorage using the public 
getter
+        auto* fs_storage = 
dynamic_cast<doris::io::FSFileCacheStorage*>(storage);
+        if (!fs_storage) {
+            continue;
+        }
+
+        auto* meta_store = fs_storage->get_meta_store();
+        if (!meta_store) {
+            continue;
+        }
+
+        // Get iterator for all BlockMeta records
+        auto iterator = meta_store->get_all();
+        if (!iterator) {
+            continue;
+        }
+
+        // Iterate through all cache entries
+        while (iterator->valid()) {
+            const auto& key = iterator->key();
+            const auto& value = iterator->value();
+
+            // Check for deserialization errors
+            if (!iterator->get_last_key_error().ok() || 
!iterator->get_last_value_error().ok()) {
+                LOG(WARNING) << "Failed to deserialize cache block metadata: "
+                             << "key_error=" << 
iterator->get_last_key_error().to_string()
+                             << ", value_error=" << 
iterator->get_last_value_error().to_string();
+                iterator->next();
+                continue; // Skip invalid records
+            }
+
+            // Convert hash to string
+            std::string hash_str = key.hash.to_string();
+
+            // Add to cache entries
+            cache_entries.emplace_back(hash_str, key.tablet_id, value.size, 
value.type, cache_path);
+
+            iterator->next();
+        }
+    }
+
+    const size_t row_num = cache_entries.size();
+    if (row_num == 0) {
+        return Status::OK();
+    }
+
+    for (size_t col_idx = 0; col_idx < _s_tbls_columns.size(); ++col_idx) {
+        const auto& col_desc = _s_tbls_columns[col_idx];
+
+        std::vector<StringRef> str_refs(row_num);
+        std::vector<int64_t> int64_vals(row_num);
+        std::vector<void*> datas(row_num);
+        std::vector<std::string> column_values(row_num);
+
+        for (size_t row_idx = 0; row_idx < row_num; ++row_idx) {
+            const auto& entry = cache_entries[row_idx];
+            const auto& [hash, tablet_id, size, type, cache_path] = entry;
+
+            if (col_desc.type == TYPE_STRING) {
+                switch (col_idx) {
+                case 0: // HASH
+                    column_values[row_idx] = hash;
+                    break;
+                case 3: // TYPE
+                    column_values[row_idx] = doris::io::cache_type_to_string(
+                            static_cast<doris::io::FileCacheType>(type));
+                    break;
+                case 4:                          // REMOTE_PATH
+                    column_values[row_idx] = ""; // TODO: Implement remote 
path retrieval
+                    break;
+                case 5: // CACHE_PATH
+                    column_values[row_idx] = cache_path;
+                    break;
+                default:
+                    column_values[row_idx] = "";
+                    break;
+                }
+                str_refs[row_idx] =
+                        StringRef(column_values[row_idx].data(), 
column_values[row_idx].size());
+                datas[row_idx] = &str_refs[row_idx];
+            } else if (col_desc.type == TYPE_BIGINT) {
+                switch (col_idx) {
+                case 1: // TABLET_ID
+                    int64_vals[row_idx] = tablet_id;
+                    break;
+                case 2: // SIZE
+                    int64_vals[row_idx] = size;
+                    break;
+                case 6: // BE_ID
+                    int64_vals[row_idx] = 
ExecEnv::GetInstance()->cluster_info()->backend_id;
+                    break;
+                default:
+                    int64_vals[row_idx] = 0;
+                    break;
+                }
+                datas[row_idx] = &int64_vals[row_idx];
+            }
+        }
+
+        RETURN_IF_ERROR(fill_dest_column_for_range(block, col_idx, datas));
+    }
+
+    return Status::OK();
+}
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h 
b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h
new file mode 100644
index 00000000000..2efe6f76cdf
--- /dev/null
+++ b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "common/status.h"
+#include "exec/schema_scanner.h"
+
+namespace doris {
+class RuntimeState;
+namespace vectorized {
+class Block;
+} // namespace vectorized
+
+class SchemaFileCacheInfoScanner : public SchemaScanner {
+    ENABLE_FACTORY_CREATOR(SchemaFileCacheInfoScanner);
+
+public:
+    SchemaFileCacheInfoScanner();
+    ~SchemaFileCacheInfoScanner() override;
+
+    Status start(RuntimeState* state) override;
+    Status get_next_block_internal(vectorized::Block* block, bool* eos) 
override;
+
+    static std::vector<SchemaScanner::ColumnDesc> _s_tbls_columns;
+
+private:
+    Status _fill_block_impl(vectorized::Block* block);
+};
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/io/cache/block_file_cache.h 
b/be/src/io/cache/block_file_cache.h
index dcaf81ec2a9..19715fb0445 100644
--- a/be/src/io/cache/block_file_cache.h
+++ b/be/src/io/cache/block_file_cache.h
@@ -171,6 +171,9 @@ public:
 
     [[nodiscard]] const std::string& get_base_path() const { return 
_cache_base_path; }
 
+    // Get storage for inspection
+    FileCacheStorage* get_storage() const { return _storage.get(); }
+
     /**
          * Given an `offset` and `size` representing [offset, offset + size) 
bytes interval,
          * return list of cached non-overlapping non-empty
diff --git a/be/src/io/cache/block_file_cache_factory.h 
b/be/src/io/cache/block_file_cache_factory.h
index 837feac7f68..3031076336c 100644
--- a/be/src/io/cache/block_file_cache_factory.h
+++ b/be/src/io/cache/block_file_cache_factory.h
@@ -102,6 +102,9 @@ public:
 
     void get_cache_stats_block(vectorized::Block* block);
 
+    // Get all cache instances for inspection
+    const std::vector<std::unique_ptr<BlockFileCache>>& get_caches() const { 
return _caches; }
+
     FileCacheFactory() = default;
     FileCacheFactory& operator=(const FileCacheFactory&) = delete;
     FileCacheFactory(const FileCacheFactory&) = delete;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java 
b/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java
index 5576c6d294b..ca58b311ec4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java
@@ -98,6 +98,8 @@ public enum SchemaTableType {
             TSchemaTableType.SCH_ROUTINE_LOAD_JOBS),
     SCH_LOAD_JOBS("LOAD_JOBS", "LOAD_JOBS",
             TSchemaTableType.SCH_LOAD_JOBS),
+    SCH_FILE_CACHE_INFO("FILE_CACHE_INFO", "FILE_CACHE_INFO",
+            TSchemaTableType.SCH_FILE_CACHE_INFO),
     SCH_VIEW_DEPENDENCY("VIEW_DEPENDENCY", "VIEW_DEPENDENCY",
                     TSchemaTableType.SCH_VIEW_DEPENDENCY),
     SQL_BLOCK_RULE_STATUS("SQL_BLOCK_RULE_STATUS", "SQL_BLOCK_RULE_STATUS",
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java
index 4a2c2560850..1f2b25d4c16 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java
@@ -702,6 +702,17 @@ public class SchemaTable extends Table {
                                     .column("FIRST_ERROR_MSG", 
ScalarType.createStringType())
                                     .build())
             )
+            .put("file_cache_info",
+                    new SchemaTable(SystemIdGenerator.getNextId(), 
"file_cache_info", TableType.SCHEMA,
+                            builder().column("HASH", 
ScalarType.createStringType())
+                                    .column("TABLET_ID", 
ScalarType.createType(PrimitiveType.BIGINT))
+                                    .column("SIZE", 
ScalarType.createType(PrimitiveType.BIGINT))
+                                    .column("TYPE", 
ScalarType.createStringType())
+                                    .column("REMOTE_PATH", 
ScalarType.createStringType())
+                                    .column("CACHE_PATH", 
ScalarType.createStringType())
+                                    .column("BE_ID", 
ScalarType.createType(PrimitiveType.BIGINT))
+                                    .build())
+            )
             .put("backend_tablets", new 
SchemaTable(SystemIdGenerator.getNextId(), "backend_tablets", TableType.SCHEMA,
                     builder().column("BE_ID", 
ScalarType.createType(PrimitiveType.BIGINT))
                             .column("TABLET_ID", 
ScalarType.createType(PrimitiveType.BIGINT))
diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift
index ec93c1a26f4..4011009887c 100644
--- a/gensrc/thrift/Descriptors.thrift
+++ b/gensrc/thrift/Descriptors.thrift
@@ -162,6 +162,7 @@ enum TSchemaTableType {
     SCH_BLACKHOLE = 62;
     SCH_COLUMN_DATA_SIZES = 63;
     SCH_LOAD_JOBS = 64;
+    SCH_FILE_CACHE_INFO = 65;
 }
 
 enum THdfsCompression {
diff --git a/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy 
b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy
new file mode 100644
index 00000000000..f018889bb7a
--- /dev/null
+++ b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy
@@ -0,0 +1,129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_file_cache_info") {
+    def custoBeConfig = [
+        enable_evict_file_cache_in_advance : false,
+        file_cache_enter_disk_resource_limit_mode_percent : 99
+    ]
+
+    setBeConfigTemporary(custoBeConfig) {
+
+    String[][] backends = sql """ show backends """
+    def backendSockets = []
+    def backendIdToBackendIP = [:]
+    def backendIdToBackendHttpPort = [:]
+    for (String[] backend in backends) {
+        if (backend[9].equals("true")) {
+            backendIdToBackendIP.put(backend[0], backend[1])
+            backendIdToBackendHttpPort.put(backend[0], backend[4])
+        }
+    }
+    assertTrue(backendIdToBackendIP.size() > 0, "No alive backends found")
+
+    backendIdToBackendIP.each { backendId, ip ->
+        def socket = ip + ":" + backendIdToBackendHttpPort.get(backendId)
+        backendSockets.add(socket)
+    }
+
+    sql "drop table IF EXISTS customer"
+
+    sql """
+        CREATE TABLE IF NOT EXISTS customer (
+            `c_custkey` int NULL,
+            `c_name` string NULL,
+            `c_address` string NULL,
+            `c_city` string NULL,
+            `c_nation` string NULL,
+            `c_region` string NULL,
+            `c_phone` string NULL,
+            `c_mktsegment` string NULL
+        )
+        DUPLICATE KEY(`c_custkey`)
+        DISTRIBUTED BY HASH(`c_custkey`) BUCKETS 1  // only 1 tablet
+        PROPERTIES (
+            "file_cache_ttl_seconds" = "3600"
+        )
+    """
+
+    sql """
+        insert into customer values
+        (1, 'Customer#000000001', 'address1', 'city1', 'nation1', 'region1', 
'phone1', 'segment1'),
+        (2, 'Customer#000000002', 'address2', 'city2', 'nation2', 'region2', 
'phone2', 'segment2'),
+        (3, 'Customer#000000003', 'address3', 'city3', 'nation3', 'region3', 
'phone3', 'segment3'),
+        (4, 'Customer#000000004', 'address4', 'city4', 'nation4', 'region4', 
'phone4', 'segment4'),
+        (5, 'Customer#000000005', 'address5', 'city5', 'nation5', 'region5', 
'phone5', 'segment5')
+    """
+    sql "sync"
+
+    sql "select count(*) from customer"
+
+    Thread.sleep(10000)
+
+    def get_tablet_id = { String tbl_name ->
+        def tablets = sql "show tablets from ${tbl_name}"
+        assertEquals(tablets.size(), 1, "Should have exactly one tablet with 
BUCKETS=1")
+        return tablets[0][0] as Long
+    }
+
+    def tablet_id = get_tablet_id("customer")
+    println "Tablet ID: ${tablet_id}"
+
+    def cache_info = sql "select * from information_schema.file_cache_info"
+    
+    assertTrue(cache_info.size() > 0, "file_cache_info should not be empty for 
tablet_id ${tablet_id}")
+    
+    println "First query - File cache info for tablet_id ${tablet_id}:"
+    cache_info.each { row ->
+        println "  ${row}"
+    }
+
+    def clearResults = []
+    backendSockets.each { socket ->
+        httpTest {
+            endpoint ""
+            uri socket + "/api/file_cache?op=clear&sync=true"
+            op "get"
+            check {respCode, body ->
+                assertEquals(respCode, 200, "clear local cache fail, maybe you 
can find something in respond: " + parseJson(body))
+                clearResults.add(true)
+            }
+        }
+    }
+    assertEquals(clearResults.size(), backendSockets.size(), "Failed to clear 
cache on some backends")
+
+    Thread.sleep(5000)
+
+    def cache_info_after_clear = sql "select * from 
information_schema.file_cache_info where tablet_id = ${tablet_id}"
+    assertEquals(cache_info_after_clear.size(), 0, "file_cache_info should be 
empty after clearing cache")
+
+    println "After clearing cache - File cache info is empty as expected"
+
+    sql "select * from customer"
+
+    Thread.sleep(10000)
+
+    def cache_info_reloaded = sql "select * from 
information_schema.file_cache_info where tablet_id = ${tablet_id}"
+    assertTrue(cache_info_reloaded.size() > 0, "file_cache_info should not be 
empty after reloading data")
+
+    println "After reloading data - File cache info for tablet_id 
${tablet_id}:"
+    cache_info_reloaded.each { row ->
+        println "  ${row}"
+    }
+
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to