This is an automated email from the ASF dual-hosted git repository.
gavinchou pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 4cbae8a97f7 [enhancement](filecache) fine-grained cache space
observation (#57783)
4cbae8a97f7 is described below
commit 4cbae8a97f7d0448e0b79d61fb958165059f27d6
Author: zhengyu <[email protected]>
AuthorDate: Thu Nov 13 13:36:14 2025 +0800
[enhancement](filecache) fine-grained cache space observation (#57783)
---
be/src/exec/schema_scanner.cpp | 3 +
.../schema_file_cache_info_scanner.cpp | 189 +++++++++++++++++++++
.../schema_file_cache_info_scanner.h | 47 +++++
be/src/io/cache/block_file_cache.h | 3 +
be/src/io/cache/block_file_cache_factory.h | 3 +
.../org/apache/doris/analysis/SchemaTableType.java | 2 +
.../java/org/apache/doris/catalog/SchemaTable.java | 11 ++
gensrc/thrift/Descriptors.thrift | 1 +
.../cloud_p0/cache/test_file_cache_info.groovy | 129 ++++++++++++++
9 files changed, 388 insertions(+)
diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp
index a39af09cc51..caadef76377 100644
--- a/be/src/exec/schema_scanner.cpp
+++ b/be/src/exec/schema_scanner.cpp
@@ -39,6 +39,7 @@
#include "exec/schema_scanner/schema_columns_scanner.h"
#include "exec/schema_scanner/schema_dummy_scanner.h"
#include "exec/schema_scanner/schema_encryption_keys_scanner.h"
+#include "exec/schema_scanner/schema_file_cache_info_scanner.h"
#include "exec/schema_scanner/schema_file_cache_statistics.h"
#include "exec/schema_scanner/schema_files_scanner.h"
#include "exec/schema_scanner/schema_load_job_scanner.h"
@@ -257,6 +258,8 @@ std::unique_ptr<SchemaScanner>
SchemaScanner::create(TSchemaTableType::type type
return SchemaClusterSnapshotPropertiesScanner::create_unique();
case TSchemaTableType::SCH_COLUMN_DATA_SIZES:
return SchemaColumnDataSizesScanner::create_unique();
+ case TSchemaTableType::SCH_FILE_CACHE_INFO:
+ return SchemaFileCacheInfoScanner::create_unique();
default:
return SchemaDummyScanner::create_unique();
break;
diff --git a/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp
b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp
new file mode 100644
index 00000000000..9734dbfe44b
--- /dev/null
+++ b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.cpp
@@ -0,0 +1,189 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "exec/schema_scanner/schema_file_cache_info_scanner.h"
+
+#include "io/cache/file_cache_common.h"
+#include "runtime/exec_env.h"
+#include "runtime/runtime_state.h"
+#include "vec/common/string_ref.h"
+#include "vec/core/block.h"
+#include "vec/data_types/data_type_factory.hpp"
+
+namespace doris {
+#include "common/compile_check_begin.h"
+
+std::vector<SchemaScanner::ColumnDesc>
SchemaFileCacheInfoScanner::_s_tbls_columns = {
+ // name, type, size, is_null
+ {"HASH", TYPE_STRING, sizeof(StringRef), true},
+ {"TABLET_ID", TYPE_BIGINT, sizeof(int64_t), true},
+ {"SIZE", TYPE_BIGINT, sizeof(int64_t), true},
+ {"TYPE", TYPE_STRING, sizeof(StringRef), true},
+ {"REMOTE_PATH", TYPE_STRING, sizeof(StringRef), true},
+ {"CACHE_PATH", TYPE_STRING, sizeof(StringRef), true},
+ {"BE_ID", TYPE_BIGINT, sizeof(int64_t), true}};
+
+SchemaFileCacheInfoScanner::SchemaFileCacheInfoScanner()
+ : SchemaScanner(_s_tbls_columns,
TSchemaTableType::SCH_FILE_CACHE_INFO) {}
+
+SchemaFileCacheInfoScanner::~SchemaFileCacheInfoScanner() {}
+
+Status SchemaFileCacheInfoScanner::start(RuntimeState* state) {
+ return Status::OK();
+}
+
+Status SchemaFileCacheInfoScanner::get_next_block_internal(vectorized::Block*
block, bool* eos) {
+ if (!_is_init) {
+ return Status::InternalError("Used before initialized.");
+ }
+
+ if (nullptr == block || nullptr == eos) {
+ return Status::InternalError("input pointer is nullptr.");
+ }
+
+ *eos = true;
+ return _fill_block_impl(block);
+}
+
+Status SchemaFileCacheInfoScanner::_fill_block_impl(vectorized::Block* block) {
+ SCOPED_TIMER(_fill_block_timer);
+
+ auto* file_cache_factory = ExecEnv::GetInstance()->file_cache_factory();
+ if (!file_cache_factory) {
+ return Status::OK();
+ }
+
+ // Collect all cache entries from all file cache instances
+ std::vector<std::tuple<std::string, int64_t, int64_t, int, std::string>>
cache_entries;
+
+ // Get all cache instances using the public getter
+ const auto& caches = file_cache_factory->get_caches();
+ for (const auto& cache : caches) {
+ const std::string& cache_path = cache->get_base_path();
+
+ // Get the storage from cache using the public getter
+ auto* storage = cache->get_storage();
+ if (!storage) {
+ continue;
+ }
+
+ // Try to get meta_store from FSFileCacheStorage using the public
getter
+ auto* fs_storage =
dynamic_cast<doris::io::FSFileCacheStorage*>(storage);
+ if (!fs_storage) {
+ continue;
+ }
+
+ auto* meta_store = fs_storage->get_meta_store();
+ if (!meta_store) {
+ continue;
+ }
+
+ // Get iterator for all BlockMeta records
+ auto iterator = meta_store->get_all();
+ if (!iterator) {
+ continue;
+ }
+
+ // Iterate through all cache entries
+ while (iterator->valid()) {
+ const auto& key = iterator->key();
+ const auto& value = iterator->value();
+
+ // Check for deserialization errors
+ if (!iterator->get_last_key_error().ok() ||
!iterator->get_last_value_error().ok()) {
+ LOG(WARNING) << "Failed to deserialize cache block metadata: "
+ << "key_error=" <<
iterator->get_last_key_error().to_string()
+ << ", value_error=" <<
iterator->get_last_value_error().to_string();
+ iterator->next();
+ continue; // Skip invalid records
+ }
+
+ // Convert hash to string
+ std::string hash_str = key.hash.to_string();
+
+ // Add to cache entries
+ cache_entries.emplace_back(hash_str, key.tablet_id, value.size,
value.type, cache_path);
+
+ iterator->next();
+ }
+ }
+
+ const size_t row_num = cache_entries.size();
+ if (row_num == 0) {
+ return Status::OK();
+ }
+
+ for (size_t col_idx = 0; col_idx < _s_tbls_columns.size(); ++col_idx) {
+ const auto& col_desc = _s_tbls_columns[col_idx];
+
+ std::vector<StringRef> str_refs(row_num);
+ std::vector<int64_t> int64_vals(row_num);
+ std::vector<void*> datas(row_num);
+ std::vector<std::string> column_values(row_num);
+
+ for (size_t row_idx = 0; row_idx < row_num; ++row_idx) {
+ const auto& entry = cache_entries[row_idx];
+ const auto& [hash, tablet_id, size, type, cache_path] = entry;
+
+ if (col_desc.type == TYPE_STRING) {
+ switch (col_idx) {
+ case 0: // HASH
+ column_values[row_idx] = hash;
+ break;
+ case 3: // TYPE
+ column_values[row_idx] = doris::io::cache_type_to_string(
+ static_cast<doris::io::FileCacheType>(type));
+ break;
+ case 4: // REMOTE_PATH
+ column_values[row_idx] = ""; // TODO: Implement remote
path retrieval
+ break;
+ case 5: // CACHE_PATH
+ column_values[row_idx] = cache_path;
+ break;
+ default:
+ column_values[row_idx] = "";
+ break;
+ }
+ str_refs[row_idx] =
+ StringRef(column_values[row_idx].data(),
column_values[row_idx].size());
+ datas[row_idx] = &str_refs[row_idx];
+ } else if (col_desc.type == TYPE_BIGINT) {
+ switch (col_idx) {
+ case 1: // TABLET_ID
+ int64_vals[row_idx] = tablet_id;
+ break;
+ case 2: // SIZE
+ int64_vals[row_idx] = size;
+ break;
+ case 6: // BE_ID
+ int64_vals[row_idx] =
ExecEnv::GetInstance()->cluster_info()->backend_id;
+ break;
+ default:
+ int64_vals[row_idx] = 0;
+ break;
+ }
+ datas[row_idx] = &int64_vals[row_idx];
+ }
+ }
+
+ RETURN_IF_ERROR(fill_dest_column_for_range(block, col_idx, datas));
+ }
+
+ return Status::OK();
+}
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h
b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h
new file mode 100644
index 00000000000..2efe6f76cdf
--- /dev/null
+++ b/be/src/exec/schema_scanner/schema_file_cache_info_scanner.h
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "common/status.h"
+#include "exec/schema_scanner.h"
+
+namespace doris {
+class RuntimeState;
+namespace vectorized {
+class Block;
+} // namespace vectorized
+
+class SchemaFileCacheInfoScanner : public SchemaScanner {
+ ENABLE_FACTORY_CREATOR(SchemaFileCacheInfoScanner);
+
+public:
+ SchemaFileCacheInfoScanner();
+ ~SchemaFileCacheInfoScanner() override;
+
+ Status start(RuntimeState* state) override;
+ Status get_next_block_internal(vectorized::Block* block, bool* eos)
override;
+
+ static std::vector<SchemaScanner::ColumnDesc> _s_tbls_columns;
+
+private:
+ Status _fill_block_impl(vectorized::Block* block);
+};
+
+} // namespace doris
\ No newline at end of file
diff --git a/be/src/io/cache/block_file_cache.h
b/be/src/io/cache/block_file_cache.h
index dcaf81ec2a9..19715fb0445 100644
--- a/be/src/io/cache/block_file_cache.h
+++ b/be/src/io/cache/block_file_cache.h
@@ -171,6 +171,9 @@ public:
[[nodiscard]] const std::string& get_base_path() const { return
_cache_base_path; }
+ // Get storage for inspection
+ FileCacheStorage* get_storage() const { return _storage.get(); }
+
/**
* Given an `offset` and `size` representing [offset, offset + size)
bytes interval,
* return list of cached non-overlapping non-empty
diff --git a/be/src/io/cache/block_file_cache_factory.h
b/be/src/io/cache/block_file_cache_factory.h
index 837feac7f68..3031076336c 100644
--- a/be/src/io/cache/block_file_cache_factory.h
+++ b/be/src/io/cache/block_file_cache_factory.h
@@ -102,6 +102,9 @@ public:
void get_cache_stats_block(vectorized::Block* block);
+ // Get all cache instances for inspection
+ const std::vector<std::unique_ptr<BlockFileCache>>& get_caches() const {
return _caches; }
+
FileCacheFactory() = default;
FileCacheFactory& operator=(const FileCacheFactory&) = delete;
FileCacheFactory(const FileCacheFactory&) = delete;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java
index 5576c6d294b..ca58b311ec4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/SchemaTableType.java
@@ -98,6 +98,8 @@ public enum SchemaTableType {
TSchemaTableType.SCH_ROUTINE_LOAD_JOBS),
SCH_LOAD_JOBS("LOAD_JOBS", "LOAD_JOBS",
TSchemaTableType.SCH_LOAD_JOBS),
+ SCH_FILE_CACHE_INFO("FILE_CACHE_INFO", "FILE_CACHE_INFO",
+ TSchemaTableType.SCH_FILE_CACHE_INFO),
SCH_VIEW_DEPENDENCY("VIEW_DEPENDENCY", "VIEW_DEPENDENCY",
TSchemaTableType.SCH_VIEW_DEPENDENCY),
SQL_BLOCK_RULE_STATUS("SQL_BLOCK_RULE_STATUS", "SQL_BLOCK_RULE_STATUS",
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java
index 4a2c2560850..1f2b25d4c16 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/SchemaTable.java
@@ -702,6 +702,17 @@ public class SchemaTable extends Table {
.column("FIRST_ERROR_MSG",
ScalarType.createStringType())
.build())
)
+ .put("file_cache_info",
+ new SchemaTable(SystemIdGenerator.getNextId(),
"file_cache_info", TableType.SCHEMA,
+ builder().column("HASH",
ScalarType.createStringType())
+ .column("TABLET_ID",
ScalarType.createType(PrimitiveType.BIGINT))
+ .column("SIZE",
ScalarType.createType(PrimitiveType.BIGINT))
+ .column("TYPE",
ScalarType.createStringType())
+ .column("REMOTE_PATH",
ScalarType.createStringType())
+ .column("CACHE_PATH",
ScalarType.createStringType())
+ .column("BE_ID",
ScalarType.createType(PrimitiveType.BIGINT))
+ .build())
+ )
.put("backend_tablets", new
SchemaTable(SystemIdGenerator.getNextId(), "backend_tablets", TableType.SCHEMA,
builder().column("BE_ID",
ScalarType.createType(PrimitiveType.BIGINT))
.column("TABLET_ID",
ScalarType.createType(PrimitiveType.BIGINT))
diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift
index ec93c1a26f4..4011009887c 100644
--- a/gensrc/thrift/Descriptors.thrift
+++ b/gensrc/thrift/Descriptors.thrift
@@ -162,6 +162,7 @@ enum TSchemaTableType {
SCH_BLACKHOLE = 62;
SCH_COLUMN_DATA_SIZES = 63;
SCH_LOAD_JOBS = 64;
+ SCH_FILE_CACHE_INFO = 65;
}
enum THdfsCompression {
diff --git a/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy
b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy
new file mode 100644
index 00000000000..f018889bb7a
--- /dev/null
+++ b/regression-test/suites/cloud_p0/cache/test_file_cache_info.groovy
@@ -0,0 +1,129 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_file_cache_info") {
+ def custoBeConfig = [
+ enable_evict_file_cache_in_advance : false,
+ file_cache_enter_disk_resource_limit_mode_percent : 99
+ ]
+
+ setBeConfigTemporary(custoBeConfig) {
+
+ String[][] backends = sql """ show backends """
+ def backendSockets = []
+ def backendIdToBackendIP = [:]
+ def backendIdToBackendHttpPort = [:]
+ for (String[] backend in backends) {
+ if (backend[9].equals("true")) {
+ backendIdToBackendIP.put(backend[0], backend[1])
+ backendIdToBackendHttpPort.put(backend[0], backend[4])
+ }
+ }
+ assertTrue(backendIdToBackendIP.size() > 0, "No alive backends found")
+
+ backendIdToBackendIP.each { backendId, ip ->
+ def socket = ip + ":" + backendIdToBackendHttpPort.get(backendId)
+ backendSockets.add(socket)
+ }
+
+ sql "drop table IF EXISTS customer"
+
+ sql """
+ CREATE TABLE IF NOT EXISTS customer (
+ `c_custkey` int NULL,
+ `c_name` string NULL,
+ `c_address` string NULL,
+ `c_city` string NULL,
+ `c_nation` string NULL,
+ `c_region` string NULL,
+ `c_phone` string NULL,
+ `c_mktsegment` string NULL
+ )
+ DUPLICATE KEY(`c_custkey`)
+ DISTRIBUTED BY HASH(`c_custkey`) BUCKETS 1 // only 1 tablet
+ PROPERTIES (
+ "file_cache_ttl_seconds" = "3600"
+ )
+ """
+
+ sql """
+ insert into customer values
+ (1, 'Customer#000000001', 'address1', 'city1', 'nation1', 'region1',
'phone1', 'segment1'),
+ (2, 'Customer#000000002', 'address2', 'city2', 'nation2', 'region2',
'phone2', 'segment2'),
+ (3, 'Customer#000000003', 'address3', 'city3', 'nation3', 'region3',
'phone3', 'segment3'),
+ (4, 'Customer#000000004', 'address4', 'city4', 'nation4', 'region4',
'phone4', 'segment4'),
+ (5, 'Customer#000000005', 'address5', 'city5', 'nation5', 'region5',
'phone5', 'segment5')
+ """
+ sql "sync"
+
+ sql "select count(*) from customer"
+
+ Thread.sleep(10000)
+
+ def get_tablet_id = { String tbl_name ->
+ def tablets = sql "show tablets from ${tbl_name}"
+ assertEquals(tablets.size(), 1, "Should have exactly one tablet with
BUCKETS=1")
+ return tablets[0][0] as Long
+ }
+
+ def tablet_id = get_tablet_id("customer")
+ println "Tablet ID: ${tablet_id}"
+
+ def cache_info = sql "select * from information_schema.file_cache_info"
+
+ assertTrue(cache_info.size() > 0, "file_cache_info should not be empty for
tablet_id ${tablet_id}")
+
+ println "First query - File cache info for tablet_id ${tablet_id}:"
+ cache_info.each { row ->
+ println " ${row}"
+ }
+
+ def clearResults = []
+ backendSockets.each { socket ->
+ httpTest {
+ endpoint ""
+ uri socket + "/api/file_cache?op=clear&sync=true"
+ op "get"
+ check {respCode, body ->
+ assertEquals(respCode, 200, "clear local cache fail, maybe you
can find something in respond: " + parseJson(body))
+ clearResults.add(true)
+ }
+ }
+ }
+ assertEquals(clearResults.size(), backendSockets.size(), "Failed to clear
cache on some backends")
+
+ Thread.sleep(5000)
+
+ def cache_info_after_clear = sql "select * from
information_schema.file_cache_info where tablet_id = ${tablet_id}"
+ assertEquals(cache_info_after_clear.size(), 0, "file_cache_info should be
empty after clearing cache")
+
+ println "After clearing cache - File cache info is empty as expected"
+
+ sql "select * from customer"
+
+ Thread.sleep(10000)
+
+ def cache_info_reloaded = sql "select * from
information_schema.file_cache_info where tablet_id = ${tablet_id}"
+ assertTrue(cache_info_reloaded.size() > 0, "file_cache_info should not be
empty after reloading data")
+
+ println "After reloading data - File cache info for tablet_id
${tablet_id}:"
+ cache_info_reloaded.each { row ->
+ println " ${row}"
+ }
+
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]