This is an automated email from the ASF dual-hosted git repository.

lijibing pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 38313fcc30a [improvement](statistics)Drop expired external stats only 
when the catalog is dropped. (#42244)
38313fcc30a is described below

commit 38313fcc30a8fb78698b99128eb7849a91746e9e
Author: Jibing-Li <[email protected]>
AuthorDate: Thu Oct 24 16:06:27 2024 +0800

    [improvement](statistics)Drop expired external stats only when the catalog 
is dropped. (#42244)
    
    Drop expired external stats only when the catalog is dropped, to reduce
    meta store access.
    Before, we go through all external catalogs and their DBs and tables to
    check expired stats, which may bring lots of meta store access.
---
 .../apache/doris/statistics/StatisticsCleaner.java | 18 +++--
 .../statistics/test_drop_expired_stats.groovy      | 76 ++++++++++++++++++++++
 2 files changed, 90 insertions(+), 4 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java
index 9775b6ecb73..15be395e590 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java
@@ -37,6 +37,7 @@ import org.apache.commons.text.StringSubstitutor;
 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -176,10 +177,9 @@ public class StatisticsCleaner extends MasterDaemon {
 
     private Map<Long, DatabaseIf<? extends TableIf>> constructDbMap() {
         Map<Long, DatabaseIf<? extends TableIf>> idToDb = Maps.newHashMap();
-        for (CatalogIf<? extends DatabaseIf<? extends TableIf>> ctl : 
idToCatalog.values()) {
-            for (DatabaseIf<? extends TableIf> db : ctl.getAllDbs()) {
-                idToDb.put(db.getId(), db);
-            }
+        Collection<DatabaseIf<? extends TableIf>> internalDBs = 
Env.getCurrentEnv().getInternalCatalog().getAllDbs();
+        for (DatabaseIf<? extends TableIf> db : internalDBs) {
+            idToDb.put(db.getId(), db);
         }
         return idToDb;
     }
@@ -268,6 +268,16 @@ public class StatisticsCleaner extends MasterDaemon {
                         expiredStats.expiredCatalog.add(catalogId);
                         continue;
                     }
+                    // Skip check external DBs and tables to avoid fetch too 
much metadata.
+                    // Remove expired external table stats only when the 
external catalog is dropped.
+                    // TODO: Need to check external database and table exist 
or not. But for now, we only check catalog.
+                    // Because column_statistics table only keep table id and 
db id.
+                    // But meta data doesn't always cache all external tables' 
ids.
+                    // So we may fail to find the external table only by id. 
Need to use db name and table name instead.
+                    // Have to store db name and table name in 
column_statistics in the future.
+                    if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID) {
+                        continue;
+                    }
                     long dbId = statsId.dbId;
                     if (!idToDb.containsKey(dbId)) {
                         expiredStats.expiredDatabase.add(dbId);
diff --git a/regression-test/suites/statistics/test_drop_expired_stats.groovy 
b/regression-test/suites/statistics/test_drop_expired_stats.groovy
new file mode 100644
index 00000000000..23067f670b5
--- /dev/null
+++ b/regression-test/suites/statistics/test_drop_expired_stats.groovy
@@ -0,0 +1,76 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_drop_expired_stats") {
+
+    sql """drop database if exists test_drop_expired_stats"""
+    sql """create database test_drop_expired_stats"""
+    sql """use test_drop_expired_stats"""
+    sql """set global enable_auto_analyze=false"""
+
+    sql """CREATE TABLE table1 (
+            key1 bigint NOT NULL,
+            key2 bigint NOT NULL,
+            value1 int NOT NULL,
+            value2 int NOT NULL,
+            value3 int NOT NULL
+        )ENGINE=OLAP
+        DUPLICATE KEY(`key1`, `key2`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY HASH(`key1`) BUCKETS 1
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+
+    sql """CREATE TABLE table2 (
+            key1 bigint NOT NULL,
+            key2 bigint NOT NULL,
+            value1 int NOT NULL
+        )ENGINE=OLAP
+        DUPLICATE KEY(`key1`, `key2`)
+        COMMENT "OLAP"
+        DISTRIBUTED BY HASH(`key1`) BUCKETS 1
+        PROPERTIES (
+            "replication_num" = "1"
+        )
+    """
+
+    def id1 = getTableId("test_drop_expired_stats", "table1")
+    def id2 = getTableId("test_drop_expired_stats", "table2")
+
+    sql """analyze table table1 with sync"""
+    sql """analyze table table2 with sync"""
+    def result = sql """select * from __internal_schema.column_statistics 
where tbl_id = ${id1}"""
+    assertEquals(5, result.size())
+    result = sql """select * from __internal_schema.column_statistics where 
tbl_id = ${id2}"""
+    assertEquals(3, result.size())
+    sql """drop table table1"""
+    sql """drop expired stats"""
+    result = sql """select * from __internal_schema.column_statistics where 
tbl_id = ${id1}"""
+    assertEquals(0, result.size())
+    result = sql """select * from __internal_schema.column_statistics where 
tbl_id = ${id2}"""
+    assertEquals(3, result.size())
+
+    sql """drop database if exists test_drop_expired_stats"""
+    sql """drop expired stats"""
+    result = sql """select * from __internal_schema.column_statistics where 
tbl_id = ${id1}"""
+    assertEquals(0, result.size())
+    result = sql """select * from __internal_schema.column_statistics where 
tbl_id = ${id2}"""
+    assertEquals(0, result.size())
+}
+


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to