This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d6450a3f1c [Fix](statistics)Fix external table auto analyze bugs 
(#23574)
d6450a3f1c is described below

commit d6450a3f1cfed491fa05c3c9b61aaf97a7a223c3
Author: Jibing-Li <64681310+jibing...@users.noreply.github.com>
AuthorDate: Fri Sep 1 10:58:14 2023 +0800

    [Fix](statistics)Fix external table auto analyze bugs (#23574)
    
    1. Fix auto analyze external table recursively load schema cache bug.
    2. Move some function in StatisticsAutoAnalyzer class to TableIf. So that 
external table and internal table could implement the logic separately.
    3. Disable external catalog auto analyze by default, could open it by 
adding catalog property "enable.auto.analyze"="true"
---
 .../java/org/apache/doris/catalog/OlapTable.java   | 28 ++++++++++++++++
 .../main/java/org/apache/doris/catalog/Table.java  | 12 ++++++-
 .../java/org/apache/doris/catalog/TableIf.java     |  5 +++
 .../doris/catalog/external/ExternalTable.java      | 19 +++++++++++
 .../doris/catalog/external/HMSExternalTable.java   |  8 ++---
 .../org/apache/doris/datasource/CatalogIf.java     |  2 ++
 .../apache/doris/datasource/ExternalCatalog.java   | 15 +++++++++
 .../apache/doris/datasource/InternalCatalog.java   |  5 +++
 .../doris/statistics/StatisticsAutoAnalyzer.java   | 37 +++++-----------------
 .../doris/statistics/util/StatisticsUtil.java      |  5 +--
 .../statistics/StatisticsAutoAnalyzerTest.java     | 30 ++++++++++--------
 .../hive/test_hive_statistic.groovy                |  4 +++
 12 files changed, 120 insertions(+), 50 deletions(-)

diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
index 97a88aad40..5f58ab277f 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
@@ -56,6 +56,8 @@ import org.apache.doris.statistics.BaseAnalysisTask;
 import org.apache.doris.statistics.HistogramTask;
 import org.apache.doris.statistics.MVAnalysisTask;
 import org.apache.doris.statistics.OlapAnalysisTask;
+import org.apache.doris.statistics.TableStats;
+import org.apache.doris.statistics.util.StatisticsUtil;
 import org.apache.doris.system.Backend;
 import org.apache.doris.system.SystemInfoService;
 import org.apache.doris.thrift.TColumn;
@@ -1122,6 +1124,32 @@ public class OlapTable extends Table {
         return new MVAnalysisTask(info);
     }
 
+    @Override
+    public boolean needReAnalyzeTable(TableStats tblStats) {
+        long rowCount = getRowCount();
+        // TODO: Do we need to analyze an empty table?
+        if (rowCount == 0) {
+            return false;
+        }
+        long updateRows = tblStats.updatedRows.get();
+        int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows);
+        return tblHealth < Config.table_stats_health_threshold;
+    }
+
+    @Override
+    public Set<String> findReAnalyzeNeededPartitions(TableStats tableStats) {
+        if (tableStats == null) {
+            return getPartitionNames().stream().map(this::getPartition)
+                
.filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet());
+        }
+        return getPartitionNames().stream()
+            .map(this::getPartition)
+            .filter(Partition::hasData)
+            .filter(partition ->
+                partition.getVisibleVersionTime() >= 
tableStats.updatedTime).map(Partition::getName)
+            .collect(Collectors.toSet());
+    }
+
     @Override
     public long getRowCount() {
         long rowCount = 0;
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java
index 24d9dcc800..12689894b4 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java
@@ -30,6 +30,7 @@ import org.apache.doris.external.hudi.HudiTable;
 import org.apache.doris.statistics.AnalysisInfo;
 import org.apache.doris.statistics.BaseAnalysisTask;
 import org.apache.doris.statistics.ColumnStatistic;
+import org.apache.doris.statistics.TableStats;
 import org.apache.doris.thrift.TTableDescriptor;
 
 import com.google.common.base.Preconditions;
@@ -571,6 +572,15 @@ public abstract class Table extends MetaObject implements 
Writable, TableIf {
         return Optional.empty();
     }
 
-    public void analyze(String dbName) {
+    public void analyze(String dbName) {}
+
+    @Override
+    public boolean needReAnalyzeTable(TableStats tblStats) {
+        return true;
+    }
+
+    @Override
+    public Set<String> findReAnalyzeNeededPartitions(TableStats tableStats) {
+        return Collections.emptySet();
     }
 }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
index f23f839898..21e2ddd154 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
@@ -23,6 +23,7 @@ import org.apache.doris.common.MetaNotFoundException;
 import org.apache.doris.statistics.AnalysisInfo;
 import org.apache.doris.statistics.BaseAnalysisTask;
 import org.apache.doris.statistics.ColumnStatistic;
+import org.apache.doris.statistics.TableStats;
 import org.apache.doris.thrift.TTableDescriptor;
 
 import com.google.common.collect.Lists;
@@ -136,6 +137,10 @@ public interface TableIf {
 
     Optional<ColumnStatistic> getColumnStatistic(String colName);
 
+    boolean needReAnalyzeTable(TableStats tblStats);
+
+    Set<String> findReAnalyzeNeededPartitions(TableStats tableStats);
+
     void write(DataOutput out) throws IOException;
 
     /**
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java
index 0f9ec3f564..6f31ac18d7 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java
@@ -35,8 +35,10 @@ import org.apache.doris.persist.gson.GsonUtils;
 import org.apache.doris.statistics.AnalysisInfo;
 import org.apache.doris.statistics.BaseAnalysisTask;
 import org.apache.doris.statistics.ColumnStatistic;
+import org.apache.doris.statistics.TableStats;
 import org.apache.doris.thrift.TTableDescriptor;
 
+import com.google.common.collect.Sets;
 import com.google.gson.annotations.SerializedName;
 import lombok.Getter;
 import org.apache.commons.lang3.NotImplementedException;
@@ -46,8 +48,10 @@ import org.apache.logging.log4j.Logger;
 import java.io.DataInput;
 import java.io.DataOutput;
 import java.io.IOException;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Optional;
+import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
 
@@ -375,4 +379,19 @@ public class ExternalTable implements TableIf, Writable, 
GsonPostProcessable {
         rwLock = new ReentrantReadWriteLock(true);
         objectCreated = false;
     }
+
+    @Override
+    public boolean needReAnalyzeTable(TableStats tblStats) {
+        // TODO: Find a way to decide if this external table need to be 
reanalyzed.
+        // For now, simply return true for all external tables.
+        return true;
+    }
+
+    @Override
+    public Set<String> findReAnalyzeNeededPartitions(TableStats tableStats) {
+        HashSet<String> partitions = Sets.newHashSet();
+        // TODO: Find a way to collect external table partitions that need to 
be analyzed.
+        partitions.add("Dummy Partition");
+        return partitions;
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
index 0459c3ef2a..c1de1ea98d 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
@@ -153,7 +153,7 @@ public class HMSExternalTable extends ExternalTable {
                 }
             }
             objectCreated = true;
-            estimatedRowCount = getRowCountFromExternalSource();
+            estimatedRowCount = getRowCountFromExternalSource(true);
         }
     }
 
@@ -277,7 +277,7 @@ public class HMSExternalTable extends ExternalTable {
     @Override
     public long getRowCount() {
         makeSureInitialized();
-        long rowCount = getRowCountFromExternalSource();
+        long rowCount = getRowCountFromExternalSource(false);
         if (rowCount == -1) {
             LOG.debug("Will estimate row count from file list.");
             rowCount = StatisticsUtil.getRowCountFromFileList(this);
@@ -285,11 +285,11 @@ public class HMSExternalTable extends ExternalTable {
         return rowCount;
     }
 
-    private long getRowCountFromExternalSource() {
+    private long getRowCountFromExternalSource(boolean isInit) {
         long rowCount;
         switch (dlaType) {
             case HIVE:
-                rowCount = StatisticsUtil.getHiveRowCount(this);
+                rowCount = StatisticsUtil.getHiveRowCount(this, isInit);
                 break;
             case ICEBERG:
                 rowCount = StatisticsUtil.getIcebergRowCount(this);
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java
index b84c769eac..d135018e75 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java
@@ -170,4 +170,6 @@ public interface CatalogIf<T extends DatabaseIf> {
     // Return a copy of all db collection.
     @SuppressWarnings({"rawtypes", "unchecked"})
     public Collection<DatabaseIf> getAllDbs();
+
+    public boolean enableAutoAnalyze();
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java
index 89f0c451a8..35d03dfabc 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java
@@ -72,6 +72,8 @@ public abstract class ExternalCatalog
             implements CatalogIf<ExternalDatabase<? extends ExternalTable>>, 
Writable, GsonPostProcessable {
     private static final Logger LOG = 
LogManager.getLogger(ExternalCatalog.class);
 
+    public static final String ENABLE_AUTO_ANALYZE = "enable.auto.analyze";
+
     // Unique id of this catalog, will be assigned after catalog is loaded.
     @SerializedName(value = "id")
     protected long id;
@@ -590,4 +592,17 @@ public abstract class ExternalCatalog
         makeSureInitialized();
         return new HashSet<>(idToDb.values());
     }
+
+    @Override
+    public boolean enableAutoAnalyze() {
+        // By default, external catalog disables auto analyze, uses could set 
catalog property to enable it:
+        // "enable.auto.analyze" = true
+        Map<String, String> properties = catalogProperty.getProperties();
+        boolean ret = false;
+        if (properties.containsKey(ENABLE_AUTO_ANALYZE)
+                && 
properties.get(ENABLE_AUTO_ANALYZE).equalsIgnoreCase("true")) {
+            ret = true;
+        }
+        return ret;
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java
index 68a819a2dd..a85d892734 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java
@@ -3134,4 +3134,9 @@ public class InternalCatalog implements 
CatalogIf<Database> {
         OlapTable olapTable = (OlapTable) 
db.getTableOrMetaException(log.getTableId(), TableType.OLAP);
         olapTable.getAutoIncrementGenerator().applyChange(log.getColumnId(), 
log.getBatchEndId());
     }
+
+    @Override
+    public boolean enableAutoAnalyze() {
+        return true;
+    }
 }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java
 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java
index ad70769427..5d704e4f3b 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java
@@ -21,7 +21,6 @@ import org.apache.doris.analysis.TableName;
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.DatabaseIf;
 import org.apache.doris.catalog.Env;
-import org.apache.doris.catalog.Partition;
 import org.apache.doris.catalog.TableIf;
 import org.apache.doris.catalog.View;
 import org.apache.doris.common.Config;
@@ -88,7 +87,9 @@ public class StatisticsAutoAnalyzer extends MasterDaemon {
     private void analyzeAll() {
         Set<CatalogIf> catalogs = 
Env.getCurrentEnv().getCatalogMgr().getCopyOfCatalog();
         for (CatalogIf ctl : catalogs) {
-
+            if (!ctl.enableAutoAnalyze()) {
+                continue;
+            }
             Collection<DatabaseIf> dbs = ctl.getAllDbs();
             for (DatabaseIf<TableIf> databaseIf : dbs) {
                 if 
(StatisticConstants.STATISTICS_DB_BLACK_LIST.contains(databaseIf.getFullName()))
 {
@@ -158,11 +159,11 @@ public class StatisticsAutoAnalyzer extends MasterDaemon {
                 .findTable(jobInfo.catalogName, jobInfo.dbName, 
jobInfo.tblName);
         TableStats tblStats = 
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
 
-        if (!(tblStats == null || needReanalyzeTable(table, tblStats))) {
+        if (!(tblStats == null || table.needReAnalyzeTable(tblStats))) {
             return null;
         }
 
-        Set<String> needRunPartitions = findReAnalyzeNeededPartitions(table, 
tblStats);
+        Set<String> needRunPartitions = 
table.findReAnalyzeNeededPartitions(tblStats);
 
         if (needRunPartitions.isEmpty()) {
             return null;
@@ -171,31 +172,6 @@ public class StatisticsAutoAnalyzer extends MasterDaemon {
         return getAnalysisJobInfo(jobInfo, table, needRunPartitions);
     }
 
-    @VisibleForTesting
-    protected Set<String> findReAnalyzeNeededPartitions(TableIf table, 
TableStats tableStats) {
-        if (tableStats == null) {
-            return table.getPartitionNames().stream().map(table::getPartition)
-                    
.filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet());
-        }
-        return table.getPartitionNames().stream()
-                .map(table::getPartition)
-                .filter(Partition::hasData)
-                .filter(partition ->
-                        partition.getVisibleVersionTime() >= 
tableStats.updatedTime).map(Partition::getName)
-                .collect(Collectors.toSet());
-    }
-
-    private boolean needReanalyzeTable(TableIf table, TableStats tblStats) {
-        long rowCount = table.getRowCount();
-        // TODO: Do we need to analyze an empty table?
-        if (rowCount == 0) {
-            return false;
-        }
-        long updateRows = tblStats.updatedRows.get();
-        int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows);
-        return tblHealth < Config.table_stats_health_threshold;
-    }
-
     @VisibleForTesting
     public AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table,
             Set<String> needRunPartitions) {
@@ -247,6 +223,9 @@ public class StatisticsAutoAnalyzer extends MasterDaemon {
         Map<Long, BaseAnalysisTask> analysisTaskInfos = new HashMap<>();
         AnalysisManager analysisManager = 
Env.getCurrentEnv().getAnalysisManager();
         analysisManager.createTaskForEachColumns(jobInfo, analysisTaskInfos, 
false);
+        if (StatisticsUtil.isExternalTable(jobInfo.catalogName, 
jobInfo.dbName, jobInfo.tblName)) {
+            analysisManager.createTableLevelTaskForExternalTable(jobInfo, 
analysisTaskInfos, false);
+        }
         Env.getCurrentEnv().getAnalysisManager().registerSysJob(jobInfo, 
analysisTaskInfos);
         analysisTaskInfos.values().forEach(analysisTaskExecutor::submitTask);
     }
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 876be46ab6..87d8a0ba15 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -489,9 +489,10 @@ public class StatisticsUtil {
      * First get it from remote table parameters. If not found, estimate it : 
totalSize/estimatedRowSize
      *
      * @param table Hive HMSExternalTable to estimate row count.
+     * @param isInit Flag to indicate if this is called during init. To avoid 
recursively get schema.
      * @return estimated row count
      */
-    public static long getHiveRowCount(HMSExternalTable table) {
+    public static long getHiveRowCount(HMSExternalTable table, boolean isInit) 
{
         Map<String, String> parameters = 
table.getRemoteTable().getParameters();
         if (parameters == null) {
             return -1;
@@ -500,7 +501,7 @@ public class StatisticsUtil {
         if (parameters.containsKey(NUM_ROWS)) {
             return Long.parseLong(parameters.get(NUM_ROWS));
         }
-        if (!parameters.containsKey(TOTAL_SIZE)) {
+        if (!parameters.containsKey(TOTAL_SIZE) || isInit) {
             return -1;
         }
         // Table parameters doesn't contain row count but contain total size. 
Estimate row count : totalSize/rowSize
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java
 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java
index 3368a5a669..fff649a447 100644
--- 
a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java
+++ 
b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java
@@ -39,7 +39,6 @@ import mockit.Expectations;
 import mockit.Injectable;
 import mockit.Mock;
 import mockit.MockUp;
-import mockit.Mocked;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
 
@@ -140,14 +139,25 @@ public class StatisticsAutoAnalyzerTest {
     }
 
     @Test
-    public void testGetReAnalyzeRequiredPart0(@Mocked TableIf tableIf) {
+    public void testGetReAnalyzeRequiredPart0() {
 
-        new Expectations() {
-            {
-                tableIf.getRowCount();
-                result = 100;
+        TableIf tableIf = new OlapTable();
+
+        new MockUp<OlapTable>() {
+            @Mock
+            protected Set<String> findReAnalyzeNeededPartitions(TableStats 
tableStats) {
+                Set<String> partitionNames = new HashSet<>();
+                partitionNames.add("p1");
+                partitionNames.add("p2");
+                return partitionNames;
+            }
+
+            @Mock
+            public long getRowCount() {
+                return 100;
             }
         };
+
         new MockUp<StatisticsUtil>() {
             @Mock
             public TableIf findTable(String catalogName, String dbName, String 
tblName) {
@@ -176,14 +186,6 @@ public class StatisticsAutoAnalyzerTest {
         };
 
         new MockUp<StatisticsAutoAnalyzer>() {
-            @Mock
-            protected Set<String> findReAnalyzeNeededPartitions(TableIf table, 
TableStats tableStats)  {
-                Set<String> partitionNames = new HashSet<>();
-                partitionNames.add("p1");
-                partitionNames.add("p2");
-                return partitionNames;
-            }
-
             @Mock
             public AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, 
TableIf table,
                     Set<String> needRunPartitions) {
diff --git 
a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy 
b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
index 85c8326382..c8163bba7e 100644
--- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
@@ -30,6 +30,10 @@ suite("test_hive_statistic", 
"p2,external,hive,external_remote,external_remote_h
             );
         """
         logger.info("catalog " + catalog_name + " created")
+
+        // Test analyze table without init.
+        sql """analyze table ${catalog_name}.tpch_1000_parquet.region with 
sync"""
+
         sql """switch ${catalog_name};"""
         logger.info("switched to catalog " + catalog_name)
         sql """use statistics;"""


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to