This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new d6450a3f1c [Fix](statistics)Fix external table auto analyze bugs (#23574) d6450a3f1c is described below commit d6450a3f1cfed491fa05c3c9b61aaf97a7a223c3 Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Fri Sep 1 10:58:14 2023 +0800 [Fix](statistics)Fix external table auto analyze bugs (#23574) 1. Fix auto analyze external table recursively load schema cache bug. 2. Move some function in StatisticsAutoAnalyzer class to TableIf. So that external table and internal table could implement the logic separately. 3. Disable external catalog auto analyze by default, could open it by adding catalog property "enable.auto.analyze"="true" --- .../java/org/apache/doris/catalog/OlapTable.java | 28 ++++++++++++++++ .../main/java/org/apache/doris/catalog/Table.java | 12 ++++++- .../java/org/apache/doris/catalog/TableIf.java | 5 +++ .../doris/catalog/external/ExternalTable.java | 19 +++++++++++ .../doris/catalog/external/HMSExternalTable.java | 8 ++--- .../org/apache/doris/datasource/CatalogIf.java | 2 ++ .../apache/doris/datasource/ExternalCatalog.java | 15 +++++++++ .../apache/doris/datasource/InternalCatalog.java | 5 +++ .../doris/statistics/StatisticsAutoAnalyzer.java | 37 +++++----------------- .../doris/statistics/util/StatisticsUtil.java | 5 +-- .../statistics/StatisticsAutoAnalyzerTest.java | 30 ++++++++++-------- .../hive/test_hive_statistic.groovy | 4 +++ 12 files changed, 120 insertions(+), 50 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 97a88aad40..5f58ab277f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -56,6 +56,8 @@ import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.HistogramTask; import org.apache.doris.statistics.MVAnalysisTask; import org.apache.doris.statistics.OlapAnalysisTask; +import org.apache.doris.statistics.TableStats; +import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.Backend; import org.apache.doris.system.SystemInfoService; import org.apache.doris.thrift.TColumn; @@ -1122,6 +1124,32 @@ public class OlapTable extends Table { return new MVAnalysisTask(info); } + @Override + public boolean needReAnalyzeTable(TableStats tblStats) { + long rowCount = getRowCount(); + // TODO: Do we need to analyze an empty table? + if (rowCount == 0) { + return false; + } + long updateRows = tblStats.updatedRows.get(); + int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); + return tblHealth < Config.table_stats_health_threshold; + } + + @Override + public Set<String> findReAnalyzeNeededPartitions(TableStats tableStats) { + if (tableStats == null) { + return getPartitionNames().stream().map(this::getPartition) + .filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet()); + } + return getPartitionNames().stream() + .map(this::getPartition) + .filter(Partition::hasData) + .filter(partition -> + partition.getVisibleVersionTime() >= tableStats.updatedTime).map(Partition::getName) + .collect(Collectors.toSet()); + } + @Override public long getRowCount() { long rowCount = 0; diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java index 24d9dcc800..12689894b4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java @@ -30,6 +30,7 @@ import org.apache.doris.external.hudi.HudiTable; import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStats; import org.apache.doris.thrift.TTableDescriptor; import com.google.common.base.Preconditions; @@ -571,6 +572,15 @@ public abstract class Table extends MetaObject implements Writable, TableIf { return Optional.empty(); } - public void analyze(String dbName) { + public void analyze(String dbName) {} + + @Override + public boolean needReAnalyzeTable(TableStats tblStats) { + return true; + } + + @Override + public Set<String> findReAnalyzeNeededPartitions(TableStats tableStats) { + return Collections.emptySet(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index f23f839898..21e2ddd154 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -23,6 +23,7 @@ import org.apache.doris.common.MetaNotFoundException; import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStats; import org.apache.doris.thrift.TTableDescriptor; import com.google.common.collect.Lists; @@ -136,6 +137,10 @@ public interface TableIf { Optional<ColumnStatistic> getColumnStatistic(String colName); + boolean needReAnalyzeTable(TableStats tblStats); + + Set<String> findReAnalyzeNeededPartitions(TableStats tableStats); + void write(DataOutput out) throws IOException; /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java index 0f9ec3f564..6f31ac18d7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java @@ -35,8 +35,10 @@ import org.apache.doris.persist.gson.GsonUtils; import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStats; import org.apache.doris.thrift.TTableDescriptor; +import com.google.common.collect.Sets; import com.google.gson.annotations.SerializedName; import lombok.Getter; import org.apache.commons.lang3.NotImplementedException; @@ -46,8 +48,10 @@ import org.apache.logging.log4j.Logger; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.HashSet; import java.util.List; import java.util.Optional; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -375,4 +379,19 @@ public class ExternalTable implements TableIf, Writable, GsonPostProcessable { rwLock = new ReentrantReadWriteLock(true); objectCreated = false; } + + @Override + public boolean needReAnalyzeTable(TableStats tblStats) { + // TODO: Find a way to decide if this external table need to be reanalyzed. + // For now, simply return true for all external tables. + return true; + } + + @Override + public Set<String> findReAnalyzeNeededPartitions(TableStats tableStats) { + HashSet<String> partitions = Sets.newHashSet(); + // TODO: Find a way to collect external table partitions that need to be analyzed. + partitions.add("Dummy Partition"); + return partitions; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index 0459c3ef2a..c1de1ea98d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -153,7 +153,7 @@ public class HMSExternalTable extends ExternalTable { } } objectCreated = true; - estimatedRowCount = getRowCountFromExternalSource(); + estimatedRowCount = getRowCountFromExternalSource(true); } } @@ -277,7 +277,7 @@ public class HMSExternalTable extends ExternalTable { @Override public long getRowCount() { makeSureInitialized(); - long rowCount = getRowCountFromExternalSource(); + long rowCount = getRowCountFromExternalSource(false); if (rowCount == -1) { LOG.debug("Will estimate row count from file list."); rowCount = StatisticsUtil.getRowCountFromFileList(this); @@ -285,11 +285,11 @@ public class HMSExternalTable extends ExternalTable { return rowCount; } - private long getRowCountFromExternalSource() { + private long getRowCountFromExternalSource(boolean isInit) { long rowCount; switch (dlaType) { case HIVE: - rowCount = StatisticsUtil.getHiveRowCount(this); + rowCount = StatisticsUtil.getHiveRowCount(this, isInit); break; case ICEBERG: rowCount = StatisticsUtil.getIcebergRowCount(this); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java index b84c769eac..d135018e75 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java @@ -170,4 +170,6 @@ public interface CatalogIf<T extends DatabaseIf> { // Return a copy of all db collection. @SuppressWarnings({"rawtypes", "unchecked"}) public Collection<DatabaseIf> getAllDbs(); + + public boolean enableAutoAnalyze(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java index 89f0c451a8..35d03dfabc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java @@ -72,6 +72,8 @@ public abstract class ExternalCatalog implements CatalogIf<ExternalDatabase<? extends ExternalTable>>, Writable, GsonPostProcessable { private static final Logger LOG = LogManager.getLogger(ExternalCatalog.class); + public static final String ENABLE_AUTO_ANALYZE = "enable.auto.analyze"; + // Unique id of this catalog, will be assigned after catalog is loaded. @SerializedName(value = "id") protected long id; @@ -590,4 +592,17 @@ public abstract class ExternalCatalog makeSureInitialized(); return new HashSet<>(idToDb.values()); } + + @Override + public boolean enableAutoAnalyze() { + // By default, external catalog disables auto analyze, uses could set catalog property to enable it: + // "enable.auto.analyze" = true + Map<String, String> properties = catalogProperty.getProperties(); + boolean ret = false; + if (properties.containsKey(ENABLE_AUTO_ANALYZE) + && properties.get(ENABLE_AUTO_ANALYZE).equalsIgnoreCase("true")) { + ret = true; + } + return ret; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index 68a819a2dd..a85d892734 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -3134,4 +3134,9 @@ public class InternalCatalog implements CatalogIf<Database> { OlapTable olapTable = (OlapTable) db.getTableOrMetaException(log.getTableId(), TableType.OLAP); olapTable.getAutoIncrementGenerator().applyChange(log.getColumnId(), log.getBatchEndId()); } + + @Override + public boolean enableAutoAnalyze() { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java index ad70769427..5d704e4f3b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java @@ -21,7 +21,6 @@ import org.apache.doris.analysis.TableName; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.View; import org.apache.doris.common.Config; @@ -88,7 +87,9 @@ public class StatisticsAutoAnalyzer extends MasterDaemon { private void analyzeAll() { Set<CatalogIf> catalogs = Env.getCurrentEnv().getCatalogMgr().getCopyOfCatalog(); for (CatalogIf ctl : catalogs) { - + if (!ctl.enableAutoAnalyze()) { + continue; + } Collection<DatabaseIf> dbs = ctl.getAllDbs(); for (DatabaseIf<TableIf> databaseIf : dbs) { if (StatisticConstants.STATISTICS_DB_BLACK_LIST.contains(databaseIf.getFullName())) { @@ -158,11 +159,11 @@ public class StatisticsAutoAnalyzer extends MasterDaemon { .findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); TableStats tblStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); - if (!(tblStats == null || needReanalyzeTable(table, tblStats))) { + if (!(tblStats == null || table.needReAnalyzeTable(tblStats))) { return null; } - Set<String> needRunPartitions = findReAnalyzeNeededPartitions(table, tblStats); + Set<String> needRunPartitions = table.findReAnalyzeNeededPartitions(tblStats); if (needRunPartitions.isEmpty()) { return null; @@ -171,31 +172,6 @@ public class StatisticsAutoAnalyzer extends MasterDaemon { return getAnalysisJobInfo(jobInfo, table, needRunPartitions); } - @VisibleForTesting - protected Set<String> findReAnalyzeNeededPartitions(TableIf table, TableStats tableStats) { - if (tableStats == null) { - return table.getPartitionNames().stream().map(table::getPartition) - .filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet()); - } - return table.getPartitionNames().stream() - .map(table::getPartition) - .filter(Partition::hasData) - .filter(partition -> - partition.getVisibleVersionTime() >= tableStats.updatedTime).map(Partition::getName) - .collect(Collectors.toSet()); - } - - private boolean needReanalyzeTable(TableIf table, TableStats tblStats) { - long rowCount = table.getRowCount(); - // TODO: Do we need to analyze an empty table? - if (rowCount == 0) { - return false; - } - long updateRows = tblStats.updatedRows.get(); - int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); - return tblHealth < Config.table_stats_health_threshold; - } - @VisibleForTesting public AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table, Set<String> needRunPartitions) { @@ -247,6 +223,9 @@ public class StatisticsAutoAnalyzer extends MasterDaemon { Map<Long, BaseAnalysisTask> analysisTaskInfos = new HashMap<>(); AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); analysisManager.createTaskForEachColumns(jobInfo, analysisTaskInfos, false); + if (StatisticsUtil.isExternalTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName)) { + analysisManager.createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, false); + } Env.getCurrentEnv().getAnalysisManager().registerSysJob(jobInfo, analysisTaskInfos); analysisTaskInfos.values().forEach(analysisTaskExecutor::submitTask); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 876be46ab6..87d8a0ba15 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -489,9 +489,10 @@ public class StatisticsUtil { * First get it from remote table parameters. If not found, estimate it : totalSize/estimatedRowSize * * @param table Hive HMSExternalTable to estimate row count. + * @param isInit Flag to indicate if this is called during init. To avoid recursively get schema. * @return estimated row count */ - public static long getHiveRowCount(HMSExternalTable table) { + public static long getHiveRowCount(HMSExternalTable table, boolean isInit) { Map<String, String> parameters = table.getRemoteTable().getParameters(); if (parameters == null) { return -1; @@ -500,7 +501,7 @@ public class StatisticsUtil { if (parameters.containsKey(NUM_ROWS)) { return Long.parseLong(parameters.get(NUM_ROWS)); } - if (!parameters.containsKey(TOTAL_SIZE)) { + if (!parameters.containsKey(TOTAL_SIZE) || isInit) { return -1; } // Table parameters doesn't contain row count but contain total size. Estimate row count : totalSize/rowSize diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java index 3368a5a669..fff649a447 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatisticsAutoAnalyzerTest.java @@ -39,7 +39,6 @@ import mockit.Expectations; import mockit.Injectable; import mockit.Mock; import mockit.MockUp; -import mockit.Mocked; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -140,14 +139,25 @@ public class StatisticsAutoAnalyzerTest { } @Test - public void testGetReAnalyzeRequiredPart0(@Mocked TableIf tableIf) { + public void testGetReAnalyzeRequiredPart0() { - new Expectations() { - { - tableIf.getRowCount(); - result = 100; + TableIf tableIf = new OlapTable(); + + new MockUp<OlapTable>() { + @Mock + protected Set<String> findReAnalyzeNeededPartitions(TableStats tableStats) { + Set<String> partitionNames = new HashSet<>(); + partitionNames.add("p1"); + partitionNames.add("p2"); + return partitionNames; + } + + @Mock + public long getRowCount() { + return 100; } }; + new MockUp<StatisticsUtil>() { @Mock public TableIf findTable(String catalogName, String dbName, String tblName) { @@ -176,14 +186,6 @@ public class StatisticsAutoAnalyzerTest { }; new MockUp<StatisticsAutoAnalyzer>() { - @Mock - protected Set<String> findReAnalyzeNeededPartitions(TableIf table, TableStats tableStats) { - Set<String> partitionNames = new HashSet<>(); - partitionNames.add("p1"); - partitionNames.add("p2"); - return partitionNames; - } - @Mock public AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table, Set<String> needRunPartitions) { diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy index 85c8326382..c8163bba7e 100644 --- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy @@ -30,6 +30,10 @@ suite("test_hive_statistic", "p2,external,hive,external_remote,external_remote_h ); """ logger.info("catalog " + catalog_name + " created") + + // Test analyze table without init. + sql """analyze table ${catalog_name}.tpch_1000_parquet.region with sync""" + sql """switch ${catalog_name};""" logger.info("switched to catalog " + catalog_name) sql """use statistics;""" --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org