This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 23d7b4d4ec3 [branch2.0](backport)(statistics) Merge external stats related pr to branch2.0 (#25462) 23d7b4d4ec3 is described below commit 23d7b4d4ec3940b98b6fc8bf7d9d0bd549d8172d Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Mon Oct 16 16:01:11 2023 +0800 [branch2.0](backport)(statistics) Merge external stats related pr to branch2.0 (#25462) https://github.com/apache/doris/pull/23170 https://github.com/apache/doris/pull/23574 https://github.com/apache/doris/pull/23751 https://github.com/apache/doris/pull/24414 https://github.com/apache/doris/pull/24154 https://github.com/apache/doris/pull/24376 https://github.com/apache/doris/pull/24696 https://github.com/apache/doris/pull/24540 https://github.com/apache/doris/pull/24891 https://github.com/apache/doris/pull/25175 --- fe/fe-core/src/main/cup/sql_parser.cup | 9 + .../org/apache/doris/analysis/AnalyzeTblStmt.java | 6 - .../doris/analysis/ShowAnalyzeTaskStatus.java | 5 + .../java/org/apache/doris/analysis/TableRef.java | 6 +- .../main/java/org/apache/doris/catalog/Env.java | 8 +- .../java/org/apache/doris/catalog/OlapTable.java | 40 ++-- .../main/java/org/apache/doris/catalog/Table.java | 10 +- .../java/org/apache/doris/catalog/TableIf.java | 13 +- .../doris/catalog/external/ExternalTable.java | 7 + .../doris/catalog/external/HMSExternalTable.java | 31 +++ .../org/apache/doris/datasource/CatalogIf.java | 1 - .../apache/doris/datasource/ExternalCatalog.java | 1 - .../doris/datasource/hive/HiveMetaStoreCache.java | 3 + .../glue/translator/PhysicalPlanTranslator.java | 11 +- .../doris/nereids/rules/analysis/BindRelation.java | 9 +- .../LogicalFileScanToPhysicalFileScan.java | 3 +- .../trees/copier/LogicalPlanDeepCopier.java | 2 +- .../trees/plans/logical/LogicalFileScan.java | 19 +- .../trees/plans/physical/PhysicalFileScan.java | 16 +- .../java/org/apache/doris/persist/EditLog.java | 13 +- .../apache/doris/planner/SingleNodePlanner.java | 1 + .../doris/planner/external/FileQueryScanNode.java | 9 + .../doris/planner/external/HiveScanNode.java | 52 ++++ .../java/org/apache/doris/qe/ConnectContext.java | 2 + .../java/org/apache/doris/qe/SessionVariable.java | 17 ++ .../java/org/apache/doris/qe/StmtExecutor.java | 7 + .../apache/doris/statistics/AnalysisManager.java | 1 - .../doris/statistics/ColumnStatisticBuilder.java | 2 +- .../apache/doris/statistics/HMSAnalysisTask.java | 81 ++++--- .../doris/statistics/StatisticsRepository.java | 9 +- .../doris/statistics/util/StatisticsUtil.java | 91 ++++--- fe/fe-core/src/main/jflex/sql_scanner.flex | 1 + .../doris/statistics/AnalysisManagerTest.java | 265 ++++++++++++++++++++- .../doris/statistics/OlapAnalysisTaskTest.java | 70 ++++++ gensrc/thrift/PaloInternalService.thrift | 1 + .../hive/test_hive_partition_statistic.out | 87 +++++++ .../hive/test_hive_statistic_timeout.out | 7 + .../jdbc/test_mysql_jdbc_statistics.groovy | 4 +- .../hive/test_hive_partition_statistic.groovy | 53 +++++ .../hive/test_hive_sample_statistic.groovy | 99 ++++++++ .../hive/test_hive_statistic.groovy | 8 +- .../hive/test_hive_statistic_auto.groovy | 87 +++++++ .../hive/test_hive_statistic_cache.groovy | 36 ++- .../hive/test_hive_statistic_sample.groovy | 150 ++++++++++++ .../hive/test_hive_statistic_timeout.groovy | 54 +++++ .../suites/statistics/test_basic_statistics.groovy | 75 ++++++ 46 files changed, 1351 insertions(+), 131 deletions(-) diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index 6cc9a54a51f..4b0a8f3b737 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -524,6 +524,7 @@ terminal String KW_QUOTA, KW_RANDOM, KW_RANGE, + KW_RECENT, KW_READ, KW_REBALANCE, KW_RECOVER, @@ -5789,6 +5790,14 @@ partition_names ::= {: RESULT = new PartitionNames(true, Lists.newArrayList(partName)); :} + | KW_PARTITIONS LPAREN STAR RPAREN + {: + RESULT = new PartitionNames(true); + :} + | KW_PARTITIONS KW_WITH KW_RECENT INTEGER_LITERAL:count + {: + RESULT = new PartitionNames(count); + :} ; opt_table_sample ::= diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java index cbc66f367f2..185bee1d132 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java @@ -21,7 +21,6 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Database; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.View; import org.apache.doris.catalog.external.ExternalTable; @@ -167,11 +166,6 @@ public class AnalyzeTblStmt extends AnalyzeStmt { } analyzeProperties.check(); - // TODO support external table - if (analyzeProperties.isSampleRows() && !(table instanceof OlapTable)) { - throw new AnalysisException("Sampling statistics " - + "collection of external tables is not supported with rows, use percent instead."); - } if (analyzeProperties.isSync() && (analyzeProperties.isAutomatic() || analyzeProperties.getPeriodTimeInMs() != 0)) { throw new AnalysisException("Automatic/Period statistics collection " diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeTaskStatus.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeTaskStatus.java index 927a56d19d2..7c6c5cf17fe 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeTaskStatus.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeTaskStatus.java @@ -59,4 +59,9 @@ public class ShowAnalyzeTaskStatus extends ShowStmt { public long getJobId() { return jobId; } + + @Override + public RedirectStatus getRedirectStatus() { + return RedirectStatus.FORWARD_NO_SYNC; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/TableRef.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/TableRef.java index f332d269b3f..b6c47e1cbfc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/TableRef.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/TableRef.java @@ -470,9 +470,11 @@ public class TableRef implements ParseNode, Writable { } protected void analyzeSample() throws AnalysisException { - if ((sampleTabletIds != null || tableSample != null) && desc.getTable().getType() != TableIf.TableType.OLAP) { + if ((sampleTabletIds != null || tableSample != null) + && desc.getTable().getType() != TableIf.TableType.OLAP + && desc.getTable().getType() != TableIf.TableType.HMS_EXTERNAL_TABLE) { throw new AnalysisException("Sample table " + desc.getTable().getName() - + " type " + desc.getTable().getType() + " is not OLAP"); + + " type " + desc.getTable().getType() + " is not supported"); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index 2b0f581375c..e836b69f77d 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -5420,10 +5420,6 @@ public class Env { return loadManagerAdapter; } - public StatisticsAutoCollector getStatisticsAutoCollector() { - return statisticsAutoCollector; - } - public QueryStats getQueryStats() { return queryStats; } @@ -5436,4 +5432,8 @@ public class Env { public ColumnIdFlushDaemon getColumnIdFlusher() { return columnIdFlusher; } + + public StatisticsAutoCollector getStatisticsAutoCollector() { + return statisticsAutoCollector; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index f59df2554d3..bc0073f7ef9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -1105,6 +1105,26 @@ public class OlapTable extends Table { return new MVAnalysisTask(info); } + public boolean needReAnalyzeTable(TableStatsMeta tblStats) { + if (tblStats == null) { + return true; + } + long rowCount = getRowCount(); + // TODO: Do we need to analyze an empty table? + if (rowCount == 0) { + return false; + } + if (!tblStats.analyzeColumns().containsAll(getBaseSchema() + .stream() + .map(Column::getName) + .collect(Collectors.toSet()))) { + return true; + } + long updateRows = tblStats.updatedRows.get(); + int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); + return tblHealth < Config.table_stats_health_threshold; + } + @Override public long getRowCount() { long rowCount = 0; @@ -2282,24 +2302,4 @@ public class OlapTable extends Table { } return dataSize; } - - public boolean needReAnalyzeTable(TableStatsMeta tblStats) { - if (tblStats == null) { - return true; - } - long rowCount = getRowCount(); - // TODO: Do we need to analyze an empty table? - if (rowCount == 0) { - return false; - } - if (!tblStats.analyzeColumns().containsAll(getBaseSchema() - .stream() - .map(Column::getName) - .collect(Collectors.toSet()))) { - return true; - } - long updateRows = tblStats.updatedRows.get(); - int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); - return tblHealth < Config.table_stats_health_threshold; - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java index ba7e55c7d86..fdaa41d2a8f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java @@ -556,7 +556,11 @@ public abstract class Table extends MetaObject implements Writable, TableIf { return Optional.empty(); } - public void analyze(String dbName) { + public void analyze(String dbName) {} + + @Override + public boolean needReAnalyzeTable(TableStatsMeta tblStats) { + return true; } @Override @@ -565,7 +569,7 @@ public abstract class Table extends MetaObject implements Writable, TableIf { } @Override - public boolean needReAnalyzeTable(TableStatsMeta tblStats) { - return true; + public List<Long> getChunkSizes() { + throw new NotImplementedException("getChunkSized not implemented"); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index 108d227e591..46a3b4b5973 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -138,6 +138,14 @@ public interface TableIf { Optional<ColumnStatistic> getColumnStatistic(String colName); + boolean needReAnalyzeTable(TableStatsMeta tblStats); + + Map<String, Set<String>> findReAnalyzeNeededPartitions(); + + // Get all the chunk sizes of this table. Now, only HMS external table implemented this interface. + // For HMS external table, the return result is a list of all the files' size. + List<Long> getChunkSizes(); + void write(DataOutput out) throws IOException; /** @@ -239,15 +247,10 @@ public interface TableIf { return -1L; } - Map<String, Set<String>> findReAnalyzeNeededPartitions(); - default long getDataSize(boolean singleReplica) { // TODO: Each tableIf should impl it by itself. return 0; } - boolean needReAnalyzeTable(TableStatsMeta tblStats); - - } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java index 01b3ce9ee2d..a915136193c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java @@ -78,6 +78,7 @@ public class ExternalTable implements TableIf, Writable, GsonPostProcessable { // this field will be refreshed after reloading schema protected volatile long schemaUpdateTime; + protected long dbId; protected boolean objectCreated; protected ExternalCatalog catalog; protected ReentrantReadWriteLock rwLock = new ReentrantReadWriteLock(true); @@ -119,6 +120,7 @@ public class ExternalTable implements TableIf, Writable, GsonPostProcessable { try { // getDbOrAnalysisException will call makeSureInitialized in ExternalCatalog. ExternalDatabase db = catalog.getDbOrAnalysisException(dbName); + dbId = db.getId(); db.makeSureInitialized(); } catch (AnalysisException e) { Util.logAndThrowRuntimeException(LOG, String.format("Exception to get db %s", dbName), e); @@ -397,4 +399,9 @@ public class ExternalTable implements TableIf, Writable, GsonPostProcessable { partitions.add("Dummy Partition"); return getBaseSchema().stream().collect(Collectors.toMap(Column::getName, k -> partitions)); } + + @Override + public List<Long> getChunkSizes() { + throw new NotImplementedException("getChunkSized not implemented"); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index 728eec3e6f9..740a35f9572 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -26,6 +26,7 @@ import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; import org.apache.doris.datasource.HMSExternalCatalog; +import org.apache.doris.datasource.hive.HiveMetaStoreCache; import org.apache.doris.datasource.hive.PooledHiveMetaStoreClient; import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; @@ -644,6 +645,36 @@ public class HMSExternalTable extends ExternalTable { super.gsonPostProcess(); estimatedRowCount = -1; } + + @Override + public List<Long> getChunkSizes() { + HiveMetaStoreCache.HivePartitionValues partitionValues = StatisticsUtil.getPartitionValuesForTable(this); + List<HiveMetaStoreCache.FileCacheValue> filesByPartitions + = StatisticsUtil.getFilesForPartitions(this, partitionValues, 0); + List<Long> result = Lists.newArrayList(); + for (HiveMetaStoreCache.FileCacheValue files : filesByPartitions) { + for (HiveMetaStoreCache.HiveFileStatus file : files.getFiles()) { + result.add(file.getLength()); + } + } + return result; + } + + @Override + public long getDataSize(boolean singleReplica) { + long totalSize = StatisticsUtil.getTotalSizeFromHMS(this); + // Usually, we can get total size from HMS parameter. + if (totalSize > 0) { + return totalSize; + } + // If not found the size in HMS, calculate it by sum all files' size in table. + List<Long> chunkSizes = getChunkSizes(); + long total = 0; + for (long size : chunkSizes) { + total += size; + } + return total; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java index cebc526d14b..369fba3624b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java @@ -177,5 +177,4 @@ public interface CatalogIf<T extends DatabaseIf> { public ConcurrentHashMap<Long, DatabaseIf> getIdToDb(); public boolean enableAutoAnalyze(); - } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java index 22a6816543e..ee3f75a9f14 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java @@ -612,5 +612,4 @@ public abstract class ExternalCatalog } return ret; } - } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index 0a85d9ff5bd..df6f48b97dc 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -1071,6 +1071,9 @@ public class HiveMetaStoreCache { long length; long blockSize; long modificationTime; + boolean splittable; + List<String> partitionValues; + AcidInfo acidInfo; } @Data diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java index 549a6c8c1c7..e2bdf6b92fd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/glue/translator/PhysicalPlanTranslator.java @@ -469,7 +469,12 @@ public class PhysicalPlanTranslator extends DefaultPlanVisitor<PlanFragment, Pla break; case HIVE: scanNode = new HiveScanNode(fileScan.translatePlanNodeId(), tupleDescriptor, false); - ((HiveScanNode) scanNode).setSelectedPartitions(fileScan.getSelectedPartitions()); + HiveScanNode hiveScanNode = (HiveScanNode) scanNode; + hiveScanNode.setSelectedPartitions(fileScan.getSelectedPartitions()); + if (fileScan.getTableSample().isPresent()) { + hiveScanNode.setTableSample(new TableSample(fileScan.getTableSample().get().isPercent, + fileScan.getTableSample().get().sampleValue, fileScan.getTableSample().get().seek)); + } break; default: throw new RuntimeException("do not support DLA type " + ((HMSExternalTable) table).getDlaType()); @@ -491,7 +496,9 @@ public class PhysicalPlanTranslator extends DefaultPlanVisitor<PlanFragment, Pla TableRef ref = new TableRef(tableName, null, null); BaseTableRef tableRef = new BaseTableRef(ref, table, tableName); tupleDescriptor.setRef(tableRef); - + if (fileScan.getStats() != null) { + scanNode.setCardinality((long) fileScan.getStats().getRowCount()); + } Utils.execWithUncheckedException(scanNode::init); context.addScanNode(scanNode); ScanNode finalScanNode = scanNode; diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java index cffe32df5b0..33dcbbde920 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/analysis/BindRelation.java @@ -42,7 +42,6 @@ import org.apache.doris.nereids.rules.RuleType; import org.apache.doris.nereids.trees.expressions.EqualTo; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.Slot; -import org.apache.doris.nereids.trees.expressions.StatementScopeIdGenerator; import org.apache.doris.nereids.trees.expressions.literal.TinyIntLiteral; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PreAggStatus; @@ -214,13 +213,13 @@ public class BindRelation extends OneAnalysisRuleFactory { Plan hiveViewPlan = parseAndAnalyzeHiveView(hiveCatalog, ddlSql, cascadesContext); return new LogicalSubQueryAlias<>(tableQualifier, hiveViewPlan); } - return new LogicalFileScan(StatementScopeIdGenerator.newRelationId(), - (HMSExternalTable) table, tableQualifier); + return new LogicalFileScan(unboundRelation.getRelationId(), (HMSExternalTable) table, tableQualifier, + unboundRelation.getTableSample()); case ICEBERG_EXTERNAL_TABLE: case PAIMON_EXTERNAL_TABLE: case MAX_COMPUTE_EXTERNAL_TABLE: - return new LogicalFileScan(StatementScopeIdGenerator.newRelationId(), - (ExternalTable) table, tableQualifier); + return new LogicalFileScan(unboundRelation.getRelationId(), (ExternalTable) table, tableQualifier, + unboundRelation.getTableSample()); case SCHEMA: return new LogicalSchemaScan(unboundRelation.getRelationId(), table, tableQualifier); case JDBC_EXTERNAL_TABLE: diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalFileScanToPhysicalFileScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalFileScanToPhysicalFileScan.java index f53ce1553ae..d86e1d1667e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalFileScanToPhysicalFileScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/rules/implementation/LogicalFileScanToPhysicalFileScan.java @@ -39,7 +39,8 @@ public class LogicalFileScanToPhysicalFileScan extends OneImplementationRuleFact Optional.empty(), fileScan.getLogicalProperties(), fileScan.getConjuncts(), - fileScan.getSelectedPartitions()) + fileScan.getSelectedPartitions(), + fileScan.getTableSample()) ).toRule(RuleType.LOGICAL_FILE_SCAN_TO_PHYSICAL_FILE_SCAN_RULE); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java index 39f7fb310b4..9c05bae461d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/copier/LogicalPlanDeepCopier.java @@ -203,7 +203,7 @@ public class LogicalPlanDeepCopier extends DefaultPlanRewriter<DeepCopierContext return context.getRelationReplaceMap().get(fileScan.getRelationId()); } LogicalFileScan newFileScan = new LogicalFileScan(StatementScopeIdGenerator.newRelationId(), - fileScan.getTable(), fileScan.getQualifier()); + fileScan.getTable(), fileScan.getQualifier(), fileScan.getTableSample()); updateReplaceMapWithOutput(fileScan, newFileScan, context.exprIdReplaceMap); context.putRelation(fileScan.getRelationId(), newFileScan); Set<Expression> conjuncts = fileScan.getConjuncts().stream() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalFileScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalFileScan.java index c25a25efcec..390fb1c97e0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalFileScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/logical/LogicalFileScan.java @@ -21,6 +21,7 @@ import org.apache.doris.catalog.PartitionItem; import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.nereids.memo.GroupExpression; import org.apache.doris.nereids.properties.LogicalProperties; +import org.apache.doris.nereids.trees.TableSample; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PlanType; @@ -49,22 +50,26 @@ public class LogicalFileScan extends LogicalCatalogRelation { private final Set<Expression> conjuncts; @Getter private final SelectedPartitions selectedPartitions; + @Getter + private final Optional<TableSample> tableSample; /** * Constructor for LogicalFileScan. */ public LogicalFileScan(RelationId id, ExternalTable table, List<String> qualifier, Optional<GroupExpression> groupExpression, Optional<LogicalProperties> logicalProperties, - Set<Expression> conjuncts, SelectedPartitions selectedPartitions) { + Set<Expression> conjuncts, SelectedPartitions selectedPartitions, Optional<TableSample> tableSample) { super(id, PlanType.LOGICAL_FILE_SCAN, table, qualifier, groupExpression, logicalProperties); this.conjuncts = conjuncts; this.selectedPartitions = selectedPartitions; + this.tableSample = tableSample; } - public LogicalFileScan(RelationId id, ExternalTable table, List<String> qualifier) { + public LogicalFileScan(RelationId id, ExternalTable table, List<String> qualifier, + Optional<TableSample> tableSample) { this(id, table, qualifier, Optional.empty(), Optional.empty(), - Sets.newHashSet(), SelectedPartitions.NOT_PRUNED); + Sets.newHashSet(), SelectedPartitions.NOT_PRUNED, tableSample); } @Override @@ -85,24 +90,24 @@ public class LogicalFileScan extends LogicalCatalogRelation { @Override public LogicalFileScan withGroupExpression(Optional<GroupExpression> groupExpression) { return new LogicalFileScan(relationId, (ExternalTable) table, qualifier, groupExpression, - Optional.of(getLogicalProperties()), conjuncts, selectedPartitions); + Optional.of(getLogicalProperties()), conjuncts, selectedPartitions, tableSample); } @Override public Plan withGroupExprLogicalPropChildren(Optional<GroupExpression> groupExpression, Optional<LogicalProperties> logicalProperties, List<Plan> children) { return new LogicalFileScan(relationId, (ExternalTable) table, qualifier, - groupExpression, logicalProperties, conjuncts, selectedPartitions); + groupExpression, logicalProperties, conjuncts, selectedPartitions, tableSample); } public LogicalFileScan withConjuncts(Set<Expression> conjuncts) { return new LogicalFileScan(relationId, (ExternalTable) table, qualifier, groupExpression, - Optional.of(getLogicalProperties()), conjuncts, selectedPartitions); + Optional.of(getLogicalProperties()), conjuncts, selectedPartitions, tableSample); } public LogicalFileScan withSelectedPartitions(SelectedPartitions selectedPartitions) { return new LogicalFileScan(relationId, (ExternalTable) table, qualifier, groupExpression, - Optional.of(getLogicalProperties()), conjuncts, selectedPartitions); + Optional.of(getLogicalProperties()), conjuncts, selectedPartitions, tableSample); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalFileScan.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalFileScan.java index be4f5b17983..3dc83c7f43d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalFileScan.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/plans/physical/PhysicalFileScan.java @@ -22,6 +22,7 @@ import org.apache.doris.nereids.memo.GroupExpression; import org.apache.doris.nereids.properties.DistributionSpec; import org.apache.doris.nereids.properties.LogicalProperties; import org.apache.doris.nereids.properties.PhysicalProperties; +import org.apache.doris.nereids.trees.TableSample; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.PlanType; @@ -47,6 +48,8 @@ public class PhysicalFileScan extends PhysicalCatalogRelation { private final Set<Expression> conjuncts; @Getter private final SelectedPartitions selectedPartitions; + @Getter + private final Optional<TableSample> tableSample; /** * Constructor for PhysicalFileScan. @@ -54,11 +57,12 @@ public class PhysicalFileScan extends PhysicalCatalogRelation { public PhysicalFileScan(RelationId id, ExternalTable table, List<String> qualifier, DistributionSpec distributionSpec, Optional<GroupExpression> groupExpression, LogicalProperties logicalProperties, Set<Expression> conjuncts, - SelectedPartitions selectedPartitions) { + SelectedPartitions selectedPartitions, Optional<TableSample> tableSample) { super(id, PlanType.PHYSICAL_FILE_SCAN, table, qualifier, groupExpression, logicalProperties); this.distributionSpec = distributionSpec; this.conjuncts = conjuncts; this.selectedPartitions = selectedPartitions; + this.tableSample = tableSample; } /** @@ -67,12 +71,14 @@ public class PhysicalFileScan extends PhysicalCatalogRelation { public PhysicalFileScan(RelationId id, ExternalTable table, List<String> qualifier, DistributionSpec distributionSpec, Optional<GroupExpression> groupExpression, LogicalProperties logicalProperties, PhysicalProperties physicalProperties, - Statistics statistics, Set<Expression> conjuncts, SelectedPartitions selectedPartitions) { + Statistics statistics, Set<Expression> conjuncts, SelectedPartitions selectedPartitions, + Optional<TableSample> tableSample) { super(id, PlanType.PHYSICAL_FILE_SCAN, table, qualifier, groupExpression, logicalProperties, physicalProperties, statistics); this.distributionSpec = distributionSpec; this.conjuncts = conjuncts; this.selectedPartitions = selectedPartitions; + this.tableSample = tableSample; } @Override @@ -95,14 +101,14 @@ public class PhysicalFileScan extends PhysicalCatalogRelation { @Override public PhysicalFileScan withGroupExpression(Optional<GroupExpression> groupExpression) { return new PhysicalFileScan(relationId, getTable(), qualifier, distributionSpec, - groupExpression, getLogicalProperties(), conjuncts, selectedPartitions); + groupExpression, getLogicalProperties(), conjuncts, selectedPartitions, tableSample); } @Override public Plan withGroupExprLogicalPropChildren(Optional<GroupExpression> groupExpression, Optional<LogicalProperties> logicalProperties, List<Plan> children) { return new PhysicalFileScan(relationId, getTable(), qualifier, distributionSpec, - groupExpression, logicalProperties.get(), conjuncts, selectedPartitions); + groupExpression, logicalProperties.get(), conjuncts, selectedPartitions, tableSample); } @Override @@ -115,6 +121,6 @@ public class PhysicalFileScan extends PhysicalCatalogRelation { Statistics statistics) { return new PhysicalFileScan(relationId, getTable(), qualifier, distributionSpec, groupExpression, getLogicalProperties(), physicalProperties, statistics, conjuncts, - selectedPartitions); + selectedPartitions, tableSample); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java b/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java index efe61042478..72b14427003 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java @@ -82,6 +82,7 @@ import org.apache.doris.policy.Policy; import org.apache.doris.policy.StoragePolicy; import org.apache.doris.resource.workloadgroup.WorkloadGroup; import org.apache.doris.statistics.AnalysisInfo; +import org.apache.doris.statistics.AnalysisManager; import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.system.Backend; import org.apache.doris.system.Frontend; @@ -1037,11 +1038,19 @@ public class EditLog { break; } case OperationType.OP_CREATE_ANALYSIS_JOB: { - env.getAnalysisManager().replayCreateAnalysisJob((AnalysisInfo) journal.getData()); + AnalysisInfo info = (AnalysisInfo) journal.getData(); + if (AnalysisManager.needAbandon(info)) { + break; + } + env.getAnalysisManager().replayCreateAnalysisJob(info); break; } case OperationType.OP_CREATE_ANALYSIS_TASK: { - env.getAnalysisManager().replayCreateAnalysisTask((AnalysisInfo) journal.getData()); + AnalysisInfo info = (AnalysisInfo) journal.getData(); + if (AnalysisManager.needAbandon(info)) { + break; + } + env.getAnalysisManager().replayCreateAnalysisTask(info); break; } case OperationType.OP_DELETE_ANALYSIS_JOB: { diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java b/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java index f33f2e483ca..e3834070d57 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/SingleNodePlanner.java @@ -2024,6 +2024,7 @@ public class SingleNodePlanner { break; case HIVE: scanNode = new HiveScanNode(ctx.getNextNodeId(), tblRef.getDesc(), true); + ((HiveScanNode) scanNode).setTableSample(tblRef.getTableSample()); break; default: throw new UserException("Not supported table type" + table.getType()); diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java index 44e113a280b..67e09cbf205 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/FileQueryScanNode.java @@ -20,6 +20,7 @@ package org.apache.doris.planner.external; import org.apache.doris.analysis.Analyzer; import org.apache.doris.analysis.SlotDescriptor; import org.apache.doris.analysis.SlotId; +import org.apache.doris.analysis.TableSample; import org.apache.doris.analysis.TupleDescriptor; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; @@ -71,6 +72,7 @@ import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Maps; +import lombok.Getter; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -92,6 +94,9 @@ public abstract class FileQueryScanNode extends FileScanNode { protected Map<String, SlotDescriptor> destSlotDescByName; protected TFileScanRangeParams params; + @Getter + protected TableSample tableSample; + /** * External file scan node for Query hms table * needCheckColumnPriv: Some of ExternalFileScanNode do not need to check column priv @@ -200,6 +205,10 @@ public abstract class FileQueryScanNode extends FileScanNode { setColumnPositionMapping(); } + public void setTableSample(TableSample tSample) { + this.tableSample = tSample; + } + @Override public void finalize(Analyzer analyzer) throws UserException { doFinalize(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java index 0b6e1d44466..1ba77fa5f9c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/planner/external/HiveScanNode.java @@ -63,8 +63,10 @@ import org.apache.logging.log4j.Logger; import java.io.IOException; import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.stream.Collectors; public class HiveScanNode extends FileQueryScanNode { @@ -218,6 +220,11 @@ public class HiveScanNode extends FileQueryScanNode { if (ConnectContext.get().getExecutor() != null) { ConnectContext.get().getExecutor().getSummaryProfile().setGetPartitionFilesFinishTime(); } + if (tableSample != null) { + List<HiveMetaStoreCache.HiveFileStatus> hiveFileStatuses = selectFiles(fileCaches); + splitAllFiles(allFiles, hiveFileStatuses); + return; + } for (HiveMetaStoreCache.FileCacheValue fileCacheValue : fileCaches) { // This if branch is to support old splitter, will remove later. if (fileCacheValue.getSplits() != null) { @@ -235,6 +242,51 @@ public class HiveScanNode extends FileQueryScanNode { } } + private void splitAllFiles(List<Split> allFiles, + List<HiveMetaStoreCache.HiveFileStatus> hiveFileStatuses) throws IOException { + for (HiveMetaStoreCache.HiveFileStatus status : hiveFileStatuses) { + allFiles.addAll(splitFile(status.getPath(), status.getBlockSize(), + status.getBlockLocations(), status.getLength(), status.getModificationTime(), + status.isSplittable(), status.getPartitionValues(), + new HiveSplitCreator(status.getAcidInfo()))); + } + } + + private List<HiveMetaStoreCache.HiveFileStatus> selectFiles(List<FileCacheValue> inputCacheValue) { + List<HiveMetaStoreCache.HiveFileStatus> fileList = Lists.newArrayList(); + long totalSize = 0; + for (FileCacheValue value : inputCacheValue) { + for (HiveMetaStoreCache.HiveFileStatus file : value.getFiles()) { + file.setSplittable(value.isSplittable()); + file.setPartitionValues(value.getPartitionValues()); + file.setAcidInfo(value.getAcidInfo()); + fileList.add(file); + totalSize += file.getLength(); + } + } + long sampleSize = 0; + if (tableSample.isPercent()) { + sampleSize = totalSize * tableSample.getSampleValue() / 100; + } else { + long estimatedRowSize = 0; + for (Column column : hmsTable.getFullSchema()) { + estimatedRowSize += column.getDataType().getSlotSize(); + } + sampleSize = estimatedRowSize * tableSample.getSampleValue(); + } + long selectedSize = 0; + Collections.shuffle(fileList, new Random(tableSample.getSeek())); + int index = 0; + for (HiveMetaStoreCache.HiveFileStatus file : fileList) { + selectedSize += file.getLength(); + index += 1; + if (selectedSize >= sampleSize) { + break; + } + } + return fileList.subList(0, index); + } + private List<FileCacheValue> getFileSplitByTransaction(HiveMetaStoreCache cache, List<HivePartition> partitions) { for (HivePartition partition : partitions) { if (partition.getPartitionValues() == null || partition.getPartitionValues().isEmpty()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java index 62734093277..90d04c8a2dd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ConnectContext.java @@ -717,6 +717,8 @@ public class ConnectContext { if (executor != null && executor.isInsertStmt()) { // particular for insert stmt, we can expand other type of timeout in the same way return Math.max(sessionVariable.getInsertTimeoutS(), sessionVariable.getQueryTimeoutS()); + } else if (executor != null && executor.isAnalyzeStmt()) { + return sessionVariable.getAnalyzeTimeoutS(); } else { // normal query stmt return sessionVariable.getQueryTimeoutS(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 34518114453..4e7a4893b67 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -65,6 +65,7 @@ public class SessionVariable implements Serializable, Writable { public static final String EXEC_MEM_LIMIT = "exec_mem_limit"; public static final String SCAN_QUEUE_MEM_LIMIT = "scan_queue_mem_limit"; public static final String QUERY_TIMEOUT = "query_timeout"; + public static final String ANALYZE_TIMEOUT = "analyze_timeout"; public static final String MAX_EXECUTION_TIME = "max_execution_time"; public static final String INSERT_TIMEOUT = "insert_timeout"; @@ -450,6 +451,10 @@ public class SessionVariable implements Serializable, Writable { @VariableMgr.VarAttr(name = QUERY_TIMEOUT) public int queryTimeoutS = 900; + // query timeout in second. + @VariableMgr.VarAttr(name = ANALYZE_TIMEOUT, needForward = true) + public int analyzeTimeoutS = 43200; + // The global max_execution_time value provides the default for the session value for new connections. // The session value applies to SELECT executions executed within the session that include // no MAX_EXECUTION_TIME(N) optimizer hint or for which N is 0. @@ -1335,6 +1340,10 @@ public class SessionVariable implements Serializable, Writable { return queryTimeoutS; } + public int getAnalyzeTimeoutS() { + return analyzeTimeoutS; + } + public void setEnableTwoPhaseReadOpt(boolean enable) { enableTwoPhaseReadOpt = enable; } @@ -1514,6 +1523,10 @@ public class SessionVariable implements Serializable, Writable { this.queryTimeoutS = queryTimeoutS; } + public void setAnalyzeTimeoutS(int analyzeTimeoutS) { + this.analyzeTimeoutS = analyzeTimeoutS; + } + public void setMaxExecutionTimeMS(int maxExecutionTimeMS) { this.maxExecutionTimeMS = maxExecutionTimeMS; this.queryTimeoutS = this.maxExecutionTimeMS / 1000; @@ -2433,6 +2446,9 @@ public class SessionVariable implements Serializable, Writable { if (queryOptions.isSetInsertTimeout()) { setInsertTimeoutS(queryOptions.getInsertTimeout()); } + if (queryOptions.isSetAnalyzeTimeout()) { + setAnalyzeTimeoutS(queryOptions.getAnalyzeTimeout()); + } } /** @@ -2444,6 +2460,7 @@ public class SessionVariable implements Serializable, Writable { queryOptions.setScanQueueMemLimit(Math.min(maxScanQueueMemByte, maxExecMemByte / 20)); queryOptions.setQueryTimeout(queryTimeoutS); queryOptions.setInsertTimeout(insertTimeoutS); + queryOptions.setAnalyzeTimeout(analyzeTimeoutS); return queryOptions; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java index 5d90008ff75..5dcb392fb62 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java @@ -397,6 +397,13 @@ public class StmtExecutor { return parsedStmt instanceof InsertStmt; } + public boolean isAnalyzeStmt() { + if (parsedStmt == null) { + return false; + } + return parsedStmt instanceof AnalyzeStmt; + } + /** * Used for audit in ConnectProcessor. * <p> diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 3eebb1ec9aa..9fa576f85cd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -651,7 +651,6 @@ public class AnalysisManager extends Daemon implements Writable { tableStats.updateByJob(jobInfo); logCreateTableStats(tableStats); } - } public List<AnalysisInfo> showAnalysisJob(ShowAnalyzeStmt stmt) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java index fa4cf7ebc99..f97459555c8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java @@ -190,7 +190,7 @@ public class ColumnStatisticBuilder { } public ColumnStatistic build() { - dataSize = Math.max((count - numNulls + 1) * avgSizeByte, 0); + dataSize = dataSize > 0 ? dataSize : Math.max((count - numNulls + 1) * avgSizeByte, 0); if (original == null && !isUnknown) { original = new ColumnStatistic(count, ndv, null, avgSizeByte, numNulls, dataSize, minValue, maxValue, minExpr, maxExpr, diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 836e6e6e493..076479aae8f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -32,10 +32,12 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.ArrayList; +import java.util.Collections; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Random; import java.util.Set; import java.util.StringJoiner; import java.util.stream.Collectors; @@ -43,10 +45,9 @@ import java.util.stream.Collectors; public class HMSAnalysisTask extends BaseAnalysisTask { private static final Logger LOG = LogManager.getLogger(HMSAnalysisTask.class); - public static final String TOTAL_SIZE = "totalSize"; - public static final String NUM_ROWS = "numRows"; - public static final String NUM_FILES = "numFiles"; - public static final String TIMESTAMP = "transient_lastDdlTime"; + // While doing sample analysis, the sampled ndv result will multiply a factor (total size/sample size) + // if ndv(col)/count(col) is greater than this threshold. + private static final String NDV_MULTIPLY_THRESHOLD = "0.3"; private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO " + "${internalDB}.${columnStatTbl}" @@ -58,12 +59,15 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${idxId} AS idx_id, " + "'${colId}' AS col_id, " + "NULL AS part_id, " - + "${countExpr} AS row_count, " - + "NDV(`${colName}`) AS ndv, " - + "${nullCountExpr} AS null_count, " + + "ROUND(COUNT(1) * ${scaleFactor}) AS row_count, " + + "case when NDV(`${colName}`)/count('${colName}') < " + + NDV_MULTIPLY_THRESHOLD + + " then NDV(`${colName}`) " + + "else NDV(`${colName}`) * ${scaleFactor} end AS ndv, " + + "ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * ${scaleFactor}) AS null_count, " + "MIN(`${colName}`) AS min, " + "MAX(`${colName}`) AS max, " - + "${dataSizeFunction} AS data_size, " + + "${dataSizeFunction} * ${scaleFactor} AS data_size, " + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; @@ -83,7 +87,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${dataSizeFunction} AS data_size, " + "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where "; - private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ${countExpr} as rowCount " + private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ROUND(COUNT(1) * ${scaleFactor}) as rowCount " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; // cache stats for each partition, it would be inserted into column_statistics in a batch. @@ -160,7 +164,6 @@ public class HMSAnalysisTask extends BaseAnalysisTask { params.put("colName", col.getName()); params.put("colId", info.colName); params.put("dataSizeFunction", getDataSizeFunction(col)); - params.put("nullCountExpr", getNullCountExpression()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); executeInsertSql(sql); @@ -277,7 +280,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask { commonParams.put("catalogName", catalog.getName()); commonParams.put("dbName", db.getFullName()); commonParams.put("tblName", tbl.getName()); - commonParams.put("countExpr", getCountExpression()); + commonParams.put("sampleExpr", getSampleExpression()); + commonParams.put("scaleFactor", getSampleScaleFactor()); if (col != null) { commonParams.put("type", col.getType().toString()); } @@ -285,30 +289,51 @@ public class HMSAnalysisTask extends BaseAnalysisTask { return commonParams; } - protected String getCountExpression() { - if (info.samplePercent > 0) { - return String.format("ROUND(COUNT(1) * 100 / %d)", info.samplePercent); - } else { - return "COUNT(1)"; + protected String getSampleExpression() { + if (tableSample == null) { + return ""; } - } - - protected String getNullCountExpression() { - if (info.samplePercent > 0) { - return String.format("ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 100 / %d)", - info.samplePercent); + if (tableSample.isPercent()) { + return String.format("TABLESAMPLE(%d PERCENT)", tableSample.getSampleValue()); } else { - return "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END)"; + return String.format("TABLESAMPLE(%d ROWS)", tableSample.getSampleValue()); } } - protected String getDataSizeFunction(Column column) { - String originFunction = super.getDataSizeFunction(column); - if (info.samplePercent > 0 && !isPartitionOnly) { - return String.format("ROUND((%s) * 100 / %d)", originFunction, info.samplePercent); + // Get the sample scale factor. While analyzing, the result of count, null count and data size need to + // multiply this factor to get more accurate result. + protected String getSampleScaleFactor() { + if (tableSample == null) { + return "1"; + } + long target = 0; + // Get list of all files' size in this HMS table. + List<Long> chunkSizes = table.getChunkSizes(); + Collections.shuffle(chunkSizes, new Random(tableSample.getSeek())); + long total = 0; + // Calculate the total size of this HMS table. + for (long size : chunkSizes) { + total += size; + } + // Calculate the sample target size for percent and rows sample. + if (tableSample.isPercent()) { + target = total * tableSample.getSampleValue() / 100; } else { - return originFunction; + int columnSize = 0; + for (Column column : table.getFullSchema()) { + columnSize += column.getDataType().getSlotSize(); + } + target = columnSize * tableSample.getSampleValue(); + } + // Calculate the actual sample size (cumulate). + long cumulate = 0; + for (long size : chunkSizes) { + cumulate += size; + if (cumulate >= target) { + break; + } } + return Double.toString(Math.max(((double) total) / cumulate, 1)); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java index cd3cc67f3c9..63953f5bfb5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java @@ -248,7 +248,14 @@ public class StatisticsRepository { builder.setMaxValue(StatisticsUtil.convertToDouble(column.getType(), max)); } if (dataSize != null) { - builder.setDataSize(Double.parseDouble(dataSize)); + double size = Double.parseDouble(dataSize); + double rows = Double.parseDouble(rowCount); + if (size > 0) { + builder.setDataSize(size); + if (rows > 0) { + builder.setAvgSizeByte(size / rows); + } + } } ColumnStatistic columnStatistic = builder.build(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 40ae13a0e0e..f482c812879 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -539,6 +539,19 @@ public class StatisticsUtil { return totalSize / estimatedRowSize; } + /** + * Get total size parameter from HMS. + * @param table Hive HMSExternalTable to get HMS total size parameter. + * @return Long value of table total size, return 0 if not found. + */ + public static long getTotalSizeFromHMS(HMSExternalTable table) { + Map<String, String> parameters = table.getRemoteTable().getParameters(); + if (parameters == null) { + return 0; + } + return parameters.containsKey(TOTAL_SIZE) ? Long.parseLong(parameters.get(TOTAL_SIZE)) : 0; + } + /** * Estimate iceberg table row count. * Get the row count by adding all task file recordCount. @@ -574,13 +587,42 @@ public class StatisticsUtil { if (table.isView()) { return 0; } + HiveMetaStoreCache.HivePartitionValues partitionValues = getPartitionValuesForTable(table); + int totalPartitionSize = partitionValues == null ? 1 : partitionValues.getIdToPartitionItem().size(); + + // Get files for all partitions. + int samplePartitionSize = Config.hive_stats_partition_sample_size; + List<HiveMetaStoreCache.FileCacheValue> filesByPartitions + = getFilesForPartitions(table, partitionValues, samplePartitionSize); + long totalSize = 0; + // Calculate the total file size. + for (HiveMetaStoreCache.FileCacheValue files : filesByPartitions) { + for (HiveMetaStoreCache.HiveFileStatus file : files.getFiles()) { + totalSize += file.getLength(); + } + } + // Estimate row count: totalSize/estimatedRowSize + long estimatedRowSize = 0; + for (Column column : table.getFullSchema()) { + estimatedRowSize += column.getDataType().getSlotSize(); + } + if (estimatedRowSize == 0) { + return 1; + } + if (samplePartitionSize < totalPartitionSize) { + totalSize = totalSize * totalPartitionSize / samplePartitionSize; + } + return totalSize / estimatedRowSize; + } + + public static HiveMetaStoreCache.HivePartitionValues getPartitionValuesForTable(HMSExternalTable table) { + if (table.isView()) { + return null; + } HiveMetaStoreCache cache = Env.getCurrentEnv().getExtMetaCacheMgr() .getMetaStoreCache((HMSExternalCatalog) table.getCatalog()); List<Type> partitionColumnTypes = table.getPartitionColumnTypes(); HiveMetaStoreCache.HivePartitionValues partitionValues = null; - List<HivePartition> hivePartitions = Lists.newArrayList(); - int samplePartitionSize = Config.hive_stats_partition_sample_size; - int totalPartitionSize = 1; // Get table partitions from cache. if (!partitionColumnTypes.isEmpty()) { // It is ok to get partition values from cache, @@ -588,17 +630,28 @@ public class StatisticsUtil { // because it has enough space to keep partition info of all tables in cache. partitionValues = cache.getPartitionValues(table.getDbName(), table.getName(), partitionColumnTypes); } + return partitionValues; + } + + public static List<HiveMetaStoreCache.FileCacheValue> getFilesForPartitions( + HMSExternalTable table, HiveMetaStoreCache.HivePartitionValues partitionValues, int sampleSize) { + if (table.isView()) { + return Lists.newArrayList(); + } + HiveMetaStoreCache cache = Env.getCurrentEnv().getExtMetaCacheMgr() + .getMetaStoreCache((HMSExternalCatalog) table.getCatalog()); + List<HivePartition> hivePartitions = Lists.newArrayList(); if (partitionValues != null) { Map<Long, PartitionItem> idToPartitionItem = partitionValues.getIdToPartitionItem(); - totalPartitionSize = idToPartitionItem.size(); + int totalPartitionSize = idToPartitionItem.size(); Collection<PartitionItem> partitionItems; List<List<String>> partitionValuesList; // If partition number is too large, randomly choose part of them to estimate the whole table. - if (samplePartitionSize < totalPartitionSize) { + if (sampleSize > 0 && sampleSize < totalPartitionSize) { List<PartitionItem> items = new ArrayList<>(idToPartitionItem.values()); Collections.shuffle(items); - partitionItems = items.subList(0, samplePartitionSize); - partitionValuesList = Lists.newArrayListWithCapacity(samplePartitionSize); + partitionItems = items.subList(0, sampleSize); + partitionValuesList = Lists.newArrayListWithCapacity(sampleSize); } else { partitionItems = idToPartitionItem.values(); partitionValuesList = Lists.newArrayListWithCapacity(totalPartitionSize); @@ -609,34 +662,14 @@ public class StatisticsUtil { // get partitions without cache, so that it will not invalid the cache when executing // non query request such as `show table status` hivePartitions = cache.getAllPartitionsWithoutCache(table.getDbName(), table.getName(), - partitionValuesList); + partitionValuesList); } else { hivePartitions.add(new HivePartition(table.getDbName(), table.getName(), true, table.getRemoteTable().getSd().getInputFormat(), table.getRemoteTable().getSd().getLocation(), null)); } // Get files for all partitions. - List<HiveMetaStoreCache.FileCacheValue> filesByPartitions = cache.getFilesByPartitionsWithoutCache( - hivePartitions, true); - long totalSize = 0; - // Calculate the total file size. - for (HiveMetaStoreCache.FileCacheValue files : filesByPartitions) { - for (HiveMetaStoreCache.HiveFileStatus file : files.getFiles()) { - totalSize += file.getLength(); - } - } - // Estimate row count: totalSize/estimatedRowSize - long estimatedRowSize = 0; - for (Column column : table.getFullSchema()) { - estimatedRowSize += column.getDataType().getSlotSize(); - } - if (estimatedRowSize == 0) { - return 1; - } - if (samplePartitionSize < totalPartitionSize) { - totalSize = totalSize * totalPartitionSize / samplePartitionSize; - } - return totalSize / estimatedRowSize; + return cache.getFilesByPartitionsWithoutCache(hivePartitions, true); } /** diff --git a/fe/fe-core/src/main/jflex/sql_scanner.flex b/fe/fe-core/src/main/jflex/sql_scanner.flex index c4791093b48..ca3d2eea319 100644 --- a/fe/fe-core/src/main/jflex/sql_scanner.flex +++ b/fe/fe-core/src/main/jflex/sql_scanner.flex @@ -377,6 +377,7 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("read", new Integer(SqlParserSymbols.KW_READ)); keywordMap.put("real", new Integer(SqlParserSymbols.KW_DOUBLE)); keywordMap.put("rebalance", new Integer(SqlParserSymbols.KW_REBALANCE)); + keywordMap.put("recent", new Integer(SqlParserSymbols.KW_RECENT)); keywordMap.put("recover", new Integer(SqlParserSymbols.KW_RECOVER)); keywordMap.put("recycle", new Integer(SqlParserSymbols.KW_RECYCLE)); keywordMap.put("refresh", new Integer(SqlParserSymbols.KW_REFRESH)); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java index 253f9c9332a..636e32ea4e1 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisManagerTest.java @@ -17,13 +17,32 @@ package org.apache.doris.statistics; +import org.apache.doris.analysis.AnalyzeProperties; +import org.apache.doris.analysis.AnalyzeTblStmt; +import org.apache.doris.analysis.PartitionNames; +import org.apache.doris.analysis.TableName; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.PrimitiveType; +import org.apache.doris.common.DdlException; +import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.doris.statistics.util.StatisticsUtil; + +import com.google.common.annotations.VisibleForTesting; +import mockit.Expectations; +import mockit.Injectable; import mockit.Mock; import mockit.MockUp; import mockit.Mocked; -import org.junit.Test; +import org.apache.hadoop.util.Lists; import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; +import java.util.ArrayList; +import java.util.Collection; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Map; public class AnalysisManagerTest { @@ -70,4 +89,248 @@ public class AnalysisManagerTest { manager.updateTaskStatus(taskInfo2, AnalysisState.FINISHED, "", 0); Assertions.assertEquals(job.state, AnalysisState.FINISHED); } + + // test build sync job + @Test + public void testBuildAndAssignJob1() throws Exception { + AnalysisInfo analysisInfo = new AnalysisInfoBuilder().setColToPartitions(new HashMap<>()).build(); + new MockUp<StatisticsUtil>() { + + @Mock + public boolean statsTblAvailable() { + return true; + } + }; + new MockUp<AnalysisManager>() { + + @Mock + public AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlException { + return analysisInfo; + } + + @Mock + @VisibleForTesting + public void createTaskForExternalTable(AnalysisInfo jobInfo, + Map<Long, BaseAnalysisTask> analysisTasks, + boolean isSync) throws DdlException { + // DO NOTHING + } + + @Mock + public void createTaskForEachColumns(AnalysisInfo jobInfo, Map<Long, BaseAnalysisTask> analysisTasks, + boolean isSync) throws DdlException { + // DO NOTHING + } + + @Mock + public void syncExecute(Collection<BaseAnalysisTask> tasks) { + // DO NOTHING + } + + @Mock + public void updateTableStats(AnalysisInfo jobInfo) { + // DO NOTHING + } + }; + AnalyzeTblStmt analyzeTblStmt = new AnalyzeTblStmt(new TableName("test"), + new PartitionNames(false, new ArrayList<String>() { + { + add("p1"); + add("p2"); + } + }), new ArrayList<String>() { + { + add("c1"); + add("c2"); + } + }, new AnalyzeProperties(new HashMap<String, String>() { + { + put(AnalyzeProperties.PROPERTY_SYNC, "true"); + } + })); + + AnalysisManager analysisManager = new AnalysisManager(); + Assertions.assertNull(analysisManager.buildAndAssignJob(analyzeTblStmt)); + analysisInfo.colToPartitions.put("c1", new HashSet<String>() { + { + add("p1"); + add("p2"); + } + }); + analysisManager.buildAndAssignJob(analyzeTblStmt); + new Expectations() { + { + analysisManager.syncExecute((Collection<BaseAnalysisTask>) any); + times = 1; + analysisManager.updateTableStats((AnalysisInfo) any); + times = 1; + // Jmockit would try to invoke this method with `null` when initiate instance of Expectations + // and cause NPE, comment these lines until find other way to test behavior that don't invoke something. + // analysisManager.persistAnalysisJob((AnalysisInfo) any); + // times = 0; + } + }; + } + + // test build async job + @Test + public void testBuildAndAssignJob2(@Injectable OlapAnalysisTask analysisTask) throws Exception { + AnalysisInfo analysisInfo = new AnalysisInfoBuilder().setColToPartitions(new HashMap<>()) + .setScheduleType(ScheduleType.PERIOD) + .build(); + new MockUp<StatisticsUtil>() { + + @Mock + public boolean statsTblAvailable() { + return true; + } + }; + new MockUp<AnalysisManager>() { + + @Mock + public AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlException { + return analysisInfo; + } + + @Mock + @VisibleForTesting + public void createTaskForExternalTable(AnalysisInfo jobInfo, + Map<Long, BaseAnalysisTask> analysisTasks, + boolean isSync) throws DdlException { + // DO NOTHING + } + + @Mock + public void createTaskForEachColumns(AnalysisInfo jobInfo, Map<Long, BaseAnalysisTask> analysisTasks, + boolean isSync) throws DdlException { + analysisTasks.put(1L, analysisTask); + } + + @Mock + public void syncExecute(Collection<BaseAnalysisTask> tasks) { + // DO NOTHING + } + + @Mock + public void updateTableStats(AnalysisInfo jobInfo) { + // DO NOTHING + } + + @Mock + public void logCreateAnalysisJob(AnalysisInfo analysisJob) { + + } + }; + AnalyzeTblStmt analyzeTblStmt = new AnalyzeTblStmt(new TableName("test"), + new PartitionNames(false, new ArrayList<String>() { + { + add("p1"); + add("p2"); + } + }), new ArrayList<String>() { + { + add("c1"); + add("c2"); + } + }, new AnalyzeProperties(new HashMap<String, String>() { + { + put(AnalyzeProperties.PROPERTY_SYNC, "false"); + put(AnalyzeProperties.PROPERTY_PERIOD_SECONDS, "100"); + } + })); + AnalysisManager analysisManager = new AnalysisManager(); + analysisInfo.colToPartitions.put("c1", new HashSet<String>() { + { + add("p1"); + add("p2"); + } + }); + analysisManager.buildAndAssignJob(analyzeTblStmt); + new Expectations() { + { + analysisManager.recordAnalysisJob(analysisInfo); + times = 1; + } + }; + } + + @Test + public void testSystemJobStatusUpdater() { + new MockUp<BaseAnalysisTask>() { + + @Mock + protected void init(AnalysisInfo info) { + + } + }; + + new MockUp<AnalysisManager>() { + @Mock + public void updateTableStats(AnalysisInfo jobInfo) {} + + @Mock + protected void logAutoJob(AnalysisInfo autoJob) { + + } + }; + + AnalysisManager analysisManager = new AnalysisManager(); + AnalysisInfo job = new AnalysisInfoBuilder() + .setJobId(0) + .setColName("col1, col2").build(); + analysisManager.systemJobInfoMap.put(job.jobId, job); + AnalysisInfo task1 = new AnalysisInfoBuilder() + .setJobId(0) + .setTaskId(1) + .setState(AnalysisState.RUNNING) + .setColName("col1").build(); + AnalysisInfo task2 = new AnalysisInfoBuilder() + .setJobId(0) + .setTaskId(1) + .setState(AnalysisState.FINISHED) + .setColName("col2").build(); + OlapAnalysisTask ot1 = new OlapAnalysisTask(task1); + OlapAnalysisTask ot2 = new OlapAnalysisTask(task2); + Map<Long, BaseAnalysisTask> taskMap = new HashMap<>(); + taskMap.put(ot1.info.taskId, ot1); + taskMap.put(ot2.info.taskId, ot2); + analysisManager.analysisJobIdToTaskMap.put(job.jobId, taskMap); + + // test invalid job + AnalysisInfo invalidJob = new AnalysisInfoBuilder().setJobId(-1).build(); + analysisManager.systemJobStatusUpdater.apply(new TaskStatusWrapper(invalidJob, + AnalysisState.FAILED, "", 0)); + + // test finished + analysisManager.systemJobStatusUpdater.apply(new TaskStatusWrapper(task1, AnalysisState.FAILED, "", 0)); + analysisManager.systemJobStatusUpdater.apply(new TaskStatusWrapper(task1, AnalysisState.FINISHED, "", 0)); + Assertions.assertEquals(1, analysisManager.autoJobs.size()); + Assertions.assertTrue(analysisManager.systemJobInfoMap.isEmpty()); + } + + @Test + public void testReAnalyze() { + new MockUp<OlapTable>() { + + int count = 0; + int[] rowCount = new int[]{100, 200}; + @Mock + public long getRowCount() { + return rowCount[count++]; + } + + @Mock + public List<Column> getBaseSchema() { + return Lists.newArrayList(new Column("col1", PrimitiveType.INT)); + } + + }; + OlapTable olapTable = new OlapTable(); + TableStatsMeta stats1 = new TableStatsMeta(0, 50, new AnalysisInfoBuilder().setColName("col1").build()); + Assertions.assertTrue(olapTable.needReAnalyzeTable(stats1)); + TableStatsMeta stats2 = new TableStatsMeta(0, 190, new AnalysisInfoBuilder().setColName("col1").build()); + Assertions.assertFalse(olapTable.needReAnalyzeTable(stats2)); + + } + } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java new file mode 100644 index 00000000000..d618a5fa538 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -0,0 +1,70 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.analysis.TableSample; +import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.common.Config; +import org.apache.doris.datasource.CatalogIf; +import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; + +import mockit.Expectations; +import mockit.Mocked; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class OlapAnalysisTaskTest { + + @Test + public void testAutoSample(@Mocked CatalogIf catalogIf, @Mocked DatabaseIf databaseIf, @Mocked TableIf tableIf) { + new Expectations() { + { + tableIf.getDataSize(true); + result = 60_0000_0000L; + } + }; + + AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder() + .setAnalysisMethod(AnalysisMethod.FULL); + OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); + olapAnalysisTask.info = analysisInfoBuilder.build(); + olapAnalysisTask.tbl = tableIf; + Config.enable_auto_sample = true; + TableSample tableSample = olapAnalysisTask.getTableSample(); + Assertions.assertEquals(4194304, tableSample.getSampleValue()); + Assertions.assertFalse(tableSample.isPercent()); + + new Expectations() { + { + tableIf.getDataSize(true); + result = 1_0000_0000L; + } + }; + tableSample = olapAnalysisTask.getTableSample(); + Assertions.assertNull(tableSample); + + analysisInfoBuilder.setSampleRows(10); + analysisInfoBuilder.setAnalysisMethod(AnalysisMethod.SAMPLE); + olapAnalysisTask.info = analysisInfoBuilder.build(); + tableSample = olapAnalysisTask.getTableSample(); + Assertions.assertEquals(10, tableSample.getSampleValue()); + Assertions.assertFalse(tableSample.isPercent()); + } + +} diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index b4c852b4ce1..818bc538b20 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -246,6 +246,7 @@ struct TQueryOptions { // use is_report_success any more 84: optional bool enable_profile = false; 85: optional bool enable_page_cache = false; + 86: optional i32 analyze_timeout = 43200 } diff --git a/regression-test/data/external_table_p2/hive/test_hive_partition_statistic.out b/regression-test/data/external_table_p2/hive/test_hive_partition_statistic.out new file mode 100644 index 00000000000..0e32ebe4775 --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_hive_partition_statistic.out @@ -0,0 +1,87 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !01 -- +event_day=1956-09-07 39 +event_day=2008-09-25 39 + +-- !1 -- +event_day=2008-09-25 10000 1 0 0 0 +event_day=2008-09-25 10000 1 0 2008-09-25 2008-09-25 +event_day=2008-09-25 10000 11 0 0 10 +event_day=2008-09-25 10000 13 0 MFGR#12 MFGR#52 +event_day=2008-09-25 10000 13 0 antique wheat +event_day=2008-09-25 10000 16 0 JUMBO BAG WRAP PACK +event_day=2008-09-25 10000 17 0 1 48 +event_day=2008-09-25 10000 17 0 64078 113087 +event_day=2008-09-25 10000 17 0 754035 763603 +event_day=2008-09-25 10000 17 0 ECONOMY ANODIZED BRASS STANDARD POLISHED TIN +event_day=2008-09-25 10000 17 0 MFGR#1221 MFGR#528 +event_day=2008-09-25 10000 17 0 burnished drab violet firebrick +event_day=2008-09-25 10000 2362 0 19920101 19980802 +event_day=2008-09-25 10000 2382 0 19920203 19981027 +event_day=2008-09-25 10000 25 0 ALGERIA VIETNAM +event_day=2008-09-25 10000 25 0 ALGERIA VIETNAM +event_day=2008-09-25 10000 250 0 ALGERIA 0 VIETNAM 9 +event_day=2008-09-25 10000 250 0 ALGERIA 0 VIETNAM 9 +event_day=2008-09-25 10000 5 0 1-URGENT 5-LOW +event_day=2008-09-25 10000 5 0 AFRICA MIDDLE EAST +event_day=2008-09-25 10000 5 0 AFRICA MIDDLE EAST +event_day=2008-09-25 10000 5 0 AUTOMOBILE MACHINERY +event_day=2008-09-25 10000 5 0 MFGR#1 MFGR#5 +event_day=2008-09-25 10000 50 0 1 50 +event_day=2008-09-25 10000 6074 0 96748 9388900 +event_day=2008-09-25 10000 7 0 1 7 +event_day=2008-09-25 10000 7 0 AIR TRUCK +event_day=2008-09-25 10000 845 0 106797 9423950 +event_day=2008-09-25 10000 9 0 0 8 +event_day=2008-09-25 10000 9775 0 119 2999848 +event_day=2008-09-25 10000 9794 0 107970 45833194 +event_day=2008-09-25 10000 9837 0 MGHV8XBriO zzlztYTFMFW +event_day=2008-09-25 10000 9846 0 Customer#000000119 Customer#002999848 +event_day=2008-09-25 10000 9861 0 13091 599962401 +event_day=2008-09-25 10000 9879 0 10-100-337-6599 34-999-684-2905 +event_day=2008-09-25 10000 9883 0 Supplier#000000001 Supplier#000199983 +event_day=2008-09-25 10000 9896 0 B5YhCdkaxR232CrXx zyxtAvAViHMabnr,1UQybiW +event_day=2008-09-25 10000 9927 0 10-105-800-9296 34-998-982-7450 +event_day=2008-09-25 10000 9971 0 1 199983 + +-- !2 -- +event_day=1956-09-07 10000 1 0 0 0 +event_day=1956-09-07 10000 1 0 1956-09-07 1956-09-07 +event_day=1956-09-07 10000 11 0 0 10 +event_day=1956-09-07 10000 13 0 MFGR#12 MFGR#52 +event_day=1956-09-07 10000 13 0 antique wheat +event_day=1956-09-07 10000 16 0 JUMBO BAG WRAP PACK +event_day=1956-09-07 10000 17 0 1 48 +event_day=1956-09-07 10000 17 0 64078 113087 +event_day=1956-09-07 10000 17 0 754035 763603 +event_day=1956-09-07 10000 17 0 ECONOMY ANODIZED BRASS STANDARD POLISHED TIN +event_day=1956-09-07 10000 17 0 MFGR#1221 MFGR#528 +event_day=1956-09-07 10000 17 0 burnished drab violet firebrick +event_day=1956-09-07 10000 2362 0 19920101 19980802 +event_day=1956-09-07 10000 2382 0 19920203 19981027 +event_day=1956-09-07 10000 25 0 ALGERIA VIETNAM +event_day=1956-09-07 10000 25 0 ALGERIA VIETNAM +event_day=1956-09-07 10000 250 0 ALGERIA 0 VIETNAM 9 +event_day=1956-09-07 10000 250 0 ALGERIA 0 VIETNAM 9 +event_day=1956-09-07 10000 5 0 1-URGENT 5-LOW +event_day=1956-09-07 10000 5 0 AFRICA MIDDLE EAST +event_day=1956-09-07 10000 5 0 AFRICA MIDDLE EAST +event_day=1956-09-07 10000 5 0 AUTOMOBILE MACHINERY +event_day=1956-09-07 10000 5 0 MFGR#1 MFGR#5 +event_day=1956-09-07 10000 50 0 1 50 +event_day=1956-09-07 10000 6074 0 96748 9388900 +event_day=1956-09-07 10000 7 0 1 7 +event_day=1956-09-07 10000 7 0 AIR TRUCK +event_day=1956-09-07 10000 845 0 106797 9423950 +event_day=1956-09-07 10000 9 0 0 8 +event_day=1956-09-07 10000 9775 0 119 2999848 +event_day=1956-09-07 10000 9794 0 107970 45833194 +event_day=1956-09-07 10000 9837 0 MGHV8XBriO zzlztYTFMFW +event_day=1956-09-07 10000 9846 0 Customer#000000119 Customer#002999848 +event_day=1956-09-07 10000 9861 0 13091 599962401 +event_day=1956-09-07 10000 9879 0 10-100-337-6599 34-999-684-2905 +event_day=1956-09-07 10000 9883 0 Supplier#000000001 Supplier#000199983 +event_day=1956-09-07 10000 9896 0 B5YhCdkaxR232CrXx zyxtAvAViHMabnr,1UQybiW +event_day=1956-09-07 10000 9927 0 10-105-800-9296 34-998-982-7450 +event_day=1956-09-07 10000 9971 0 1 199983 + diff --git a/regression-test/data/external_table_p2/hive/test_hive_statistic_timeout.out b/regression-test/data/external_table_p2/hive/test_hive_statistic_timeout.out new file mode 100644 index 00000000000..e906deea593 --- /dev/null +++ b/regression-test/data/external_table_p2/hive/test_hive_statistic_timeout.out @@ -0,0 +1,7 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !01 -- +p_container 200000000 40 0 JUMBO BAG WRAP PKG +p_partkey 200000000 200778064 0 1 200000000 +p_retailprice 200000000 120014 0 900.00 2099.00 +p_type 200000000 150 0 ECONOMY ANODIZED BRASS STANDARD POLISHED TIN + diff --git a/regression-test/suites/external_table_p0/jdbc/test_mysql_jdbc_statistics.groovy b/regression-test/suites/external_table_p0/jdbc/test_mysql_jdbc_statistics.groovy index d6f0ca351dd..e58e17cfcd6 100644 --- a/regression-test/suites/external_table_p0/jdbc/test_mysql_jdbc_statistics.groovy +++ b/regression-test/suites/external_table_p0/jdbc/test_mysql_jdbc_statistics.groovy @@ -42,7 +42,7 @@ suite("test_mysql_jdbc_statistics", "p0,external,mysql,external_docker,external_ assertTrue(result[0][1] == "5.0") assertTrue(result[0][2] == "5.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "18.0") + assertTrue(result[0][4] == "15.0") assertTrue(result[0][5] == "3.0") assertTrue(result[0][6] == "'abc'") assertTrue(result[0][7] == "'abg'") @@ -53,7 +53,7 @@ suite("test_mysql_jdbc_statistics", "p0,external,mysql,external_docker,external_ assertTrue(result[0][1] == "5.0") assertTrue(result[0][2] == "5.0") assertTrue(result[0][3] == "0.0") - assertTrue(result[0][4] == "24.0") + assertTrue(result[0][4] == "20.0") assertTrue(result[0][5] == "4.0") assertTrue(result[0][6] == "111") assertTrue(result[0][7] == "115") diff --git a/regression-test/suites/external_table_p2/hive/test_hive_partition_statistic.groovy b/regression-test/suites/external_table_p2/hive/test_hive_partition_statistic.groovy new file mode 100644 index 00000000000..9f4b462237f --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_partition_statistic.groovy @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_partition_statistic", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_hive_partition_statistic" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + + sql """use ${catalog_name}.multi_partition""" + sql """analyze table multi_partition_orc partitions (`event_day=2008-09-25`, `event_day=1956-09-07`) with sync""" + + def ctlId + def result = sql """show proc '/catalogs'""" + + for (int i = 0; i < result.size(); i++) { + if (result[i][1] == catalog_name) { + ctlId = result[i][0] + } + } + + qt_01 """select part_id, count(*) from internal.__internal_schema.column_statistics where catalog_id='$ctlId' group by part_id order by part_id;""" + order_qt_1 """select part_id, count, ndv, null_count, min, max from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and part_id='event_day=2008-09-25'""" + order_qt_2 """select part_id, count, ndv, null_count, min, max from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and part_id='event_day=1956-09-07'""" + + sql """drop catalog ${catalog_name}"""; + } +} + diff --git a/regression-test/suites/external_table_p2/hive/test_hive_sample_statistic.groovy b/regression-test/suites/external_table_p2/hive/test_hive_sample_statistic.groovy new file mode 100644 index 00000000000..c2a21e3994b --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_sample_statistic.groovy @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_sample_statistic", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_hive_sample_statistic" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + + sql """use ${catalog_name}.tpch_1000_parquet""" + sql """analyze table part with sample percent 10 with sync;""" + + def result = sql """show table stats part""" + assertTrue(result.size() == 1) + assertTrue(Long.parseLong(result[0][2]) >= 200000000) + assertTrue(Long.parseLong(result[0][2]) < 220000000) + + def ctlId + result = sql """show proc '/catalogs'""" + + for (int i = 0; i < result.size(); i++) { + if (result[i][1] == catalog_name) { + ctlId = result[i][0] + } + } + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_partkey'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_name'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_mfgr'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_brand'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_type'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_size'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_container'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_retailprice'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + result = sql """select count from internal.__internal_schema.column_statistics where catalog_id='$ctlId' and col_id='p_comment'""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] >= 200000000) + assertTrue(result[0][0] < 220000000) + + sql """drop catalog ${catalog_name}"""; + } +} + diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy index 1160a2f8dd6..49142b12e36 100644 --- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy @@ -30,6 +30,10 @@ suite("test_hive_statistic", "p2") { ); """ logger.info("catalog " + catalog_name + " created") + + // Test analyze table without init. + sql """analyze table ${catalog_name}.tpch_1000_parquet.region with sync""" + sql """switch ${catalog_name};""" logger.info("switched to catalog " + catalog_name) sql """use statistics;""" @@ -234,11 +238,11 @@ suite("test_hive_statistic", "p2") { sql """analyze database `statistics` with sync""" result = sql """show table stats statistics""" assertTrue(result.size() == 1) - assertTrue(result[0][0] == "100") + assertTrue(result[0][2] == "100") result = sql """show table cached stats statistics""" assertTrue(result.size() == 1) - assertTrue(result[0][0] == "100") + assertTrue(result[0][2] == "100") sql """drop stats statistics""" result = sql """show column cached stats statistics""" diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic_auto.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic_auto.groovy new file mode 100644 index 00000000000..f766069346e --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic_auto.groovy @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_statistic_auto", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_hive_statistic_auto" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + + // Test analyze table without init. + sql """analyze database ${catalog_name}.statistics PROPERTIES("use.auto.analyzer"="true")""" + sql """use ${catalog_name}.statistics""" + + for (int i = 0; i < 10; i++) { + Thread.sleep(1000) + def result = sql """show column stats `statistics` (lo_quantity)""" + if (result.size <= 0) { + continue; + } + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "lo_quantity") + assertTrue(result[0][1] == "100.0") + assertTrue(result[0][2] == "46.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "404.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "50") + + result = sql """show column stats `statistics` (lo_orderkey)""" + if (result.size <= 0) { + continue; + } + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "lo_orderkey") + assertTrue(result[0][1] == "100.0") + assertTrue(result[0][2] == "26.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "404.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "98") + + result = sql """show column stats `statistics` (lo_linenumber)""" + if (result.size <= 0) { + continue; + } + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "lo_linenumber") + assertTrue(result[0][1] == "100.0") + assertTrue(result[0][2] == "7.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "404.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "7") + } + + sql """drop catalog ${catalog_name}""" + + } +} + diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic_cache.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic_cache.groovy index d1399ef49b1..d7b8f00e3e1 100644 --- a/regression-test/suites/external_table_p2/hive/test_hive_statistic_cache.groovy +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic_cache.groovy @@ -29,6 +29,40 @@ suite("test_hive_statistic_cache", "p2") { 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' ); """ + sql """use ${catalog_name}.tpch_1000_parquet""" + sql """desc customer"""; + sql """desc lineitem"""; + sql """desc region"""; + sql """desc nation"""; + sql """desc orders"""; + sql """desc part"""; + sql """desc partsupp"""; + sql """desc supplier"""; + Thread.sleep(1000); + def result = sql """show table cached stats customer""" + assertTrue(result[0][2] == "150000000") + + result = sql """show table cached stats lineitem""" + assertTrue(result[0][2] == "5999989709") + + result = sql """show table cached stats region""" + assertTrue(result[0][2] == "5") + + result = sql """show table cached stats nation""" + assertTrue(result[0][2] == "25") + + result = sql """show table cached stats orders""" + assertTrue(result[0][2] == "1500000000") + + result = sql """show table cached stats part""" + assertTrue(result[0][2] == "200000000") + + result = sql """show table cached stats partsupp""" + assertTrue(result[0][2] == "800000000") + + result = sql """show table cached stats supplier""" + assertTrue(result[0][2] == "10000000") + logger.info("catalog " + catalog_name + " created") sql """switch ${catalog_name};""" logger.info("switched to catalog " + catalog_name) @@ -37,7 +71,7 @@ suite("test_hive_statistic_cache", "p2") { sql """analyze table `stats` with sync;""" sql """select count(*) from stats""" Thread.sleep(5000); - def result = sql """show column cached stats `stats` (lo_orderkey)""" + result = sql """show column cached stats `stats` (lo_orderkey)""" assertTrue(result[0][0] == "lo_orderkey") assertTrue(result[0][1] == "100.0") assertTrue(result[0][2] == "26.0") diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy new file mode 100644 index 00000000000..960bec31df5 --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic_sample.groovy @@ -0,0 +1,150 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_statistic_sample", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_hive_statistic_sample" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + + sql """analyze table ${catalog_name}.tpch_1000_parquet.region with sample percent 10 with sync""" + sql """analyze table ${catalog_name}.tpch_1000_parquet.supplier with sample percent 10 with sync""" + sql """use ${catalog_name}.tpch_1000_parquet""" + def result = sql """show column stats region (r_regionkey)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "r_regionkey") + assertTrue(result[0][1] == "5.0") + assertTrue(result[0][2] == "5.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "20.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "0") + assertTrue(result[0][7] == "4") + + result = sql """show column stats region (r_name)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "r_name") + assertTrue(result[0][1] == "5.0") + assertTrue(result[0][2] == "5.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "34.0") + assertTrue(result[0][5] == "6.8") + assertTrue(result[0][6] == "\'AFRICA\'") + assertTrue(result[0][7] == "\'MIDDLE EAST\'") + + result = sql """show column stats region (r_comment)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "r_comment") + assertTrue(result[0][1] == "5.0") + assertTrue(result[0][2] == "5.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "330.0") + assertTrue(result[0][5] == "66.0") + assertTrue(result[0][6] == "\'ges. thinly even pinto beans ca\'") + assertTrue(result[0][7] == "\'uickly special accounts cajole carefully blithely close requests. carefully final asymptotes haggle furiousl\'") + + result = sql """show column stats supplier (s_suppkey)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_suppkey") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "9970222.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "3.9995194E7") + assertTrue(result[0][5] == "3.9999997999759773") + assertTrue(result[0][6] == "1885331") + assertTrue(result[0][7] == "9395153") + + result = sql """show column stats supplier (s_name)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_name") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "1.004004E7") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "1.79978374E8") + assertTrue(result[0][5] == "17.999999199903908") + assertTrue(result[0][6] == "\'Supplier#001885331\'") + assertTrue(result[0][7] == "\'Supplier#009395153\'") + + result = sql """show column stats supplier (s_address)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_address") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "9998862.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "2.50070604E8") + assertTrue(result[0][5] == "25.010064108699456") + assertTrue(result[0][6] == "\' E,WAW2ZEx\'") + assertTrue(result[0][7] == "\'zzzw X3bpxu,OCpzgv6BdyMVMKzaB1DbH\'") + + result = sql """show column stats supplier (s_nationkey)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_nationkey") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "25.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "3.9995194E7") + assertTrue(result[0][5] == "3.9999997999759773") + assertTrue(result[0][6] == "0") + assertTrue(result[0][7] == "24") + + result = sql """show column stats supplier (s_phone)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_phone") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "9928006.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "1.49981978E8") + assertTrue(result[0][5] == "14.99999929991592") + assertTrue(result[0][6] == "\'10-100-128-4513\'") + assertTrue(result[0][7] == "\'34-999-967-7296\'") + + result = sql """show column stats supplier (s_acctbal)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_acctbal") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "4766937.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "7.9990388E7") + assertTrue(result[0][5] == "7.999999599951955") + assertTrue(result[0][6] == "-999.99") + assertTrue(result[0][7] == "9999.99") + + result = sql """show column stats supplier (s_comment)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "s_comment") + assertTrue(result[0][1] == "9998799.0") + assertTrue(result[0][2] == "9931298.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "6.24883849E8") + assertTrue(result[0][5] == "62.49589065646784") + assertTrue(result[0][6] == "\' Customer across the pinto beans. pinRecommends\'") + assertTrue(result[0][7] == "\'zzle? express, regular foxes haggle final ac\'") + + sql """drop catalog ${catalog_name}""" + } +} + diff --git a/regression-test/suites/external_table_p2/hive/test_hive_statistic_timeout.groovy b/regression-test/suites/external_table_p2/hive/test_hive_statistic_timeout.groovy new file mode 100644 index 00000000000..a3754d9d2a4 --- /dev/null +++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic_timeout.groovy @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_statistic_timeout", "p2,external,hive,external_remote,external_remote_hive") { + String enabled = context.config.otherConfigs.get("enableExternalHiveTest") + if (enabled != null && enabled.equalsIgnoreCase("true")) { + String extHiveHmsHost = context.config.otherConfigs.get("extHiveHmsHost") + String extHiveHmsPort = context.config.otherConfigs.get("extHiveHmsPort") + String catalog_name = "test_hive_statistic_timeout" + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hadoop.username' = 'hadoop', + 'hive.metastore.uris' = 'thrift://${extHiveHmsHost}:${extHiveHmsPort}' + ); + """ + logger.info("catalog " + catalog_name + " created") + + sql """use ${catalog_name}.tpch_1000_parquet""" + sql """set query_timeout=1""" + sql """analyze table part (p_partkey, p_container, p_type, p_retailprice) with sync;""" + + def result = sql """show column stats part""" + assertTrue(result.size() == 4) + + def ctlId + result = sql """show proc '/catalogs'""" + + for (int i = 0; i < result.size(); i++) { + if (result[i][1] == catalog_name) { + ctlId = result[i][0] + } + } + + qt_01 """select col_id, count, ndv, null_count, min, max from internal.__internal_schema.column_statistics where catalog_id='$ctlId' order by col_id;""" + sql """drop catalog ${catalog_name}"""; + } +} + diff --git a/regression-test/suites/statistics/test_basic_statistics.groovy b/regression-test/suites/statistics/test_basic_statistics.groovy new file mode 100644 index 00000000000..a885ac1c11c --- /dev/null +++ b/regression-test/suites/statistics/test_basic_statistics.groovy @@ -0,0 +1,75 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_basic_statistics") { + String db = "test_basic_statistics" + String tbl = "test_table_1" + + sql """ + DROP DATABASE IF EXISTS `${db}` + """ + + sql """ + CREATE DATABASE `${db}` + """ + + sql """ use `${db}`""" + + sql """ + DROP TABLE IF EXISTS `${tbl}` + """ + + sql """ + CREATE TABLE IF NOT EXISTS `${tbl}` ( + `id` int(11) not null comment "", + `name` varchar(100) null comment "" + ) engine=olap + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 properties("replication_num" = "1") + """ + + sql """ + INSERT INTO `${tbl}` VALUES (1, 'name1'), (2, 'name2'), (3, 'name3'), (4, 'name4'), (5, 'name5'), (6, 'name6'), (7, 'name7'), (8, 'name8'), (9, 'name9') + """ + + sql """ analyze table ${tbl} with sync""" + def result = sql """show column stats ${tbl} (id)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "id") + assertTrue(result[0][1] == "9.0") + assertTrue(result[0][2] == "9.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "36.0") + assertTrue(result[0][5] == "4.0") + assertTrue(result[0][6] == "1") + assertTrue(result[0][7] == "9") + + result = sql """show column stats ${tbl} (name)""" + assertTrue(result.size() == 1) + assertTrue(result[0][0] == "name") + assertTrue(result[0][1] == "9.0") + assertTrue(result[0][2] == "9.0") + assertTrue(result[0][3] == "0.0") + assertTrue(result[0][4] == "45.0") + assertTrue(result[0][5] == "5.0") + assertTrue(result[0][6] == "\'name1\'") + assertTrue(result[0][7] == "\'name9\'") + + sql """drop table ${tbl}""" + sql """drop database ${db}""" + +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org