This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 5b09254fac [improvement](external statistics)Fix external stats
collection bugs (#22788)
5b09254fac is described below
commit 5b09254facc8221be8b32a46b6c9d02c51b140e7
Author: Jibing-Li <[email protected]>
AuthorDate: Fri Aug 11 21:58:24 2023 +0800
[improvement](external statistics)Fix external stats collection bugs
(#22788)
1. Collect external table row count when execute analyze database.
2. Support show cached table stats (row count)
3. Support alter external table column stats.
4. Refresh/Invalidate table row count stat memory cache when analyze task
finished and drop table stats.
---
docs/en/docs/lakehouse/external-statistics.md | 7 ++++++-
docs/zh-CN/docs/lakehouse/external-statistics.md | 7 ++++++-
fe/fe-core/src/main/cup/sql_parser.cup | 4 ++--
.../doris/analysis/AlterColumnStatsStmt.java | 11 +++--------
.../org/apache/doris/analysis/AnalyzeTblStmt.java | 7 +++++--
.../apache/doris/analysis/ShowTableStatsStmt.java | 8 +++++++-
.../java/org/apache/doris/qe/ShowExecutor.java | 15 +++++++++++++-
.../apache/doris/statistics/AnalysisManager.java | 7 ++++---
.../apache/doris/statistics/HMSAnalysisTask.java | 10 ++++++++++
.../apache/doris/statistics/StatisticsCache.java | 8 ++++++++
.../statistics/TableStatisticsCacheLoader.java | 4 +++-
.../hive/test_hive_statistic.groovy | 23 +++++++++++++++++-----
12 files changed, 86 insertions(+), 25 deletions(-)
diff --git a/docs/en/docs/lakehouse/external-statistics.md
b/docs/en/docs/lakehouse/external-statistics.md
index 6c469961a8..33724fc238 100644
--- a/docs/en/docs/lakehouse/external-statistics.md
+++ b/docs/en/docs/lakehouse/external-statistics.md
@@ -191,6 +191,11 @@ DROP ANALYZE JOB [JOB_ID]
Show statistics includes show table statistics (number of rows) and column
statistics. Please refer to View statistics in [Internal Table
Statistics](../query-acceleration/statistics.md)
#### Table statistics
+```
+SHOW TABLE [cached] stats TABLE_NAME;
+```
+
+View row count of the given table. If the cached parameter is specified, the
row count of the specified table that has been loaded into the cache is
displayed.
```
mysql> SHOW TABLE STATS hive.tpch100.orders;
@@ -203,7 +208,7 @@ mysql> SHOW TABLE STATS hive.tpch100.orders;
#### Column statistics
```
-SHOW COLUMN [cached] stats hive.tpch100.orders;
+SHOW COLUMN [cached] stats TABLE_NAME;
```
View the column statistics of a table. If the cached parameter is specified,
the column information of the specified table that has been loaded into the
cache is displayed.
diff --git a/docs/zh-CN/docs/lakehouse/external-statistics.md
b/docs/zh-CN/docs/lakehouse/external-statistics.md
index 0b47ed5329..f4f331b287 100644
--- a/docs/zh-CN/docs/lakehouse/external-statistics.md
+++ b/docs/zh-CN/docs/lakehouse/external-statistics.md
@@ -191,6 +191,11 @@ DROP ANALYZE JOB [JOB_ID]
信息的查看包括表的统计信息(表的行数)查看和列统计信息查看,请参考[内表统计信息](../query-acceleration/statistics.md)查看统计信息部分。
#### 表统计信息
+```
+SHOW TALBE [cached] stats TABLE_NAME;
+```
+
+查看statistics表中指定table的行数,如果指定cached参数,则展示的是指定表已加载到缓存中的行数信息。
```
mysql> SHOW TABLE STATS hive.tpch100.orders;
@@ -203,7 +208,7 @@ mysql> SHOW TABLE STATS hive.tpch100.orders;
#### 列统计信息
```
-SHOW COLUMN [cached] stats hive.tpch100.orders;
+SHOW COLUMN [cached] stats TABLE_NAME;
```
查看statistics表中指定table的列统计信息,如果指定cached参数,则展示的是指定表已加载到缓存中的列信息。
diff --git a/fe/fe-core/src/main/cup/sql_parser.cup
b/fe/fe-core/src/main/cup/sql_parser.cup
index e7b081ba93..d589ff3937 100644
--- a/fe/fe-core/src/main/cup/sql_parser.cup
+++ b/fe/fe-core/src/main/cup/sql_parser.cup
@@ -4180,9 +4180,9 @@ show_param ::=
RESULT = new ShowSyncJobStmt(dbName);
:}
/* show table stats */
- | KW_TABLE KW_STATS table_name:tbl opt_partition_names:partitionNames
+ | KW_TABLE opt_cached:cached KW_STATS table_name:tbl
opt_partition_names:partitionNames
{:
- RESULT = new ShowTableStatsStmt(tbl, partitionNames);
+ RESULT = new ShowTableStatsStmt(tbl, partitionNames, cached);
:}
/* show column stats */
| KW_COLUMN opt_cached:cached KW_STATS table_name:tbl opt_col_list:cols
opt_partition_names:partitionNames
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/AlterColumnStatsStmt.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/AlterColumnStatsStmt.java
index 0e7892dcd1..58b8121267 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/analysis/AlterColumnStatsStmt.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/analysis/AlterColumnStatsStmt.java
@@ -22,7 +22,6 @@ import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.OlapTable;
import org.apache.doris.catalog.Partition;
import org.apache.doris.catalog.PartitionType;
-import org.apache.doris.catalog.Table;
import org.apache.doris.catalog.TableIf;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.Config;
@@ -148,17 +147,13 @@ public class AlterColumnStatsStmt extends DdlStmt {
DatabaseIf db = catalog.getDbOrAnalysisException(tableName.getDb());
TableIf table = db.getTableOrAnalysisException(tableName.getTbl());
- if (table.getType() != Table.TableType.OLAP) {
- throw new AnalysisException("Only OLAP table statistics are
supported");
- }
-
- OlapTable olapTable = (OlapTable) table;
- if (olapTable.getColumn(columnName) == null) {
+ if (table.getColumn(columnName) == null) {
ErrorReport.reportAnalysisException(ErrorCode.ERR_WRONG_COLUMN_NAME,
columnName, FeNameFormat.getColumnNameRegex());
}
- if (optPartitionNames != null) {
+ if (optPartitionNames != null && table instanceof OlapTable) {
+ OlapTable olapTable = (OlapTable) table;
if
(olapTable.getPartitionInfo().getType().equals(PartitionType.UNPARTITIONED)) {
throw new AnalysisException("Not a partitioned table: " +
olapTable.getName());
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java
index 527f802748..fb4c3bb39a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java
@@ -142,8 +142,11 @@ public class AnalyzeTblStmt extends AnalyzeStmt {
}
checkAnalyzePriv(tableName.getDb(), tableName.getTbl());
if (columnNames == null) {
- columnNames = table.getBaseSchema(false)
-
.stream().map(Column::getName).collect(Collectors.toList());
+ // Filter unsupported type columns.
+ columnNames = table.getBaseSchema(false).stream()
+ .filter(c -> !StatisticsUtil.isUnsupportedType(c.getType()))
+ .map(Column::getName)
+ .collect(Collectors.toList());
}
table.readLock();
try {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java
b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java
index e462c8585c..da10d5c492 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java
@@ -53,12 +53,14 @@ public class ShowTableStatsStmt extends ShowStmt {
private final TableName tableName;
private final PartitionNames partitionNames;
+ private final boolean cached;
private TableIf table;
- public ShowTableStatsStmt(TableName tableName, PartitionNames
partitionNames) {
+ public ShowTableStatsStmt(TableName tableName, PartitionNames
partitionNames, boolean cached) {
this.tableName = tableName;
this.partitionNames = partitionNames;
+ this.cached = cached;
}
public TableName getTableName() {
@@ -133,4 +135,8 @@ public class ShowTableStatsStmt extends ShowStmt {
result.add(row);
return new ShowResultSet(getMetaData(), result);
}
+
+ public boolean isCached() {
+ return cached;
+ }
}
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java
b/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java
index 7b0d6c4d8a..4bfc6c61b1 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java
@@ -138,6 +138,7 @@ import org.apache.doris.catalog.Tablet;
import org.apache.doris.catalog.TabletInvertedIndex;
import org.apache.doris.catalog.TabletMeta;
import org.apache.doris.catalog.View;
+import org.apache.doris.catalog.external.ExternalTable;
import org.apache.doris.catalog.external.HMSExternalTable;
import org.apache.doris.clone.DynamicPartitionScheduler;
import org.apache.doris.cluster.ClusterNamespace;
@@ -240,6 +241,7 @@ import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Objects;
+import java.util.Optional;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
@@ -2411,8 +2413,19 @@ public class ShowExecutor {
ShowTableStatsStmt showTableStatsStmt = (ShowTableStatsStmt) stmt;
TableIf tableIf = showTableStatsStmt.getTable();
long partitionId = showTableStatsStmt.getPartitionId();
+ boolean showCache = showTableStatsStmt.isCached();
try {
- if (partitionId > 0) {
+ if (tableIf instanceof ExternalTable && showCache) {
+ Optional<TableStatistic> tableStatistics =
Env.getCurrentEnv().getStatisticsCache().getTableStatistics(
+ tableIf.getDatabase().getCatalog().getId(),
+ tableIf.getDatabase().getId(),
+ tableIf.getId());
+ if (tableStatistics.isPresent()) {
+ resultSet =
showTableStatsStmt.constructResultSet(tableStatistics.get());
+ } else {
+ resultSet =
showTableStatsStmt.constructResultSet(TableStatistic.UNKNOWN);
+ }
+ } else if (partitionId > 0) {
TableStatistic partStats =
StatisticsRepository.fetchTableLevelOfPartStats(partitionId);
resultSet = showTableStatsStmt.constructResultSet(partStats);
} else {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
index b5c6ebf602..82daf0e614 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java
@@ -286,10 +286,9 @@ public class AnalysisManager extends Daemon implements
Writable {
TableName tableName = new TableName(db.getCatalog().getName(),
db.getFullName(),
table.getName());
// columnNames null means to add all visitable columns.
+ // Will get all the visible columns in analyzeTblStmt.check()
AnalyzeTblStmt analyzeTblStmt = new
AnalyzeTblStmt(analyzeProperties, tableName,
- table.getBaseSchema().stream().filter(c ->
!StatisticsUtil.isUnsupportedType(c.getType())).map(
- Column::getName).collect(
- Collectors.toList()), db.getId(), table);
+ null, db.getId(), table);
try {
analyzeTblStmt.check();
} catch (AnalysisException analysisException) {
@@ -816,6 +815,8 @@ public class AnalysisManager extends Daemon implements
Writable {
}
if (dropStatsStmt.dropTableRowCount()) {
StatisticsRepository.dropExternalTableStatistics(tblId);
+ // Table cache key doesn't care about catalog id and db id,
because the table id is globally unique.
+ Env.getCurrentEnv().getStatisticsCache().invalidateTableStats(-1,
-1, tblId);
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index 119368d91d..d569cd79bd 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -17,6 +17,7 @@
package org.apache.doris.statistics;
+import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.external.HMSExternalTable;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.util.TimeUtils;
@@ -291,4 +292,13 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
LocalDateTime.ofInstant(Instant.ofEpochMilli(Long.parseLong(timestamp) * 1000),
ZoneId.systemDefault())));
}
+
+ @Override
+ protected void afterExecution() {
+ if (isTableLevelTask) {
+
Env.getCurrentEnv().getStatisticsCache().refreshTableStatsSync(catalog.getId(),
db.getId(), tbl.getId());
+ } else {
+
Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1,
col.getName());
+ }
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
index 12e306ad0c..b405647ad8 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java
@@ -177,6 +177,14 @@ public class StatisticsCache {
columnStatisticsCache.synchronous().refresh(new
StatisticsCacheKey(catalogId, dbId, tblId, idxId, colName));
}
+ public void invalidateTableStats(long catalogId, long dbId, long tblId) {
+ tableStatisticsCache.synchronous().invalidate(new
StatisticsCacheKey(catalogId, dbId, tblId));
+ }
+
+ public void refreshTableStatsSync(long catalogId, long dbId, long tblId) {
+ tableStatisticsCache.synchronous().refresh(new
StatisticsCacheKey(catalogId, dbId, tblId));
+ }
+
public void refreshHistogramSync(long tblId, long idxId, String colName) {
histogramCache.synchronous().refresh(new StatisticsCacheKey(tblId,
idxId, colName));
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java
index 817e74540f..953bc9a427 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java
@@ -36,7 +36,9 @@ public class TableStatisticsCacheLoader extends
StatisticsCacheLoader<Optional<T
protected Optional<TableStatistic> doLoad(StatisticsCacheKey key) {
try {
TableStatistic tableStatistic =
StatisticsRepository.fetchTableLevelStats(key.tableId);
- return Optional.of(tableStatistic);
+ if (tableStatistic != TableStatistic.UNKNOWN) {
+ return Optional.of(tableStatistic);
+ }
} catch (DdlException e) {
LOG.debug("Fail to get table line number from table_statistics
table. "
+ "Will try to get from data source.", e);
diff --git
a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
index 90da0738a9..0d783c13ad 100644
--- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
@@ -221,15 +221,28 @@ suite("test_hive_statistic",
"p2,external,hive,external_remote,external_remote_h
assertTrue(result[0][6] == "'AIR'")
assertTrue(result[0][7] == "'TRUCK'")
- // sql """ALTER TABLE statistics MODIFY COLUMN lo_shipmode SET STATS
('row_count'='6001215')"""
- // result = sql "show column stats `statistics` (lo_shipmode)"
- // assertTrue(result.size() == 1)
- // assertTrue(result[0][0] == "lo_shipmode")
- // assertTrue(result[0][1] == "6001215.0")
+ sql """ALTER TABLE statistics MODIFY COLUMN lo_shipmode SET STATS
('row_count'='6001215')"""
+ result = sql "show column stats `statistics` (lo_shipmode)"
+ assertTrue(result.size() == 1)
+ assertTrue(result[0][0] == "lo_shipmode")
+ assertTrue(result[0][1] == "6001215.0")
sql """drop stats statistics"""
result = sql """show column stats statistics"""
assertTrue(result.size() == 0)
+
+ sql """analyze database `statistics` with sync"""
+ result = sql """show table stats statistics"""
+ assertTrue(result.size() == 1)
+ assertTrue(result[0][0] == "100")
+
+ result = sql """show table cached stats statistics"""
+ assertTrue(result.size() == 1)
+ assertTrue(result[0][0] == "100")
+
+ sql """drop stats statistics"""
+ result = sql """show column cached stats statistics"""
+ assertTrue(result.size() == 0)
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]