This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 1f9ae3427f1 [feature](statistics)support statistics for
iceberg/paimon/hudi table (#29868)
1f9ae3427f1 is described below
commit 1f9ae3427f155c9ae487bb2f353492ad9fc0252c
Author: wuwenchi <[email protected]>
AuthorDate: Wed Jan 17 17:43:05 2024 +0800
[feature](statistics)support statistics for iceberg/paimon/hudi table
(#29868)
---
.../docker-compose/iceberg/iceberg.env | 2 +-
.../docker-compose/iceberg/iceberg.yaml.tpl | 2 +
.../catalog/external/IcebergExternalTable.java | 9 +
.../catalog/external/PaimonExternalTable.java | 9 +
.../apache/doris/common/proc/CatalogsProcDir.java | 10 +-
...AnalysisTask.java => ExternalAnalysisTask.java} | 173 ++------------
.../apache/doris/statistics/HMSAnalysisTask.java | 252 ++-------------------
.../doris/statistics/util/StatisticsUtil.java | 10 +-
.../doris/statistics/HMSAnalysisTaskTest.java | 4 +-
regression-test/conf/regression-conf.groovy | 1 +
.../iceberg/test_iceberg_statistics.out | 39 ++++
.../paimon/test_paimon_statistics.out | 21 ++
.../hive/test_hive_hudi_statistics.out | 16 ++
.../pipeline/p0/conf/regression-conf.groovy | 1 +
.../tpch/tpch-sf100/conf/regression-conf.groovy | 1 +
.../plugins/plugins_get_ids_from_proc.groovy | 62 +++++
.../iceberg/test_iceberg_statistics.groovy | 57 +++++
.../paimon/test_paimon_catalog.groovy | 3 +
.../paimon/test_paimon_statistics.groovy | 47 ++++
.../external_table_p0/test_catalog_ddl.groovy | 2 +
.../hive/test_hive_hudi_statistics.groovy | 47 ++++
21 files changed, 372 insertions(+), 396 deletions(-)
diff --git a/docker/thirdparties/docker-compose/iceberg/iceberg.env
b/docker/thirdparties/docker-compose/iceberg/iceberg.env
index 4cc8b42eaf9..6bebd49f437 100644
--- a/docker/thirdparties/docker-compose/iceberg/iceberg.env
+++ b/docker/thirdparties/docker-compose/iceberg/iceberg.env
@@ -21,4 +21,4 @@ SPARK_DRIVER_UI_PORT=8080
SPARK_HISTORY_UI_PORT=10000
REST_CATALOG_PORT=18181
MINIO_UI_PORT=9000
-MINIO_API_PORT=9001
+MINIO_API_PORT=19001
diff --git a/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl
b/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl
index d7220f24376..bc217c1dd6e 100644
--- a/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl
+++ b/docker/thirdparties/docker-compose/iceberg/iceberg.yaml.tpl
@@ -58,6 +58,8 @@ services:
minio:
image: minio/minio
container_name: doris--minio
+ ports:
+ - ${MINIO_API_PORT}:9000
environment:
- MINIO_ROOT_USER=admin
- MINIO_ROOT_PASSWORD=password
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
index 7398ff19c9e..be99e26de62 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/IcebergExternalTable.java
@@ -23,7 +23,10 @@ import org.apache.doris.catalog.HiveMetaStoreClientHelper;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
import org.apache.doris.datasource.iceberg.IcebergExternalCatalog;
+import org.apache.doris.statistics.AnalysisInfo;
+import org.apache.doris.statistics.BaseAnalysisTask;
import org.apache.doris.statistics.ColumnStatistic;
+import org.apache.doris.statistics.ExternalAnalysisTask;
import org.apache.doris.statistics.util.StatisticsUtil;
import org.apache.doris.thrift.THiveTable;
import org.apache.doris.thrift.TIcebergTable;
@@ -149,4 +152,10 @@ public class IcebergExternalTable extends ExternalTable {
() -> StatisticsUtil.getIcebergColumnStats(colName,
((IcebergExternalCatalog)
catalog).getIcebergTable(dbName, name)));
}
+
+ @Override
+ public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
+ makeSureInitialized();
+ return new ExternalAnalysisTask(info);
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
index c8ea253671d..b517265df6a 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/PaimonExternalTable.java
@@ -21,6 +21,9 @@ import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.ScalarType;
import org.apache.doris.catalog.Type;
import org.apache.doris.datasource.paimon.PaimonExternalCatalog;
+import org.apache.doris.statistics.AnalysisInfo;
+import org.apache.doris.statistics.BaseAnalysisTask;
+import org.apache.doris.statistics.ExternalAnalysisTask;
import org.apache.doris.thrift.THiveTable;
import org.apache.doris.thrift.TTableDescriptor;
import org.apache.doris.thrift.TTableType;
@@ -154,4 +157,10 @@ public class PaimonExternalTable extends ExternalTable {
+ getPaimonCatalogType());
}
}
+
+ @Override
+ public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) {
+ makeSureInitialized();
+ return new ExternalAnalysisTask(info);
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
b/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
index 854b4dddc79..e6163645c28 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/proc/CatalogsProcDir.java
@@ -27,6 +27,7 @@ import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
+import org.apache.log4j.Logger;
import java.util.ArrayList;
import java.util.Collections;
@@ -37,6 +38,7 @@ import java.util.List;
* show all catalogs' info
*/
public class CatalogsProcDir implements ProcDirInterface {
+ private static final Logger LOG = Logger.getLogger(CatalogsProcDir.class);
public static final ImmutableList<String> TITLE_NAMES = new
ImmutableList.Builder<String>()
.add("CatalogIds").add("CatalogName").add("DatabaseNum").add("LastUpdateTime")
.build();
@@ -90,7 +92,13 @@ public class CatalogsProcDir implements ProcDirInterface {
List<Comparable> catalogInfo = Lists.newArrayList();
catalogInfo.add(catalog.getId());
catalogInfo.add(catalog.getName());
- catalogInfo.add(catalog.getDbNames().size());
+ int size = -1;
+ try {
+ size = catalog.getDbNames().size();
+ } catch (Exception e) {
+ LOG.warn("failed to get database: ", e);
+ }
+ catalogInfo.add(size);
catalogInfo.add(TimeUtils.longToTimeString(catalog.getLastUpdateTime()));
catalogInfos.add(catalogInfo);
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
similarity index 59%
copy from
fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
copy to
fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
index fd0a4c82538..15848c013d6 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ExternalAnalysisTask.java
@@ -19,15 +19,11 @@ package org.apache.doris.statistics;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Env;
-import org.apache.doris.catalog.external.HMSExternalTable;
-import org.apache.doris.common.AnalysisException;
+import org.apache.doris.catalog.external.ExternalTable;
import org.apache.doris.common.FeConstants;
import org.apache.doris.common.Pair;
-import org.apache.doris.datasource.hive.HiveMetaStoreCache;
-import org.apache.doris.external.hive.util.HiveUtil;
import org.apache.doris.statistics.util.StatisticsUtil;
-import com.google.common.collect.Sets;
import org.apache.commons.text.StringSubstitutor;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -39,42 +35,43 @@ import java.util.Map;
import java.util.Random;
import java.util.Set;
-public class HMSAnalysisTask extends BaseAnalysisTask {
- private static final Logger LOG =
LogManager.getLogger(HMSAnalysisTask.class);
+public class ExternalAnalysisTask extends BaseAnalysisTask {
+ private static final Logger LOG =
LogManager.getLogger(ExternalAnalysisTask.class);
private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT
ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
+ "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
private boolean isTableLevelTask;
private boolean isPartitionOnly;
- private HMSExternalTable table;
+ private ExternalTable table;
- public HMSAnalysisTask() {
+ // For test
+ public ExternalAnalysisTask() {
}
- public HMSAnalysisTask(AnalysisInfo info) {
+ public ExternalAnalysisTask(AnalysisInfo info) {
super(info);
isTableLevelTask = info.externalTableLevelTask;
isPartitionOnly = info.partitionOnly;
- table = (HMSExternalTable) tbl;
+ table = (ExternalTable) tbl;
}
public void doExecute() throws Exception {
if (isTableLevelTask) {
getTableStats();
} else {
- getTableColumnStats();
+ getOrdinaryColumnStats();
}
}
// For test
- protected void setTable(HMSExternalTable table) {
+ protected void setTable(ExternalTable table) {
this.table = table;
}
/**
* Get table row count
*/
- private void getTableStats() throws Exception {
+ private void getTableStats() {
Map<String, String> params = buildStatsParams(null);
List<ResultRow> columnResult =
StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
@@ -86,35 +83,8 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
job.rowCountDone(this);
}
- /**
- * Get column statistics and insert the result to
__internal_schema.column_statistics
- */
- protected void getTableColumnStats() throws Exception {
- if (!info.usingSqlForPartitionColumn) {
- try {
- if (isPartitionColumn()) {
- getPartitionColumnStats();
- } else {
- getHmsColumnStats();
- }
- } catch (Exception e) {
- LOG.warn("Failed to collect stats for {}col {} using metadata,
"
- + "fallback to normal collection",
- isPartitionColumn() ? "partition " : "",
col.getName(), e);
- /* retry using sql way! */
- getOrdinaryColumnStats();
- }
- } else {
- getOrdinaryColumnStats();
- }
- }
-
- private boolean isPartitionColumn() {
- return table.getPartitionColumns().stream().anyMatch(c ->
c.getName().equals(col.getName()));
- }
-
- // Get ordinary column stats. Ordinary column means not partition column.
- private void getOrdinaryColumnStats() throws Exception {
+ // Get ordinary column stats
+ protected void getOrdinaryColumnStats() throws Exception {
StringBuilder sb = new StringBuilder();
Map<String, String> params = buildStatsParams("NULL");
params.put("min", getMinFunction());
@@ -169,122 +139,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
runQuery(sql);
}
- // Collect the partition column stats through HMS metadata.
- // Get all the partition values and calculate the stats based on the
values.
- private void getPartitionColumnStats() throws Exception {
- Set<String> partitionNames = table.getPartitionNames();
- Set<String> ndvPartValues = Sets.newHashSet();
- long numNulls = 0;
- long dataSize = 0;
- String min = null;
- String max = null;
- for (String names : partitionNames) {
- // names is like "date=20230101" for one level partition
- // and like "date=20230101/hour=12" for two level partition
- String[] parts = names.split("/");
- for (String part : parts) {
- if (part.startsWith(col.getName())) {
- String value = HiveUtil.getHivePartitionValue(part);
- // HIVE_DEFAULT_PARTITION hive partition value when the
partition name is not specified.
- if (value == null || value.isEmpty() ||
value.equals(HiveMetaStoreCache.HIVE_DEFAULT_PARTITION)) {
- numNulls += 1;
- continue;
- }
- ndvPartValues.add(value);
- dataSize += col.getType().isStringType() ? value.length()
: col.getType().getSlotSize();
- min = updateMinValue(min, value);
- max = updateMaxValue(max, value);
- }
- }
- }
- // Estimate the row count. This value is inaccurate if the table stats
is empty.
- TableStatsMeta tableStatsStatus =
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
- long count = tableStatsStatus == null ? table.estimatedRowCount() :
tableStatsStatus.rowCount;
- dataSize = dataSize * count / partitionNames.size();
- numNulls = numNulls * count / partitionNames.size();
- int ndv = ndvPartValues.size();
-
- Map<String, String> params = buildStatsParams("NULL");
- params.put("row_count", String.valueOf(count));
- params.put("ndv", String.valueOf(ndv));
- params.put("null_count", String.valueOf(numNulls));
- params.put("min", StatisticsUtil.quote(min));
- params.put("max", StatisticsUtil.quote(max));
- params.put("data_size", String.valueOf(dataSize));
- StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
- String sql =
stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE);
- runQuery(sql);
- }
-
- // Collect the spark analyzed column stats through HMS metadata.
- private void getHmsColumnStats() throws Exception {
- TableStatsMeta tableStatsStatus =
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
- long count = tableStatsStatus == null ? table.estimatedRowCount() :
tableStatsStatus.rowCount;
-
- Map<String, String> params = buildStatsParams("NULL");
- Map<StatsType, String> statsParams = new HashMap<>();
- statsParams.put(StatsType.NDV, "ndv");
- statsParams.put(StatsType.NUM_NULLS, "null_count");
- statsParams.put(StatsType.MIN_VALUE, "min");
- statsParams.put(StatsType.MAX_VALUE, "max");
- statsParams.put(StatsType.AVG_SIZE, "avg_len");
-
- if (table.fillColumnStatistics(info.colName, statsParams, params)) {
- throw new AnalysisException("some column stats not available");
- }
-
- long dataSize = Long.valueOf(params.get("avg_len")) * count;
- params.put("row_count", String.valueOf(count));
- params.put("data_size", String.valueOf(dataSize));
-
- StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
- String sql =
stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE);
- runQuery(sql);
- }
-
- private String updateMinValue(String currentMin, String value) {
- if (currentMin == null) {
- return value;
- }
- if (col.getType().isFixedPointType()) {
- if (Long.parseLong(value) < Long.parseLong(currentMin)) {
- return value;
- } else {
- return currentMin;
- }
- }
- if (col.getType().isFloatingPointType() || col.getType().isDecimalV2()
|| col.getType().isDecimalV3()) {
- if (Double.parseDouble(value) < Double.parseDouble(currentMin)) {
- return value;
- } else {
- return currentMin;
- }
- }
- return value.compareTo(currentMin) < 0 ? value : currentMin;
- }
-
- private String updateMaxValue(String currentMax, String value) {
- if (currentMax == null) {
- return value;
- }
- if (col.getType().isFixedPointType()) {
- if (Long.parseLong(value) > Long.parseLong(currentMax)) {
- return value;
- } else {
- return currentMax;
- }
- }
- if (col.getType().isFloatingPointType() || col.getType().isDecimalV2()
|| col.getType().isDecimalV3()) {
- if (Double.parseDouble(value) > Double.parseDouble(currentMax)) {
- return value;
- } else {
- return currentMax;
- }
- }
- return value.compareTo(currentMax) > 0 ? value : currentMax;
- }
-
- private Map<String, String> buildStatsParams(String partId) {
+ protected Map<String, String> buildStatsParams(String partId) {
Map<String, String> commonParams = new HashMap<>();
String id = StatisticsUtil.constructId(tbl.getId(), -1);
if (partId == null) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index fd0a4c82538..9e8be622824 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -17,12 +17,10 @@
package org.apache.doris.statistics;
-import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Env;
+import org.apache.doris.catalog.external.ExternalTable;
import org.apache.doris.catalog.external.HMSExternalTable;
import org.apache.doris.common.AnalysisException;
-import org.apache.doris.common.FeConstants;
-import org.apache.doris.common.Pair;
import org.apache.doris.datasource.hive.HiveMetaStoreCache;
import org.apache.doris.external.hive.util.HiveUtil;
import org.apache.doris.statistics.util.StatisticsUtil;
@@ -32,64 +30,36 @@ import org.apache.commons.text.StringSubstitutor;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
-import java.util.Collections;
import java.util.HashMap;
-import java.util.List;
import java.util.Map;
-import java.util.Random;
import java.util.Set;
-public class HMSAnalysisTask extends BaseAnalysisTask {
+public class HMSAnalysisTask extends ExternalAnalysisTask {
private static final Logger LOG =
LogManager.getLogger(HMSAnalysisTask.class);
+ private HMSExternalTable hmsExternalTable;
- private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT
ROUND(COUNT(1) * ${scaleFactor}) as rowCount "
- + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleHints}";
- private boolean isTableLevelTask;
- private boolean isPartitionOnly;
- private HMSExternalTable table;
-
+ // for test
public HMSAnalysisTask() {
}
public HMSAnalysisTask(AnalysisInfo info) {
super(info);
- isTableLevelTask = info.externalTableLevelTask;
- isPartitionOnly = info.partitionOnly;
- table = (HMSExternalTable) tbl;
+ hmsExternalTable = (HMSExternalTable) tbl;
}
- public void doExecute() throws Exception {
- if (isTableLevelTask) {
- getTableStats();
- } else {
- getTableColumnStats();
- }
+ private boolean isPartitionColumn() {
+ return hmsExternalTable.getPartitionColumns().stream().anyMatch(c ->
c.getName().equals(col.getName()));
}
// For test
protected void setTable(HMSExternalTable table) {
- this.table = table;
+ setTable((ExternalTable) table);
+ this.hmsExternalTable = table;
}
- /**
- * Get table row count
- */
- private void getTableStats() throws Exception {
- Map<String, String> params = buildStatsParams(null);
- List<ResultRow> columnResult =
- StatisticsUtil.execStatisticQuery(new StringSubstitutor(params)
- .replace(ANALYZE_TABLE_COUNT_TEMPLATE));
- String rowCount = columnResult.get(0).get(0);
- Env.getCurrentEnv().getAnalysisManager()
- .updateTableStatsStatus(
- new TableStatsMeta(Long.parseLong(rowCount), info,
tbl));
- job.rowCountDone(this);
- }
- /**
- * Get column statistics and insert the result to
__internal_schema.column_statistics
- */
- protected void getTableColumnStats() throws Exception {
+ @Override
+ protected void getOrdinaryColumnStats() throws Exception {
if (!info.usingSqlForPartitionColumn) {
try {
if (isPartitionColumn()) {
@@ -102,77 +72,17 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
+ "fallback to normal collection",
isPartitionColumn() ? "partition " : "",
col.getName(), e);
/* retry using sql way! */
- getOrdinaryColumnStats();
+ super.getOrdinaryColumnStats();
}
} else {
- getOrdinaryColumnStats();
+ super.getOrdinaryColumnStats();
}
}
- private boolean isPartitionColumn() {
- return table.getPartitionColumns().stream().anyMatch(c ->
c.getName().equals(col.getName()));
- }
-
- // Get ordinary column stats. Ordinary column means not partition column.
- private void getOrdinaryColumnStats() throws Exception {
- StringBuilder sb = new StringBuilder();
- Map<String, String> params = buildStatsParams("NULL");
- params.put("min", getMinFunction());
- params.put("max", getMaxFunction());
- params.put("dataSizeFunction", getDataSizeFunction(col, false));
- Pair<Double, Long> sampleInfo = getSampleInfo();
- params.put("scaleFactor", String.valueOf(sampleInfo.first));
- StringSubstitutor stringSubstitutor;
- if (tableSample == null) {
- // Do full analyze
- LOG.debug("Will do full collection for column {}", col.getName());
- sb.append(COLLECT_COL_STATISTICS);
- } else {
- // Do sample analyze
- LOG.debug("Will do sample collection for column {}",
col.getName());
- boolean limitFlag = false;
- boolean bucketFlag = false;
- // If sample size is too large, use limit to control the sample
size.
- if (needLimit(sampleInfo.second, sampleInfo.first)) {
- limitFlag = true;
- long columnSize = 0;
- for (Column column : table.getFullSchema()) {
- columnSize += column.getDataType().getSlotSize();
- }
- double targetRows = (double) sampleInfo.second / columnSize;
- // Estimate the new scaleFactor based on the schema.
- if (targetRows > StatisticsUtil.getHugeTableSampleRows()) {
- params.put("limit", "limit " +
StatisticsUtil.getHugeTableSampleRows());
- params.put("scaleFactor",
- String.valueOf(sampleInfo.first * targetRows /
StatisticsUtil.getHugeTableSampleRows()));
- }
- }
- // Single distribution column is not fit for DUJ1 estimator, use
linear estimator.
- Set<String> distributionColumns = tbl.getDistributionColumnNames();
- if (distributionColumns.size() == 1 &&
distributionColumns.contains(col.getName().toLowerCase())) {
- bucketFlag = true;
- sb.append(LINEAR_ANALYZE_TEMPLATE);
- params.put("ndvFunction", "ROUND(NDV(`${colName}`) *
${scaleFactor})");
- params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
- } else {
- sb.append(DUJ1_ANALYZE_TEMPLATE);
- params.put("dataSizeFunction", getDataSizeFunction(col, true));
- params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count)
* ${scaleFactor})"));
- params.put("rowCount", "ROUND(SUM(t1.count) *
${scaleFactor})");
- }
- LOG.info("Sample for column [{}]. Scale factor [{}], "
- + "limited [{}], is distribute column [{}]",
- col.getName(), params.get("scaleFactor"), limitFlag,
bucketFlag);
- }
- stringSubstitutor = new StringSubstitutor(params);
- String sql = stringSubstitutor.replace(sb.toString());
- runQuery(sql);
- }
-
// Collect the partition column stats through HMS metadata.
// Get all the partition values and calculate the stats based on the
values.
private void getPartitionColumnStats() throws Exception {
- Set<String> partitionNames = table.getPartitionNames();
+ Set<String> partitionNames = hmsExternalTable.getPartitionNames();
Set<String> ndvPartValues = Sets.newHashSet();
long numNulls = 0;
long dataSize = 0;
@@ -198,8 +108,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
}
}
// Estimate the row count. This value is inaccurate if the table stats
is empty.
- TableStatsMeta tableStatsStatus =
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
- long count = tableStatsStatus == null ? table.estimatedRowCount() :
tableStatsStatus.rowCount;
+ TableStatsMeta tableStatsStatus =
Env.getCurrentEnv().getAnalysisManager()
+ .findTableStatsStatus(hmsExternalTable.getId());
+ long count = tableStatsStatus == null ?
hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
dataSize = dataSize * count / partitionNames.size();
numNulls = numNulls * count / partitionNames.size();
int ndv = ndvPartValues.size();
@@ -218,8 +129,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
// Collect the spark analyzed column stats through HMS metadata.
private void getHmsColumnStats() throws Exception {
- TableStatsMeta tableStatsStatus =
Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId());
- long count = tableStatsStatus == null ? table.estimatedRowCount() :
tableStatsStatus.rowCount;
+ TableStatsMeta tableStatsStatus =
Env.getCurrentEnv().getAnalysisManager()
+ .findTableStatsStatus(hmsExternalTable.getId());
+ long count = tableStatsStatus == null ?
hmsExternalTable.estimatedRowCount() : tableStatsStatus.rowCount;
Map<String, String> params = buildStatsParams("NULL");
Map<StatsType, String> statsParams = new HashMap<>();
@@ -229,7 +141,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
statsParams.put(StatsType.MAX_VALUE, "max");
statsParams.put(StatsType.AVG_SIZE, "avg_len");
- if (table.fillColumnStatistics(info.colName, statsParams, params)) {
+ if (hmsExternalTable.fillColumnStatistics(info.colName, statsParams,
params)) {
throw new AnalysisException("some column stats not available");
}
@@ -283,126 +195,4 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
}
return value.compareTo(currentMax) > 0 ? value : currentMax;
}
-
- private Map<String, String> buildStatsParams(String partId) {
- Map<String, String> commonParams = new HashMap<>();
- String id = StatisticsUtil.constructId(tbl.getId(), -1);
- if (partId == null) {
- commonParams.put("partId", "NULL");
- } else {
- id = StatisticsUtil.constructId(id, partId);
- commonParams.put("partId", "\'" + partId + "\'");
- }
- commonParams.put("internalDB", FeConstants.INTERNAL_DB_NAME);
- commonParams.put("columnStatTbl",
StatisticConstants.STATISTIC_TBL_NAME);
- commonParams.put("id", id);
- commonParams.put("catalogId", String.valueOf(catalog.getId()));
- commonParams.put("dbId", String.valueOf(db.getId()));
- commonParams.put("tblId", String.valueOf(tbl.getId()));
- commonParams.put("indexId", "-1");
- commonParams.put("idxId", "-1");
- commonParams.put("colName", info.colName);
- commonParams.put("colId", info.colName);
- commonParams.put("catalogName", catalog.getName());
- commonParams.put("dbName", db.getFullName());
- commonParams.put("tblName", tbl.getName());
- commonParams.put("sampleHints", getSampleHint());
- commonParams.put("limit", "");
- commonParams.put("scaleFactor", "1");
- if (col != null) {
- commonParams.put("type", col.getType().toString());
- }
- commonParams.put("lastAnalyzeTimeInMs",
String.valueOf(System.currentTimeMillis()));
- return commonParams;
- }
-
- protected String getSampleHint() {
- if (tableSample == null) {
- return "";
- }
- if (tableSample.isPercent()) {
- return String.format("TABLESAMPLE(%d PERCENT)",
tableSample.getSampleValue());
- } else {
- return String.format("TABLESAMPLE(%d ROWS)",
tableSample.getSampleValue());
- }
- }
-
- /**
- * Get the pair of sample scale factor and the file size going to sample.
- * While analyzing, the result of count, null count and data size need to
- * multiply this scale factor to get more accurate result.
- * @return Pair of sample scale factor and the file size going to sample.
- */
- protected Pair<Double, Long> getSampleInfo() {
- if (tableSample == null) {
- return Pair.of(1.0, 0L);
- }
- long target;
- // Get list of all files' size in this HMS table.
- List<Long> chunkSizes = table.getChunkSizes();
- Collections.shuffle(chunkSizes, new Random(tableSample.getSeek()));
- long total = 0;
- // Calculate the total size of this HMS table.
- for (long size : chunkSizes) {
- total += size;
- }
- if (total == 0) {
- return Pair.of(1.0, 0L);
- }
- // Calculate the sample target size for percent and rows sample.
- if (tableSample.isPercent()) {
- target = total * tableSample.getSampleValue() / 100;
- } else {
- int columnSize = 0;
- for (Column column : table.getFullSchema()) {
- columnSize += column.getDataType().getSlotSize();
- }
- target = columnSize * tableSample.getSampleValue();
- }
- // Calculate the actual sample size (cumulate).
- long cumulate = 0;
- for (long size : chunkSizes) {
- cumulate += size;
- if (cumulate >= target) {
- break;
- }
- }
- return Pair.of(Math.max(((double) total) / cumulate, 1), cumulate);
- }
-
- @Override
- protected void afterExecution() {
- // Table level task doesn't need to sync any value to sync stats, it
stores the value in metadata.
- // Partition only task doesn't need to refresh cached.
- if (isTableLevelTask || isPartitionOnly) {
- return;
- }
- Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(),
-1, col.getName());
- }
-
- /**
- * If the size to sample is larger than LIMIT_SIZE (1GB)
- * and is much larger (1.2*) than the size user want to sample,
- * use limit to control the total sample size.
- * @param sizeToRead The file size to sample.
- * @param factor sizeToRead * factor = Table total size.
- * @return True if need to limit.
- */
- protected boolean needLimit(long sizeToRead, double factor) {
- long total = (long) (sizeToRead * factor);
- long target;
- if (tableSample.isPercent()) {
- target = total * tableSample.getSampleValue() / 100;
- } else {
- int columnSize = 0;
- for (Column column : table.getFullSchema()) {
- columnSize += column.getDataType().getSlotSize();
- }
- target = columnSize * tableSample.getSampleValue();
- }
- if (sizeToRead > LIMIT_SIZE && sizeToRead > target * LIMIT_FACTOR) {
- return true;
- }
- return false;
- }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
index 5c8aec3fbf6..6176ec13bd6 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java
@@ -84,10 +84,12 @@ import org.apache.iceberg.FileScanTask;
import org.apache.iceberg.PartitionSpec;
import org.apache.iceberg.Table;
import org.apache.iceberg.TableScan;
+import org.apache.iceberg.io.CloseableIterable;
import org.apache.iceberg.types.Types;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
+import java.io.IOException;
import java.net.InetSocketAddress;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
@@ -735,8 +737,12 @@ public class StatisticsUtil {
columnStatisticBuilder.setDataSize(0);
columnStatisticBuilder.setAvgSizeByte(0);
columnStatisticBuilder.setNumNulls(0);
- for (FileScanTask task : tableScan.planFiles()) {
- processDataFile(task.file(), task.spec(), colName,
columnStatisticBuilder);
+ try (CloseableIterable<FileScanTask> fileScanTasks =
tableScan.planFiles()) {
+ for (FileScanTask task : fileScanTasks) {
+ processDataFile(task.file(), task.spec(), colName,
columnStatisticBuilder);
+ }
+ } catch (IOException e) {
+ LOG.warn("Error to close FileScanTask.", e);
}
if (columnStatisticBuilder.getCount() > 0) {
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getDataSize()
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
index fb0a3b3c2ca..e1016864525 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
@@ -252,7 +252,7 @@ public class HMSAnalysisTaskTest {
analysisInfoBuilder.setUsingSqlForPartitionColumn(true);
task.info = analysisInfoBuilder.build();
- task.getTableColumnStats();
+ task.getOrdinaryColumnStats();
}
@@ -309,6 +309,6 @@ public class HMSAnalysisTaskTest {
analysisInfoBuilder.setUsingSqlForPartitionColumn(false);
task.info = analysisInfoBuilder.build();
- task.getTableColumnStats();
+ task.getOrdinaryColumnStats();
}
}
diff --git a/regression-test/conf/regression-conf.groovy
b/regression-test/conf/regression-conf.groovy
index 39c41f7c11a..6d17bd032fd 100644
--- a/regression-test/conf/regression-conf.groovy
+++ b/regression-test/conf/regression-conf.groovy
@@ -191,6 +191,7 @@ extArrowFlightSqlPassword= ""
// iceberg rest catalog config
iceberg_rest_uri_port=18181
+iceberg_minio_port=19001
// If the failure suite num exceeds this config
// all following suite will be skipped to fast quit the run.
diff --git
a/regression-test/data/external_table_p0/iceberg/test_iceberg_statistics.out
b/regression-test/data/external_table_p0/iceberg/test_iceberg_statistics.out
new file mode 100644
index 00000000000..c094d171479
--- /dev/null
+++ b/regression-test/data/external_table_p0/iceberg/test_iceberg_statistics.out
@@ -0,0 +1,39 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !s1 --
+city 1000 4 0 Beijing Shanghai 6973
+col_binary 1000 867 0 0 1111101100100001001 15356
+col_boolean 1000 2 0 0 1 1000
+col_byte 1000 251 0 -128 127 4000
+col_char 1000 963 0 ! zy@notj#fkedb($ 9348
+col_date 1000 3 0 1969-09-21 2969-02-03 4000
+col_decimal 1000 1006 0 4.028284 9999.512216 8000
+col_double 1000 990 0 0.005217837593576302
9.996285421163707 8000
+col_float 1000 995 0 0.013126845 9.99709 4000
+col_integer 1000 999 0 -21468189 2108484 4000
+col_long 1000 996 0 -92193877774291102
92127291905311066 8000
+col_short 1000 985 0 -32554 32525 4000
+col_string 1000 992 0 0 zx70Jyeb6TfQ1YUaIGC 10714
+col_timestamp 1000 4 0 1970-01-01 08:00:01.000001
1970-01-04 08:00:01.000001 8000
+col_timestamp_ntz 1000 4 0 2017-12-01 10:12:55.038194
2017-12-04 10:12:55.038194 8000
+col_varchar 1000 988 0 0 zvnZ6bBxh 10764
+id 1000 1001 0 -99567408 99854631 8000
+
+-- !s2 --
+city 1000 4 0 Beijing Shanghai 6973
+col_binary 1000 867 0 0 1111101100100001001 15356
+col_boolean 1000 2 0 0 1 1000
+col_byte 1000 251 0 -128 127 4000
+col_char 1000 973 0 ! zy@notj#fkedb($ 9324
+col_date 1000 3 0 1969-09-21 2969-02-03 4000
+col_decimal 1000 1006 0 4.028284 9999.512216 8000
+col_double 1000 990 0 0.005217837593576302
9.996285421163707 8000
+col_float 1000 995 0 0.013126845 9.99709 4000
+col_integer 1000 999 0 -21468189 2108484 4000
+col_long 1000 996 0 -92193877774291102
92127291905311066 8000
+col_short 1000 985 0 -32554 32525 4000
+col_string 1000 992 0 0 zx70Jyeb6TfQ1YUaIGC 10714
+col_timestamp 1000 4 0 1970-01-01 08:00:01.000001
1970-01-04 08:00:01.000001 8000
+col_timestamp_ntz 1000 4 0 2017-12-01 10:12:55.038194
2017-12-04 10:12:55.038194 8000
+col_varchar 1000 988 0 0 zvnZ6bBxh 10764
+id 1000 1001 0 -99567408 99854631 8000
+
diff --git
a/regression-test/data/external_table_p0/paimon/test_paimon_statistics.out
b/regression-test/data/external_table_p0/paimon/test_paimon_statistics.out
new file mode 100644
index 00000000000..0f9f20d4782
--- /dev/null
+++ b/regression-test/data/external_table_p0/paimon/test_paimon_statistics.out
@@ -0,0 +1,21 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !s1 --
+c1 2 2 0 1 10 2
+c10 2 2 0 10.1 100.1 16
+c11 2 2 0 11.10 110.10 16
+c12 2 2 0 2020-02-02 2020-03-02 8
+c13 2 2 0 130str 13str 11
+c14 2 2 0 140varchar 14varchar 19
+c15 2 2 0 a b 2
+c16 2 2 0 0 1 2
+c17 2 2 0 aaaa bbbb 8
+c18 2 2 0 2023-08-13 09:32:38.530000 2023-08-14
08:32:52.821000 16
+c2 2 2 0 2 20 2
+c3 2 2 0 3 30 4
+c4 2 2 0 4 40 4
+c5 2 2 0 5 50 8
+c6 2 2 0 6 60 8
+c7 2 2 0 7 70 16
+c8 2 2 0 8 80 16
+c9 2 2 0 9.1 90.1 8
+
diff --git
a/regression-test/data/external_table_p2/hive/test_hive_hudi_statistics.out
b/regression-test/data/external_table_p2/hive/test_hive_hudi_statistics.out
new file mode 100644
index 00000000000..66a36a81afc
--- /dev/null
+++ b/regression-test/data/external_table_p2/hive/test_hive_hudi_statistics.out
@@ -0,0 +1,16 @@
+-- This file is automatically generated. You should know what you did if you
want to edit this
+-- !s1 --
+_hoodie_commit_seqno 4 4 0 20230605145009209_0_1
20230801201335031_1_1 84
+_hoodie_commit_time 4 3 0 20230605145009209
20230801201335031 68
+_hoodie_file_name 4 4 0
65ffc5d9-397a-456e-a735-30f3ad37466f-0
e33d645c-6e2f-41f3-b8d6-f658771bd460-0_1-83-220_20230605145403388.parquet
221
+_hoodie_partition_path 4 3 0
partitionId=2011-11-11/versionId=v_1 partitionId=2021-02-01/versionId=v_4
144
+_hoodie_record_key 4 3 0 rowId:row_1 rowId:row_4
44
+inttolong 4 2 0 0 1 16
+longtoint 4 3 0 1000000 1000004 32
+name 4 3 0 ashin john 15
+partitionid 4 3 0 2011-11-11 2021-02-01 40
+precomb 4 3 0 0 4 32
+rowid 4 3 0 row_1 row_4 20
+tobedeletedstr 4 3 0 toBeDel0 toBeDel4 32
+versionid 4 3 0 v_0 v_4 12
+
diff --git a/regression-test/pipeline/p0/conf/regression-conf.groovy
b/regression-test/pipeline/p0/conf/regression-conf.groovy
index be70ab0f730..b99e21c4e7c 100644
--- a/regression-test/pipeline/p0/conf/regression-conf.groovy
+++ b/regression-test/pipeline/p0/conf/regression-conf.groovy
@@ -96,6 +96,7 @@ kafka_port=19193
// iceberg test config
iceberg_rest_uri_port=18181
+iceberg_minio_port=19001
enableEsTest=false
es_6_port=19200
diff --git
a/regression-test/pipeline/tpch/tpch-sf100/conf/regression-conf.groovy
b/regression-test/pipeline/tpch/tpch-sf100/conf/regression-conf.groovy
index 364a7103fe8..5234ccc4241 100644
--- a/regression-test/pipeline/tpch/tpch-sf100/conf/regression-conf.groovy
+++ b/regression-test/pipeline/tpch/tpch-sf100/conf/regression-conf.groovy
@@ -94,6 +94,7 @@ kafka_port=19193
// iceberg test config
iceberg_rest_uri_port=18181
+iceberg_minio_port=19001
enableEsTest=false
es_6_port=19200
diff --git a/regression-test/plugins/plugins_get_ids_from_proc.groovy
b/regression-test/plugins/plugins_get_ids_from_proc.groovy
new file mode 100644
index 00000000000..74a4d4d2010
--- /dev/null
+++ b/regression-test/plugins/plugins_get_ids_from_proc.groovy
@@ -0,0 +1,62 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.apache.doris.regression.suite.Suite
+
+Suite.metaClass.get_catalog_id = {String catalog_name /* param */ ->
+ String catalog_id;
+ def catalogs = sql """show proc '/catalogs'"""
+ for (catalog in catalogs) {
+ if (catalog[1].equals(catalog_name)) {
+ catalog_id = catalog[0]
+ break
+ }
+ }
+ log.info("get catalogid: " + catalog_id)
+ return catalog_id
+}
+
+
+Suite.metaClass.get_database_id = {String catalog_name, String db_name /*
param */ ->
+ String database_id;
+ def catalog_id = get_catalog_id(catalog_name)
+ def dbs = sql """show proc '/catalogs/${catalog_id}'"""
+ for (db in dbs) {
+ if (db[1].equals(db_name)) {
+ database_id = db[0]
+ break
+ }
+ }
+ log.info("get database_id: " + database_id)
+ return database_id
+}
+
+
+Suite.metaClass.get_table_id = {String catalog_name, String db_name, String
tb_name /* param */ ->
+ String table_id;
+ def catalog_id = get_catalog_id(catalog_name)
+ def database_id = get_database_id(catalog_name, db_name)
+ def tbs = sql """show proc '/catalogs/${catalog_id}/${database_id}'"""
+ for (tb in tbs) {
+ if (tb[1].equals(tb_name)) {
+ table_id = tb[0]
+ break
+ }
+ }
+ log.info("get table_id: " + table_id)
+ return table_id
+}
diff --git
a/regression-test/suites/external_table_p0/iceberg/test_iceberg_statistics.groovy
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_statistics.groovy
new file mode 100644
index 00000000000..24b27eb70b4
--- /dev/null
+++
b/regression-test/suites/external_table_p0/iceberg/test_iceberg_statistics.groovy
@@ -0,0 +1,57 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_iceberg_statistics",
"p0,external,doris,external_docker,external_docker_doris") {
+ String enabled = context.config.otherConfigs.get("enableIcebergTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ try {
+ String rest_port =
context.config.otherConfigs.get("iceberg_rest_uri_port")
+ String minio_port =
context.config.otherConfigs.get("iceberg_minio_port")
+ String externalEnvIp =
context.config.otherConfigs.get("externalEnvIp")
+ String catalog_name = "test_iceberg_rest_catalog"
+ String db_name = "format_v2"
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """CREATE CATALOG ${catalog_name} PROPERTIES (
+ 'type'='iceberg',
+ 'iceberg.catalog.type'='rest',
+ 'uri' = 'http://${externalEnvIp}:${rest_port}',
+ "s3.access_key" = "admin",
+ "s3.secret_key" = "password",
+ "s3.endpoint" = "http://${externalEnvIp}:${minio_port}",
+ "s3.region" = "us-east-1"
+ );"""
+
+ def table_id_mor = get_table_id(catalog_name, db_name,
"sample_mor_parquet")
+ def table_id_cow = get_table_id(catalog_name, db_name,
"sample_cow_parquet")
+
+ // analyze
+ sql """use `${catalog_name}`.`${db_name}`"""
+ sql """analyze table sample_mor_parquet with sync"""
+ sql """analyze table sample_cow_parquet with sync"""
+
+ // select
+ def s1 = """select
col_id,count,ndv,null_count,min,max,data_size_in_bytes from
internal.__internal_schema.column_statistics where tbl_id = ${table_id_mor}
order by id;"""
+ def s2 = """select
col_id,count,ndv,null_count,min,max,data_size_in_bytes from
internal.__internal_schema.column_statistics where tbl_id = ${table_id_cow}
order by id;"""
+
+ qt_s1 s1
+ qt_s2 s2
+ } finally {
+ }
+ }
+}
+
diff --git
a/regression-test/suites/external_table_p0/paimon/test_paimon_catalog.groovy
b/regression-test/suites/external_table_p0/paimon/test_paimon_catalog.groovy
index 87ea14ad2fd..ce8c9b5e849 100644
--- a/regression-test/suites/external_table_p0/paimon/test_paimon_catalog.groovy
+++ b/regression-test/suites/external_table_p0/paimon/test_paimon_catalog.groovy
@@ -52,6 +52,9 @@ suite("test_paimon_catalog",
"p0,external,doris,external_docker,external_docker_
);
"""
+ sql """drop catalog ${file_ctl_name}""";
+ sql """drop catalog ${hms_ctl_name}""";
+
String enabled = context.config.otherConfigs.get("enablePaimonTest")
if (enabled != null && enabled.equalsIgnoreCase("true")) {
def all = """select * from all_table order by c1;"""
diff --git
a/regression-test/suites/external_table_p0/paimon/test_paimon_statistics.groovy
b/regression-test/suites/external_table_p0/paimon/test_paimon_statistics.groovy
new file mode 100644
index 00000000000..c75e7b797d9
--- /dev/null
+++
b/regression-test/suites/external_table_p0/paimon/test_paimon_statistics.groovy
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_paimon_statistics",
"p0,external,doris,external_docker,external_docker_doris") {
+ String enabled = context.config.otherConfigs.get("enablePaimonTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ try {
+ String hdfs_port = context.config.otherConfigs.get("hdfs_port")
+ String catalog_name = "paimon1"
+ String externalEnvIp =
context.config.otherConfigs.get("externalEnvIp")
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ "type" = "paimon",
+ "paimon.catalog.type"="filesystem",
+ "warehouse" =
"hdfs://${externalEnvIp}:${hdfs_port}/user/doris/paimon1"
+ );"""
+
+ def table_id = get_table_id(catalog_name, "db1", "all_table")
+
+ // analyze
+ sql """use `${catalog_name}`.`db1`"""
+ sql """analyze table all_table with sync"""
+
+ // select
+ def s1 = """select
col_id,count,ndv,null_count,min,max,data_size_in_bytes from
internal.__internal_schema.column_statistics where tbl_id = ${table_id} order
by id;"""
+
+ qt_s1 s1
+ } finally {
+ }
+ }
+}
+
diff --git a/regression-test/suites/external_table_p0/test_catalog_ddl.groovy
b/regression-test/suites/external_table_p0/test_catalog_ddl.groovy
index b236567e8bd..a9a67f51853 100644
--- a/regression-test/suites/external_table_p0/test_catalog_ddl.groovy
+++ b/regression-test/suites/external_table_p0/test_catalog_ddl.groovy
@@ -44,4 +44,6 @@ suite("test_catalog_ddl", "p0,external,external_docker") {
result = sql """show create catalog ${catalog1};"""
assertEquals(result.size(), 1)
assertTrue(result[0][1].contains("COMMENT \"alter_comment\""))
+
+ sql """drop catalog ${catalog1}"""
}
diff --git
a/regression-test/suites/external_table_p2/hive/test_hive_hudi_statistics.groovy
b/regression-test/suites/external_table_p2/hive/test_hive_hudi_statistics.groovy
new file mode 100644
index 00000000000..55e5037de45
--- /dev/null
+++
b/regression-test/suites/external_table_p2/hive/test_hive_hudi_statistics.groovy
@@ -0,0 +1,47 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_hudi_statistics", "p2,external,hive,hudi") {
+ String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+ if (enabled != null && enabled.equalsIgnoreCase("true")) {
+ String extHiveHmsHost =
context.config.otherConfigs.get("extHiveHmsHost")
+ String extHiveHmsPort =
context.config.otherConfigs.get("extHiveHmsPort")
+ String catalog_name = "test_hive_hudi_statistics"
+ String db_name = "hudi_catalog"
+
+ sql """drop catalog if exists ${catalog_name};"""
+ sql """
+ create catalog if not exists ${catalog_name} properties (
+ 'hadoop.username'='hadoop',
+ 'type'='hms',
+ 'hive.metastore.uris' =
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+ );
+ """
+
+ def table_id = get_table_id(catalog_name, db_name,
"partitioned_mor_rt")
+
+ // analyze
+ sql """use `${catalog_name}`.`${db_name}`"""
+ sql """analyze table partitioned_mor_rt with sync"""
+
+ // select
+ def s1 = """select
col_id,count,ndv,null_count,min,max,data_size_in_bytes from
internal.__internal_schema.column_statistics where tbl_id = ${table_id} order
by id;"""
+
+ qt_s1 s1
+
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]