This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 9f44fa16a62 [improvement](statistics)Multi bucket columns using DUJ1
to collect ndv #26950 (#26976)
9f44fa16a62 is described below
commit 9f44fa16a62e7b9ce8dd20b0b3d79c88977f47ec
Author: Jibing-Li <[email protected]>
AuthorDate: Tue Nov 14 20:14:05 2023 +0800
[improvement](statistics)Multi bucket columns using DUJ1 to collect ndv
#26950 (#26976)
backport #26950
---
.../java/org/apache/doris/catalog/OlapTable.java | 4 +-
.../java/org/apache/doris/catalog/TableIf.java | 5 ++
.../doris/catalog/external/HMSExternalTable.java | 8 ++-
.../apache/doris/statistics/HMSAnalysisTask.java | 5 +-
.../apache/doris/statistics/OlapAnalysisTask.java | 5 +-
.../doris/statistics/OlapAnalysisTaskTest.java | 70 ++++++++++++++++++++++
6 files changed, 90 insertions(+), 7 deletions(-)
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
index 3161b5a3371..73e0ff7ed95 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java
@@ -88,7 +88,6 @@ import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
-import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Optional;
@@ -770,6 +769,7 @@ public class OlapTable extends Table {
defaultDistributionInfo.markAutoBucket();
}
+ @Override
public Set<String> getDistributionColumnNames() {
Set<String> distributionColumnNames = Sets.newHashSet();
if (defaultDistributionInfo instanceof RandomDistributionInfo) {
@@ -2308,7 +2308,7 @@ public class OlapTable extends Table {
public boolean isDistributionColumn(String columnName) {
Set<String> distributeColumns = getDistributionColumnNames()
.stream().map(String::toLowerCase).collect(Collectors.toSet());
- return distributeColumns.contains(columnName.toLowerCase(Locale.ROOT));
+ return distributeColumns.contains(columnName.toLowerCase());
}
@Override
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
index f39ce5a13e3..3539d17e269 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java
@@ -27,6 +27,7 @@ import org.apache.doris.statistics.TableStatsMeta;
import org.apache.doris.thrift.TTableDescriptor;
import com.google.common.collect.Lists;
+import com.google.common.collect.Sets;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
@@ -259,5 +260,9 @@ public interface TableIf {
default boolean isPartitionColumn(String columnName) {
return false;
}
+
+ default Set<String> getDistributionColumnNames() {
+ return Sets.newHashSet();
+ }
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
index 96e2d573c61..2729fdf7a95 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
@@ -703,7 +703,13 @@ public class HMSExternalTable extends ExternalTable {
@Override
public boolean isDistributionColumn(String columnName) {
return
getRemoteTable().getSd().getBucketCols().stream().map(String::toLowerCase)
-
.collect(Collectors.toSet()).contains(columnName.toLowerCase(Locale.ROOT));
+ .collect(Collectors.toSet()).contains(columnName.toLowerCase());
+ }
+
+ @Override
+ public Set<String> getDistributionColumnNames() {
+ return
getRemoteTable().getSd().getBucketCols().stream().map(String::toLowerCase)
+ .collect(Collectors.toSet());
}
}
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index e4555dcd8bc..812bd615a69 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -140,8 +140,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
String.valueOf(sampleInfo.first * targetRows /
StatisticsUtil.getHugeTableSampleRows()));
}
}
- // Distribution columns don't fit for DUJ1 estimator, use linear
estimator.
- if (tbl.isDistributionColumn(col.getName())) {
+ // Single distribution column is not fit for DUJ1 estimator, use
linear estimator.
+ Set<String> distributionColumns = tbl.getDistributionColumnNames();
+ if (distributionColumns.size() == 1 &&
distributionColumns.contains(col.getName().toLowerCase())) {
bucketFlag = true;
sb.append(LINEAR_ANALYZE_TEMPLATE);
params.put("rowCount", "ROUND(count(1) * ${scaleFactor})");
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index 42b43f52a7f..d7037580595 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -129,8 +129,9 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
StringSubstitutor stringSubstitutor = new
StringSubstitutor(params);
String sql;
- // Distribution columns don't fit for DUJ1 estimator, use linear
estimator.
- if (tbl.isDistributionColumn(col.getName())) {
+ // Single distribution column is not fit for DUJ1 estimator, use
linear estimator.
+ Set<String> distributionColumns = tbl.getDistributionColumnNames();
+ if (distributionColumns.size() == 1 &&
distributionColumns.contains(col.getName().toLowerCase())) {
params.put("min", StatisticsUtil.quote(min));
params.put("max", StatisticsUtil.quote(max));
sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE);
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 0431e373d48..9437d2d0787 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -256,6 +256,76 @@ public class OlapAnalysisTaskTest {
olapAnalysisTask.doSample();
}
+ @Test
+ public void testManualSampleTwoDistributeKey(@Mocked CatalogIf catalogIf,
@Mocked DatabaseIf databaseIf, @Mocked OlapTable tableIf)
+ throws Exception {
+
+ new Expectations() {
+ {
+ tableIf.getRowCount();
+ result = 500;
+ tableIf.getId();
+ result = 30001;
+ catalogIf.getId();
+ result = 10001;
+ catalogIf.getName();
+ result = "catalogName";
+ databaseIf.getId();
+ result = 20001;
+ }
+ };
+
+ new MockUp<OlapAnalysisTask>() {
+ @Mock
+ public Pair<List<Long>, Long> calcActualSampleTablets() {
+ return Pair.of(Lists.newArrayList(), 100L);
+ }
+
+ @Mock
+ public ResultRow collectBasicStat(AutoCloseConnectContext context)
{
+ List<String> values = Lists.newArrayList();
+ values.add("1");
+ values.add("2");
+ return new ResultRow(values);
+ }
+
+ @Mock
+ public void runQuery(String sql, boolean needEncode) {
+ Assertions.assertFalse(needEncode);
+ Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1',
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS
`row_count`, SUM(t1.count) * COUNT(1) / (SUM(t1.count) - SUM(IF(t1.count = 1,
1, 0)) + SUM(IF(t1.count = 1, 1, 0)) * SUM(t1.count) / 500) as `ndv`,
IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.count, 0)), 0) * 5.0 as
`null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LEN [...]
+ return;
+ }
+ };
+
+ new MockUp<StatisticsUtil>() {
+ @Mock
+ public AutoCloseConnectContext buildConnectContext(boolean
scanLimit) {
+ return null;
+ }
+ };
+
+ new MockUp<OlapTable>() {
+ @Mock
+ public Set<String> getDistributionColumnNames() {
+ HashSet<String> cols = Sets.newHashSet();
+ cols.add("test1");
+ cols.add("test2");
+ return cols;
+ }
+ };
+
+ OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask();
+ olapAnalysisTask.col = new Column("test1", PrimitiveType.STRING);
+ olapAnalysisTask.tbl = tableIf;
+ AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder();
+ analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL);
+ olapAnalysisTask.info = analysisInfoBuilder.build();
+ olapAnalysisTask.catalog = catalogIf;
+ olapAnalysisTask.db = databaseIf;
+ olapAnalysisTask.tableSample = new TableSample(false, 100L);
+ olapAnalysisTask.doSample();
+ }
+
@Test
public void testNeedLimitFalse(@Mocked CatalogIf catalogIf, @Mocked
DatabaseIf databaseIf, @Mocked OlapTable tableIf)
throws Exception {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]