This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 1d3c2e42da1 [fix](stats)min and max return NaN when table is empty
(#27863)
1d3c2e42da1 is described below
commit 1d3c2e42da126db1264bbb576529f8684daca251
Author: Jibing-Li <[email protected]>
AuthorDate: Fri Dec 1 15:42:36 2023 +0800
[fix](stats)min and max return NaN when table is empty (#27863)
fix analyze empty table and min/max null value bug:
1. Skip empty analyze task for sample analyze task. (Full analyze task
already skipped).
2. Check sample rows is not 0 before calculate the scale factor.
3. Remove ' in sql template after remove base64 encoding for min/max value.
backport https://github.com/apache/doris/pull/27862
---
.../apache/doris/statistics/BaseAnalysisTask.java | 10 +++----
.../apache/doris/statistics/HMSAnalysisTask.java | 11 +++++---
.../apache/doris/statistics/JdbcAnalysisTask.java | 2 +-
.../apache/doris/statistics/OlapAnalysisTask.java | 32 ++++++++++++----------
.../org/apache/doris/statistics/AnalyzeTest.java | 2 +-
.../doris/statistics/HMSAnalysisTaskTest.java | 22 +++++++++++++++
.../doris/statistics/OlapAnalysisTaskTest.java | 9 ++----
7 files changed, 56 insertions(+), 32 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 4c0e07ce6cb..449a6fc15e5 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -92,8 +92,8 @@ public abstract class BaseAnalysisTask {
+ "${rowCount} AS `row_count`, "
+ "${ndvFunction} as `ndv`, "
+ "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0)
* ${scaleFactor} as `null_count`, "
- + "'${min}' AS `min`, "
- + "'${max}' AS `max`, "
+ + "${min} AS `min`, "
+ + "${max} AS `max`, "
+ "${dataSizeFunction} * ${scaleFactor} AS `data_size`, "
+ "NOW() "
+ "FROM ( "
@@ -115,8 +115,8 @@ public abstract class BaseAnalysisTask {
+ "${row_count} AS `row_count`, "
+ "${ndv} AS `ndv`, "
+ "${null_count} AS `null_count`, "
- + "'${min}' AS `min`, "
- + "'${max}' AS `max`, "
+ + "${min} AS `min`, "
+ + "${max} AS `max`, "
+ "${data_size} AS `data_size`, "
+ "NOW() ";
@@ -311,7 +311,7 @@ public abstract class BaseAnalysisTask {
this.job = job;
}
- protected void runQuery(String sql, boolean needEncode) {
+ protected void runQuery(String sql) {
long startTime = System.currentTimeMillis();
try (AutoCloseConnectContext a =
StatisticsUtil.buildConnectContext()) {
stmtExecutor = new StmtExecutor(a.connectContext, sql);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
index 7bd540de2c7..efd99d1eca9 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java
@@ -159,7 +159,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
}
stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
- runQuery(sql, true);
+ runQuery(sql);
}
// Collect the partition column stats through HMS metadata.
@@ -201,12 +201,12 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
params.put("row_count", String.valueOf(count));
params.put("ndv", String.valueOf(ndv));
params.put("null_count", String.valueOf(numNulls));
- params.put("min", min);
- params.put("max", max);
+ params.put("min", StatisticsUtil.quote(min));
+ params.put("max", StatisticsUtil.quote(max));
params.put("data_size", String.valueOf(dataSize));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql =
stringSubstitutor.replace(ANALYZE_PARTITION_COLUMN_TEMPLATE);
- runQuery(sql, true);
+ runQuery(sql);
}
private String updateMinValue(String currentMin, String value) {
@@ -313,6 +313,9 @@ public class HMSAnalysisTask extends BaseAnalysisTask {
for (long size : chunkSizes) {
total += size;
}
+ if (total == 0) {
+ return Pair.of(1.0, 0L);
+ }
// Calculate the sample target size for percent and rows sample.
if (tableSample.isPercent()) {
target = total * tableSample.getSampleValue() / 100;
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java
index e2e83aa8fa6..50c437fa8f9 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java
@@ -110,7 +110,7 @@ public class JdbcAnalysisTask extends BaseAnalysisTask {
params.put("dataSizeFunction", getDataSizeFunction(col, false));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String sql = stringSubstitutor.replace(sb.toString());
- runQuery(sql, true);
+ runQuery(sql);
}
private Map<String, String> buildTableStatsParams(String partId) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
index 45ccdbb0d68..bf144b1a116 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java
@@ -59,7 +59,13 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
}
public void doExecute() throws Exception {
-
+ Set<String> partitionNames = info.colToPartitions.get(info.colName);
+ if (partitionNames.isEmpty()) {
+ LOG.debug("Skip empty empty partition task for column {} in
{}.{}.{}",
+ info.catalogId, info.dbId, info.tblId, info.colName);
+ job.appendBuf(this, Collections.emptyList());
+ return;
+ }
if (tableSample != null) {
doSample();
} else {
@@ -113,24 +119,25 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
params.put("scaleFactor", String.valueOf(scaleFactor));
params.put("sampleHints", tabletStr.isEmpty() ? "" :
String.format("TABLET(%s)", tabletStr));
params.put("ndvFunction",
getNdvFunction(String.valueOf(rowCount)));
- params.put("min", min);
- params.put("max", max);
+ params.put("min", StatisticsUtil.quote(min));
+ params.put("max", StatisticsUtil.quote(max));
params.put("rowCount", String.valueOf(rowCount));
params.put("type", col.getType().toString());
params.put("limit", "");
if (needLimit()) {
// If the tablets to be sampled are too large, use limit to
control the rows to read, and re-calculate
// the scaleFactor.
- limitFlag = true;
rowsToSample = Math.min(getSampleRows(), pair.second);
- params.put("limit", "limit " + rowsToSample);
- params.put("scaleFactor", String.valueOf(scaleFactor *
(double) pair.second / rowsToSample));
+ // Empty table doesn't need to limit.
+ if (rowsToSample > 0) {
+ limitFlag = true;
+ params.put("limit", "limit " + rowsToSample);
+ params.put("scaleFactor", String.valueOf(scaleFactor *
(double) pair.second / rowsToSample));
+ }
}
StringSubstitutor stringSubstitutor = new
StringSubstitutor(params);
String sql;
if (useLinearAnalyzeTemplate()) {
- params.put("min", StatisticsUtil.quote(min));
- params.put("max", StatisticsUtil.quote(max));
// For single unique key, use count as ndv.
if (isSingleUniqueKey()) {
params.put("ndvFunction", String.valueOf(rowCount));
@@ -148,7 +155,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
col.getName(), params.get("rowCount"), rowsToSample,
params.get("scaleFactor"),
limitFlag, tbl.isDistributionColumn(col.getName()),
tbl.isPartitionColumn(col.getName()), col.isKey(),
isSingleUniqueKey());
- runQuery(sql, false);
+ runQuery(sql);
}
}
@@ -169,11 +176,6 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
*/
protected void doFull() throws Exception {
LOG.debug("Will do full collection for column {}", col.getName());
- Set<String> partitionNames = info.colToPartitions.get(info.colName);
- if (partitionNames.isEmpty()) {
- job.appendBuf(this, Collections.emptyList());
- return;
- }
Map<String, String> params = new HashMap<>();
params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME);
@@ -189,7 +191,7 @@ public class OlapAnalysisTask extends BaseAnalysisTask {
params.put("tblName", String.valueOf(tbl.getName()));
StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
String collectColStats =
stringSubstitutor.replace(COLLECT_COL_STATISTICS);
- runQuery(collectColStats, true);
+ runQuery(collectColStats);
}
// Get sample tablets id and scale up scaleFactor
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java
index 1487d192466..9cebb3283d5 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalyzeTest.java
@@ -157,7 +157,7 @@ public class AnalyzeTest extends TestWithFeService {
new MockUp<BaseAnalysisTask>() {
@Mock
- protected void runQuery(String sql, boolean needEncode) {}
+ protected void runQuery(String sql) {}
};
HashMap<String, Set<String>> colToPartitions = Maps.newHashMap();
colToPartitions.put("col1", Collections.singleton("t1"));
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
index 24a74053bb6..a569a5cb06d 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/HMSAnalysisTaskTest.java
@@ -21,6 +21,7 @@ import org.apache.doris.analysis.TableSample;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.PrimitiveType;
import org.apache.doris.catalog.external.HMSExternalTable;
+import org.apache.doris.common.Pair;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.collect.Lists;
@@ -138,4 +139,25 @@ public class HMSAnalysisTaskTest {
Assertions.assertEquals(1000, tableSample.getSampleValue());
}
+ @Test
+ public void testGetSampleInfo(@Mocked HMSExternalTable tableIf)
+ throws Exception {
+ new MockUp<HMSExternalTable>() {
+ @Mock
+ public List<Long> getChunkSizes() {
+ return Lists.newArrayList();
+ }
+ };
+ HMSAnalysisTask task = new HMSAnalysisTask();
+ task.setTable(tableIf);
+ task.tableSample = null;
+ Pair<Double, Long> info1 = task.getSampleInfo();
+ Assertions.assertEquals(1.0, info1.first);
+ Assertions.assertEquals(0, info1.second);
+ task.tableSample = new TableSample(false, 100L);
+ Pair<Double, Long> info2 = task.getSampleInfo();
+ Assertions.assertEquals(1.0, info2.first);
+ Assertions.assertEquals(0, info2.second);
+ }
+
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
index 7b7894e54b4..ed9122f70b6 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java
@@ -149,8 +149,7 @@ public class OlapAnalysisTaskTest {
}
@Mock
- public void runQuery(String sql, boolean needEncode) {
- Assertions.assertFalse(needEncode);
+ public void runQuery(String sql) {
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1',
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS
`row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) -
SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) *
SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL,
`t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' A [...]
return;
}
@@ -216,8 +215,7 @@ public class OlapAnalysisTaskTest {
}
@Mock
- public void runQuery(String sql, boolean needEncode) {
- Assertions.assertFalse(needEncode);
+ public void runQuery(String sql) {
Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-',
'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1
AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`,
ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS
NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, '1' AS `min`, '2' AS `max`,
SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM
`catalogName`.`${dbName}`.`${tblName}` limit 100", sql);
return;
}
@@ -290,8 +288,7 @@ public class OlapAnalysisTaskTest {
}
@Mock
- public void runQuery(String sql, boolean needEncode) {
- Assertions.assertFalse(needEncode);
+ public void runQuery(String sql) {
Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1',
'-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS
`tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS
`row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) -
SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) *
SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL,
`t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' A [...]
return;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]