This is an automated email from the ASF dual-hosted git repository.
lijibing pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 70b5a83d65c [fix](statistics)Skip analyze if the collected info is
invalid. (#42028) (#42086)
70b5a83d65c is described below
commit 70b5a83d65c6cfb99d16ce69f485168497bfc12e
Author: Jibing-Li <[email protected]>
AuthorDate: Fri Oct 18 22:24:50 2024 +0800
[fix](statistics)Skip analyze if the collected info is invalid. (#42028)
(#42086)
backport: https://github.com/apache/doris/pull/42028
---
.../apache/doris/statistics/BaseAnalysisTask.java | 5 ++
.../org/apache/doris/statistics/ColStatsData.java | 21 ++++++
.../doris/statistics/BaseAnalysisTaskTest.java | 45 +++++++++++
.../apache/doris/statistics/ColStatsDataTest.java | 87 ++++++++++++++++++++++
4 files changed, 158 insertions(+)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
index 6ab65677e7a..f6553b7c485 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java
@@ -293,6 +293,11 @@ public abstract class BaseAnalysisTask {
try (AutoCloseConnectContext a =
StatisticsUtil.buildConnectContext()) {
stmtExecutor = new StmtExecutor(a.connectContext, sql);
ColStatsData colStatsData = new
ColStatsData(stmtExecutor.executeInternalQuery().get(0));
+ if (!colStatsData.isValid()) {
+ String message = String.format("ColStatsData is invalid, skip
analyzing. %s", colStatsData.toSQL(true));
+ LOG.warn(message);
+ throw new RuntimeException(message);
+ }
// Update index row count after analyze.
if (this instanceof OlapAnalysisTask) {
AnalysisInfo jobInfo =
Env.getCurrentEnv().getAnalysisManager().findJobInfo(job.getJobInfo().jobId);
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
index 7cf75462fee..bb826399458 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java
@@ -186,4 +186,25 @@ public class ColStatsData {
return ColumnStatistic.UNKNOWN;
}
}
+
+ public boolean isNull(String value) {
+ // Checking "NULL" as null is a historical bug which treat literal
value "NULL" as null. Will fix it soon.
+ return value == null || value.equalsIgnoreCase("NULL");
+ }
+
+ public boolean isValid() {
+ if (ndv > 10 * count) {
+ LOG.debug("Ndv {} is much larger than count {}", ndv, count);
+ return false;
+ }
+ if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit))) {
+ LOG.debug("Ndv is 0 but min or max exists");
+ return false;
+ }
+ if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) &&
(nullCount == 0 || count > nullCount * 10)) {
+ LOG.debug("count {} not 0, ndv is 0, min and max are all null,
null count {} is too small", count, count);
+ return false;
+ }
+ return true;
+ }
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
index 86ccfff26e0..de2cf522366 100644
---
a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
+++
b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java
@@ -20,10 +20,16 @@ package org.apache.doris.statistics;
import org.apache.doris.analysis.TableSample;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.PrimitiveType;
+import org.apache.doris.qe.StmtExecutor;
+import com.google.common.collect.Lists;
+import mockit.Mock;
+import mockit.MockUp;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
+import java.util.List;
+
public class BaseAnalysisTaskTest {
@Test
@@ -60,4 +66,43 @@ public class BaseAnalysisTaskTest {
System.out.println(ndvFunction);
}
+ @Test
+ public void testInvalidColStats() {
+ List<String> values = Lists.newArrayList();
+ values.add("id");
+ values.add("10000");
+ values.add("20000");
+ values.add("30000");
+ values.add("0");
+ values.add("col");
+ values.add(null);
+ values.add("100"); // count
+ values.add("1100"); // ndv
+ values.add("300"); // null
+ values.add("min");
+ values.add("max");
+ values.add("400");
+ values.add("500");
+ ResultRow row = new ResultRow(values);
+ List<ResultRow> result = Lists.newArrayList();
+ result.add(row);
+
+ new MockUp<StmtExecutor>() {
+ @Mock
+ public List<ResultRow> executeInternalQuery() {
+ return result;
+ }
+ };
+ BaseAnalysisTask task = new OlapAnalysisTask();
+ try {
+ task.runQuery("test");
+ } catch (Exception e) {
+ Assertions.assertEquals(e.getMessage(),
+ "ColStatsData is invalid, skip analyzing. "
+ +
"('id',10000,20000,30000,0,'col',null,100,1100,300,'min','max',400,'500')");
+ return;
+ }
+ Assertions.fail();
+ }
+
}
diff --git
a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
index 8743105a644..a7c84370153 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/ColStatsDataTest.java
@@ -164,4 +164,91 @@ public class ColStatsDataTest {
Assertions.assertEquals(400, columnStatistic.dataSize);
Assertions.assertEquals("500", columnStatistic.updatedTime);
}
+
+ @Test
+ public void testIsNull() {
+ ColStatsData stats = new ColStatsData();
+ Assertions.assertTrue(stats.isNull(null));
+ Assertions.assertTrue(stats.isNull("null"));
+ Assertions.assertTrue(stats.isNull("NuLl"));
+ Assertions.assertFalse(stats.isNull(""));
+ Assertions.assertFalse(stats.isNull(" "));
+ Assertions.assertFalse(stats.isNull("123"));
+ }
+
+ @Test
+ public void testIsValid() {
+ List<String> values = Lists.newArrayList();
+ values.add("id");
+ values.add("10000");
+ values.add("20000");
+ values.add("30000");
+ values.add("0");
+ values.add("col");
+ values.add(null);
+ values.add("100"); // count
+ values.add("1100"); // ndv
+ values.add("300"); // null
+ values.add("min");
+ values.add("max");
+ values.add("400");
+ values.add("500");
+ ResultRow row = new ResultRow(values);
+ ColStatsData data = new ColStatsData(row);
+ Assertions.assertFalse(data.isValid());
+
+ // Set count = 200
+ values.set(7, "200");
+ row = new ResultRow(values);
+ data = new ColStatsData(row);
+ Assertions.assertTrue(data.isValid());
+
+ // Set ndv = 0, min/max is not null
+ values.set(8, "0");
+ row = new ResultRow(values);
+ data = new ColStatsData(row);
+ Assertions.assertFalse(data.isValid());
+
+ // Set min to null, min/max is not null
+ values.set(10, null);
+ row = new ResultRow(values);
+ data = new ColStatsData(row);
+ Assertions.assertFalse(data.isValid());
+
+ // Set max to null, min/max is not null
+ values.set(11, null);
+ row = new ResultRow(values);
+ data = new ColStatsData(row);
+ Assertions.assertTrue(data.isValid());
+
+ // Set min to not null, min/max is not null
+ values.set(10, "min");
+ row = new ResultRow(values);
+ data = new ColStatsData(row);
+ Assertions.assertFalse(data.isValid());
+
+ // Set min and max to null, nullNum = 0
+ values.set(9, "0");
+ values.set(10, "nuLl");
+ values.set(11, null);
+ row = new ResultRow(values);
+ data = new ColStatsData(row);
+ Assertions.assertFalse(data.isValid());
+
+ // nullNum = 19, count = 200
+ values.set(9, "19");
+ row = new ResultRow(values);
+ data = new ColStatsData(row);
+ Assertions.assertFalse(data.isValid());
+
+ // nullNum = 21, count = 200, so count < nullNum * 10
+ values.set(9, "21");
+ row = new ResultRow(values);
+ data = new ColStatsData(row);
+ Assertions.assertTrue(data.isValid());
+
+ // Empty table stats is valid.
+ data = new ColStatsData();
+ Assertions.assertTrue(data.isValid());
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]