This is an automated email from the ASF dual-hosted git repository.
kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push:
new 7f3a1e8902 [Improvement](statistics)Show column stats even when error
occurred (#23703) (#23985)
7f3a1e8902 is described below
commit 7f3a1e8902ee12348ce90c0fcfa42d1b799966bc
Author: Jibing-Li <[email protected]>
AuthorDate: Wed Sep 6 20:33:42 2023 +0800
[Improvement](statistics)Show column stats even when error occurred
(#23703) (#23985)
Before, show column stats will ignore column with error.
In this pr, when min or max value failed to deserialize, show column stats
will use N/A as value of min or max, and still show the rest stats. (count,
null_count, ndv and so on).
---
.../apache/doris/statistics/ColumnStatistic.java | 25 ++++++++----
.../hive/test_hive_statistic.groovy | 47 ++++++++++++++++++++++
2 files changed, 65 insertions(+), 7 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
index 129a3c0e7a..3f928ee72a 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
@@ -21,6 +21,7 @@ import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Env;
import org.apache.doris.catalog.Type;
+import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.DdlException;
import org.apache.doris.statistics.util.InternalQueryResult.ResultRow;
import org.apache.doris.statistics.util.StatisticsUtil;
@@ -196,15 +197,25 @@ public class ColumnStatistic {
}
String min = resultRow.getColumnValue("min");
String max = resultRow.getColumnValue("max");
- if (min != null) {
-
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
min));
-
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(),
min));
+ if (min != null && !min.equalsIgnoreCase("NULL")) {
+ try {
+
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
min));
+
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(),
min));
+ } catch (AnalysisException e) {
+ LOG.warn("Failed to deserialize column {} min value {}.",
col, min, e);
+ columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
+ }
} else {
columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
}
- if (max != null) {
-
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
max));
-
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(),
max));
+ if (max != null && !max.equalsIgnoreCase("NULL")) {
+ try {
+
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
max));
+
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(),
max));
+ } catch (AnalysisException e) {
+ LOG.warn("Failed to deserialize column {} max value {}.",
col, max, e);
+ columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
+ }
} else {
columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
}
@@ -215,7 +226,7 @@ public class ColumnStatistic {
columnStatisticBuilder.setUpdatedTime(resultRow.getColumnValue("update_time"));
return columnStatisticBuilder.build();
} catch (Exception e) {
- LOG.warn("Failed to deserialize column statistics, column not
exists", e);
+ LOG.warn("Failed to deserialize column statistics.", e);
return ColumnStatistic.UNKNOWN;
}
}
diff --git
a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
index 97745ce1fe..1160a2f8dd 100644
--- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
@@ -243,6 +243,53 @@ suite("test_hive_statistic", "p2") {
sql """drop stats statistics"""
result = sql """show column cached stats statistics"""
assertTrue(result.size() == 0)
+
+ sql """use multi_catalog"""
+ sql """analyze table logs1_parquet (log_time) with sync"""
+ def ctlId
+ def dbId
+ def tblId
+ result = sql """show proc '/catalogs'"""
+
+ for (int i = 0; i < result.size(); i++) {
+ if (result[i][1] == catalog_name) {
+ ctlId = result[i][0]
+ }
+ }
+ result = sql """show proc '/catalogs/$ctlId'"""
+ for (int i = 0; i < result.size(); i++) {
+ if (result[i][1] == 'multi_catalog') {
+ dbId = result[i][0]
+ }
+ }
+ result = sql """show proc '/catalogs/$ctlId/$dbId'"""
+ for (int i = 0; i < result.size(); i++) {
+ if (result[i][1] == 'logs1_parquet') {
+ tblId = result[i][0]
+ }
+ }
+
+ result = sql """select * from
internal.__internal_schema.column_statistics where id =
'${tblId}--1-log_time'"""
+ assertTrue(result.size() == 1)
+ def id = result[0][0]
+ def catalog_id = result[0][1]
+ def db_id = result[0][2]
+ def tbl_id = result[0][3]
+ def idx_id = result[0][4]
+ def col_id = result[0][5]
+ def count = result[0][7]
+ def ndv = result[0][8]
+ def null_count = result[0][9]
+ def data_size_in_bytes = result[0][12]
+ def update_time = result[0][13]
+
+ sql """insert into internal.__internal_schema.column_statistics values
('$id', '$catalog_id', '$db_id', '$tbl_id', '$idx_id', '$col_id', NULL, $count,
$ndv, $null_count, '', '', '$data_size_in_bytes', '$update_time')"""
+
+ result = sql """show column stats logs1_parquet (log_time)"""
+ assertTrue(result.size() == 1)
+ assertTrue(result[0][6] == "N/A")
+ assertTrue(result[0][7] == "N/A")
+ sql """drop catalog ${catalog_name}"""
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]