This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push:
new 9a7e8b298a [Improvement](statistics)Show column stats even when error
occurred (#23703)
9a7e8b298a is described below
commit 9a7e8b298a2bd1bdf80dfa38d23c52fcbfa55cd2
Author: Jibing-Li <[email protected]>
AuthorDate: Fri Sep 1 10:57:37 2023 +0800
[Improvement](statistics)Show column stats even when error occurred (#23703)
Before, show column stats will ignore column with error.
In this pr, when min or max value failed to deserialize, show column stats
will use N/A as value of min or max, and still show the rest stats. (count,
null_count, ndv and so on).
---
.../apache/doris/statistics/ColumnStatistic.java | 21 +++++++---
.../hive/test_hive_statistic.groovy | 47 ++++++++++++++++++++++
2 files changed, 63 insertions(+), 5 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
index 85965e9513..80d33e7c85 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
@@ -21,6 +21,7 @@ import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.PartitionInfo;
import org.apache.doris.catalog.Type;
+import org.apache.doris.common.AnalysisException;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.base.Preconditions;
@@ -168,21 +169,31 @@ public class ColumnStatistic {
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
-
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
min));
-
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(),
min));
+ try {
+
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
min));
+
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(),
min));
+ } catch (AnalysisException e) {
+ LOG.warn("Failed to deserialize column {} min value {}.",
col, min, e);
+ columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
+ }
} else {
columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {
-
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
max));
-
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(),
max));
+ try {
+
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
max));
+
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(),
max));
+ } catch (AnalysisException e) {
+ LOG.warn("Failed to deserialize column {} max value {}.",
col, max, e);
+ columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
+ }
} else {
columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
}
columnStatisticBuilder.setUpdatedTime(row.get(13));
return columnStatisticBuilder.build();
} catch (Exception e) {
- LOG.warn("Failed to deserialize column statistics, column not
exists", e);
+ LOG.warn("Failed to deserialize column statistics.", e);
return ColumnStatistic.UNKNOWN;
}
}
diff --git
a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
index 2366267a27..85c8326382 100644
--- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
@@ -243,6 +243,53 @@ suite("test_hive_statistic",
"p2,external,hive,external_remote,external_remote_h
sql """drop stats statistics"""
result = sql """show column cached stats statistics"""
assertTrue(result.size() == 0)
+
+ sql """use multi_catalog"""
+ sql """analyze table logs1_parquet (log_time) with sync"""
+ def ctlId
+ def dbId
+ def tblId
+ result = sql """show proc '/catalogs'"""
+
+ for (int i = 0; i < result.size(); i++) {
+ if (result[i][1] == catalog_name) {
+ ctlId = result[i][0]
+ }
+ }
+ result = sql """show proc '/catalogs/$ctlId'"""
+ for (int i = 0; i < result.size(); i++) {
+ if (result[i][1] == 'multi_catalog') {
+ dbId = result[i][0]
+ }
+ }
+ result = sql """show proc '/catalogs/$ctlId/$dbId'"""
+ for (int i = 0; i < result.size(); i++) {
+ if (result[i][1] == 'logs1_parquet') {
+ tblId = result[i][0]
+ }
+ }
+
+ result = sql """select * from
internal.__internal_schema.column_statistics where id =
'${tblId}--1-log_time'"""
+ assertTrue(result.size() == 1)
+ def id = result[0][0]
+ def catalog_id = result[0][1]
+ def db_id = result[0][2]
+ def tbl_id = result[0][3]
+ def idx_id = result[0][4]
+ def col_id = result[0][5]
+ def count = result[0][7]
+ def ndv = result[0][8]
+ def null_count = result[0][9]
+ def data_size_in_bytes = result[0][12]
+ def update_time = result[0][13]
+
+ sql """insert into internal.__internal_schema.column_statistics values
('$id', '$catalog_id', '$db_id', '$tbl_id', '$idx_id', '$col_id', NULL, $count,
$ndv, $null_count, '', '', '$data_size_in_bytes', '$update_time')"""
+
+ result = sql """show column stats logs1_parquet (log_time)"""
+ assertTrue(result.size() == 1)
+ assertTrue(result[0][6] == "N/A")
+ assertTrue(result[0][7] == "N/A")
+ sql """drop catalog ${catalog_name}"""
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]