This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 9a7e8b298a [Improvement](statistics)Show column stats even when error 
occurred (#23703)
9a7e8b298a is described below

commit 9a7e8b298a2bd1bdf80dfa38d23c52fcbfa55cd2
Author: Jibing-Li <[email protected]>
AuthorDate: Fri Sep 1 10:57:37 2023 +0800

    [Improvement](statistics)Show column stats even when error occurred (#23703)
    
    Before, show column stats will ignore column with error.
    In this pr, when min or max value failed to deserialize, show column stats 
will use N/A as value of min or max, and still show the rest stats. (count, 
null_count, ndv and so on).
---
 .../apache/doris/statistics/ColumnStatistic.java   | 21 +++++++---
 .../hive/test_hive_statistic.groovy                | 47 ++++++++++++++++++++++
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java 
b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
index 85965e9513..80d33e7c85 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java
@@ -21,6 +21,7 @@ import org.apache.doris.analysis.LiteralExpr;
 import org.apache.doris.catalog.Column;
 import org.apache.doris.catalog.PartitionInfo;
 import org.apache.doris.catalog.Type;
+import org.apache.doris.common.AnalysisException;
 import org.apache.doris.statistics.util.StatisticsUtil;
 
 import com.google.common.base.Preconditions;
@@ -168,21 +169,31 @@ public class ColumnStatistic {
             String min = row.get(10);
             String max = row.get(11);
             if (min != null && !min.equalsIgnoreCase("NULL")) {
-                
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
 min));
-                
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), 
min));
+                try {
+                    
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
 min));
+                    
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), 
min));
+                } catch (AnalysisException e) {
+                    LOG.warn("Failed to deserialize column {} min value {}.", 
col, min, e);
+                    columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
+                }
             } else {
                 columnStatisticBuilder.setMinValue(Double.MIN_VALUE);
             }
             if (max != null && !max.equalsIgnoreCase("NULL")) {
-                
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
 max));
-                
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), 
max));
+                try {
+                    
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
 max));
+                    
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), 
max));
+                } catch (AnalysisException e) {
+                    LOG.warn("Failed to deserialize column {} max value {}.", 
col, max, e);
+                    columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
+                }
             } else {
                 columnStatisticBuilder.setMaxValue(Double.MAX_VALUE);
             }
             columnStatisticBuilder.setUpdatedTime(row.get(13));
             return columnStatisticBuilder.build();
         } catch (Exception e) {
-            LOG.warn("Failed to deserialize column statistics, column not 
exists", e);
+            LOG.warn("Failed to deserialize column statistics.", e);
             return ColumnStatistic.UNKNOWN;
         }
     }
diff --git 
a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy 
b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
index 2366267a27..85c8326382 100644
--- a/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
+++ b/regression-test/suites/external_table_p2/hive/test_hive_statistic.groovy
@@ -243,6 +243,53 @@ suite("test_hive_statistic", 
"p2,external,hive,external_remote,external_remote_h
         sql """drop stats statistics"""
         result = sql """show column cached stats statistics"""
         assertTrue(result.size() == 0)
+
+        sql """use multi_catalog"""
+        sql """analyze table logs1_parquet (log_time) with sync"""
+        def ctlId
+        def dbId
+        def tblId
+        result = sql """show proc '/catalogs'"""
+
+        for (int i = 0; i < result.size(); i++) {
+            if (result[i][1] == catalog_name) {
+                ctlId = result[i][0]
+            }
+        }
+        result = sql """show proc '/catalogs/$ctlId'"""
+        for (int i = 0; i < result.size(); i++) {
+            if (result[i][1] == 'multi_catalog') {
+                dbId = result[i][0]
+            }
+        }
+        result = sql """show proc '/catalogs/$ctlId/$dbId'"""
+        for (int i = 0; i < result.size(); i++) {
+            if (result[i][1] == 'logs1_parquet') {
+                tblId = result[i][0]
+            }
+        }
+
+        result = sql """select * from 
internal.__internal_schema.column_statistics where id = 
'${tblId}--1-log_time'"""
+        assertTrue(result.size() == 1)
+        def id = result[0][0]
+        def catalog_id = result[0][1]
+        def db_id = result[0][2]
+        def tbl_id = result[0][3]
+        def idx_id = result[0][4]
+        def col_id = result[0][5]
+        def count = result[0][7]
+        def ndv = result[0][8]
+        def null_count = result[0][9]
+        def data_size_in_bytes = result[0][12]
+        def update_time = result[0][13]
+
+        sql """insert into internal.__internal_schema.column_statistics values 
('$id', '$catalog_id', '$db_id', '$tbl_id', '$idx_id', '$col_id', NULL, $count, 
$ndv, $null_count, '', '', '$data_size_in_bytes', '$update_time')"""
+
+        result = sql """show column stats logs1_parquet (log_time)"""
+        assertTrue(result.size() == 1)
+        assertTrue(result[0][6] == "N/A")
+        assertTrue(result[0][7] == "N/A")
+        sql """drop catalog ${catalog_name}"""
     }
 }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to