This is an automated email from the ASF dual-hosted git repository.

lijibing pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.0 by this push:
     new cd689b6c32a Disable fetch min/max column stats through HMS, because 
the value may inaccurate and misleading. (#35124) (#35293)
cd689b6c32a is described below

commit cd689b6c32ae469e89406de08562e991c331dc28
Author: Jibing-Li <[email protected]>
AuthorDate: Thu May 23 18:58:13 2024 +0800

    Disable fetch min/max column stats through HMS, because the value may 
inaccurate and misleading. (#35124) (#35293)
---
 .../doris/catalog/external/HMSExternalTable.java   | 63 +++-------------------
 .../hive/test_hive_statistics_from_hms.groovy      | 52 ++++++++++--------
 2 files changed, 38 insertions(+), 77 deletions(-)

diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
index a268916d33d..e8a45c66cfb 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java
@@ -47,7 +47,6 @@ import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
 import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
 import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
-import org.apache.hadoop.hive.metastore.api.Decimal;
 import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
 import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
 import org.apache.hadoop.hive.metastore.api.FieldSchema;
@@ -61,9 +60,6 @@ import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 
 import java.io.IOException;
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.time.LocalDate;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -555,8 +551,11 @@ public class HMSExternalTable extends ExternalTable {
             return Optional.empty();
         }
         Map<String, String> parameters = remoteTable.getParameters();
+        if (!parameters.containsKey(NUM_ROWS) || 
Long.parseLong(parameters.get(NUM_ROWS)) == 0) {
+            return Optional.empty();
+        }
         ColumnStatisticBuilder columnStatisticBuilder = new 
ColumnStatisticBuilder();
-        double count = parameters.containsKey(NUM_ROWS) ? 
Double.parseDouble(parameters.get(NUM_ROWS)) : 0;
+        long count = Long.parseLong(parameters.get(NUM_ROWS));
         columnStatisticBuilder.setCount(count);
         // The tableStats length is at most 1.
         for (ColumnStatisticsObj tableStat : tableStats) {
@@ -575,12 +574,10 @@ public class HMSExternalTable extends ExternalTable {
         return Optional.of(columnStatisticBuilder.build());
     }
 
-    private void setStatData(Column col, ColumnStatisticsData data, 
ColumnStatisticBuilder builder, double count)
+    private void setStatData(Column col, ColumnStatisticsData data, 
ColumnStatisticBuilder builder, long count)
             throws AnalysisException {
         long ndv = 0;
         long nulls = 0;
-        String min = "";
-        String max = "";
         double colSize = 0;
         if (!data.isSetStringStats()) {
             colSize = count * col.getType().getSlotSize();
@@ -590,8 +587,6 @@ public class HMSExternalTable extends ExternalTable {
             LongColumnStatsData longStats = data.getLongStats();
             ndv = longStats.getNumDVs();
             nulls = longStats.getNumNulls();
-            min = String.valueOf(longStats.getLowValue());
-            max = String.valueOf(longStats.getHighValue());
         } else if (data.isSetStringStats()) {
             StringColumnStatsData stringStats = data.getStringStats();
             ndv = stringStats.getNumDVs();
@@ -602,65 +597,23 @@ public class HMSExternalTable extends ExternalTable {
             DecimalColumnStatsData decimalStats = data.getDecimalStats();
             ndv = decimalStats.getNumDVs();
             nulls = decimalStats.getNumNulls();
-            if (decimalStats.isSetLowValue()) {
-                Decimal lowValue = decimalStats.getLowValue();
-                if (lowValue != null) {
-                    BigDecimal lowDecimal = new BigDecimal(new 
BigInteger(lowValue.getUnscaled()), lowValue.getScale());
-                    min = lowDecimal.toString();
-                }
-            }
-            if (decimalStats.isSetHighValue()) {
-                Decimal highValue = decimalStats.getHighValue();
-                if (highValue != null) {
-                    BigDecimal highDecimal =
-                            new BigDecimal(new 
BigInteger(highValue.getUnscaled()), highValue.getScale());
-                    max = highDecimal.toString();
-                }
-            }
         } else if (data.isSetDoubleStats()) {
             DoubleColumnStatsData doubleStats = data.getDoubleStats();
             ndv = doubleStats.getNumDVs();
             nulls = doubleStats.getNumNulls();
-            min = String.valueOf(doubleStats.getLowValue());
-            max = String.valueOf(doubleStats.getHighValue());
         } else if (data.isSetDateStats()) {
             DateColumnStatsData dateStats = data.getDateStats();
             ndv = dateStats.getNumDVs();
             nulls = dateStats.getNumNulls();
-            if (dateStats.isSetLowValue()) {
-                org.apache.hadoop.hive.metastore.api.Date lowValue = 
dateStats.getLowValue();
-                if (lowValue != null) {
-                    LocalDate lowDate = 
LocalDate.ofEpochDay(lowValue.getDaysSinceEpoch());
-                    min = lowDate.toString();
-                }
-            }
-            if (dateStats.isSetHighValue()) {
-                org.apache.hadoop.hive.metastore.api.Date highValue = 
dateStats.getHighValue();
-                if (highValue != null) {
-                    LocalDate highDate = 
LocalDate.ofEpochDay(highValue.getDaysSinceEpoch());
-                    max = highDate.toString();
-                }
-            }
         } else {
-            LOG.debug(String.format("Not suitable data type for column %s", 
col.getName()));
-            throw new RuntimeException("Not supported data type.");
+            LOG.warn(String.format("Not suitable data type for column %s", 
col.getName()));
         }
         builder.setNdv(ndv);
         builder.setNumNulls(nulls);
         builder.setDataSize(colSize);
         builder.setAvgSizeByte(colSize / count);
-        if (!min.equals("")) {
-            builder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), 
min));
-            builder.setMinExpr(StatisticsUtil.readableValue(col.getType(), 
min));
-        } else {
-            builder.setMinValue(Double.MIN_VALUE);
-        }
-        if (!max.equals("")) {
-            builder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), 
max));
-            builder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), 
max));
-        } else {
-            builder.setMaxValue(Double.MAX_VALUE);
-        }
+        builder.setMinValue(Double.NEGATIVE_INFINITY);
+        builder.setMaxValue(Double.POSITIVE_INFINITY);
     }
 
     public void setEventUpdateTime(long updateTime) {
diff --git 
a/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy
 
b/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy
index c3c671bb035..3a067fa42f9 100644
--- 
a/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy
+++ 
b/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy
@@ -55,8 +55,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "2.400486E7")
         assertTrue(result[0][6] == "4.0")
-        assertTrue(result[0][7] == "\'1992-01-04\'")
-        assertTrue(result[0][8] == "\'1998-12-31\'")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_tax)"""
         assertTrue(result.size() == 1)
@@ -66,8 +66,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "4.800972E7")
         assertTrue(result[0][6] == "8.0")
-        assertTrue(result[0][7] == "0")
-        assertTrue(result[0][8] == "0.08")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_shipmode)"""
         assertTrue(result.size() == 1)
@@ -77,6 +77,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "2.5717007E7")
         assertTrue(result[0][6] == "4.285300060071169")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_suppkey)"""
         assertTrue(result.size() == 1)
@@ -86,8 +88,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "2.400486E7")
         assertTrue(result[0][6] == "4.0")
-        assertTrue(result[0][7] == "1")
-        assertTrue(result[0][8] == "7")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_shipdate)"""
         assertTrue(result.size() == 1)
@@ -97,8 +99,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "2.400486E7")
         assertTrue(result[0][6] == "4.0")
-        assertTrue(result[0][7] == "\'1992-01-02\'")
-        assertTrue(result[0][8] == "\'1998-12-01\'")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_commitdate)"""
         assertTrue(result.size() == 1)
@@ -108,8 +110,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "2.400486E7")
         assertTrue(result[0][6] == "4.0")
-        assertTrue(result[0][7] == "\'1992-01-31\'")
-        assertTrue(result[0][8] == "\'1998-10-31\'")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_partkey)"""
         assertTrue(result.size() == 1)
@@ -119,8 +121,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "2.400486E7")
         assertTrue(result[0][6] == "4.0")
-        assertTrue(result[0][7] == "1")
-        assertTrue(result[0][8] == "10000")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_orderkey)"""
         assertTrue(result.size() == 1)
@@ -130,8 +132,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "2.400486E7")
         assertTrue(result[0][6] == "4.0")
-        assertTrue(result[0][7] == "1")
-        assertTrue(result[0][8] == "6000000")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_quantity)"""
         assertTrue(result.size() == 1)
@@ -141,8 +143,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "4.800972E7")
         assertTrue(result[0][6] == "8.0")
-        assertTrue(result[0][7] == "1")
-        assertTrue(result[0][8] == "50")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_linestatus)"""
         assertTrue(result.size() == 1)
@@ -152,6 +154,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "6001215.0")
         assertTrue(result[0][6] == "1.0")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_comment)"""
         assertTrue(result.size() == 1)
@@ -161,6 +165,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "1.5899739E8")
         assertTrue(result[0][6] == "26.494199924515286")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_extendedprice)"""
         assertTrue(result.size() == 1)
@@ -170,8 +176,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "4.800972E7")
         assertTrue(result[0][6] == "8.0")
-        assertTrue(result[0][7] == "901")
-        assertTrue(result[0][8] == "104949.5")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_linenumber)"""
         assertTrue(result.size() == 1)
@@ -181,8 +187,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "2.400486E7")
         assertTrue(result[0][6] == "4.0")
-        assertTrue(result[0][7] == "1")
-        assertTrue(result[0][8] == "200000")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_discount)"""
         assertTrue(result.size() == 1)
@@ -192,8 +198,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "4.800972E7")
         assertTrue(result[0][6] == "8.0")
-        assertTrue(result[0][7] == "0")
-        assertTrue(result[0][8] == "0.1")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         result = sql """show column cached stats lineitem (l_shipinstruct)"""
         assertTrue(result.size() == 1)
@@ -203,6 +209,8 @@ suite("test_hive_statistics_from_hms", 
"p2,external,hive,external_remote,externa
         assertTrue(result[0][4] == "0.0")
         assertTrue(result[0][5] == "7.2006178E7")
         assertTrue(result[0][6] == "11.998599950176756")
+        assertTrue(result[0][7] == "N/A")
+        assertTrue(result[0][8] == "N/A")
 
         for (int i = 0; i < 10; i++) {
             result = sql """show table stats lineitem"""


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to