This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push:
new 37f1bf317ce [fix](statistics)Disable fetch min/max column stats
through HMS, because the value may inaccurate and misleading. (#35124) (#35145)
37f1bf317ce is described below
commit 37f1bf317cebc1cef749e95b07a87a8efd92fcf6
Author: Jibing-Li <[email protected]>
AuthorDate: Tue May 21 22:58:12 2024 +0800
[fix](statistics)Disable fetch min/max column stats through HMS, because
the value may inaccurate and misleading. (#35124) (#35145)
backport #35124
---
.../doris/datasource/hive/HMSExternalTable.java | 65 +++-------------------
.../hive/test_hive_statistics_from_hms.groovy | 52 +++++++++--------
2 files changed, 38 insertions(+), 79 deletions(-)
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
index e5624fb58b5..4d3f963aa55 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
@@ -69,7 +69,6 @@ import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
import org.apache.hadoop.hive.metastore.api.DateColumnStatsData;
-import org.apache.hadoop.hive.metastore.api.Decimal;
import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData;
import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
@@ -82,9 +81,6 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.IOException;
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.time.LocalDate;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
@@ -697,8 +693,11 @@ public class HMSExternalTable extends ExternalTable
implements MTMVRelatedTableI
return Optional.empty();
}
Map<String, String> parameters = remoteTable.getParameters();
+ if (!parameters.containsKey(NUM_ROWS) ||
Long.parseLong(parameters.get(NUM_ROWS)) == 0) {
+ return Optional.empty();
+ }
ColumnStatisticBuilder columnStatisticBuilder = new
ColumnStatisticBuilder();
- double count = parameters.containsKey(NUM_ROWS) ?
Double.parseDouble(parameters.get(NUM_ROWS)) : 0;
+ long count = Long.parseLong(parameters.get(NUM_ROWS));
columnStatisticBuilder.setCount(count);
// The tableStats length is at most 1.
for (ColumnStatisticsObj tableStat : tableStats) {
@@ -719,12 +718,10 @@ public class HMSExternalTable extends ExternalTable
implements MTMVRelatedTableI
return Optional.of(columnStatisticBuilder.build());
}
- private void setStatData(Column col, ColumnStatisticsData data,
ColumnStatisticBuilder builder, double count)
+ private void setStatData(Column col, ColumnStatisticsData data,
ColumnStatisticBuilder builder, long count)
throws AnalysisException {
long ndv = 0;
long nulls = 0;
- String min = "";
- String max = "";
double colSize = 0;
if (!data.isSetStringStats()) {
colSize = count * col.getType().getSlotSize();
@@ -734,8 +731,6 @@ public class HMSExternalTable extends ExternalTable
implements MTMVRelatedTableI
LongColumnStatsData longStats = data.getLongStats();
ndv = longStats.getNumDVs();
nulls = longStats.getNumNulls();
- min = String.valueOf(longStats.getLowValue());
- max = String.valueOf(longStats.getHighValue());
} else if (data.isSetStringStats()) {
StringColumnStatsData stringStats = data.getStringStats();
ndv = stringStats.getNumDVs();
@@ -746,67 +741,23 @@ public class HMSExternalTable extends ExternalTable
implements MTMVRelatedTableI
DecimalColumnStatsData decimalStats = data.getDecimalStats();
ndv = decimalStats.getNumDVs();
nulls = decimalStats.getNumNulls();
- if (decimalStats.isSetLowValue()) {
- Decimal lowValue = decimalStats.getLowValue();
- if (lowValue != null) {
- BigDecimal lowDecimal = new BigDecimal(new
BigInteger(lowValue.getUnscaled()), lowValue.getScale());
- min = lowDecimal.toString();
- }
- }
- if (decimalStats.isSetHighValue()) {
- Decimal highValue = decimalStats.getHighValue();
- if (highValue != null) {
- BigDecimal highDecimal =
- new BigDecimal(new
BigInteger(highValue.getUnscaled()), highValue.getScale());
- max = highDecimal.toString();
- }
- }
} else if (data.isSetDoubleStats()) {
DoubleColumnStatsData doubleStats = data.getDoubleStats();
ndv = doubleStats.getNumDVs();
nulls = doubleStats.getNumNulls();
- min = String.valueOf(doubleStats.getLowValue());
- max = String.valueOf(doubleStats.getHighValue());
} else if (data.isSetDateStats()) {
DateColumnStatsData dateStats = data.getDateStats();
ndv = dateStats.getNumDVs();
nulls = dateStats.getNumNulls();
- if (dateStats.isSetLowValue()) {
- org.apache.hadoop.hive.metastore.api.Date lowValue =
dateStats.getLowValue();
- if (lowValue != null) {
- LocalDate lowDate =
LocalDate.ofEpochDay(lowValue.getDaysSinceEpoch());
- min = lowDate.toString();
- }
- }
- if (dateStats.isSetHighValue()) {
- org.apache.hadoop.hive.metastore.api.Date highValue =
dateStats.getHighValue();
- if (highValue != null) {
- LocalDate highDate =
LocalDate.ofEpochDay(highValue.getDaysSinceEpoch());
- max = highDate.toString();
- }
- }
} else {
- if (LOG.isDebugEnabled()) {
- LOG.debug(String.format("Not suitable data type for column
%s", col.getName()));
- }
- throw new RuntimeException("Not supported data type.");
+ LOG.warn(String.format("Not suitable data type for column %s",
col.getName()));
}
builder.setNdv(ndv);
builder.setNumNulls(nulls);
builder.setDataSize(colSize);
builder.setAvgSizeByte(colSize / count);
- if (!min.equals("")) {
- builder.setMinValue(StatisticsUtil.convertToDouble(col.getType(),
min));
- builder.setMinExpr(StatisticsUtil.readableValue(col.getType(),
min));
- } else {
- builder.setMinValue(Double.MIN_VALUE);
- }
- if (!max.equals("")) {
- builder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(),
max));
- builder.setMaxExpr(StatisticsUtil.readableValue(col.getType(),
max));
- } else {
- builder.setMaxValue(Double.MAX_VALUE);
- }
+ builder.setMinValue(Double.NEGATIVE_INFINITY);
+ builder.setMaxValue(Double.POSITIVE_INFINITY);
}
public void setEventUpdateTime(long updateTime) {
diff --git
a/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy
b/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy
index c3c671bb035..3a067fa42f9 100644
---
a/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy
+++
b/regression-test/suites/external_table_p2/hive/test_hive_statistics_from_hms.groovy
@@ -55,8 +55,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
- assertTrue(result[0][7] == "\'1992-01-04\'")
- assertTrue(result[0][8] == "\'1998-12-31\'")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_tax)"""
assertTrue(result.size() == 1)
@@ -66,8 +66,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "4.800972E7")
assertTrue(result[0][6] == "8.0")
- assertTrue(result[0][7] == "0")
- assertTrue(result[0][8] == "0.08")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_shipmode)"""
assertTrue(result.size() == 1)
@@ -77,6 +77,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.5717007E7")
assertTrue(result[0][6] == "4.285300060071169")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_suppkey)"""
assertTrue(result.size() == 1)
@@ -86,8 +88,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
- assertTrue(result[0][7] == "1")
- assertTrue(result[0][8] == "7")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_shipdate)"""
assertTrue(result.size() == 1)
@@ -97,8 +99,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
- assertTrue(result[0][7] == "\'1992-01-02\'")
- assertTrue(result[0][8] == "\'1998-12-01\'")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_commitdate)"""
assertTrue(result.size() == 1)
@@ -108,8 +110,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
- assertTrue(result[0][7] == "\'1992-01-31\'")
- assertTrue(result[0][8] == "\'1998-10-31\'")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_partkey)"""
assertTrue(result.size() == 1)
@@ -119,8 +121,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
- assertTrue(result[0][7] == "1")
- assertTrue(result[0][8] == "10000")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_orderkey)"""
assertTrue(result.size() == 1)
@@ -130,8 +132,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
- assertTrue(result[0][7] == "1")
- assertTrue(result[0][8] == "6000000")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_quantity)"""
assertTrue(result.size() == 1)
@@ -141,8 +143,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "4.800972E7")
assertTrue(result[0][6] == "8.0")
- assertTrue(result[0][7] == "1")
- assertTrue(result[0][8] == "50")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_linestatus)"""
assertTrue(result.size() == 1)
@@ -152,6 +154,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "6001215.0")
assertTrue(result[0][6] == "1.0")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_comment)"""
assertTrue(result.size() == 1)
@@ -161,6 +165,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "1.5899739E8")
assertTrue(result[0][6] == "26.494199924515286")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_extendedprice)"""
assertTrue(result.size() == 1)
@@ -170,8 +176,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "4.800972E7")
assertTrue(result[0][6] == "8.0")
- assertTrue(result[0][7] == "901")
- assertTrue(result[0][8] == "104949.5")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_linenumber)"""
assertTrue(result.size() == 1)
@@ -181,8 +187,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "2.400486E7")
assertTrue(result[0][6] == "4.0")
- assertTrue(result[0][7] == "1")
- assertTrue(result[0][8] == "200000")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_discount)"""
assertTrue(result.size() == 1)
@@ -192,8 +198,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "4.800972E7")
assertTrue(result[0][6] == "8.0")
- assertTrue(result[0][7] == "0")
- assertTrue(result[0][8] == "0.1")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
result = sql """show column cached stats lineitem (l_shipinstruct)"""
assertTrue(result.size() == 1)
@@ -203,6 +209,8 @@ suite("test_hive_statistics_from_hms",
"p2,external,hive,external_remote,externa
assertTrue(result[0][4] == "0.0")
assertTrue(result[0][5] == "7.2006178E7")
assertTrue(result[0][6] == "11.998599950176756")
+ assertTrue(result[0][7] == "N/A")
+ assertTrue(result[0][8] == "N/A")
for (int i = 0; i < 10; i++) {
result = sql """show table stats lineitem"""
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]