This is an automated email from the ASF dual-hosted git repository.
zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new b6cbb2e6a2f HIVE-26277: NPEs and rounding issues in
ColumnStatsAggregator classes (Alessandro Solimando reviewed by Stamatis
Zampetakis)
b6cbb2e6a2f is described below
commit b6cbb2e6a2f3d3c5de565492c3f658cbf94d96fb
Author: Alessandro Solimando <[email protected]>
AuthorDate: Fri May 13 17:29:30 2022 +0200
HIVE-26277: NPEs and rounding issues in ColumnStatsAggregator classes
(Alessandro Solimando reviewed by Stamatis Zampetakis)
1. Add and invoke checkStatisticsList to prevent NPEs in aggregators;
they all rely on a non-empty list of statistics.
2. Cast integers to double in divisions to make computations more
accurate and avoid rounding issues.
3. Align loggers names to match the class they are in and avoid
misleading log messages.
4. Add documentation for ndvtuner based on current understanding of how
it should work.
Closes #3339
Move (and complete) ndvTuner documentation from tests to production classes
---
.../aggr/BinaryColumnStatsAggregator.java | 2 +
.../aggr/BooleanColumnStatsAggregator.java | 2 +
.../columnstats/aggr/ColumnStatsAggregator.java | 19 ++
.../aggr/DateColumnStatsAggregator.java | 14 +-
.../aggr/DecimalColumnStatsAggregator.java | 5 +-
.../aggr/DoubleColumnStatsAggregator.java | 2 +
.../aggr/LongColumnStatsAggregator.java | 10 +-
.../aggr/StringColumnStatsAggregator.java | 4 +-
.../aggr/TimestampColumnStatsAggregator.java | 14 +-
.../hadoop/hive/metastore/StatisticsTestUtils.java | 112 +++++++++
.../metastore/columnstats/ColStatsBuilder.java | 187 ++++++++++++++
.../aggr/BinaryColumnStatsAggregatorTest.java | 101 ++++++++
.../aggr/BooleanColumnStatsAggregatorTest.java | 101 ++++++++
.../aggr/DateColumnStatsAggregatorTest.java | 270 ++++++++++++++++++++
.../aggr/DecimalColumnStatsAggregatorTest.java | 256 +++++++++++++++++++
.../aggr/DoubleColumnStatsAggregatorTest.java | 242 ++++++++++++++++++
.../aggr/LongColumnStatsAggregatorTest.java | 242 ++++++++++++++++++
.../aggr/StringColumnStatsAggregatorTest.java | 188 ++++++++++++++
.../aggr/TimestampColumnStatsAggregatorTest.java | 273 +++++++++++++++++++++
19 files changed, 2028 insertions(+), 16 deletions(-)
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregator.java
index c885cf2d44f..552c91835f7 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregator.java
@@ -32,6 +32,8 @@ public class BinaryColumnStatsAggregator extends
ColumnStatsAggregator {
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo,
List<String> partNames, boolean areAllPartsFound) throws MetaException {
+ checkStatisticsList(colStatsWithSourceInfo);
+
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregator.java
index 6fafab53e0f..9babeea8510 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregator.java
@@ -32,6 +32,8 @@ public class BooleanColumnStatsAggregator extends
ColumnStatsAggregator {
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo,
List<String> partNames, boolean areAllPartsFound) throws MetaException {
+ checkStatisticsList(colStatsWithSourceInfo);
+
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregator.java
index c4325763beb..144e71c69ec 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/ColumnStatsAggregator.java
@@ -27,9 +27,28 @@ import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWi
public abstract class ColumnStatsAggregator {
public boolean useDensityFunctionForNDVEstimation;
+ /**
+ * The tuner controls the derivation of the NDV value when aggregating
statistics from multiple partitions. It accepts
+ * values in the range [0, 1] pushing the aggregated NDV closer to the
lower, or upper bound respectively.
+ * <p>
+ * For example, consider the aggregation of three partitions with NDV values
2, 3, and 4, respectively. The NDV
+ * lower bound is 4 (the highest among individual NDVs), and the upper bound
is 9 (the sum of individual NDVs). In
+ * this case the aggregated NDV will be in the range [4, 9] touching the
bounds when the tuner is equal to 0, or 1
+ * respectively.
+ * </p>
+ * <p>
+ * It is optional and concrete implementations can choose to ignore it
completely.
+ * </p>
+ */
public double ndvTuner;
public abstract ColumnStatisticsObj aggregate(
List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String>
partNames,
boolean areAllPartsFound) throws MetaException;
+
+ void checkStatisticsList(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo) {
+ if (colStatsWithSourceInfo.isEmpty()) {
+ throw new IllegalArgumentException("Column statistics list must not be
empty when aggregating");
+ }
+ }
}
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java
index 281ddaa90f3..a0dcbe9d6a7 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java
@@ -49,6 +49,8 @@ public class DateColumnStatsAggregator extends
ColumnStatsAggregator implements
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo,
List<String> partNames, boolean areAllPartsFound) throws
MetaException {
+ checkStatisticsList(colStatsWithSourceInfo);
+
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
@@ -99,9 +101,10 @@ public class DateColumnStatsAggregator extends
ColumnStatsAggregator implements
for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
ColumnStatisticsObj cso = csp.getColStatsObj();
DateColumnStatsDataInspector newData = dateInspectorFromStats(cso);
+ lowerBound = Math.max(lowerBound, newData.getNumDVs());
higherBound += newData.getNumDVs();
if (newData.isSetLowValue() && newData.isSetHighValue()) {
- densityAvgSum += (diff(newData.getHighValue(),
newData.getLowValue())) / newData.getNumDVs();
+ densityAvgSum += ((double) diff(newData.getHighValue(),
newData.getLowValue())) / newData.getNumDVs();
}
if (ndvEstimator != null) {
ndvEstimator.mergeEstimators(newData.getNdvEstimator());
@@ -124,7 +127,8 @@ public class DateColumnStatsAggregator extends
ColumnStatsAggregator implements
aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
} else {
long estimation;
- if (useDensityFunctionForNDVEstimation) {
+ if (useDensityFunctionForNDVEstimation && aggregateData != null
+ && aggregateData.isSetLowValue() &&
aggregateData.isSetHighValue()) {
// We have estimation, lowerbound and higherbound. We use estimation
// if it is between lowerbound and higherbound.
double densityAvg = densityAvgSum / partNames.size();
@@ -161,7 +165,7 @@ public class DateColumnStatsAggregator extends
ColumnStatsAggregator implements
String partName = csp.getPartName();
DateColumnStatsData newData = cso.getStatsData().getDateStats();
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += diff(newData.getHighValue(),
newData.getLowValue()) / newData.getNumDVs();
+ densityAvgSum += ((double) diff(newData.getHighValue(),
newData.getLowValue())) / newData.getNumDVs();
}
adjustedIndexMap.put(partName, (double) indexMap.get(partName));
adjustedStatsMap.put(partName, cso.getStatsData());
@@ -190,7 +194,7 @@ public class DateColumnStatsAggregator extends
ColumnStatsAggregator implements
csd.setDateStats(aggregateData);
adjustedStatsMap.put(pseudoPartName.toString(), csd);
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += diff(aggregateData.getHighValue(),
aggregateData.getLowValue())
+ densityAvgSum += ((double) diff(aggregateData.getHighValue(),
aggregateData.getLowValue()))
/ aggregateData.getNumDVs();
}
// reset everything
@@ -223,7 +227,7 @@ public class DateColumnStatsAggregator extends
ColumnStatsAggregator implements
csd.setDateStats(aggregateData);
adjustedStatsMap.put(pseudoPartName.toString(), csd);
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += diff(aggregateData.getHighValue(),
aggregateData.getLowValue())
+ densityAvgSum += ((double) diff(aggregateData.getHighValue(),
aggregateData.getLowValue()))
/ aggregateData.getNumDVs();
}
}
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java
index 63bc3fdc5ce..3e2093829b7 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java
@@ -50,6 +50,8 @@ public class DecimalColumnStatsAggregator extends
ColumnStatsAggregator implemen
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo,
List<String> partNames, boolean areAllPartsFound) throws MetaException {
+ checkStatisticsList(colStatsWithSourceInfo);
+
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
@@ -128,7 +130,8 @@ public class DecimalColumnStatsAggregator extends
ColumnStatsAggregator implemen
aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
} else {
long estimation;
- if (useDensityFunctionForNDVEstimation) {
+ if (useDensityFunctionForNDVEstimation && aggregateData != null
+ && aggregateData.isSetLowValue() &&
aggregateData.isSetHighValue()) {
// We have estimation, lowerbound and higherbound. We use estimation
// if it is between lowerbound and higherbound.
double densityAvg = densityAvgSum / partNames.size();
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java
index 6d4e6472aa7..2caa2f32a3c 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java
@@ -48,6 +48,8 @@ public class DoubleColumnStatsAggregator extends
ColumnStatsAggregator implement
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo,
List<String> partNames, boolean areAllPartsFound) throws MetaException {
+ checkStatisticsList(colStatsWithSourceInfo);
+
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java
index ffde02455ae..dd35e0b35c8 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java
@@ -48,6 +48,8 @@ public class LongColumnStatsAggregator extends
ColumnStatsAggregator implements
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo,
List<String> partNames, boolean areAllPartsFound) throws MetaException {
+ checkStatisticsList(colStatsWithSourceInfo);
+
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
@@ -100,7 +102,7 @@ public class LongColumnStatsAggregator extends
ColumnStatsAggregator implements
LongColumnStatsDataInspector newData = longInspectorFromStats(cso);
lowerBound = Math.max(lowerBound, newData.getNumDVs());
higherBound += newData.getNumDVs();
- densityAvgSum += (newData.getHighValue() - newData.getLowValue()) /
newData.getNumDVs();
+ densityAvgSum += ((double) (newData.getHighValue() -
newData.getLowValue())) / newData.getNumDVs();
if (ndvEstimator != null) {
ndvEstimator.mergeEstimators(newData.getNdvEstimator());
}
@@ -159,7 +161,7 @@ public class LongColumnStatsAggregator extends
ColumnStatsAggregator implements
String partName = csp.getPartName();
LongColumnStatsData newData = cso.getStatsData().getLongStats();
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += (newData.getHighValue() - newData.getLowValue())
/ newData.getNumDVs();
+ densityAvgSum += ((double) (newData.getHighValue() -
newData.getLowValue())) / newData.getNumDVs();
}
adjustedIndexMap.put(partName, (double) indexMap.get(partName));
adjustedStatsMap.put(partName, cso.getStatsData());
@@ -188,7 +190,7 @@ public class LongColumnStatsAggregator extends
ColumnStatsAggregator implements
csd.setLongStats(aggregateData);
adjustedStatsMap.put(pseudoPartName.toString(), csd);
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += (aggregateData.getHighValue() -
aggregateData.getLowValue()) / aggregateData.getNumDVs();
+ densityAvgSum += ((double) (aggregateData.getHighValue() -
aggregateData.getLowValue())) / aggregateData.getNumDVs();
}
// reset everything
pseudoPartName = new StringBuilder();
@@ -221,7 +223,7 @@ public class LongColumnStatsAggregator extends
ColumnStatsAggregator implements
csd.setLongStats(aggregateData);
adjustedStatsMap.put(pseudoPartName.toString(), csd);
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += (aggregateData.getHighValue() -
aggregateData.getLowValue()) / aggregateData.getNumDVs();
+ densityAvgSum += ((double) (aggregateData.getHighValue() -
aggregateData.getLowValue())) / aggregateData.getNumDVs();
}
}
}
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java
index 6fb0fb5d8f9..bb38b8cfaa7 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java
@@ -42,11 +42,13 @@ import static
org.apache.hadoop.hive.metastore.columnstats.ColumnsStatsUtils.str
public class StringColumnStatsAggregator extends ColumnStatsAggregator
implements
IExtrapolatePartStatus {
- private static final Logger LOG =
LoggerFactory.getLogger(LongColumnStatsAggregator.class);
+ private static final Logger LOG =
LoggerFactory.getLogger(StringColumnStatsAggregator.class);
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo,
List<String> partNames, boolean areAllPartsFound) throws MetaException {
+ checkStatisticsList(colStatsWithSourceInfo);
+
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
diff --git
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/TimestampColumnStatsAggregator.java
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/TimestampColumnStatsAggregator.java
index 8828f89ebfe..95e8db9fdf8 100644
---
a/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/TimestampColumnStatsAggregator.java
+++
b/standalone-metastore/metastore-server/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/TimestampColumnStatsAggregator.java
@@ -49,6 +49,8 @@ public class TimestampColumnStatsAggregator extends
ColumnStatsAggregator implem
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo>
colStatsWithSourceInfo,
List<String> partNames, boolean
areAllPartsFound) throws MetaException {
+ checkStatisticsList(colStatsWithSourceInfo);
+
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
@@ -99,9 +101,10 @@ public class TimestampColumnStatsAggregator extends
ColumnStatsAggregator implem
for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
ColumnStatisticsObj cso = csp.getColStatsObj();
TimestampColumnStatsDataInspector newData =
timestampInspectorFromStats(cso);
+ lowerBound = Math.max(lowerBound, newData.getNumDVs());
higherBound += newData.getNumDVs();
if (newData.isSetLowValue() && newData.isSetHighValue()) {
- densityAvgSum += (diff(newData.getHighValue(),
newData.getLowValue())) / newData.getNumDVs();
+ densityAvgSum += ((double) (diff(newData.getHighValue(),
newData.getLowValue())) / newData.getNumDVs());
}
if (ndvEstimator != null) {
ndvEstimator.mergeEstimators(newData.getNdvEstimator());
@@ -124,7 +127,8 @@ public class TimestampColumnStatsAggregator extends
ColumnStatsAggregator implem
aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
} else {
long estimation;
- if (useDensityFunctionForNDVEstimation) {
+ if (useDensityFunctionForNDVEstimation && aggregateData != null
+ && aggregateData.isSetLowValue() && aggregateData.isSetHighValue()
) {
// We have estimation, lowerbound and higherbound. We use estimation
// if it is between lowerbound and higherbound.
double densityAvg = densityAvgSum / partNames.size();
@@ -161,7 +165,7 @@ public class TimestampColumnStatsAggregator extends
ColumnStatsAggregator implem
String partName = csp.getPartName();
TimestampColumnStatsData newData =
cso.getStatsData().getTimestampStats();
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += diff(newData.getHighValue(),
newData.getLowValue()) / newData.getNumDVs();
+ densityAvgSum += ((double) diff(newData.getHighValue(),
newData.getLowValue()) / newData.getNumDVs());
}
adjustedIndexMap.put(partName, (double) indexMap.get(partName));
adjustedStatsMap.put(partName, cso.getStatsData());
@@ -190,7 +194,7 @@ public class TimestampColumnStatsAggregator extends
ColumnStatsAggregator implem
csd.setTimestampStats(aggregateData);
adjustedStatsMap.put(pseudoPartName.toString(), csd);
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += diff(aggregateData.getHighValue(),
aggregateData.getLowValue())
+ densityAvgSum += ((double) diff(aggregateData.getHighValue(),
aggregateData.getLowValue()))
/ aggregateData.getNumDVs();
}
// reset everything
@@ -223,7 +227,7 @@ public class TimestampColumnStatsAggregator extends
ColumnStatsAggregator implem
csd.setTimestampStats(aggregateData);
adjustedStatsMap.put(pseudoPartName.toString(), csd);
if (useDensityFunctionForNDVEstimation) {
- densityAvgSum += diff(aggregateData.getHighValue(),
aggregateData.getLowValue())
+ densityAvgSum += ((double) diff(aggregateData.getHighValue(),
aggregateData.getLowValue()))
/ aggregateData.getNumDVs();
}
}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/StatisticsTestUtils.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/StatisticsTestUtils.java
new file mode 100644
index 00000000000..5520f04a4ff
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/StatisticsTestUtils.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore;
+
+import org.apache.hadoop.hive.common.ndv.fm.FMSketch;
+import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog;
+import org.apache.hadoop.hive.metastore.api.ColumnStatistics;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.Table;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+
+import java.util.Collections;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.stream.IntStream;
+
+import static org.apache.hadoop.hive.metastore.Warehouse.DEFAULT_CATALOG_NAME;
+
+public class StatisticsTestUtils {
+
+ private static final String HIVE_ENGINE = "hive";
+
+ private StatisticsTestUtils() {
+ throw new AssertionError("Suppress default constructor for non
instantiation");
+ }
+
+ /**
+ * Creates a {@link ColStatsObjWithSourceInfo} object for a given table,
partition and column information,
+ * using the given statistics data.
+ * @param data the column statistics data
+ * @param tbl the target table for stats
+ * @param column the target column for stats
+ * @param partName the target partition for stats
+ * @return column statistics objects with source info.
+ */
+ public static ColStatsObjWithSourceInfo
createStatsWithInfo(ColumnStatisticsData data, Table tbl,
+ FieldSchema column, String partName) {
+ ColumnStatisticsObj statObj = new ColumnStatisticsObj(column.getName(),
column.getType(), data);
+ return new ColStatsObjWithSourceInfo(statObj, tbl.getCatName(),
tbl.getDbName(), column.getName(), partName);
+ }
+
+ /**
+ * Creates an FM sketch object initialized with the given values.
+ * @param values the values to be added
+ * @return an FM sketch initialized with the given values.
+ */
+ public static FMSketch createFMSketch(long... values) {
+ FMSketch fm = new FMSketch(1);
+ for (long value : values) {
+ fm.addToEstimator(value);
+ }
+ return fm;
+ }
+
+ /**
+ * Creates an FM sketch object initialized with the given values.
+ * @param values the values to be added
+ * @return an FM sketch initialized with the given values.
+ */
+ public static FMSketch createFMSketch(String... values) {
+ FMSketch fm = new FMSketch(1);
+ for (String value : values) {
+ fm.addToEstimator(value);
+ }
+ return fm;
+ }
+
+ /**
+ * Creates an HLL object initialized with the given values.
+ * @param values the values to be added
+ * @return an HLL object initialized with the given values.
+ */
+ public static HyperLogLog createHll(long... values) {
+ HyperLogLog hll = HyperLogLog.builder().build();
+ for (long value : values) {
+ hll.addLong(value);
+ }
+ return hll;
+ }
+
+ /**
+ * Creates an HLL object initialized with the given values.
+ * @param values the values to be added
+ * @return an HLL object initialized with the given values.
+ */
+ public static HyperLogLog createHll(String... values) {
+ HyperLogLog hll = HyperLogLog.builder().build();
+ for (String value : values) {
+ hll.addBytes(value.getBytes());
+ }
+ return hll;
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/ColStatsBuilder.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/ColStatsBuilder.java
new file mode 100644
index 00000000000..6683d323ecc
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/ColStatsBuilder.java
@@ -0,0 +1,187 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats;
+
+import org.apache.hadoop.hive.common.ndv.fm.FMSketch;
+import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog;
+import org.apache.hadoop.hive.metastore.StatisticsTestUtils;
+import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData;
+import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.Date;
+import org.apache.hadoop.hive.metastore.api.Decimal;
+import org.apache.hadoop.hive.metastore.api.Timestamp;
+import
org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector;
+import
org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector;
+import
org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector;
+import
org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector;
+import
org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector;
+import
org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector;
+
+import java.lang.reflect.InvocationTargetException;
+
+public class ColStatsBuilder<T> {
+
+ private final Class<T> type;
+ private T lowValue;
+ private T highValue;
+ private Double avgColLen;
+ private Long maxColLen;
+ private Long numTrues;
+ private Long numFalses;
+ private Long numNulls;
+ private Long numDVs;
+ private byte[] bitVector;
+
+ public ColStatsBuilder(Class<T> type) {
+ this.type = type;
+ }
+
+ public ColStatsBuilder<T> numNulls(long num) {
+ this.numNulls = num;
+ return this;
+ }
+
+ public ColStatsBuilder<T> numDVs(long num) {
+ this.numDVs = num;
+ return this;
+ }
+
+ public ColStatsBuilder<T> numFalses(long num) {
+ this.numFalses = num;
+ return this;
+ }
+
+ public ColStatsBuilder<T> numTrues(long num) {
+ this.numTrues = num;
+ return this;
+ }
+
+ public ColStatsBuilder<T> avgColLen(double val) {
+ this.avgColLen = val;
+ return this;
+ }
+
+ public ColStatsBuilder<T> maxColLen(long val) {
+ this.maxColLen = val;
+ return this;
+ }
+
+ public ColStatsBuilder<T> low(T val) {
+ this.lowValue = val;
+ return this;
+ }
+
+ public ColStatsBuilder<T> high(T val) {
+ this.highValue = val;
+ return this;
+ }
+
+ public ColStatsBuilder<T> hll(long... values) {
+ HyperLogLog hll = StatisticsTestUtils.createHll(values);
+ this.bitVector = hll.serialize();
+ return this;
+ }
+
+ public ColStatsBuilder<T> hll(String... values) {
+ HyperLogLog hll = StatisticsTestUtils.createHll(values);
+ this.bitVector = hll.serialize();
+ return this;
+ }
+
+ public ColStatsBuilder<T> fmSketch(long... values) {
+ FMSketch fm = StatisticsTestUtils.createFMSketch(values);
+ this.bitVector = fm.serialize();
+ return this;
+ }
+
+ public ColStatsBuilder<T> fmSketch(String... values) {
+ FMSketch fm = StatisticsTestUtils.createFMSketch(values);
+ this.bitVector = fm.serialize();
+ return this;
+ }
+
+ public ColumnStatisticsData build() {
+ ColumnStatisticsData data = new ColumnStatisticsData();
+ if (type == byte[].class) {
+ data.setBinaryStats(newColData(BinaryColumnStatsData.class));
+ } else if (type == Boolean.class) {
+ data.setBooleanStats(newColData(BooleanColumnStatsData.class));
+ } else if (type == Date.class) {
+ data.setDateStats(newColData(DateColumnStatsDataInspector.class));
+ } else if (type == Decimal.class) {
+ data.setDecimalStats(newColData(DecimalColumnStatsDataInspector.class));
+ } else if (type == double.class) {
+ data.setDoubleStats(newColData(DoubleColumnStatsDataInspector.class));
+ } else if (type == long.class) {
+ data.setLongStats(newColData(LongColumnStatsDataInspector.class));
+ } else if (type == String.class) {
+ data.setStringStats(newColData(StringColumnStatsDataInspector.class));
+ } else if (type == Timestamp.class) {
+
data.setTimestampStats(newColData(TimestampColumnStatsDataInspector.class));
+ } else {
+ throw new IllegalStateException(type.getSimpleName() + " is not
supported");
+ }
+ return data;
+ }
+
+ private <X> X newColData(Class<X> clazz) {
+ try {
+ X data = clazz.getDeclaredConstructor().newInstance();
+ if (numNulls != null) {
+ clazz.getMethod("setNumNulls", long.class).invoke(data, numNulls);
+ }
+ if (numDVs != null) {
+ clazz.getMethod("setNumDVs", long.class).invoke(data, numDVs);
+ }
+ if (bitVector != null) {
+ clazz.getMethod("setBitVectors", byte[].class).invoke(data, bitVector);
+ }
+ if (avgColLen != null) {
+ clazz.getMethod("setAvgColLen", double.class).invoke(data, avgColLen);
+ }
+ if (maxColLen != null) {
+ clazz.getMethod("setMaxColLen", long.class).invoke(data, maxColLen);
+ }
+ if (numFalses != null) {
+ clazz.getMethod("setNumFalses", long.class).invoke(data, numFalses);
+ }
+ if (numTrues != null) {
+ clazz.getMethod("setNumTrues", long.class).invoke(data, numTrues);
+ }
+
+ if (lowValue != null) {
+ if (type.isPrimitive()) {
+ clazz.getMethod("setLowValue", type).invoke(data, lowValue);
+ } else {
+ clazz.getMethod("setLowValue", type).invoke(data,
type.cast(lowValue));
+ }
+ }
+ if (highValue != null) {
+ if (type.isPrimitive()) {
+ clazz.getMethod("setHighValue", type).invoke(data, highValue);
+ } else {
+ clazz.getMethod("setHighValue", type).invoke(data,
type.cast(highValue));
+ }
+ }
+ clazz.getMethod("validate").invoke(data);
+ return data;
+ } catch (NoSuchMethodException | InstantiationException |
IllegalAccessException | InvocationTargetException e) {
+ throw new RuntimeException("Reflection error", e);
+ }
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregatorTest.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregatorTest.java
new file mode 100644
index 00000000000..cc9d4ca4a87
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BinaryColumnStatsAggregatorTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class BinaryColumnStatsAggregatorTest {
+
+ private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+ 0, null, null, Collections.emptyMap(), null, null,
+ TableType.MANAGED_TABLE.toString());
+ private static final FieldSchema COL = new FieldSchema("col", "binary", "");
+
+ @Test
+ public void testAggregateSingleStat() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(byte[].class).numNulls(1).avgColLen(8.5).maxColLen(13).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ BinaryColumnStatsAggregator aggregator = new BinaryColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(byte[].class).numNulls(1).avgColLen(20.0 /
3).maxColLen(13).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(byte[].class).numNulls(2).avgColLen(14).maxColLen(18).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(byte[].class).numNulls(3).avgColLen(17.5).maxColLen(18).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ BinaryColumnStatsAggregator aggregator = new BinaryColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(byte[].class).numNulls(6).avgColLen(17.5).maxColLen(18).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenOnlySomeAvailable() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3",
"part4");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(byte[].class).numNulls(1).avgColLen(20.0 /
3).maxColLen(13).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(byte[].class).numNulls(3).avgColLen(17.5).maxColLen(18).build();
+ ColumnStatisticsData data4 = new
ColStatsBuilder<>(byte[].class).numNulls(2).avgColLen(14).maxColLen(18).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)),
+ createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+ BinaryColumnStatsAggregator aggregator = new BinaryColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(byte[].class).numNulls(6).avgColLen(17.5).maxColLen(18).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregatorTest.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregatorTest.java
new file mode 100644
index 00000000000..1676d1350d5
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/BooleanColumnStatsAggregatorTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class BooleanColumnStatsAggregatorTest {
+
+ private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+ 0, null, null, Collections.emptyMap(), null, null,
+ TableType.MANAGED_TABLE.toString());
+ private static final FieldSchema COL = new FieldSchema("col", "boolean", "");
+
+ @Test
+ public void testAggregateSingleStat() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Boolean.class).numNulls(1).numFalses(2).numTrues(13).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ BooleanColumnStatsAggregator aggregator = new
BooleanColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Boolean.class).numNulls(1).numFalses(3).numTrues(13).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Boolean.class).numNulls(2).numFalses(6).numTrues(18).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Boolean.class).numNulls(3).numFalses(2).numTrues(18).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ BooleanColumnStatsAggregator aggregator = new
BooleanColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Boolean.class).numNulls(6).numFalses(11).numTrues(49).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenOnlySomeAvailable() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3",
"part4");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Boolean.class).numNulls(1).numFalses(3).numTrues(13).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Boolean.class).numNulls(3).numFalses(2).numTrues(18).build();
+ ColumnStatisticsData data4 = new
ColStatsBuilder<>(Boolean.class).numNulls(2).numFalses(6).numTrues(18).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)),
+ createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+ BooleanColumnStatsAggregator aggregator = new
BooleanColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Boolean.class).numNulls(6).numFalses(11).numTrues(49).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregatorTest.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregatorTest.java
new file mode 100644
index 00000000000..07a5d49c179
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregatorTest.java
@@ -0,0 +1,270 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.Date;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class DateColumnStatsAggregatorTest {
+
+ private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+ 0, null, null, Collections.emptyMap(), null, null,
+ TableType.MANAGED_TABLE.toString());
+ private static final FieldSchema COL = new FieldSchema("col", "date", "");
+
+ private static final Date DATE_1 = new Date(1);
+ private static final Date DATE_2 = new Date(2);
+ private static final Date DATE_3 = new Date(3);
+ private static final Date DATE_4 = new Date(4);
+ private static final Date DATE_5 = new Date(5);
+ private static final Date DATE_6 = new Date(6);
+ private static final Date DATE_7 = new Date(7);
+ private static final Date DATE_8 = new Date(8);
+ private static final Date DATE_9 = new Date(9);
+
+ @Test
+ public void testAggregateSingleStat() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2).low(DATE_1).high(DATE_4)
+ .hll(DATE_1.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch()).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateSingleStatWhenNullValues() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ // ndv tuner does not have any effect because min numDVs and max numDVs
coincide (we have a single stats)
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultipleStatsWhenSomeNullValues() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2");
+
+ long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch()
};
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(2)
+ .low(DATE_1).high(DATE_2).hll(values1).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)));
+
+ DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(3)
+ .low(DATE_1).high(DATE_2).hll(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(4)
+ .low(DATE_1).high(DATE_2).hll(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new ColStatsBuilder<>(Date.class).numNulls(3).numDVs(5)
+ .low(DATE_1).high(DATE_2).hll(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(),
DATE_3.getDaysSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3)
+ .low(DATE_1).high(DATE_3).hll(values1).build();
+
+ long[] values2 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(),
DATE_5.getDaysSinceEpoch() };
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3)
+ .low(DATE_3).high(DATE_5).hll(values2).build();
+
+ long[] values3 = { DATE_6.getDaysSinceEpoch(), DATE_7.getDaysSinceEpoch()
};
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(2)
+ .low(DATE_6).high(DATE_7).hll(values3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ // the aggregation does not update hll, only numDVs is, it keeps the first
hll
+ // notice that numDVs is computed by using HLL, it can detect that
'DATE_3' appears twice
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Date.class).numNulls(6).numDVs(7)
+ .low(DATE_1).high(DATE_7).hll(values1).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenUnmergeableBitVectors() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(),
DATE_3.getDaysSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3)
+ .low(DATE_1).high(DATE_3).fmSketch(values1).build();
+ long[] values2 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(),
DATE_5.getDaysSinceEpoch() };
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3)
+ .low(DATE_3).high(DATE_5).hll(values2).build();
+ long[] values3 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(),
DATE_6.getDaysSinceEpoch(),
+ DATE_8.getDaysSinceEpoch() };
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(4)
+ .low(DATE_1).high(DATE_8).hll(values3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ // the aggregation does not update the bitvector, only numDVs is, it keeps
the first bitvector;
+ // numDVs is set to the maximum among all stats when non-mergeable
bitvectors are detected
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Date.class).numNulls(6).numDVs(4)
+ .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(6)
+ .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ double[] tunerValues = new double[] { 0, 0.5, 0.75, 1 };
+ long[] expectedNDVs = new long[] { 4, 7, 8, 10 };
+ for (int i = 0; i < tunerValues.length; i++) {
+ aggregator.ndvTuner = tunerValues[i];
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new
ColStatsBuilder<>(Date.class).numNulls(6).numDVs(expectedNDVs[i])
+ .low(DATE_1).high(DATE_8).fmSketch(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenOnlySomeAvailable() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3",
"part4");
+
+ long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(),
DATE_3.getDaysSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3)
+ .low(DATE_1).high(DATE_3).hll(values1).build();
+
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(1).low(DATE_7).high(DATE_7)
+ .hll(DATE_7.getDaysSinceEpoch()).build();
+
+ long[] values4 = { DATE_3.getDaysSinceEpoch(), DATE_4.getDaysSinceEpoch(),
DATE_5.getDaysSinceEpoch() };
+ ColumnStatisticsData data4 = new
ColStatsBuilder<>(Date.class).numNulls(2).numDVs(3)
+ .low(DATE_3).high(DATE_5).hll(values4).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)),
+ createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+ DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Date.class).numNulls(8).numDVs(4)
+ .low(DATE_1).high(DATE_9).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void
testAggregateMultiStatsOnlySomeAvailableButUnmergeableBitVector() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ long[] values1 = { DATE_1.getDaysSinceEpoch(), DATE_2.getDaysSinceEpoch(),
DATE_6.getDaysSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Date.class).numNulls(1).numDVs(3)
+ .low(DATE_1).high(DATE_6).fmSketch(values1).build();
+
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Date.class).numNulls(3).numDVs(1)
+ .low(DATE_7).high(DATE_7).hll(DATE_7.getDaysSinceEpoch()).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DateColumnStatsAggregator aggregator = new DateColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Date.class).numNulls(6).numDVs(3)
+ .low(DATE_1).high(DATE_7).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, false);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new ColStatsBuilder<>(Date.class).numNulls(6).numDVs(4)
+ .low(DATE_1).high(DATE_7).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregatorTest.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregatorTest.java
new file mode 100644
index 00000000000..a3a2730be9e
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregatorTest.java
@@ -0,0 +1,256 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.Decimal;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.api.utils.DecimalUtils;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class DecimalColumnStatsAggregatorTest {
+
+ private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+ 0, null, null, Collections.emptyMap(), null, null,
+ TableType.MANAGED_TABLE.toString());
+ private static final FieldSchema COL = new FieldSchema("col", "decimal", "");
+
+ private static final Decimal ONE = DecimalUtils.createThriftDecimal("1.0");
+ private static final Decimal TWO = DecimalUtils.createThriftDecimal("2.0");
+ private static final Decimal THREE = DecimalUtils.createThriftDecimal("3.0");
+ private static final Decimal FOUR = DecimalUtils.createThriftDecimal("4.0");
+ private static final Decimal FIVE = DecimalUtils.createThriftDecimal("5.0");
+ private static final Decimal SIX = DecimalUtils.createThriftDecimal("6.0");
+ private static final Decimal SEVEN = DecimalUtils.createThriftDecimal("7.0");
+ private static final Decimal EIGHT = DecimalUtils.createThriftDecimal("8.0");
+
+ @Test
+ public void testAggregateSingleStat() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Decimal.class).numNulls(1).numDVs(2)
+ .low(ONE).high(FOUR).hll(1, 4).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ DecimalColumnStatsAggregator aggregator = new
DecimalColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateSingleStatWhenNullValues() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Decimal.class).numNulls(1).numDVs(2).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ DecimalColumnStatsAggregator aggregator = new
DecimalColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ // ndv tuner does not have any effect because min numDVs and max numDVs
coincide (we have a single stats)
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultipleStatsWhenSomeNullValues() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Decimal.class).numNulls(1).numDVs(2)
+ .low(ONE).high(TWO).hll(1, 2).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Decimal.class).numNulls(2).numDVs(3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)));
+
+ DecimalColumnStatsAggregator aggregator = new
DecimalColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Decimal.class).numNulls(3).numDVs(3)
+ .low(ONE).high(TWO).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new ColStatsBuilder<>(Decimal.class).numNulls(3).numDVs(4)
+ .low(ONE).high(TWO).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new ColStatsBuilder<>(Decimal.class).numNulls(3).numDVs(5)
+ .low(ONE).high(TWO).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Decimal.class).numNulls(1).numDVs(3)
+ .low(ONE).high(THREE).hll(1, 2, 3).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Decimal.class).numNulls(2).numDVs(3)
+ .low(THREE).high(FIVE).hll(3, 4, 5).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Decimal.class).numNulls(3).numDVs(2)
+ .low(SIX).high(SEVEN).hll(6, 7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DecimalColumnStatsAggregator aggregator = new
DecimalColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ // the aggregation does not update hll, only numDVs is, it keeps the first
hll
+ // notice that numDVs is computed by using HLL, it can detect that '3'
appears twice
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Decimal.class).numNulls(6).numDVs(7)
+ .low(ONE).high(SEVEN).hll(1, 2, 3).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenUnmergeableBitVectors() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Decimal.class).numNulls(1).numDVs(3)
+ .low(ONE).high(THREE).fmSketch(1, 2, 3).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Decimal.class).numNulls(2).numDVs(3)
+ .low(THREE).high(FIVE).hll(3, 4, 5).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Decimal.class).numNulls(3).numDVs(4)
+ .low(ONE).high(EIGHT).hll(1, 2, 6, 8).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DecimalColumnStatsAggregator aggregator = new
DecimalColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ // the aggregation does not update the bitvector, only numDVs is, it keeps
the first bitvector;
+ // numDVs is set to the maximum among all stats when non-mergeable
bitvectors are detected
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Decimal.class).numNulls(6).numDVs(4)
+ .low(ONE).high(EIGHT).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new ColStatsBuilder<>(Decimal.class).numNulls(6).numDVs(6)
+ .low(ONE).high(EIGHT).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ double[] tunerValues = new double[] { 0, 0.5, 0.75, 1 };
+ long[] expectedDVs = new long[] { 4, 7, 8, 10 };
+ for (int i = 0; i < tunerValues.length; i++) {
+ aggregator.ndvTuner = tunerValues[i];
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new
ColStatsBuilder<>(Decimal.class).numNulls(6).numDVs(expectedDVs[i])
+ .low(ONE).high(EIGHT).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenOnlySomeAvailable() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3",
"part4");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Decimal.class).numNulls(1).numDVs(3)
+ .low(ONE).high(THREE).hll(1, 2, 3).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Decimal.class).numNulls(3).numDVs(1)
+ .low(SEVEN).high(SEVEN).hll(7).build();
+ ColumnStatisticsData data4 = new
ColStatsBuilder<>(Decimal.class).numNulls(2).numDVs(3)
+ .low(THREE).high(FIVE).hll(3, 4, 5).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)),
+ createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+ DecimalColumnStatsAggregator aggregator = new
DecimalColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Decimal.class).numNulls(8).numDVs(4)
+ .low(ONE).high(DecimalUtils.createThriftDecimal("9.4")).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void
testAggregateMultiStatsOnlySomeAvailableButUnmergeableBitVector() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Decimal.class).numNulls(1).numDVs(3)
+ .low(ONE).high(SIX).fmSketch(1, 2, 6).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Decimal.class).numNulls(3).numDVs(1)
+ .low(SEVEN).high(SEVEN).hll(7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DecimalColumnStatsAggregator aggregator = new
DecimalColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Decimal.class).numNulls(6).numDVs(3)
+ .low(ONE).high(DecimalUtils.createThriftDecimal("7.5")).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, false);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new ColStatsBuilder<>(Decimal.class).numNulls(6).numDVs(4)
+ .low(ONE).high(DecimalUtils.createThriftDecimal("7.5")).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregatorTest.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregatorTest.java
new file mode 100644
index 00000000000..d38d5324e00
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregatorTest.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class DoubleColumnStatsAggregatorTest {
+
+ private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+ 0, null, null, Collections.emptyMap(), null, null,
+ TableType.MANAGED_TABLE.toString());
+ private static final FieldSchema COL = new FieldSchema("col", "double", "");
+
+ @Test
+ public void testAggregateSingleStat() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(double.class).numNulls(1).numDVs(2)
+ .low(1d).high(4d).hll(1, 4).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ DoubleColumnStatsAggregator aggregator = new DoubleColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateSingleStatWhenNullValues() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(double.class).numNulls(1).numDVs(2).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ DoubleColumnStatsAggregator aggregator = new DoubleColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ // ndv tuner does not have any effect because min numDVs and max numDVs
coincide (we have a single stats)
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultipleStatsWhenSomeNullValues() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(double.class).numNulls(1).numDVs(2)
+ .low(1d).high(2d).hll(1, 2).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(double.class).numNulls(2).numDVs(3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)));
+
+ DoubleColumnStatsAggregator aggregator = new DoubleColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(double.class).numNulls(3).numDVs(3)
+ .low(1d).high(2d).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new ColStatsBuilder<>(double.class).numNulls(3).numDVs(4)
+ .low(1d).high(2d).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new ColStatsBuilder<>(double.class).numNulls(3).numDVs(5)
+ .low(1d).high(2d).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(double.class).numNulls(1).numDVs(3)
+ .low(1d).high(3d).hll(1, 2, 3).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(double.class).numNulls(2).numDVs(3)
+ .low(3d).high(5d).hll(3, 4, 5).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(double.class).numNulls(3).numDVs(2)
+ .low(6d).high(7d).hll(6, 7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DoubleColumnStatsAggregator aggregator = new DoubleColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ // the aggregation does not update hll, only numDVs is, it keeps the first
hll
+ // notice that numDVs is computed by using HLL, it can detect that '3'
appears twice
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(double.class).numNulls(6).numDVs(7)
+ .low(1d).high(7d).hll(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenUnmergeableBitVectors() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(double.class).numNulls(1).numDVs(3)
+ .low(1d).high(3d).fmSketch(1, 2, 3).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(double.class).numNulls(2).numDVs(3)
+ .low(3d).high(5d).hll(3, 4, 5).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(double.class).numNulls(3).numDVs(4)
+ .low(1d).high(8d).hll(1, 2, 6, 8).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DoubleColumnStatsAggregator aggregator = new DoubleColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ // the aggregation does not update the bitvector, only numDVs is, it keeps
the first bitvector;
+ // numDVs is set to the maximum among all stats when non-mergeable
bitvectors are detected
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(double.class).numNulls(6).numDVs(4)
+ .low(1d).high(8d).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new ColStatsBuilder<>(double.class).numNulls(6).numDVs(6)
+ .low(1d).high(8d).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ double[] tunerValues = new double[] { 0, 0.5, 0.75, 1 };
+ long[] expectedDVs = new long[] { 4, 7, 8, 10 };
+ for (int i = 0; i < tunerValues.length; i++) {
+ aggregator.ndvTuner = tunerValues[i];
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new
ColStatsBuilder<>(double.class).numNulls(6).numDVs(expectedDVs[i])
+ .low(1d).high(8d).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenOnlySomeAvailable() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3",
"part4");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(double.class).numNulls(1).numDVs(3)
+ .low(1d).high(3d).hll(1, 2, 3).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(double.class).numNulls(3).numDVs(1)
+ .low(7d).high(7d).hll(7).build();
+ ColumnStatisticsData data4 = new
ColStatsBuilder<>(double.class).numNulls(2).numDVs(3)
+ .low(3d).high(5d).hll(3, 4, 5).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)),
+ createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+ DoubleColumnStatsAggregator aggregator = new DoubleColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(double.class).numNulls(8).numDVs(4)
+ .low(1d).high(9.4).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void
testAggregateMultiStatsOnlySomeAvailableButUnmergeableBitVector() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(double.class).numNulls(1).numDVs(3)
+ .low(1d).high(6d).fmSketch(1, 2, 6).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(double.class).numNulls(3).numDVs(1)
+ .low(7d).high(7d).hll(7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ DoubleColumnStatsAggregator aggregator = new DoubleColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(double.class).numNulls(6).numDVs(3)
+ .low(1d).high(7.5).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, false);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new ColStatsBuilder<>(double.class).numNulls(6).numDVs(4)
+ .low(1d).high(7.5).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregatorTest.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregatorTest.java
new file mode 100644
index 00000000000..126c9868bc6
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregatorTest.java
@@ -0,0 +1,242 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class LongColumnStatsAggregatorTest {
+
+ private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+ 0, null, null, Collections.emptyMap(), null, null,
+ TableType.MANAGED_TABLE.toString());
+ private static final FieldSchema COL = new FieldSchema("col", "int", "");
+
+ @Test
+ public void testAggregateSingleStat() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(long.class).numNulls(1).numDVs(2)
+ .low(1L).high(4L).hll(1, 4).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ LongColumnStatsAggregator aggregator = new LongColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateSingleStatWhenNullValues() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(long.class).numNulls(1).numDVs(2).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ LongColumnStatsAggregator aggregator = new LongColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ // ndv tuner does not have any effect because min numDVs and max numDVs
coincide (we have a single stats)
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultipleStatsWhenSomeNullValues() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(long.class).numNulls(1).numDVs(2)
+ .low(1L).high(2L).hll(1, 2).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(long.class).numNulls(2).numDVs(3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)));
+
+ LongColumnStatsAggregator aggregator = new LongColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(long.class).numNulls(3).numDVs(3)
+ .low(1L).high(2L).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new ColStatsBuilder<>(long.class).numNulls(3).numDVs(4)
+ .low(1L).high(2L).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new ColStatsBuilder<>(long.class).numNulls(3).numDVs(5)
+ .low(1L).high(2L).hll(1, 2).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(long.class).numNulls(1).numDVs(3)
+ .low(1L).high(3L).hll(1, 2, 3).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(long.class).numNulls(2).numDVs(3)
+ .low(3L).high(5L).hll(3, 4, 5).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(long.class).numNulls(3).numDVs(2)
+ .low(6L).high(7L).hll(6, 7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ LongColumnStatsAggregator aggregator = new LongColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ // the aggregation does not update hll, only numDVs is, it keeps the first
hll
+ // notice that numDVs is computed by using HLL, it can detect that '3'
appears twice
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(long.class).numNulls(6).numDVs(7)
+ .low(1L).high(7L).hll(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenUnmergeableBitVectors() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(long.class).numNulls(1).numDVs(3)
+ .low(1L).high(3L).fmSketch(1, 2, 3).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(long.class).numNulls(2).numDVs(3)
+ .low(3L).high(5L).hll(3, 4, 5).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(long.class).numNulls(3).numDVs(4)
+ .low(1L).high(8L).hll(1, 2, 6, 8).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ LongColumnStatsAggregator aggregator = new LongColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ // the aggregation does not update the bitvector, only numDVs is, it keeps
the first bitvector;
+ // numDVs is set to the maximum among all stats when non-mergeable
bitvectors are detected
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(long.class).numNulls(6).numDVs(4)
+ .low(1L).high(8L).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new ColStatsBuilder<>(long.class).numNulls(6).numDVs(6)
+ .low(1L).high(8L).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ double[] tunerValues = new double[] { 0, 0.5, 0.75, 1 };
+ long[] expectedDVs = new long[] { 4, 7, 8, 10 };
+ for (int i = 0; i < tunerValues.length; i++) {
+ aggregator.ndvTuner = tunerValues[i];
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new
ColStatsBuilder<>(long.class).numNulls(6).numDVs(expectedDVs[i])
+ .low(1L).high(8L).fmSketch(1, 2, 3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenOnlySomeAvailable() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3",
"part4");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(long.class).numNulls(1).numDVs(3)
+ .low(1L).high(3L).hll(1, 2, 3).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(long.class).numNulls(3).numDVs(1)
+ .low(7L).high(7L).hll(7).build();
+ ColumnStatisticsData data4 = new
ColStatsBuilder<>(long.class).numNulls(2).numDVs(3)
+ .low(3L).high(5L).hll(3, 4, 5).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)),
+ createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+ LongColumnStatsAggregator aggregator = new LongColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(long.class).numNulls(8).numDVs(4)
+ .low(1L).high(9L).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void
testAggregateMultiStatsOnlySomeAvailableButUnmergeableBitVector() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(long.class).numNulls(1).numDVs(3)
+ .low(1L).high(6L).fmSketch(1, 2, 6).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(long.class).numNulls(3).numDVs(1)
+ .low(7L).high(7L).hll(7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ LongColumnStatsAggregator aggregator = new LongColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(long.class).numNulls(6).numDVs(3)
+ .low(1L).high(7L).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, false);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new ColStatsBuilder<>(long.class).numNulls(6).numDVs(4)
+ .low(1L).high(7L).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregatorTest.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregatorTest.java
new file mode 100644
index 00000000000..b27092090a9
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregatorTest.java
@@ -0,0 +1,188 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class StringColumnStatsAggregatorTest {
+
+ private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+ 0, null, null, Collections.emptyMap(), null, null,
+ TableType.MANAGED_TABLE.toString());
+ private static final FieldSchema COL = new FieldSchema("col", "string", "");
+
+ private static final String S_1 = "test";
+ private static final String S_2 = "try";
+ private static final String S_3 = "longer string";
+ private static final String S_4 = "even longer string";
+ private static final String S_5 = "some string";
+ private static final String S_6 = "some other string";
+ private static final String S_7 = "yet another string";
+
+ @Test
+ public void testAggregateSingleStat() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(String.class).numNulls(1).numDVs(2).avgColLen(8.5).maxColLen(13)
+ .hll(S_1, S_3).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ StringColumnStatsAggregator aggregator = new StringColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(String.class).numNulls(1).numDVs(3).avgColLen(20.0 /
3).maxColLen(13)
+ .hll(S_1, S_2, S_3).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(String.class).numNulls(2).numDVs(3).avgColLen(14).maxColLen(18)
+ .hll(S_3, S_4, S_5).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(String.class).numNulls(3).numDVs(2).avgColLen(17.5).maxColLen(18)
+ .hll(S_6, S_7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ StringColumnStatsAggregator aggregator = new StringColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ // the aggregation does not update hll, only numNDVs is, it keeps the
first hll
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(String.class).numNulls(6).numDVs(7).avgColLen(17.5).maxColLen(18)
+ .hll(S_1, S_2, S_3).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenUnmergeableBitVectors() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(String.class).numNulls(1).numDVs(3).avgColLen(20.0 /
3).maxColLen(13)
+ .fmSketch(S_1, S_2, S_3).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(String.class).numNulls(2).numDVs(3).avgColLen(14).maxColLen(18)
+ .hll(S_3, S_4, S_5).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(String.class).numNulls(3).numDVs(2).avgColLen(17.5).maxColLen(18)
+ .hll(S_6, S_7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ StringColumnStatsAggregator aggregator = new StringColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ // the aggregation does not update the bitvector, only numDVs is, it keeps
the first bitvector;
+ // numDVs is set to the maximum among all stats when non-mergeable
bitvectors are detected
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(String.class).numNulls(6).numDVs(3).avgColLen(17.5).maxColLen(18)
+ .fmSketch(S_1, S_2, S_3).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ // both useDensityFunctionForNDVEstimation and ndvTuner are ignored by
StringColumnStatsAggregator
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ double[] tunerValues = new double[] { 0, 0.5, 0.75, 1 };
+ for (int i = 0; i < tunerValues.length; i++) {
+ aggregator.ndvTuner = tunerValues[i];
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenOnlySomeAvailable() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3",
"part4");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(String.class).numNulls(1).numDVs(3).avgColLen(20.0 /
3).maxColLen(13)
+ .hll(S_1, S_2, S_3).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(String.class).numNulls(3).numDVs(2).avgColLen(17.5).maxColLen(18)
+ .hll(S_6, S_7).build();
+ ColumnStatisticsData data4 = new
ColStatsBuilder<>(String.class).numNulls(2).numDVs(3).avgColLen(14).maxColLen(18)
+ .hll(S_3, S_4, S_5).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)),
+ createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+ StringColumnStatsAggregator aggregator = new StringColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(String.class).numNulls(8).numDVs(6)
+ .avgColLen(24).maxColLen(24).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void
testAggregateMultiStatsOnlySomeAvailableButUnmergeableBitVector() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(String.class).numNulls(1).numDVs(3).avgColLen(20.0 /
3).maxColLen(13)
+ .fmSketch(S_1, S_2, S_3).build();
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(String.class).numNulls(3).numDVs(2).avgColLen(17.5).maxColLen(18)
+ .hll(S_6, S_7).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ StringColumnStatsAggregator aggregator = new StringColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(String.class).numNulls(6).numDVs(3)
+ .avgColLen(22.916666666666668).maxColLen(22).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ // both useDensityFunctionForNDVEstimation and ndvTuner are ignored by
StringColumnStatsAggregator
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, false);
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+}
diff --git
a/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/TimestampColumnStatsAggregatorTest.java
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/TimestampColumnStatsAggregatorTest.java
new file mode 100644
index 00000000000..e6217eb118b
--- /dev/null
+++
b/standalone-metastore/metastore-server/src/test/java/org/apache/hadoop/hive/metastore/columnstats/aggr/TimestampColumnStatsAggregatorTest.java
@@ -0,0 +1,273 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hadoop.hive.metastore.columnstats.aggr;
+
+import org.apache.hadoop.hive.metastore.TableType;
+import org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.metastore.api.MetaException;
+import org.apache.hadoop.hive.metastore.api.Table;
+import org.apache.hadoop.hive.metastore.api.Timestamp;
+import org.apache.hadoop.hive.metastore.columnstats.ColStatsBuilder;
+import
org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo;
+import org.junit.Assert;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static
org.apache.hadoop.hive.metastore.StatisticsTestUtils.createStatsWithInfo;
+
+@Category(MetastoreUnitTest.class)
+public class TimestampColumnStatsAggregatorTest {
+
+ private static final Table TABLE = new Table("dummy", "db", "hive", 0, 0,
+ 0, null, null, Collections.emptyMap(), null, null,
+ TableType.MANAGED_TABLE.toString());
+ private static final FieldSchema COL = new FieldSchema("col", "timestamp",
"");
+
+ private static final Timestamp TS_1 = new Timestamp(1);
+ private static final Timestamp TS_2 = new Timestamp(2);
+ private static final Timestamp TS_3 = new Timestamp(3);
+ private static final Timestamp TS_4 = new Timestamp(4);
+ private static final Timestamp TS_5 = new Timestamp(5);
+ private static final Timestamp TS_6 = new Timestamp(6);
+ private static final Timestamp TS_7 = new Timestamp(7);
+ private static final Timestamp TS_8 = new Timestamp(8);
+ private static final Timestamp TS_9 = new Timestamp(9);
+
+ @Test
+ public void testAggregateSingleStat() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Timestamp.class).numNulls(1).numDVs(2).low(TS_1)
+ .high(TS_3).hll(TS_1.getSecondsSinceEpoch(),
TS_3.getSecondsSinceEpoch()).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ TimestampColumnStatsAggregator aggregator = new
TimestampColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateSingleStatWhenNullValues() throws MetaException {
+ List<String> partitions = Collections.singletonList("part1");
+
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Timestamp.class).numNulls(1).numDVs(2).build();
+ List<ColStatsObjWithSourceInfo> statsList =
+ Collections.singletonList(createStatsWithInfo(data1, TABLE, COL,
partitions.get(0)));
+
+ TimestampColumnStatsAggregator aggregator = new
TimestampColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ // ndv tuner does not have any effect because min numDVs and max numDVs
coincide (we have a single stats)
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ Assert.assertEquals(data1, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultipleStatsWhenSomeNullValues() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2");
+
+ long[] values1 = { TS_1.getSecondsSinceEpoch(),
TS_2.getSecondsSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Timestamp.class).numNulls(1).numDVs(2)
+ .low(TS_1).high(TS_2).hll(values1).build();
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Timestamp.class).numNulls(2).numDVs(3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)));
+
+ TimestampColumnStatsAggregator aggregator = new
TimestampColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(3).numDVs(3)
+ .low(TS_1).high(TS_2).hll(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(3).numDVs(4)
+ .low(TS_1).high(TS_2).hll(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ aggregator.ndvTuner = 1;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(3).numDVs(5)
+ .low(TS_1).high(TS_2).hll(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenAllAvailable() throws MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ long[] values1 = { TS_1.getSecondsSinceEpoch(),
TS_2.getSecondsSinceEpoch(), TS_3.getSecondsSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Timestamp.class).numNulls(1).numDVs(2)
+ .low(TS_1).high(TS_3).hll(values1).build();
+
+ long[] values2 = { TS_3.getSecondsSinceEpoch(),
TS_4.getSecondsSinceEpoch(), TS_5.getSecondsSinceEpoch() };
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Timestamp.class).numNulls(2).numDVs(3)
+ .low(TS_3).high(TS_5).hll(values2).build();
+
+ long[] values3 = { TS_6.getSecondsSinceEpoch(),
TS_7.getSecondsSinceEpoch() };
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Timestamp.class).numNulls(3).numDVs(2)
+ .low(TS_6).high(TS_7).hll(values3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ TimestampColumnStatsAggregator aggregator = new
TimestampColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+
+ // the aggregation does not update hll, only numDVs is, it keeps the first
hll
+ // notice that numDVs is computed by using HLL, it can detect that 'TS_3'
appears twice
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(6).numDVs(7)
+ .low(TS_1).high(TS_7).hll(values1).build();
+
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenUnmergeableBitVectors() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ long[] values1 = { TS_1.getSecondsSinceEpoch(),
TS_2.getSecondsSinceEpoch(), TS_3.getSecondsSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Timestamp.class).numNulls(1).numDVs(3)
+ .low(TS_1).high(TS_3).fmSketch(values1).build();
+
+ long[] values2 = { TS_3.getSecondsSinceEpoch(),
TS_4.getSecondsSinceEpoch(), TS_5.getSecondsSinceEpoch() };
+ ColumnStatisticsData data2 = new
ColStatsBuilder<>(Timestamp.class).numNulls(2).numDVs(3).low(TS_3).high(TS_5)
+ .hll(values2).build();
+
+ long[] values3 = { TS_1.getSecondsSinceEpoch(),
TS_2.getSecondsSinceEpoch(), TS_6.getSecondsSinceEpoch(),
+ TS_8.getSecondsSinceEpoch() };
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Timestamp.class).numNulls(3).numDVs(4).low(TS_1)
+ .high(TS_8).hll(values3).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data2, TABLE, COL, partitions.get(1)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ TimestampColumnStatsAggregator aggregator = new
TimestampColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, true);
+ // the aggregation does not update the bitvector, only numDVs is, it keeps
the first bitvector;
+ // numDVs is set to the maximum among all stats when non-mergeable
bitvectors are detected
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(6).numDVs(4).low(TS_1)
+ .high(TS_8).fmSketch(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(6).numDVs(6).low(TS_1).high(TS_8)
+ .fmSketch(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = false;
+ double[] tunerValues = new double[] { 0, 0.5, 0.75, 1 };
+ long[] expectedDVs = new long[] { 4, 7, 8, 10 };
+ for (int i = 0; i < tunerValues.length; i++) {
+ aggregator.ndvTuner = tunerValues[i];
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(6).numDVs(expectedDVs[i])
+ .low(TS_1).high(TS_8).fmSketch(values1).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+ }
+
+ @Test
+ public void testAggregateMultiStatsWhenOnlySomeAvailable() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3",
"part4");
+
+ long[] values1 = { TS_1.getSecondsSinceEpoch(),
TS_2.getSecondsSinceEpoch(), TS_3.getSecondsSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Timestamp.class).numNulls(1).numDVs(3)
+ .low(TS_1).high(TS_3).hll(values1).build();
+
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Timestamp.class).numNulls(3).numDVs(1)
+ .low(TS_7).high(TS_7).hll(TS_7.getSecondsSinceEpoch()).build();
+
+ long[] values4 = { TS_3.getSecondsSinceEpoch(),
TS_4.getSecondsSinceEpoch(), TS_5.getSecondsSinceEpoch() };
+ ColumnStatisticsData data4 = new
ColStatsBuilder<>(Timestamp.class).numNulls(2).numDVs(3).low(TS_3).high(TS_5)
+ .hll(values4).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)),
+ createStatsWithInfo(data4, TABLE, COL, partitions.get(3)));
+
+ TimestampColumnStatsAggregator aggregator = new
TimestampColumnStatsAggregator();
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(8).numDVs(4).low(TS_1)
+ .high(TS_9).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+
+ @Test
+ public void
testAggregateMultiStatsOnlySomeAvailableButUnmergeableBitVector() throws
MetaException {
+ List<String> partitions = Arrays.asList("part1", "part2", "part3");
+
+ long[] values1 = { TS_1.getSecondsSinceEpoch(),
TS_2.getSecondsSinceEpoch(), TS_6.getSecondsSinceEpoch() };
+ ColumnStatisticsData data1 = new
ColStatsBuilder<>(Timestamp.class).numNulls(1).numDVs(3)
+ .low(TS_1).high(TS_6).hll(values1).build();
+
+ ColumnStatisticsData data3 = new
ColStatsBuilder<>(Timestamp.class).numNulls(3).numDVs(1)
+ .low(TS_7).high(TS_7).hll(TS_7.getSecondsSinceEpoch()).build();
+
+ List<ColStatsObjWithSourceInfo> statsList = Arrays.asList(
+ createStatsWithInfo(data1, TABLE, COL, partitions.get(0)),
+ createStatsWithInfo(data3, TABLE, COL, partitions.get(2)));
+
+ TimestampColumnStatsAggregator aggregator = new
TimestampColumnStatsAggregator();
+
+ ColumnStatisticsObj computedStatsObj = aggregator.aggregate(statsList,
partitions, false);
+ // hll in case of missing stats is left as null, only numDVs is updated
+ ColumnStatisticsData expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(6).numDVs(3).low(TS_1)
+ .high(TS_7).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+
+ aggregator.useDensityFunctionForNDVEstimation = true;
+ computedStatsObj = aggregator.aggregate(statsList, partitions, true);
+ // the use of the density function leads to a different estimation for
numNDV
+ expectedStats = new
ColStatsBuilder<>(Timestamp.class).numNulls(6).numDVs(4).low(TS_1)
+ .high(TS_7).build();
+ Assert.assertEquals(expectedStats, computedStatsObj.getStatsData());
+ }
+}