http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java deleted file mode 100644 index ac7e8e3..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DecimalColumnStatsAggregator.java +++ /dev/null @@ -1,375 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hadoop.hive.metastore.columnstats.aggr; - -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.StatObjectConverter; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.utils.DecimalUtils; -import org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector; -import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; -import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implements - IExtrapolatePartStatus { - - private static final Logger LOG = LoggerFactory.getLogger(DecimalColumnStatsAggregator.class); - - @Override - public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, - List<String> partNames, boolean areAllPartsFound) throws MetaException { - ColumnStatisticsObj statsObj = null; - String colType = null; - String colName = null; - // check if all the ColumnStatisticsObjs contain stats and all the ndv are - // bitvectors - boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size(); - NumDistinctValueEstimator ndvEstimator = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - if (statsObj == null) { - colName = cso.getColName(); - colType = cso.getColType(); - statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, - cso.getStatsData().getSetField()); - LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, - doAllPartitionContainStats); - } - DecimalColumnStatsDataInspector decimalColumnStatsData = - (DecimalColumnStatsDataInspector) cso.getStatsData().getDecimalStats(); - if (decimalColumnStatsData.getNdvEstimator() == null) { - ndvEstimator = null; - break; - } else { - // check if all of the bit vectors can merge - NumDistinctValueEstimator estimator = decimalColumnStatsData.getNdvEstimator(); - if (ndvEstimator == null) { - ndvEstimator = estimator; - } else { - if (ndvEstimator.canMerge(estimator)) { - continue; - } else { - ndvEstimator = null; - break; - } - } - } - } - if (ndvEstimator != null) { - ndvEstimator = NumDistinctValueEstimatorFactory - .getEmptyNumDistinctValueEstimator(ndvEstimator); - } - LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); - ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); - if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) { - DecimalColumnStatsDataInspector aggregateData = null; - long lowerBound = 0; - long higherBound = 0; - double densityAvgSum = 0.0; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - DecimalColumnStatsDataInspector newData = - (DecimalColumnStatsDataInspector) cso.getStatsData().getDecimalStats(); - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (MetaStoreUtils.decimalToDouble(newData.getHighValue()) - MetaStoreUtils - .decimalToDouble(newData.getLowValue())) / newData.getNumDVs(); - if (ndvEstimator != null) { - ndvEstimator.mergeEstimators(newData.getNdvEstimator()); - } - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - if (MetaStoreUtils.decimalToDouble(aggregateData.getLowValue()) < MetaStoreUtils - .decimalToDouble(newData.getLowValue())) { - aggregateData.setLowValue(aggregateData.getLowValue()); - } else { - aggregateData.setLowValue(newData.getLowValue()); - } - if (MetaStoreUtils.decimalToDouble(aggregateData.getHighValue()) > MetaStoreUtils - .decimalToDouble(newData.getHighValue())) { - aggregateData.setHighValue(aggregateData.getHighValue()); - } else { - aggregateData.setHighValue(newData.getHighValue()); - } - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); - } - } - if (ndvEstimator != null) { - // if all the ColumnStatisticsObjs contain bitvectors, we do not need to - // use uniform distribution assumption because we can merge bitvectors - // to get a good estimation. - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - } else { - long estimation; - if (useDensityFunctionForNDVEstimation) { - // We have estimation, lowerbound and higherbound. We use estimation - // if it is between lowerbound and higherbound. - double densityAvg = densityAvgSum / partNames.size(); - estimation = (long) ((MetaStoreUtils.decimalToDouble(aggregateData.getHighValue()) - MetaStoreUtils - .decimalToDouble(aggregateData.getLowValue())) / densityAvg); - if (estimation < lowerBound) { - estimation = lowerBound; - } else if (estimation > higherBound) { - estimation = higherBound; - } - } else { - estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); - } - aggregateData.setNumDVs(estimation); - } - columnStatisticsData.setDecimalStats(aggregateData); - } else { - // we need extrapolation - LOG.debug("start extrapolation for " + colName); - Map<String, Integer> indexMap = new HashMap<>(); - for (int index = 0; index < partNames.size(); index++) { - indexMap.put(partNames.get(index), index); - } - Map<String, Double> adjustedIndexMap = new HashMap<>(); - Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>(); - // while we scan the css, we also get the densityAvg, lowerbound and - // higerbound when useDensityFunctionForNDVEstimation is true. - double densityAvgSum = 0.0; - if (ndvEstimator == null) { - // if not every partition uses bitvector for ndv, we just fall back to - // the traditional extrapolation methods. - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - String partName = csp.getPartName(); - DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats(); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (MetaStoreUtils.decimalToDouble(newData.getHighValue()) - MetaStoreUtils - .decimalToDouble(newData.getLowValue())) / newData.getNumDVs(); - } - adjustedIndexMap.put(partName, (double) indexMap.get(partName)); - adjustedStatsMap.put(partName, cso.getStatsData()); - } - } else { - // we first merge all the adjacent bitvectors that we could merge and - // derive new partition names and index. - StringBuilder pseudoPartName = new StringBuilder(); - double pseudoIndexSum = 0; - int length = 0; - int curIndex = -1; - DecimalColumnStatsDataInspector aggregateData = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - String partName = csp.getPartName(); - DecimalColumnStatsDataInspector newData = - (DecimalColumnStatsDataInspector) cso.getStatsData().getDecimalStats(); - // newData.isSetBitVectors() should be true for sure because we - // already checked it before. - if (indexMap.get(partName) != curIndex) { - // There is bitvector, but it is not adjacent to the previous ones. - if (length > 0) { - // we have to set ndv - adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - ColumnStatisticsData csd = new ColumnStatisticsData(); - csd.setDecimalStats(aggregateData); - adjustedStatsMap.put(pseudoPartName.toString(), csd); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (MetaStoreUtils.decimalToDouble(aggregateData.getHighValue()) - MetaStoreUtils - .decimalToDouble(aggregateData.getLowValue())) / aggregateData.getNumDVs(); - } - // reset everything - pseudoPartName = new StringBuilder(); - pseudoIndexSum = 0; - length = 0; - ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); - } - aggregateData = null; - } - curIndex = indexMap.get(partName); - pseudoPartName.append(partName); - pseudoIndexSum += curIndex; - length++; - curIndex++; - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - if (MetaStoreUtils.decimalToDouble(aggregateData.getLowValue()) < MetaStoreUtils - .decimalToDouble(newData.getLowValue())) { - aggregateData.setLowValue(aggregateData.getLowValue()); - } else { - aggregateData.setLowValue(newData.getLowValue()); - } - if (MetaStoreUtils.decimalToDouble(aggregateData.getHighValue()) > MetaStoreUtils - .decimalToDouble(newData.getHighValue())) { - aggregateData.setHighValue(aggregateData.getHighValue()); - } else { - aggregateData.setHighValue(newData.getHighValue()); - } - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - } - ndvEstimator.mergeEstimators(newData.getNdvEstimator()); - } - if (length > 0) { - // we have to set ndv - adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - ColumnStatisticsData csd = new ColumnStatisticsData(); - csd.setDecimalStats(aggregateData); - adjustedStatsMap.put(pseudoPartName.toString(), csd); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (MetaStoreUtils.decimalToDouble(aggregateData.getHighValue()) - MetaStoreUtils - .decimalToDouble(aggregateData.getLowValue())) / aggregateData.getNumDVs(); - } - } - } - extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), - adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); - } - LOG.debug( - "Ndv estimatation for {} is {} # of partitions requested: {} # of partitions found: {}", - colName, columnStatisticsData.getDecimalStats().getNumDVs(), partNames.size(), - colStatsWithSourceInfo.size()); - statsObj.setStatsData(columnStatisticsData); - return statsObj; - } - - @Override - public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, - int numPartsWithStats, Map<String, Double> adjustedIndexMap, - Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) { - int rightBorderInd = numParts; - DecimalColumnStatsDataInspector extrapolateDecimalData = new DecimalColumnStatsDataInspector(); - Map<String, DecimalColumnStatsData> extractedAdjustedStatsMap = new HashMap<>(); - for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) { - extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDecimalStats()); - } - List<Map.Entry<String, DecimalColumnStatsData>> list = new LinkedList<>( - extractedAdjustedStatsMap.entrySet()); - // get the lowValue - Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, DecimalColumnStatsData> o1, - Map.Entry<String, DecimalColumnStatsData> o2) { - return o1.getValue().getLowValue().compareTo(o2.getValue().getLowValue()); - } - }); - double minInd = adjustedIndexMap.get(list.get(0).getKey()); - double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - double lowValue = 0; - double min = MetaStoreUtils.decimalToDouble(list.get(0).getValue().getLowValue()); - double max = MetaStoreUtils.decimalToDouble(list.get(list.size() - 1).getValue().getLowValue()); - if (minInd == maxInd) { - lowValue = min; - } else if (minInd < maxInd) { - // left border is the min - lowValue = (max - (max - min) * maxInd / (maxInd - minInd)); - } else { - // right border is the min - lowValue = (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd)); - } - - // get the highValue - Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, DecimalColumnStatsData> o1, - Map.Entry<String, DecimalColumnStatsData> o2) { - return o1.getValue().getHighValue().compareTo(o2.getValue().getHighValue()); - } - }); - minInd = adjustedIndexMap.get(list.get(0).getKey()); - maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - double highValue = 0; - min = MetaStoreUtils.decimalToDouble(list.get(0).getValue().getHighValue()); - max = MetaStoreUtils.decimalToDouble(list.get(list.size() - 1).getValue().getHighValue()); - if (minInd == maxInd) { - highValue = min; - } else if (minInd < maxInd) { - // right border is the max - highValue = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - highValue = (min + (max - min) * minInd / (minInd - maxInd)); - } - - // get the #nulls - long numNulls = 0; - for (Map.Entry<String, DecimalColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) { - numNulls += entry.getValue().getNumNulls(); - } - // we scale up sumNulls based on the number of partitions - numNulls = numNulls * numParts / numPartsWithStats; - - // get the ndv - long ndv = 0; - long ndvMin = 0; - long ndvMax = 0; - Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, DecimalColumnStatsData> o1, - Map.Entry<String, DecimalColumnStatsData> o2) { - return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); - } - }); - long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); - long higherBound = 0; - for (Map.Entry<String, DecimalColumnStatsData> entry : list) { - higherBound += entry.getValue().getNumDVs(); - } - if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) { - ndv = (long) ((highValue - lowValue) / densityAvg); - if (ndv < lowerBound) { - ndv = lowerBound; - } else if (ndv > higherBound) { - ndv = higherBound; - } - } else { - minInd = adjustedIndexMap.get(list.get(0).getKey()); - maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - ndvMin = list.get(0).getValue().getNumDVs(); - ndvMax = list.get(list.size() - 1).getValue().getNumDVs(); - if (minInd == maxInd) { - ndv = ndvMin; - } else if (minInd < maxInd) { - // right border is the max - ndv = (long) (ndvMin + (ndvMax - ndvMin) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - ndv = (long) (ndvMin + (ndvMax - ndvMin) * minInd / (minInd - maxInd)); - } - } - extrapolateDecimalData.setLowValue(DecimalUtils.createThriftDecimal(String - .valueOf(lowValue))); - extrapolateDecimalData.setHighValue(DecimalUtils.createThriftDecimal(String - .valueOf(highValue))); - extrapolateDecimalData.setNumNulls(numNulls); - extrapolateDecimalData.setNumDVs(ndv); - extrapolateData.setDecimalStats(extrapolateDecimalData); - } -}
http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java deleted file mode 100644 index ece77dd..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DoubleColumnStatsAggregator.java +++ /dev/null @@ -1,348 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hadoop.hive.metastore.columnstats.aggr; - -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector; -import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implements - IExtrapolatePartStatus { - - private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); - - @Override - public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, - List<String> partNames, boolean areAllPartsFound) throws MetaException { - ColumnStatisticsObj statsObj = null; - String colType = null; - String colName = null; - // check if all the ColumnStatisticsObjs contain stats and all the ndv are - // bitvectors - boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size(); - NumDistinctValueEstimator ndvEstimator = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - if (statsObj == null) { - colName = cso.getColName(); - colType = cso.getColType(); - statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, - cso.getStatsData().getSetField()); - LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, - doAllPartitionContainStats); - } - DoubleColumnStatsDataInspector doubleColumnStatsData = - (DoubleColumnStatsDataInspector) cso.getStatsData().getDoubleStats(); - if (doubleColumnStatsData.getNdvEstimator() == null) { - ndvEstimator = null; - break; - } else { - // check if all of the bit vectors can merge - NumDistinctValueEstimator estimator = doubleColumnStatsData.getNdvEstimator(); - if (ndvEstimator == null) { - ndvEstimator = estimator; - } else { - if (ndvEstimator.canMerge(estimator)) { - continue; - } else { - ndvEstimator = null; - break; - } - } - } - } - if (ndvEstimator != null) { - ndvEstimator = NumDistinctValueEstimatorFactory - .getEmptyNumDistinctValueEstimator(ndvEstimator); - } - LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); - ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); - if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) { - DoubleColumnStatsDataInspector aggregateData = null; - long lowerBound = 0; - long higherBound = 0; - double densityAvgSum = 0.0; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - DoubleColumnStatsDataInspector newData = - (DoubleColumnStatsDataInspector) cso.getStatsData().getDoubleStats(); - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); - if (ndvEstimator != null) { - ndvEstimator.mergeEstimators(newData.getNdvEstimator()); - } - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue())); - aggregateData - .setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); - } - } - if (ndvEstimator != null) { - // if all the ColumnStatisticsObjs contain bitvectors, we do not need to - // use uniform distribution assumption because we can merge bitvectors - // to get a good estimation. - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - } else { - long estimation; - if (useDensityFunctionForNDVEstimation) { - // We have estimation, lowerbound and higherbound. We use estimation - // if it is between lowerbound and higherbound. - double densityAvg = densityAvgSum / partNames.size(); - estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); - if (estimation < lowerBound) { - estimation = lowerBound; - } else if (estimation > higherBound) { - estimation = higherBound; - } - } else { - estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); - } - aggregateData.setNumDVs(estimation); - } - columnStatisticsData.setDoubleStats(aggregateData); - } else { - // we need extrapolation - LOG.debug("start extrapolation for " + colName); - Map<String, Integer> indexMap = new HashMap<>(); - for (int index = 0; index < partNames.size(); index++) { - indexMap.put(partNames.get(index), index); - } - Map<String, Double> adjustedIndexMap = new HashMap<>(); - Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>(); - // while we scan the css, we also get the densityAvg, lowerbound and - // higerbound when useDensityFunctionForNDVEstimation is true. - double densityAvgSum = 0.0; - if (ndvEstimator == null) { - // if not every partition uses bitvector for ndv, we just fall back to - // the traditional extrapolation methods. - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - String partName = csp.getPartName(); - DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats(); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); - } - adjustedIndexMap.put(partName, (double) indexMap.get(partName)); - adjustedStatsMap.put(partName, cso.getStatsData()); - } - } else { - // we first merge all the adjacent bitvectors that we could merge and - // derive new partition names and index. - StringBuilder pseudoPartName = new StringBuilder(); - double pseudoIndexSum = 0; - int length = 0; - int curIndex = -1; - DoubleColumnStatsData aggregateData = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - String partName = csp.getPartName(); - DoubleColumnStatsDataInspector newData = - (DoubleColumnStatsDataInspector) cso.getStatsData().getDoubleStats(); - // newData.isSetBitVectors() should be true for sure because we - // already checked it before. - if (indexMap.get(partName) != curIndex) { - // There is bitvector, but it is not adjacent to the previous ones. - if (length > 0) { - // we have to set ndv - adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - ColumnStatisticsData csd = new ColumnStatisticsData(); - csd.setDoubleStats(aggregateData); - adjustedStatsMap.put(pseudoPartName.toString(), csd); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs(); - } - // reset everything - pseudoPartName = new StringBuilder(); - pseudoIndexSum = 0; - length = 0; - ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); - } - aggregateData = null; - } - curIndex = indexMap.get(partName); - pseudoPartName.append(partName); - pseudoIndexSum += curIndex; - length++; - curIndex++; - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue())); - aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), - newData.getHighValue())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - } - ndvEstimator.mergeEstimators(newData.getNdvEstimator()); - } - if (length > 0) { - // we have to set ndv - adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - ColumnStatisticsData csd = new ColumnStatisticsData(); - csd.setDoubleStats(aggregateData); - adjustedStatsMap.put(pseudoPartName.toString(), csd); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs(); - } - } - } - extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), - adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); - } - LOG.debug( - "Ndv estimatation for {} is {}. # of partitions requested: {}. # of partitions found: {}", - colName, columnStatisticsData.getDoubleStats().getNumDVs(), partNames.size(), - colStatsWithSourceInfo.size()); - statsObj.setStatsData(columnStatisticsData); - return statsObj; - } - - @Override - public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, - int numPartsWithStats, Map<String, Double> adjustedIndexMap, - Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) { - int rightBorderInd = numParts; - DoubleColumnStatsDataInspector extrapolateDoubleData = new DoubleColumnStatsDataInspector(); - Map<String, DoubleColumnStatsData> extractedAdjustedStatsMap = new HashMap<>(); - for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) { - extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDoubleStats()); - } - List<Map.Entry<String, DoubleColumnStatsData>> list = new LinkedList<>( - extractedAdjustedStatsMap.entrySet()); - // get the lowValue - Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, DoubleColumnStatsData> o1, - Map.Entry<String, DoubleColumnStatsData> o2) { - return Double.compare(o1.getValue().getLowValue(), o2.getValue().getLowValue()); - } - }); - double minInd = adjustedIndexMap.get(list.get(0).getKey()); - double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - double lowValue = 0; - double min = list.get(0).getValue().getLowValue(); - double max = list.get(list.size() - 1).getValue().getLowValue(); - if (minInd == maxInd) { - lowValue = min; - } else if (minInd < maxInd) { - // left border is the min - lowValue = (max - (max - min) * maxInd / (maxInd - minInd)); - } else { - // right border is the min - lowValue = (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd)); - } - - // get the highValue - Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, DoubleColumnStatsData> o1, - Map.Entry<String, DoubleColumnStatsData> o2) { - return Double.compare(o1.getValue().getHighValue(), o2.getValue().getHighValue()); - } - }); - minInd = adjustedIndexMap.get(list.get(0).getKey()); - maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - double highValue = 0; - min = list.get(0).getValue().getHighValue(); - max = list.get(list.size() - 1).getValue().getHighValue(); - if (minInd == maxInd) { - highValue = min; - } else if (minInd < maxInd) { - // right border is the max - highValue = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - highValue = (min + (max - min) * minInd / (minInd - maxInd)); - } - - // get the #nulls - long numNulls = 0; - for (Map.Entry<String, DoubleColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) { - numNulls += entry.getValue().getNumNulls(); - } - // we scale up sumNulls based on the number of partitions - numNulls = numNulls * numParts / numPartsWithStats; - - // get the ndv - long ndv = 0; - long ndvMin = 0; - long ndvMax = 0; - Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, DoubleColumnStatsData> o1, - Map.Entry<String, DoubleColumnStatsData> o2) { - return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); - } - }); - long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); - long higherBound = 0; - for (Map.Entry<String, DoubleColumnStatsData> entry : list) { - higherBound += entry.getValue().getNumDVs(); - } - if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) { - ndv = (long) ((highValue - lowValue) / densityAvg); - if (ndv < lowerBound) { - ndv = lowerBound; - } else if (ndv > higherBound) { - ndv = higherBound; - } - } else { - minInd = adjustedIndexMap.get(list.get(0).getKey()); - maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - ndvMin = list.get(0).getValue().getNumDVs(); - ndvMax = list.get(list.size() - 1).getValue().getNumDVs(); - if (minInd == maxInd) { - ndv = ndvMin; - } else if (minInd < maxInd) { - // right border is the max - ndv = (long) (ndvMin + (ndvMax - ndvMin) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - ndv = (long) (ndvMin + (ndvMax - ndvMin) * minInd / (minInd - maxInd)); - } - } - extrapolateDoubleData.setLowValue(lowValue); - extrapolateDoubleData.setHighValue(highValue); - extrapolateDoubleData.setNumNulls(numNulls); - extrapolateDoubleData.setNumDVs(ndv); - extrapolateData.setDoubleStats(extrapolateDoubleData); - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/IExtrapolatePartStatus.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/IExtrapolatePartStatus.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/IExtrapolatePartStatus.java deleted file mode 100644 index 98a121b..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/IExtrapolatePartStatus.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.metastore.columnstats.aggr; - -import java.util.Map; - -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; - -public interface IExtrapolatePartStatus { - // The following function will extrapolate the stats when the column stats of - // some partitions are missing. - /** - * @param extrapolateData - * it will carry back the specific stats, e.g., DOUBLE_STATS or - * LONG_STATS - * @param numParts - * the total number of partitions - * @param numPartsWithStats - * the number of partitions that have stats - * @param adjustedIndexMap - * the partition name to index map - * @param adjustedStatsMap - * the partition name to its stats map - * @param densityAvg - * the average of ndv density, which is useful when - * useDensityFunctionForNDVEstimation is true. - */ - void extrapolate(ColumnStatisticsData extrapolateData, int numParts, - int numPartsWithStats, Map<String, Double> adjustedIndexMap, - Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg); - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java deleted file mode 100644 index e6823d3..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/LongColumnStatsAggregator.java +++ /dev/null @@ -1,348 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hadoop.hive.metastore.columnstats.aggr; - -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.ColumnStatistics; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector; -import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class LongColumnStatsAggregator extends ColumnStatsAggregator implements - IExtrapolatePartStatus { - - private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); - - @Override - public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, - List<String> partNames, boolean areAllPartsFound) throws MetaException { - ColumnStatisticsObj statsObj = null; - String colType = null; - String colName = null; - // check if all the ColumnStatisticsObjs contain stats and all the ndv are - // bitvectors - boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size(); - NumDistinctValueEstimator ndvEstimator = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - if (statsObj == null) { - colName = cso.getColName(); - colType = cso.getColType(); - statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, - cso.getStatsData().getSetField()); - LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, - doAllPartitionContainStats); - } - LongColumnStatsDataInspector longColumnStatsData = - (LongColumnStatsDataInspector) cso.getStatsData().getLongStats(); - if (longColumnStatsData.getNdvEstimator() == null) { - ndvEstimator = null; - break; - } else { - // check if all of the bit vectors can merge - NumDistinctValueEstimator estimator = longColumnStatsData.getNdvEstimator(); - if (ndvEstimator == null) { - ndvEstimator = estimator; - } else { - if (ndvEstimator.canMerge(estimator)) { - continue; - } else { - ndvEstimator = null; - break; - } - } - } - } - if (ndvEstimator != null) { - ndvEstimator = NumDistinctValueEstimatorFactory - .getEmptyNumDistinctValueEstimator(ndvEstimator); - } - LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); - ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); - if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) { - LongColumnStatsDataInspector aggregateData = null; - long lowerBound = 0; - long higherBound = 0; - double densityAvgSum = 0.0; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - LongColumnStatsDataInspector newData = - (LongColumnStatsDataInspector) cso.getStatsData().getLongStats(); - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); - if (ndvEstimator != null) { - ndvEstimator.mergeEstimators(newData.getNdvEstimator()); - } - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue())); - aggregateData - .setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); - } - } - if (ndvEstimator != null) { - // if all the ColumnStatisticsObjs contain bitvectors, we do not need to - // use uniform distribution assumption because we can merge bitvectors - // to get a good estimation. - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - } else { - long estimation; - if (useDensityFunctionForNDVEstimation) { - // We have estimation, lowerbound and higherbound. We use estimation - // if it is between lowerbound and higherbound. - double densityAvg = densityAvgSum / partNames.size(); - estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); - if (estimation < lowerBound) { - estimation = lowerBound; - } else if (estimation > higherBound) { - estimation = higherBound; - } - } else { - estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); - } - aggregateData.setNumDVs(estimation); - } - columnStatisticsData.setLongStats(aggregateData); - } else { - // we need extrapolation - LOG.debug("start extrapolation for " + colName); - - Map<String, Integer> indexMap = new HashMap<>(); - for (int index = 0; index < partNames.size(); index++) { - indexMap.put(partNames.get(index), index); - } - Map<String, Double> adjustedIndexMap = new HashMap<>(); - Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>(); - // while we scan the css, we also get the densityAvg, lowerbound and - // higerbound when useDensityFunctionForNDVEstimation is true. - double densityAvgSum = 0.0; - if (ndvEstimator == null) { - // if not every partition uses bitvector for ndv, we just fall back to - // the traditional extrapolation methods. - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - String partName = csp.getPartName(); - LongColumnStatsData newData = cso.getStatsData().getLongStats(); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); - } - adjustedIndexMap.put(partName, (double) indexMap.get(partName)); - adjustedStatsMap.put(partName, cso.getStatsData()); - } - } else { - // we first merge all the adjacent bitvectors that we could merge and - // derive new partition names and index. - StringBuilder pseudoPartName = new StringBuilder(); - double pseudoIndexSum = 0; - int length = 0; - int curIndex = -1; - LongColumnStatsDataInspector aggregateData = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - String partName = csp.getPartName(); - LongColumnStatsDataInspector newData = - (LongColumnStatsDataInspector) cso.getStatsData().getLongStats(); - // newData.isSetBitVectors() should be true for sure because we - // already checked it before. - if (indexMap.get(partName) != curIndex) { - // There is bitvector, but it is not adjacent to the previous ones. - if (length > 0) { - // we have to set ndv - adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - ColumnStatisticsData csd = new ColumnStatisticsData(); - csd.setLongStats(aggregateData); - adjustedStatsMap.put(pseudoPartName.toString(), csd); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs(); - } - // reset everything - pseudoPartName = new StringBuilder(); - pseudoIndexSum = 0; - length = 0; - ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); - } - aggregateData = null; - } - curIndex = indexMap.get(partName); - pseudoPartName.append(partName); - pseudoIndexSum += curIndex; - length++; - curIndex++; - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue())); - aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), - newData.getHighValue())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - } - ndvEstimator.mergeEstimators(newData.getNdvEstimator()); - } - if (length > 0) { - // we have to set ndv - adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - ColumnStatisticsData csd = new ColumnStatisticsData(); - csd.setLongStats(aggregateData); - adjustedStatsMap.put(pseudoPartName.toString(), csd); - if (useDensityFunctionForNDVEstimation) { - densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs(); - } - } - } - extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), - adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); - } - LOG.debug( - "Ndv estimatation for {} is {} # of partitions requested: {} # of partitions found: {}", - colName, columnStatisticsData.getLongStats().getNumDVs(), partNames.size(), - colStatsWithSourceInfo.size()); - statsObj.setStatsData(columnStatisticsData); - return statsObj; - } - - @Override - public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, - int numPartsWithStats, Map<String, Double> adjustedIndexMap, - Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) { - int rightBorderInd = numParts; - LongColumnStatsDataInspector extrapolateLongData = new LongColumnStatsDataInspector(); - Map<String, LongColumnStatsData> extractedAdjustedStatsMap = new HashMap<>(); - for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) { - extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getLongStats()); - } - List<Map.Entry<String, LongColumnStatsData>> list = new LinkedList<>( - extractedAdjustedStatsMap.entrySet()); - // get the lowValue - Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, LongColumnStatsData> o1, - Map.Entry<String, LongColumnStatsData> o2) { - return Long.compare(o1.getValue().getLowValue(), o2.getValue().getLowValue()); - } - }); - double minInd = adjustedIndexMap.get(list.get(0).getKey()); - double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - long lowValue = 0; - long min = list.get(0).getValue().getLowValue(); - long max = list.get(list.size() - 1).getValue().getLowValue(); - if (minInd == maxInd) { - lowValue = min; - } else if (minInd < maxInd) { - // left border is the min - lowValue = (long) (max - (max - min) * maxInd / (maxInd - minInd)); - } else { - // right border is the min - lowValue = (long) (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd)); - } - - // get the highValue - Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, LongColumnStatsData> o1, - Map.Entry<String, LongColumnStatsData> o2) { - return Long.compare(o1.getValue().getHighValue(), o2.getValue().getHighValue()); - } - }); - minInd = adjustedIndexMap.get(list.get(0).getKey()); - maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - long highValue = 0; - min = list.get(0).getValue().getHighValue(); - max = list.get(list.size() - 1).getValue().getHighValue(); - if (minInd == maxInd) { - highValue = min; - } else if (minInd < maxInd) { - // right border is the max - highValue = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - highValue = (long) (min + (max - min) * minInd / (minInd - maxInd)); - } - - // get the #nulls - long numNulls = 0; - for (Map.Entry<String, LongColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) { - numNulls += entry.getValue().getNumNulls(); - } - // we scale up sumNulls based on the number of partitions - numNulls = numNulls * numParts / numPartsWithStats; - - // get the ndv - long ndv = 0; - Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, LongColumnStatsData> o1, - Map.Entry<String, LongColumnStatsData> o2) { - return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); - } - }); - long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); - long higherBound = 0; - for (Map.Entry<String, LongColumnStatsData> entry : list) { - higherBound += entry.getValue().getNumDVs(); - } - if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) { - ndv = (long) ((highValue - lowValue) / densityAvg); - if (ndv < lowerBound) { - ndv = lowerBound; - } else if (ndv > higherBound) { - ndv = higherBound; - } - } else { - minInd = adjustedIndexMap.get(list.get(0).getKey()); - maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - min = list.get(0).getValue().getNumDVs(); - max = list.get(list.size() - 1).getValue().getNumDVs(); - if (minInd == maxInd) { - ndv = min; - } else if (minInd < maxInd) { - // right border is the max - ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - ndv = (long) (min + (max - min) * minInd / (minInd - maxInd)); - } - } - extrapolateLongData.setLowValue(lowValue); - extrapolateLongData.setHighValue(highValue); - extrapolateLongData.setNumNulls(numNulls); - extrapolateLongData.setNumDVs(ndv); - extrapolateData.setLongStats(extrapolateLongData); - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java deleted file mode 100644 index 9537647..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java +++ /dev/null @@ -1,304 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hadoop.hive.metastore.columnstats.aggr; - -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.ColumnStatistics; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; -import org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector; -import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -public class StringColumnStatsAggregator extends ColumnStatsAggregator implements - IExtrapolatePartStatus { - - private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); - - @Override - public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, - List<String> partNames, boolean areAllPartsFound) throws MetaException { - ColumnStatisticsObj statsObj = null; - String colType = null; - String colName = null; - // check if all the ColumnStatisticsObjs contain stats and all the ndv are - // bitvectors - boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size(); - NumDistinctValueEstimator ndvEstimator = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - if (statsObj == null) { - colName = cso.getColName(); - colType = cso.getColType(); - statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, - cso.getStatsData().getSetField()); - LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, - doAllPartitionContainStats); - } - StringColumnStatsDataInspector stringColumnStatsData = - (StringColumnStatsDataInspector) cso.getStatsData().getStringStats(); - if (stringColumnStatsData.getNdvEstimator() == null) { - ndvEstimator = null; - break; - } else { - // check if all of the bit vectors can merge - NumDistinctValueEstimator estimator = stringColumnStatsData.getNdvEstimator(); - if (ndvEstimator == null) { - ndvEstimator = estimator; - } else { - if (ndvEstimator.canMerge(estimator)) { - continue; - } else { - ndvEstimator = null; - break; - } - } - } - } - if (ndvEstimator != null) { - ndvEstimator = NumDistinctValueEstimatorFactory - .getEmptyNumDistinctValueEstimator(ndvEstimator); - } - LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); - ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); - if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) { - StringColumnStatsDataInspector aggregateData = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - StringColumnStatsDataInspector newData = - (StringColumnStatsDataInspector) cso.getStatsData().getStringStats(); - if (ndvEstimator != null) { - ndvEstimator.mergeEstimators(newData.getNdvEstimator()); - } - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData - .setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen())); - aggregateData - .setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); - } - } - if (ndvEstimator != null) { - // if all the ColumnStatisticsObjs contain bitvectors, we do not need to - // use uniform distribution assumption because we can merge bitvectors - // to get a good estimation. - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - } else { - // aggregateData already has the ndv of the max of all - } - columnStatisticsData.setStringStats(aggregateData); - } else { - // we need extrapolation - LOG.debug("start extrapolation for " + colName); - - Map<String, Integer> indexMap = new HashMap<>(); - for (int index = 0; index < partNames.size(); index++) { - indexMap.put(partNames.get(index), index); - } - Map<String, Double> adjustedIndexMap = new HashMap<>(); - Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>(); - if (ndvEstimator == null) { - // if not every partition uses bitvector for ndv, we just fall back to - // the traditional extrapolation methods. - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - String partName = csp.getPartName(); - adjustedIndexMap.put(partName, (double) indexMap.get(partName)); - adjustedStatsMap.put(partName, cso.getStatsData()); - } - } else { - // we first merge all the adjacent bitvectors that we could merge and - // derive new partition names and index. - StringBuilder pseudoPartName = new StringBuilder(); - double pseudoIndexSum = 0; - int length = 0; - int curIndex = -1; - StringColumnStatsDataInspector aggregateData = null; - for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) { - ColumnStatisticsObj cso = csp.getColStatsObj(); - String partName = csp.getPartName(); - StringColumnStatsDataInspector newData = - (StringColumnStatsDataInspector) cso.getStatsData().getStringStats(); - // newData.isSetBitVectors() should be true for sure because we - // already checked it before. - if (indexMap.get(partName) != curIndex) { - // There is bitvector, but it is not adjacent to the previous ones. - if (length > 0) { - // we have to set ndv - adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - ColumnStatisticsData csd = new ColumnStatisticsData(); - csd.setStringStats(aggregateData); - adjustedStatsMap.put(pseudoPartName.toString(), csd); - // reset everything - pseudoPartName = new StringBuilder(); - pseudoIndexSum = 0; - length = 0; - ndvEstimator = NumDistinctValueEstimatorFactory - .getEmptyNumDistinctValueEstimator(ndvEstimator); - } - aggregateData = null; - } - curIndex = indexMap.get(partName); - pseudoPartName.append(partName); - pseudoIndexSum += curIndex; - length++; - curIndex++; - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), - newData.getAvgColLen())); - aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), - newData.getMaxColLen())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - } - ndvEstimator.mergeEstimators(newData.getNdvEstimator()); - } - if (length > 0) { - // we have to set ndv - adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - ColumnStatisticsData csd = new ColumnStatisticsData(); - csd.setStringStats(aggregateData); - adjustedStatsMap.put(pseudoPartName.toString(), csd); - } - } - extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), - adjustedIndexMap, adjustedStatsMap, -1); - } - LOG.debug( - "Ndv estimatation for {} is {} # of partitions requested: {} # of partitions found: {}", - colName, columnStatisticsData.getStringStats().getNumDVs(), partNames.size(), - colStatsWithSourceInfo.size()); - statsObj.setStatsData(columnStatisticsData); - return statsObj; - } - - @Override - public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, - int numPartsWithStats, Map<String, Double> adjustedIndexMap, - Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) { - int rightBorderInd = numParts; - StringColumnStatsDataInspector extrapolateStringData = new StringColumnStatsDataInspector(); - Map<String, StringColumnStatsData> extractedAdjustedStatsMap = new HashMap<>(); - for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) { - extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getStringStats()); - } - List<Map.Entry<String, StringColumnStatsData>> list = new LinkedList<>( - extractedAdjustedStatsMap.entrySet()); - // get the avgLen - Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, StringColumnStatsData> o1, - Map.Entry<String, StringColumnStatsData> o2) { - return Double.compare(o1.getValue().getAvgColLen(), o2.getValue().getAvgColLen()); - } - }); - double minInd = adjustedIndexMap.get(list.get(0).getKey()); - double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - double avgColLen = 0; - double min = list.get(0).getValue().getAvgColLen(); - double max = list.get(list.size() - 1).getValue().getAvgColLen(); - if (minInd == maxInd) { - avgColLen = min; - } else if (minInd < maxInd) { - // right border is the max - avgColLen = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - avgColLen = (min + (max - min) * minInd / (minInd - maxInd)); - } - - // get the maxLen - Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, StringColumnStatsData> o1, - Map.Entry<String, StringColumnStatsData> o2) { - return Long.compare(o1.getValue().getMaxColLen(), o2.getValue().getMaxColLen()); - } - }); - minInd = adjustedIndexMap.get(list.get(0).getKey()); - maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - double maxColLen = 0; - min = list.get(0).getValue().getAvgColLen(); - max = list.get(list.size() - 1).getValue().getAvgColLen(); - if (minInd == maxInd) { - maxColLen = min; - } else if (minInd < maxInd) { - // right border is the max - maxColLen = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - maxColLen = (min + (max - min) * minInd / (minInd - maxInd)); - } - - // get the #nulls - long numNulls = 0; - for (Map.Entry<String, StringColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) { - numNulls += entry.getValue().getNumNulls(); - } - // we scale up sumNulls based on the number of partitions - numNulls = numNulls * numParts / numPartsWithStats; - - // get the ndv - long ndv = 0; - Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() { - @Override - public int compare(Map.Entry<String, StringColumnStatsData> o1, - Map.Entry<String, StringColumnStatsData> o2) { - return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs()); - } - }); - minInd = adjustedIndexMap.get(list.get(0).getKey()); - maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); - min = list.get(0).getValue().getNumDVs(); - max = list.get(list.size() - 1).getValue().getNumDVs(); - if (minInd == maxInd) { - ndv = (long) min; - } else if (minInd < maxInd) { - // right border is the max - ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); - } else { - // left border is the max - ndv = (long) (min + (max - min) * minInd / (minInd - maxInd)); - } - extrapolateStringData.setAvgColLen(avgColLen); - extrapolateStringData.setMaxColLen((long) maxColLen); - extrapolateStringData.setNumNulls(numNulls); - extrapolateStringData.setNumDVs(ndv); - extrapolateData.setStringStats(extrapolateStringData); - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DateColumnStatsDataInspector.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DateColumnStatsDataInspector.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DateColumnStatsDataInspector.java deleted file mode 100644 index f6eacbc..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DateColumnStatsDataInspector.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.metastore.columnstats.cache; - -import java.nio.ByteBuffer; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; - -@SuppressWarnings("serial") -public class DateColumnStatsDataInspector extends DateColumnStatsData { - - private NumDistinctValueEstimator ndvEstimator; - - public DateColumnStatsDataInspector() { - super(); - } - - public DateColumnStatsDataInspector(long numNulls, long numDVs) { - super(numNulls, numDVs); - } - - public DateColumnStatsDataInspector(DateColumnStatsDataInspector other) { - super(other); - if (other.ndvEstimator != null) { - super.setBitVectors(ndvEstimator.serialize()); - } - } - - @Override - public DateColumnStatsDataInspector deepCopy() { - return new DateColumnStatsDataInspector(this); - } - - @Override - public byte[] getBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.getBitVectors(); - } - - @Override - public ByteBuffer bufferForBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.bufferForBitVectors(); - } - - @Override - public void setBitVectors(byte[] bitVectors) { - super.setBitVectors(bitVectors); - this.ndvEstimator = null; - } - - @Override - public void setBitVectors(ByteBuffer bitVectors) { - super.setBitVectors(bitVectors); - this.ndvEstimator = null; - } - - @Override - public void unsetBitVectors() { - super.unsetBitVectors(); - this.ndvEstimator = null; - } - - @Override - public boolean isSetBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.isSetBitVectors(); - } - - @Override - public void setBitVectorsIsSet(boolean value) { - if (ndvEstimator != null) { - updateBitVectors(); - } - super.setBitVectorsIsSet(value); - } - - public NumDistinctValueEstimator getNdvEstimator() { - if (isSetBitVectors() && getBitVectors().length != 0) { - updateNdvEstimator(); - } - return ndvEstimator; - } - - public void setNdvEstimator(NumDistinctValueEstimator ndvEstimator) { - super.unsetBitVectors(); - this.ndvEstimator = ndvEstimator; - } - - private void updateBitVectors() { - super.setBitVectors(ndvEstimator.serialize()); - this.ndvEstimator = null; - } - - private void updateNdvEstimator() { - this.ndvEstimator = NumDistinctValueEstimatorFactory - .getNumDistinctValueEstimator(super.getBitVectors()); - super.unsetBitVectors(); - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DecimalColumnStatsDataInspector.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DecimalColumnStatsDataInspector.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DecimalColumnStatsDataInspector.java deleted file mode 100644 index e2427f3..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DecimalColumnStatsDataInspector.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.metastore.columnstats.cache; - -import java.nio.ByteBuffer; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; - -@SuppressWarnings("serial") -public class DecimalColumnStatsDataInspector extends DecimalColumnStatsData { - - private NumDistinctValueEstimator ndvEstimator; - - public DecimalColumnStatsDataInspector() { - super(); - } - - public DecimalColumnStatsDataInspector(long numNulls, long numDVs) { - super(numNulls, numDVs); - } - - public DecimalColumnStatsDataInspector(DecimalColumnStatsDataInspector other) { - super(other); - if (other.ndvEstimator != null) { - super.setBitVectors(ndvEstimator.serialize()); - } - } - - @Override - public DecimalColumnStatsDataInspector deepCopy() { - return new DecimalColumnStatsDataInspector(this); - } - - @Override - public byte[] getBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.getBitVectors(); - } - - @Override - public ByteBuffer bufferForBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.bufferForBitVectors(); - } - - @Override - public void setBitVectors(byte[] bitVectors) { - super.setBitVectors(bitVectors); - this.ndvEstimator = null; - } - - @Override - public void setBitVectors(ByteBuffer bitVectors) { - super.setBitVectors(bitVectors); - this.ndvEstimator = null; - } - - @Override - public void unsetBitVectors() { - super.unsetBitVectors(); - this.ndvEstimator = null; - } - - @Override - public boolean isSetBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.isSetBitVectors(); - } - - @Override - public void setBitVectorsIsSet(boolean value) { - if (ndvEstimator != null) { - updateBitVectors(); - } - super.setBitVectorsIsSet(value); - } - - public NumDistinctValueEstimator getNdvEstimator() { - if (isSetBitVectors() && getBitVectors().length != 0) { - updateNdvEstimator(); - } - return ndvEstimator; - } - - public void setNdvEstimator(NumDistinctValueEstimator ndvEstimator) { - super.unsetBitVectors(); - this.ndvEstimator = ndvEstimator; - } - - private void updateBitVectors() { - super.setBitVectors(ndvEstimator.serialize()); - this.ndvEstimator = null; - } - - private void updateNdvEstimator() { - this.ndvEstimator = NumDistinctValueEstimatorFactory - .getNumDistinctValueEstimator(super.getBitVectors()); - super.unsetBitVectors(); - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DoubleColumnStatsDataInspector.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DoubleColumnStatsDataInspector.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DoubleColumnStatsDataInspector.java deleted file mode 100644 index 7ce7127..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/DoubleColumnStatsDataInspector.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.metastore.columnstats.cache; - -import java.nio.ByteBuffer; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; - -@SuppressWarnings("serial") -public class DoubleColumnStatsDataInspector extends DoubleColumnStatsData { - - private NumDistinctValueEstimator ndvEstimator; - - public DoubleColumnStatsDataInspector() { - super(); - } - - public DoubleColumnStatsDataInspector(long numNulls, long numDVs) { - super(numNulls, numDVs); - } - - public DoubleColumnStatsDataInspector(DoubleColumnStatsDataInspector other) { - super(other); - if (other.ndvEstimator != null) { - super.setBitVectors(ndvEstimator.serialize()); - } - } - - @Override - public DoubleColumnStatsDataInspector deepCopy() { - return new DoubleColumnStatsDataInspector(this); - } - - @Override - public byte[] getBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.getBitVectors(); - } - - @Override - public ByteBuffer bufferForBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.bufferForBitVectors(); - } - - @Override - public void setBitVectors(byte[] bitVectors) { - super.setBitVectors(bitVectors); - this.ndvEstimator = null; - } - - @Override - public void setBitVectors(ByteBuffer bitVectors) { - super.setBitVectors(bitVectors); - this.ndvEstimator = null; - } - - @Override - public void unsetBitVectors() { - super.unsetBitVectors(); - this.ndvEstimator = null; - } - - @Override - public boolean isSetBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.isSetBitVectors(); - } - - @Override - public void setBitVectorsIsSet(boolean value) { - if (ndvEstimator != null) { - updateBitVectors(); - } - super.setBitVectorsIsSet(value); - } - - public NumDistinctValueEstimator getNdvEstimator() { - if (isSetBitVectors() && getBitVectors().length != 0) { - updateNdvEstimator(); - } - return ndvEstimator; - } - - public void setNdvEstimator(NumDistinctValueEstimator ndvEstimator) { - super.unsetBitVectors(); - this.ndvEstimator = ndvEstimator; - } - - private void updateBitVectors() { - super.setBitVectors(ndvEstimator.serialize()); - this.ndvEstimator = null; - } - - private void updateNdvEstimator() { - this.ndvEstimator = NumDistinctValueEstimatorFactory - .getNumDistinctValueEstimator(super.getBitVectors()); - super.unsetBitVectors(); - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/081fa368/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/LongColumnStatsDataInspector.java ---------------------------------------------------------------------- diff --git a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/LongColumnStatsDataInspector.java b/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/LongColumnStatsDataInspector.java deleted file mode 100644 index faf314b..0000000 --- a/standalone-metastore/metastore-common/src/main/java/org/apache/hadoop/hive/metastore/columnstats/cache/LongColumnStatsDataInspector.java +++ /dev/null @@ -1,124 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.hadoop.hive.metastore.columnstats.cache; - -import java.nio.ByteBuffer; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; - -@SuppressWarnings("serial") -public class LongColumnStatsDataInspector extends LongColumnStatsData { - - private NumDistinctValueEstimator ndvEstimator; - - public LongColumnStatsDataInspector() { - super(); - } - - public LongColumnStatsDataInspector(long numNulls, long numDVs) { - super(numNulls, numDVs); - } - - public LongColumnStatsDataInspector(LongColumnStatsDataInspector other) { - super(other); - if (other.ndvEstimator != null) { - super.setBitVectors(ndvEstimator.serialize()); - } - } - - @Override - public LongColumnStatsDataInspector deepCopy() { - return new LongColumnStatsDataInspector(this); - } - - @Override - public byte[] getBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.getBitVectors(); - } - - @Override - public ByteBuffer bufferForBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.bufferForBitVectors(); - } - - @Override - public void setBitVectors(byte[] bitVectors) { - super.setBitVectors(bitVectors); - this.ndvEstimator = null; - } - - @Override - public void setBitVectors(ByteBuffer bitVectors) { - super.setBitVectors(bitVectors); - this.ndvEstimator = null; - } - - @Override - public void unsetBitVectors() { - super.unsetBitVectors(); - this.ndvEstimator = null; - } - - @Override - public boolean isSetBitVectors() { - if (ndvEstimator != null) { - updateBitVectors(); - } - return super.isSetBitVectors(); - } - - @Override - public void setBitVectorsIsSet(boolean value) { - if (ndvEstimator != null) { - updateBitVectors(); - } - super.setBitVectorsIsSet(value); - } - - public NumDistinctValueEstimator getNdvEstimator() { - if (isSetBitVectors() && getBitVectors().length != 0) { - updateNdvEstimator(); - } - return ndvEstimator; - } - - public void setNdvEstimator(NumDistinctValueEstimator ndvEstimator) { - super.unsetBitVectors(); - this.ndvEstimator = ndvEstimator; - } - - private void updateBitVectors() { - super.setBitVectors(ndvEstimator.serialize()); - this.ndvEstimator = null; - } - - private void updateNdvEstimator() { - this.ndvEstimator = NumDistinctValueEstimatorFactory - .getNumDistinctValueEstimator(super.getBitVectors()); - super.unsetBitVectors(); - } - -}