asolimando commented on code in PR #3137:
URL: https://github.com/apache/hive/pull/3137#discussion_r1045899791
##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/stats/annotation/StatsRulesProcFactory.java:
##########
@@ -1234,17 +1285,70 @@ private long evaluateComparator(Statistics stats,
AnnotateStatsProcCtx aspCtx, E
// new estimate for the number of rows
return Math.round(
((maxValue.subtract(value)).divide(maxValue.subtract(minValue),
RoundingMode.UP))
- .multiply(BigDecimal.valueOf(numRows))
+ .multiply(BigDecimal.valueOf(currNumRows))
.doubleValue());
}
}
}
} catch (NumberFormatException nfe) {
- return numRows / 3;
+ return currNumRows / 3;
}
}
// default
- return numRows / 3;
+ return currNumRows / 3;
+ }
+
+ private long evaluateComparatorWithHistogram(ColStatistics cs, long
currNumRows, String colTypeLowerCase,
+ String boundValue, boolean upperBound, boolean closedBound) {
+ final KllFloatsSketch kll =
KllFloatsSketch.heapify(Memory.wrap(cs.getHistogram()));
+
+ if (kll.getN() == 0) {
+ return 0;
+ }
+
+ try {
+ final float value = extractFloatFromLiteralValue(colTypeLowerCase,
boundValue);
+
+ // kll ignores null values (i.e., kll.getN() + numNulls =
currNumRows), we therefore need to use kll.getN()
+ // instead of currNumRows since the CDF is expressed as a fraction of
kll.getN(), not currNumRows
+ if (upperBound) {
+ return Math.round(kll.getN() * (closedBound ?
+ lessThanOrEqualSelectivity(kll, value) :
lessThanSelectivity(kll, value)));
+ } else {
+ return Math.round(kll.getN() * (closedBound ?
+ greaterThanOrEqualSelectivity(kll, value) :
greaterThanSelectivity(kll, value)));
+ }
+ } catch (RuntimeException e) {
+ LOG.debug("Selectivity computation using histogram failed to parse the
boundary value ({}), "
+ + ", using the generic computation strategy", boundValue, e);
+ return currNumRows / 3;
+ }
+ }
+
+ @VisibleForTesting
+ protected static float extractFloatFromLiteralValue(String
colTypeLowerCase, String value) {
+ if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME)) {
+ return Byte.parseByte(value);
+ } else if (colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME)) {
+ return Short.parseShort(value);
+ } else if (colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME)) {
+ return Integer.parseInt(value);
+ } else if (colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME)) {
+ return Long.parseLong(value);
+ } else if (colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME)) {
+ return Float.parseFloat(value);
+ } else if (colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
+ return (float) Double.parseDouble(value);
+ } else if
(colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
+ return new BigDecimal(value).floatValue();
Review Comment:
Unfortunately KLL is based on Float values for storing (`kll.update()`) and
querying (including `kll.cdf()`), so for data types with higher capacity than
`float` like `double` or `long`, we need to take a hit, so I'd rather make this
explicit here rather than carrying around a higher precision information which
will need to be casted to `float` later on.
For the same reason, I have named the method `extractFloatFromLiteralValue`,
to state clearly that we are extracting a `float`, no matter the data type of
the literal.
This is unfortunate, but we are dealing with statistics and the approximated
result coming from KLL will be hopefully an improvement over the existing
hard-coded estimation anyway.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]