amansinha100 commented on a change in pull request #1785: DRILL-7242: Handle
additional boundary cases and compute better estim…
URL: https://github.com/apache/drill/pull/1785#discussion_r283596709
##########
File path:
exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
##########
@@ -178,101 +185,136 @@ public Double estimatedSelectivity(final RexNode
columnFilter, final long totalR
return currentRange;
}
- private long getSelectedRows(final Range range) {
- final int numBuckets = buckets.length - 1;
+ @VisibleForTesting
+ protected long getSelectedRows(final Range range) {
double startBucketFraction = 1.0;
double endBucketFraction = 1.0;
long numRows = 0;
int result;
Double lowValue = null;
Double highValue = null;
- final int first = 0;
- final int last = buckets.length - 1;
- int startBucket = first;
- int endBucket = last;
+ final int firstStartPointIndex = 0;
+ final int lastEndPointIndex = buckets.length - 1;
+ int startBucket = firstStartPointIndex;
+ int endBucket = lastEndPointIndex - 1;
if (range.hasLowerBound()) {
lowValue = (Double) range.lowerEndpoint();
- // if low value is greater than the end point of the last bucket then
none of the rows qualify
- if (lowValue.compareTo(buckets[last]) > 0) {
+ // if low value is greater than the end point of the last bucket or if
it is equal but the range is open (i.e
+ // predicate is of type > 5 where 5 is the end point of last bucket)
then none of the rows qualify
+ result = lowValue.compareTo(buckets[lastEndPointIndex]);
+ if (result > 0 || result == 0 && range.lowerBoundType() ==
BoundType.OPEN) {
return 0;
}
-
- result = lowValue.compareTo(buckets[first]);
+ result = lowValue.compareTo(buckets[firstStartPointIndex]);
// if low value is less than or equal to the first bucket's start point
then start with the first bucket and all
// rows in first bucket are included
if (result <= 0) {
- startBucket = first;
+ startBucket = firstStartPointIndex;
startBucketFraction = 1.0;
} else {
// Use a simplified logic where we treat > and >= the same when
computing selectivity since the
// difference is going to be very small for reasonable sized data sets
- startBucket = getContainingBucket(lowValue, numBuckets);
+ startBucket = getContainingBucket(lowValue, lastEndPointIndex, true);
+
// expecting start bucket to be >= 0 since other conditions have been
handled previously
Preconditions.checkArgument(startBucket >= 0, "Expected start bucket
id >= 0");
- startBucketFraction = ((double) (buckets[startBucket + 1] - lowValue))
/ (buckets[startBucket + 1] - buckets[startBucket]);
+
+ if (buckets[startBucket + 1].doubleValue() ==
buckets[startBucket].doubleValue()) {
+ // if start and end points of the bucket are the same, consider
entire bucket
+ startBucketFraction = 1.0;
+ } else {
+ startBucketFraction = ((double) (buckets[startBucket + 1] -
lowValue)) / (buckets[startBucket + 1] - buckets[startBucket]);
+ }
}
}
if (range.hasUpperBound()) {
highValue = (Double) range.upperEndpoint();
- // if the high value is less than the start point of the first bucket
then none of the rows qualify
- if (highValue.compareTo(buckets[first]) < 0) {
+ // if the high value is less than the start point of the first bucket or
if it is equal but the range is open (i.e
+ // predicate is of type < 1 where 1 is the start point of the first
bucket) then none of the rows qualify
+ result = highValue.compareTo(buckets[firstStartPointIndex]);
+ if (result < 0 || (result == 0 && range.upperBoundType() ==
BoundType.OPEN)) {
return 0;
}
- result = highValue.compareTo(buckets[last]);
+ result = highValue.compareTo(buckets[lastEndPointIndex]);
// if high value is greater than or equal to the last bucket's end point
then include the last bucket and all rows in
// last bucket qualify
if (result >= 0) {
- endBucket = last;
+ endBucket = lastEndPointIndex - 1;
endBucketFraction = 1.0;
} else {
// Use a simplified logic where we treat < and <= the same when
computing selectivity since the
// difference is going to be very small for reasonable sized data sets
- endBucket = getContainingBucket(highValue, numBuckets);
+ endBucket = getContainingBucket(highValue, lastEndPointIndex, false);
+
// expecting end bucket to be >= 0 since other conditions have been
handled previously
Preconditions.checkArgument(endBucket >= 0, "Expected end bucket id >=
0");
- endBucketFraction = ((double)(highValue - buckets[endBucket])) /
(buckets[endBucket + 1] - buckets[endBucket]);
+
+ if (buckets[endBucket + 1].doubleValue() ==
buckets[endBucket].doubleValue()) {
+ // if start and end points of the bucket are the same, consider
entire bucket
+ endBucketFraction = 1.0;
+ } else {
+ endBucketFraction = ((double) (highValue - buckets[endBucket])) /
(buckets[endBucket + 1] - buckets[endBucket]);
+ }
}
}
- Preconditions.checkArgument(startBucket <= endBucket);
+ Preconditions.checkArgument(startBucket >= 0 && startBucket + 1 <=
lastEndPointIndex, "Invalid startBucket: " + startBucket);
+ Preconditions.checkArgument(endBucket >= 0 && endBucket + 1 <=
lastEndPointIndex, "Invalid endBucket: " + endBucket);
+ Preconditions.checkArgument(startBucket <= endBucket,
+ "Start bucket: " + startBucket + " should be less than or equal to end
bucket: " + endBucket);
- // if the endBucketId corresponds to the last endpoint, then adjust it to
be one less
- if (endBucket == last) {
- endBucket = last - 1;
- }
- if (startBucket == endBucket && highValue != null && lowValue != null) {
+ if (startBucket == endBucket) {
// if the start and end buckets are the same, interpolate based on the
difference between the high and low value
- numRows = (long) ((highValue - lowValue) / (buckets[endBucket + 1] -
buckets[startBucket]) * numRowsPerBucket);
+ if (highValue != null && lowValue != null) {
+ numRows = (long) ((highValue - lowValue) / (buckets[startBucket + 1] -
buckets[startBucket]) * numRowsPerBucket);
+ } else if (highValue != null) {
+ numRows = (long) (endBucketFraction * numRowsPerBucket);
+ } else {
+ numRows = (long) (startBucketFraction * numRowsPerBucket);
+ }
} else {
- numRows = (long) ((startBucketFraction + endBucketFraction) *
numRowsPerBucket + (endBucket - startBucket - 1) * numRowsPerBucket);
+ int numIntermediateBuckets = (endBucket > startBucket + 1) ? (endBucket
- startBucket - 1) : 0;
+ numRows = (long) ((startBucketFraction + endBucketFraction) *
numRowsPerBucket + numIntermediateBuckets * numRowsPerBucket);
}
return numRows;
}
- private int getContainingBucket(final Double value, final int numBuckets) {
+ /**
+ * Get the start point of the containing bucket for the supplied value. If
there are multiple buckets with the
+ * same start point, return either the first matching or last matching
depending on firstMatching flag
+ * @param value the input double value
+ * @param lastEndPointIndex
+ * @param firstMatching If true, return the first bucket that matches the
specified criteria otherwise return the last one
+ * @return index of either the first or last matching bucket if a match was
found, otherwise return -1
+ */
+ private int getContainingBucket(final Double value, final int
lastEndPointIndex, final boolean firstMatching) {
int i = 0;
int containing_bucket = -1;
+
// check which bucket this value falls in
- for (; i <= numBuckets; i++) {
+ for (; i <= lastEndPointIndex; i++) {
int result = buckets[i].compareTo(value);
if (result > 0) {
containing_bucket = i - 1;
break;
} else if (result == 0) {
- containing_bucket = i;
- break;
+ containing_bucket = (i == lastEndPointIndex) ? i - 1 : i;
Review comment:
Done
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services