[GitHub] [drill] amansinha100 commented on a change in pull request #1785: DRILL-7242: Handle additional boundary cases and compute better estim…

GitBox Mon, 13 May 2019 18:37:10 -0700

amansinha100 commented on a change in pull request #1785: DRILL-7242: Handle 
additional boundary cases and compute better estim…
URL: https://github.com/apache/drill/pull/1785#discussion_r283596709


 ##########
 File path: 
exec/java-exec/src/main/java/org/apache/drill/exec/planner/common/NumericEquiDepthHistogram.java
 ##########
 @@ -178,101 +185,136 @@ public Double estimatedSelectivity(final RexNode 
columnFilter, final long totalR
     return currentRange;
   }
 
-  private long getSelectedRows(final Range range) {
-    final int numBuckets = buckets.length - 1;
+  @VisibleForTesting
+  protected long getSelectedRows(final Range range) {
     double startBucketFraction = 1.0;
     double endBucketFraction = 1.0;
     long numRows = 0;
     int result;
     Double lowValue = null;
     Double highValue = null;
-    final int first = 0;
-    final int last = buckets.length - 1;
-    int startBucket = first;
-    int endBucket = last;
+    final int firstStartPointIndex = 0;
+    final int lastEndPointIndex = buckets.length - 1;
+    int startBucket = firstStartPointIndex;
+    int endBucket = lastEndPointIndex - 1;
 
     if (range.hasLowerBound()) {
       lowValue = (Double) range.lowerEndpoint();
 
-      // if low value is greater than the end point of the last bucket then 
none of the rows qualify
-      if (lowValue.compareTo(buckets[last]) > 0) {
+      // if low value is greater than the end point of the last bucket or if 
it is equal but the range is open (i.e
+      // predicate is of type > 5 where 5 is the end point of last bucket) 
then none of the rows qualify
+      result = lowValue.compareTo(buckets[lastEndPointIndex]);
+      if (result > 0 || result == 0 && range.lowerBoundType() == 
BoundType.OPEN)  {
         return 0;
       }
-
-      result = lowValue.compareTo(buckets[first]);
+      result = lowValue.compareTo(buckets[firstStartPointIndex]);
 
       // if low value is less than or equal to the first bucket's start point 
then start with the first bucket and all
       // rows in first bucket are included
       if (result <= 0) {
-        startBucket = first;
+        startBucket = firstStartPointIndex;
         startBucketFraction = 1.0;
       } else {
         // Use a simplified logic where we treat > and >= the same when 
computing selectivity since the
         // difference is going to be very small for reasonable sized data sets
-        startBucket = getContainingBucket(lowValue, numBuckets);
+        startBucket = getContainingBucket(lowValue, lastEndPointIndex, true);
+
         // expecting start bucket to be >= 0 since other conditions have been 
handled previously
         Preconditions.checkArgument(startBucket >= 0, "Expected start bucket 
id >= 0");
-        startBucketFraction = ((double) (buckets[startBucket + 1] - lowValue)) 
/ (buckets[startBucket + 1] - buckets[startBucket]);
+
+       if (buckets[startBucket + 1].doubleValue() == 
buckets[startBucket].doubleValue()) {
+         // if start and end points of the bucket are the same, consider 
entire bucket
+         startBucketFraction = 1.0;
+       } else {
+          startBucketFraction = ((double) (buckets[startBucket + 1] - 
lowValue)) / (buckets[startBucket + 1] - buckets[startBucket]);
+        }
       }
     }
 
     if (range.hasUpperBound()) {
       highValue = (Double) range.upperEndpoint();
 
-      // if the high value is less than the start point of the first bucket 
then none of the rows qualify
-      if (highValue.compareTo(buckets[first]) < 0) {
+      // if the high value is less than the start point of the first bucket or 
if it is equal but the range is open (i.e
+      // predicate is of type < 1 where 1 is the start point of the first 
bucket) then none of the rows qualify
+      result = highValue.compareTo(buckets[firstStartPointIndex]);
+      if (result < 0 || (result == 0 && range.upperBoundType() == 
BoundType.OPEN)) {
         return 0;
       }
 
-      result = highValue.compareTo(buckets[last]);
+      result = highValue.compareTo(buckets[lastEndPointIndex]);
 
       // if high value is greater than or equal to the last bucket's end point 
then include the last bucket and all rows in
       // last bucket qualify
       if (result >= 0) {
-        endBucket = last;
+        endBucket = lastEndPointIndex - 1;
         endBucketFraction = 1.0;
       } else {
         // Use a simplified logic where we treat < and <= the same when 
computing selectivity since the
         // difference is going to be very small for reasonable sized data sets
-        endBucket = getContainingBucket(highValue, numBuckets);
+        endBucket = getContainingBucket(highValue, lastEndPointIndex, false);
+
         // expecting end bucket to be >= 0 since other conditions have been 
handled previously
         Preconditions.checkArgument(endBucket >= 0, "Expected end bucket id >= 
0");
-        endBucketFraction = ((double)(highValue - buckets[endBucket])) / 
(buckets[endBucket + 1] - buckets[endBucket]);
+
+        if (buckets[endBucket + 1].doubleValue() == 
buckets[endBucket].doubleValue()) {
+          // if start and end points of the bucket are the same, consider 
entire bucket
+          endBucketFraction = 1.0;
+        } else {
+          endBucketFraction = ((double) (highValue - buckets[endBucket])) / 
(buckets[endBucket + 1] - buckets[endBucket]);
+        }
       }
     }
 
-    Preconditions.checkArgument(startBucket <= endBucket);
+    Preconditions.checkArgument(startBucket >= 0 && startBucket + 1 <= 
lastEndPointIndex, "Invalid startBucket: " + startBucket);
+    Preconditions.checkArgument(endBucket >= 0 && endBucket + 1 <= 
lastEndPointIndex, "Invalid endBucket: " +  endBucket);
+    Preconditions.checkArgument(startBucket <= endBucket,
+      "Start bucket: " + startBucket + " should be less than or equal to end 
bucket: " + endBucket);
 
-    // if the endBucketId corresponds to the last endpoint, then adjust it to 
be one less
-    if (endBucket == last) {
-      endBucket = last - 1;
-    }
-    if (startBucket == endBucket && highValue != null && lowValue != null) {
+    if (startBucket == endBucket) {
       // if the start and end buckets are the same, interpolate based on the 
difference between the high and low value
-      numRows = (long) ((highValue - lowValue) / (buckets[endBucket + 1] - 
buckets[startBucket]) * numRowsPerBucket);
+      if (highValue != null && lowValue != null) {
+        numRows = (long) ((highValue - lowValue) / (buckets[startBucket + 1] - 
buckets[startBucket]) * numRowsPerBucket);
+      } else if (highValue != null) {
+        numRows = (long) (endBucketFraction * numRowsPerBucket);
+      } else {
+        numRows = (long) (startBucketFraction * numRowsPerBucket);
+      }
     } else {
-      numRows = (long) ((startBucketFraction + endBucketFraction) * 
numRowsPerBucket + (endBucket - startBucket - 1) * numRowsPerBucket);
+      int numIntermediateBuckets = (endBucket > startBucket + 1) ? (endBucket 
- startBucket - 1) : 0;
+      numRows = (long) ((startBucketFraction + endBucketFraction) * 
numRowsPerBucket + numIntermediateBuckets * numRowsPerBucket);
     }
 
     return numRows;
   }
 
-  private int getContainingBucket(final Double value, final int numBuckets) {
+  /**
+   * Get the start point of the containing bucket for the supplied value. If 
there are multiple buckets with the
+   * same start point, return either the first matching or last matching 
depending on firstMatching flag
+   * @param value the input double value
+   * @param lastEndPointIndex
+   * @param firstMatching If true, return the first bucket that matches the 
specified criteria otherwise return the last one
+   * @return index of either the first or last matching bucket if a match was 
found, otherwise return -1
+   */
+  private int getContainingBucket(final Double value, final int 
lastEndPointIndex, final boolean firstMatching) {
     int i = 0;
     int containing_bucket = -1;
+
     // check which bucket this value falls in
-    for (; i <= numBuckets; i++) {
+    for (; i <= lastEndPointIndex; i++) {
       int result = buckets[i].compareTo(value);
       if (result > 0) {
         containing_bucket = i - 1;
         break;
       } else if (result == 0) {
-        containing_bucket = i;
-        break;
+        containing_bucket = (i == lastEndPointIndex) ? i - 1 : i;
 
 Review comment:
   Done

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] [drill] amansinha100 commented on a change in pull request #1785: DRILL-7242: Handle additional boundary cases and compute better estim…

Reply via email to