[GitHub] [pinot] richardstartin commented on a diff in pull request #8891: Optimize dictionary lookup for IN clause

GitBox Thu, 16 Jun 2022 11:21:43 -0700


richardstartin commented on code in PR #8891:
URL: https://github.com/apache/pinot/pull/8891#discussion_r899399714



##########
pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/NotInPredicateEvaluatorFactory.java:
##########
@@ -70,67 +68,80 @@ public static BaseDictionaryBasedPredicateEvaluator 
newDictionaryBasedEvaluator(
    */
   public static BaseRawValueBasedPredicateEvaluator 
newRawValueBasedEvaluator(NotInPredicate notInPredicate,
       DataType dataType) {
-    List<String> values = notInPredicate.getValues();
-    int hashSetSize = HashUtil.getMinHashSetSize(values.size());
     switch (dataType) {
       case INT: {
-        IntSet nonMatchingValues = new IntOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Integer.parseInt(value));
+        int[] intValues = notInPredicate.getIntValues();
+        IntSet nonMatchingValues = new 
IntOpenHashSet(HashUtil.getMinHashSetSize(intValues.length));
+        for (int value : intValues) {
+          nonMatchingValues.add(value);
         }
         return new IntRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case LONG: {
-        LongSet nonMatchingValues = new LongOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Long.parseLong(value));
+        long[] longValues = notInPredicate.getLongValues();
+        LongSet nonMatchingValues = new 
LongOpenHashSet(HashUtil.getMinHashSetSize(longValues.length));
+        for (long value : longValues) {
+          nonMatchingValues.add(value);
         }
         return new LongRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case FLOAT: {
-        FloatSet nonMatchingValues = new FloatOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Float.parseFloat(value));
+        float[] floatValues = notInPredicate.getFloatValues();
+        FloatSet nonMatchingValues = new 
FloatOpenHashSet(HashUtil.getMinHashSetSize(floatValues.length));
+        for (float value : floatValues) {
+          nonMatchingValues.add(value);
         }
         return new FloatRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case DOUBLE: {
-        DoubleSet nonMatchingValues = new DoubleOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Double.parseDouble(value));
+        double[] doubleValues = notInPredicate.getDoubleValues();
+        DoubleSet nonMatchingValues = new 
DoubleOpenHashSet(HashUtil.getMinHashSetSize(doubleValues.length));
+        for (double value : doubleValues) {
+          nonMatchingValues.add(value);
         }
         return new DoubleRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case BIG_DECIMAL: {
-        TreeSet<BigDecimal> nonMatchingValues = new TreeSet<>();
-        for (String value : values) {
-          nonMatchingValues.add(new BigDecimal(value));
-        }
+        BigDecimal[] bigDecimalValues = notInPredicate.getBigDecimalValues();
+        // NOTE: Use TreeSet because BigDecimal's compareTo() is not 
consistent with equals()
+        //       E.g. compareTo(3.0, 3) returns 0 but equals(3.0, 3) returns 
false
+        TreeSet<BigDecimal> nonMatchingValues = new 
TreeSet<>(Arrays.asList(bigDecimalValues));
         return new 
BigDecimalRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case BOOLEAN: {
-        IntSet nonMatchingValues = new IntOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(BooleanUtils.toInt(value));
+        int[] booleanValues = notInPredicate.getBooleanValues();
+        IntSet nonMatchingValues = new 
IntOpenHashSet(HashUtil.getMinHashSetSize(booleanValues.length));
+        for (int value : booleanValues) {
+          nonMatchingValues.add(value);
         }
         return new IntRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case TIMESTAMP: {
-        LongSet nonMatchingValues = new LongOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(TimestampUtils.toMillisSinceEpoch(value));
+        long[] timestampValues = notInPredicate.getTimestampValues();
+        LongSet nonMatchingValues = new 
LongOpenHashSet(HashUtil.getMinHashSetSize(timestampValues.length));
+        for (long value : timestampValues) {
+          nonMatchingValues.add(value);
         }
         return new LongRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case STRING: {
-        Set<String> nonMatchingValues = new ObjectOpenHashSet<>(hashSetSize);
-        nonMatchingValues.addAll(values);
+        List<String> stringValues = notInPredicate.getValues();
+        Set<String> nonMatchingValues = new 
ObjectOpenHashSet<>(HashUtil.getMinHashSetSize(stringValues.size()));
+        // NOTE: Add value-by-value to avoid overhead
+        for (String value : stringValues) {
+          //noinspection UseBulkOperation
+          nonMatchingValues.add(value);
+        }
         return new StringRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case BYTES: {
-        Set<ByteArray> nonMatchingValues = new 
ObjectOpenHashSet<>(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(BytesUtils.toByteArray(value));
+        ByteArray[] bytesValues = notInPredicate.getBytesValues();
+        Set<ByteArray> nonMatchingValues = new 
ObjectOpenHashSet<>(HashUtil.getMinHashSetSize(bytesValues.length));
+        // NOTE: Add value-by-value to avoid overhead
+        //noinspection ManualArrayToCollectionCopy

Review Comment:
   I’m sure there hash sets were too small by default but this could have been 
resolved via the load factor, and would probably have made this even faster (it 
would have been nice if profiles before and after were captured for posterity).



##########
pinot-core/src/main/java/org/apache/pinot/core/operator/filter/predicate/NotInPredicateEvaluatorFactory.java:
##########
@@ -70,67 +68,80 @@ public static BaseDictionaryBasedPredicateEvaluator 
newDictionaryBasedEvaluator(
    */
   public static BaseRawValueBasedPredicateEvaluator 
newRawValueBasedEvaluator(NotInPredicate notInPredicate,
       DataType dataType) {
-    List<String> values = notInPredicate.getValues();
-    int hashSetSize = HashUtil.getMinHashSetSize(values.size());
     switch (dataType) {
       case INT: {
-        IntSet nonMatchingValues = new IntOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Integer.parseInt(value));
+        int[] intValues = notInPredicate.getIntValues();
+        IntSet nonMatchingValues = new 
IntOpenHashSet(HashUtil.getMinHashSetSize(intValues.length));
+        for (int value : intValues) {
+          nonMatchingValues.add(value);
         }
         return new IntRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case LONG: {
-        LongSet nonMatchingValues = new LongOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Long.parseLong(value));
+        long[] longValues = notInPredicate.getLongValues();
+        LongSet nonMatchingValues = new 
LongOpenHashSet(HashUtil.getMinHashSetSize(longValues.length));
+        for (long value : longValues) {
+          nonMatchingValues.add(value);
         }
         return new LongRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case FLOAT: {
-        FloatSet nonMatchingValues = new FloatOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Float.parseFloat(value));
+        float[] floatValues = notInPredicate.getFloatValues();
+        FloatSet nonMatchingValues = new 
FloatOpenHashSet(HashUtil.getMinHashSetSize(floatValues.length));
+        for (float value : floatValues) {
+          nonMatchingValues.add(value);
         }
         return new FloatRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case DOUBLE: {
-        DoubleSet nonMatchingValues = new DoubleOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(Double.parseDouble(value));
+        double[] doubleValues = notInPredicate.getDoubleValues();
+        DoubleSet nonMatchingValues = new 
DoubleOpenHashSet(HashUtil.getMinHashSetSize(doubleValues.length));
+        for (double value : doubleValues) {
+          nonMatchingValues.add(value);
         }
         return new DoubleRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case BIG_DECIMAL: {
-        TreeSet<BigDecimal> nonMatchingValues = new TreeSet<>();
-        for (String value : values) {
-          nonMatchingValues.add(new BigDecimal(value));
-        }
+        BigDecimal[] bigDecimalValues = notInPredicate.getBigDecimalValues();
+        // NOTE: Use TreeSet because BigDecimal's compareTo() is not 
consistent with equals()
+        //       E.g. compareTo(3.0, 3) returns 0 but equals(3.0, 3) returns 
false
+        TreeSet<BigDecimal> nonMatchingValues = new 
TreeSet<>(Arrays.asList(bigDecimalValues));
         return new 
BigDecimalRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case BOOLEAN: {
-        IntSet nonMatchingValues = new IntOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(BooleanUtils.toInt(value));
+        int[] booleanValues = notInPredicate.getBooleanValues();
+        IntSet nonMatchingValues = new 
IntOpenHashSet(HashUtil.getMinHashSetSize(booleanValues.length));
+        for (int value : booleanValues) {
+          nonMatchingValues.add(value);
         }
         return new IntRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case TIMESTAMP: {
-        LongSet nonMatchingValues = new LongOpenHashSet(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(TimestampUtils.toMillisSinceEpoch(value));
+        long[] timestampValues = notInPredicate.getTimestampValues();
+        LongSet nonMatchingValues = new 
LongOpenHashSet(HashUtil.getMinHashSetSize(timestampValues.length));
+        for (long value : timestampValues) {
+          nonMatchingValues.add(value);
         }
         return new LongRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case STRING: {
-        Set<String> nonMatchingValues = new ObjectOpenHashSet<>(hashSetSize);
-        nonMatchingValues.addAll(values);
+        List<String> stringValues = notInPredicate.getValues();
+        Set<String> nonMatchingValues = new 
ObjectOpenHashSet<>(HashUtil.getMinHashSetSize(stringValues.size()));
+        // NOTE: Add value-by-value to avoid overhead
+        for (String value : stringValues) {
+          //noinspection UseBulkOperation
+          nonMatchingValues.add(value);
+        }
         return new StringRawValueBasedNotInPredicateEvaluator(notInPredicate, 
nonMatchingValues);
       }
       case BYTES: {
-        Set<ByteArray> nonMatchingValues = new 
ObjectOpenHashSet<>(hashSetSize);
-        for (String value : values) {
-          nonMatchingValues.add(BytesUtils.toByteArray(value));
+        ByteArray[] bytesValues = notInPredicate.getBytesValues();
+        Set<ByteArray> nonMatchingValues = new 
ObjectOpenHashSet<>(HashUtil.getMinHashSetSize(bytesValues.length));
+        // NOTE: Add value-by-value to avoid overhead
+        //noinspection ManualArrayToCollectionCopy

Review Comment:
   I’m sure the hash sets were too small by default but this could have been 
resolved via the load factor, and would probably have made this even faster (it 
would have been nice if profiles before and after were captured for posterity).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [pinot] richardstartin commented on a diff in pull request #8891: Optimize dictionary lookup for IN clause

Reply via email to