Jackie-Jiang commented on PR #8846:
URL: https://github.com/apache/pinot/pull/8846#issuecomment-1158566486

   Performed a benchmark with 5M lookups on different data types with different 
cardinality:
   
   ```
   Benchmark                                          (_cardinality)  Mode  Cnt 
    Score      Error  Units
   BenchmarkBinarySearch.benchmarkDoubleBinarySearch             100  avgt    3 
  216.168 ±   17.112  ms/op
   BenchmarkBinarySearch.benchmarkDoubleBinarySearch           10000  avgt    3 
  402.108 ±   37.544  ms/op
   BenchmarkBinarySearch.benchmarkDoubleBinarySearch         1000000  avgt    3 
  831.683 ±  417.802  ms/op
   BenchmarkBinarySearch.benchmarkDoubleMap                      100  avgt    3 
   65.926 ±   21.608  ms/op
   BenchmarkBinarySearch.benchmarkDoubleMap                    10000  avgt    3 
   87.987 ±    7.291  ms/op
   BenchmarkBinarySearch.benchmarkDoubleMap                  1000000  avgt    3 
  568.169 ±   87.005  ms/op
   BenchmarkBinarySearch.benchmarkFloatBinarySearch              100  avgt    3 
  203.972 ±    8.662  ms/op
   BenchmarkBinarySearch.benchmarkFloatBinarySearch            10000  avgt    3 
  393.657 ±   49.358  ms/op
   BenchmarkBinarySearch.benchmarkFloatBinarySearch          1000000  avgt    3 
  751.277 ±   13.846  ms/op
   BenchmarkBinarySearch.benchmarkFloatMap                       100  avgt    3 
   89.837 ±   24.091  ms/op
   BenchmarkBinarySearch.benchmarkFloatMap                     10000  avgt    3 
   87.869 ±   17.819  ms/op
   BenchmarkBinarySearch.benchmarkFloatMap                   1000000  avgt    3 
  404.022 ±   56.542  ms/op
   BenchmarkBinarySearch.benchmarkIntBinarySearch                100  avgt    3 
  191.057 ±   24.173  ms/op
   BenchmarkBinarySearch.benchmarkIntBinarySearch              10000  avgt    3 
  345.814 ±   78.147  ms/op
   BenchmarkBinarySearch.benchmarkIntBinarySearch            1000000  avgt    3 
  693.074 ±   23.619  ms/op
   BenchmarkBinarySearch.benchmarkIntMap                         100  avgt    3 
   81.485 ±    1.535  ms/op
   BenchmarkBinarySearch.benchmarkIntMap                       10000  avgt    3 
   85.403 ±    7.045  ms/op
   BenchmarkBinarySearch.benchmarkIntMap                     1000000  avgt    3 
  219.425 ±   59.368  ms/op
   BenchmarkBinarySearch.benchmarkLongBinarySearch               100  avgt    3 
  182.610 ±   13.995  ms/op
   BenchmarkBinarySearch.benchmarkLongBinarySearch             10000  avgt    3 
  365.881 ±   15.217  ms/op
   BenchmarkBinarySearch.benchmarkLongBinarySearch           1000000  avgt    3 
  774.414 ±  402.259  ms/op
   BenchmarkBinarySearch.benchmarkLongMap                        100  avgt    3 
   65.400 ±    5.087  ms/op
   BenchmarkBinarySearch.benchmarkLongMap                      10000  avgt    3 
   98.427 ±   11.991  ms/op
   BenchmarkBinarySearch.benchmarkLongMap                    1000000  avgt    3 
  302.917 ±    6.348  ms/op
   BenchmarkBinarySearch.benchmarkStringBinarySearch             100  avgt    3 
  306.576 ±   39.766  ms/op
   BenchmarkBinarySearch.benchmarkStringBinarySearch           10000  avgt    3 
  870.486 ±   52.621  ms/op
   BenchmarkBinarySearch.benchmarkStringBinarySearch         1000000  avgt    3 
 3173.214 ± 1122.761  ms/op
   BenchmarkBinarySearch.benchmarkStringMap                      100  avgt    3 
  115.433 ±    4.070  ms/op
   BenchmarkBinarySearch.benchmarkStringMap                    10000  avgt    3 
  254.615 ±   15.829  ms/op
   BenchmarkBinarySearch.benchmarkStringMap                  1000000  avgt    3 
 1372.186 ±  197.025  ms/op
   ```
   
   Note that all the strings have the same prefix, so it is actually adding 
some comparison cost for the binary search. With 1M cardinality, binary search 
is about 2-3x slower than hash lookup, but it is still fast enough (adding less 
than 2 seconds for string), and shouldn't become an issue. Considering the 
memory saving (especially preventing running out of memory when generating 
segments for wide tables), and the extra GC overhead of creating a map, this 
change should be beneficial. 
   
   Here is the benchmark code:
   ```
   package org.apache.pinot.perf;
   
   import it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap;
   import it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap;
   import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
   import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
   import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
   import java.io.IOException;
   import java.util.Arrays;
   import java.util.Random;
   import java.util.concurrent.TimeUnit;
   import org.openjdk.jmh.annotations.Benchmark;
   import org.openjdk.jmh.annotations.BenchmarkMode;
   import org.openjdk.jmh.annotations.Mode;
   import org.openjdk.jmh.annotations.OutputTimeUnit;
   import org.openjdk.jmh.annotations.Param;
   import org.openjdk.jmh.annotations.Scope;
   import org.openjdk.jmh.annotations.Setup;
   import org.openjdk.jmh.annotations.State;
   import org.openjdk.jmh.runner.Runner;
   import org.openjdk.jmh.runner.options.Options;
   import org.openjdk.jmh.runner.options.OptionsBuilder;
   import org.openjdk.jmh.runner.options.TimeValue;
   
   
   @State(Scope.Benchmark)
   public class BenchmarkBinarySearch {
     private static final int NUM_LOOK_UPS = 5000000;
     private static final Random RANDOM = new Random();
   
     @Param({"100", "10000", "1000000"})
     private int _cardinality;
   
     private int[] _sortedInts;
     private long[] _sortedLongs;
     private float[] _sortedFloats;
     private double[] _sortedDoubles;
     private String[] _sortedStrings;
   
     @Setup
     public void setUp()
         throws IOException {
       _sortedInts = new int[_cardinality];
       _sortedLongs = new long[_cardinality];
       _sortedFloats = new float[_cardinality];
       _sortedDoubles = new double[_cardinality];
       _sortedStrings = new String[_cardinality];
       for (int i = 0; i < _cardinality; i++) {
         _sortedInts[i] = i;
         _sortedLongs[i] = i;
         _sortedFloats[i] = i;
         _sortedDoubles[i] = i;
         String string = Integer.toString(i + _cardinality);
         _sortedStrings[i] = string;
       }
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkIntMap() {
       Int2IntOpenHashMap intMap = new Int2IntOpenHashMap(_cardinality);
       for (int i = 0; i < _cardinality; i++) {
         intMap.put(i, i);
       }
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += intMap.get(RANDOM.nextInt(_cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkIntBinarySearch() {
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += Arrays.binarySearch(_sortedInts, 
RANDOM.nextInt(_cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkLongMap() {
       Long2IntOpenHashMap longMap = new Long2IntOpenHashMap(_cardinality);
       for (int i = 0; i < _cardinality; i++) {
         longMap.put(i, i);
       }
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += longMap.get(RANDOM.nextInt(_cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkLongBinarySearch() {
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += Arrays.binarySearch(_sortedLongs, 
RANDOM.nextInt(_cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkFloatMap() {
       Float2IntOpenHashMap floatMap = new Float2IntOpenHashMap(_cardinality);
       for (int i = 0; i < _cardinality; i++) {
         floatMap.put(i, i);
       }
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += floatMap.get(RANDOM.nextInt(_cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkFloatBinarySearch() {
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += Arrays.binarySearch(_sortedFloats, 
RANDOM.nextInt(_cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkDoubleMap() {
       Double2IntOpenHashMap doubleMap = new 
Double2IntOpenHashMap(_cardinality);
       for (int i = 0; i < _cardinality; i++) {
         doubleMap.put(i, i);
       }
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += doubleMap.get(RANDOM.nextInt(_cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkDoubleBinarySearch() {
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += Arrays.binarySearch(_sortedDoubles, 
RANDOM.nextInt(_cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkStringMap() {
       Object2IntOpenHashMap<String> stringMap = new 
Object2IntOpenHashMap<>(_cardinality);
       for (int i = 0; i < _cardinality; i++) {
         stringMap.put(Integer.toString(i + _cardinality), i);
       }
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += 
stringMap.getInt(Integer.toString(RANDOM.nextInt(_cardinality) + _cardinality));
       }
       return result;
     }
   
     @Benchmark
     @BenchmarkMode(Mode.AverageTime)
     @OutputTimeUnit(TimeUnit.MILLISECONDS)
     public int benchmarkStringBinarySearch() {
       int result = 0;
       for (int i = 0; i < NUM_LOOK_UPS; i++) {
         result += Arrays.binarySearch(_sortedStrings, 
Integer.toString(RANDOM.nextInt(_cardinality) + _cardinality));
       }
       return result;
     }
   
     public static void main(String[] args)
         throws Exception {
       Options opt =
           new 
OptionsBuilder().include(BenchmarkBinarySearch.class.getSimpleName()).warmupTime(TimeValue.seconds(5))
               
.warmupIterations(2).measurementTime(TimeValue.seconds(10)).measurementIterations(3).forks(1).build();
   
       new Runner(opt).run();
     }
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to