Jackie-Jiang commented on PR #8846:
URL: https://github.com/apache/pinot/pull/8846#issuecomment-1158566486
Performed a benchmark with 5M lookups on different data types with different
cardinality:
```
Benchmark (_cardinality) Mode Cnt
Score Error Units
BenchmarkBinarySearch.benchmarkDoubleBinarySearch 100 avgt 3
216.168 ± 17.112 ms/op
BenchmarkBinarySearch.benchmarkDoubleBinarySearch 10000 avgt 3
402.108 ± 37.544 ms/op
BenchmarkBinarySearch.benchmarkDoubleBinarySearch 1000000 avgt 3
831.683 ± 417.802 ms/op
BenchmarkBinarySearch.benchmarkDoubleMap 100 avgt 3
65.926 ± 21.608 ms/op
BenchmarkBinarySearch.benchmarkDoubleMap 10000 avgt 3
87.987 ± 7.291 ms/op
BenchmarkBinarySearch.benchmarkDoubleMap 1000000 avgt 3
568.169 ± 87.005 ms/op
BenchmarkBinarySearch.benchmarkFloatBinarySearch 100 avgt 3
203.972 ± 8.662 ms/op
BenchmarkBinarySearch.benchmarkFloatBinarySearch 10000 avgt 3
393.657 ± 49.358 ms/op
BenchmarkBinarySearch.benchmarkFloatBinarySearch 1000000 avgt 3
751.277 ± 13.846 ms/op
BenchmarkBinarySearch.benchmarkFloatMap 100 avgt 3
89.837 ± 24.091 ms/op
BenchmarkBinarySearch.benchmarkFloatMap 10000 avgt 3
87.869 ± 17.819 ms/op
BenchmarkBinarySearch.benchmarkFloatMap 1000000 avgt 3
404.022 ± 56.542 ms/op
BenchmarkBinarySearch.benchmarkIntBinarySearch 100 avgt 3
191.057 ± 24.173 ms/op
BenchmarkBinarySearch.benchmarkIntBinarySearch 10000 avgt 3
345.814 ± 78.147 ms/op
BenchmarkBinarySearch.benchmarkIntBinarySearch 1000000 avgt 3
693.074 ± 23.619 ms/op
BenchmarkBinarySearch.benchmarkIntMap 100 avgt 3
81.485 ± 1.535 ms/op
BenchmarkBinarySearch.benchmarkIntMap 10000 avgt 3
85.403 ± 7.045 ms/op
BenchmarkBinarySearch.benchmarkIntMap 1000000 avgt 3
219.425 ± 59.368 ms/op
BenchmarkBinarySearch.benchmarkLongBinarySearch 100 avgt 3
182.610 ± 13.995 ms/op
BenchmarkBinarySearch.benchmarkLongBinarySearch 10000 avgt 3
365.881 ± 15.217 ms/op
BenchmarkBinarySearch.benchmarkLongBinarySearch 1000000 avgt 3
774.414 ± 402.259 ms/op
BenchmarkBinarySearch.benchmarkLongMap 100 avgt 3
65.400 ± 5.087 ms/op
BenchmarkBinarySearch.benchmarkLongMap 10000 avgt 3
98.427 ± 11.991 ms/op
BenchmarkBinarySearch.benchmarkLongMap 1000000 avgt 3
302.917 ± 6.348 ms/op
BenchmarkBinarySearch.benchmarkStringBinarySearch 100 avgt 3
306.576 ± 39.766 ms/op
BenchmarkBinarySearch.benchmarkStringBinarySearch 10000 avgt 3
870.486 ± 52.621 ms/op
BenchmarkBinarySearch.benchmarkStringBinarySearch 1000000 avgt 3
3173.214 ± 1122.761 ms/op
BenchmarkBinarySearch.benchmarkStringMap 100 avgt 3
115.433 ± 4.070 ms/op
BenchmarkBinarySearch.benchmarkStringMap 10000 avgt 3
254.615 ± 15.829 ms/op
BenchmarkBinarySearch.benchmarkStringMap 1000000 avgt 3
1372.186 ± 197.025 ms/op
```
Note that all the strings have the same prefix, so it is actually adding
some comparison cost for the binary search. With 1M cardinality, binary search
is about 2-3x slower than hash lookup, but it is still fast enough (adding less
than 2 seconds for string), and shouldn't become an issue. Considering the
memory saving (especially preventing running out of memory when generating
segments for wide tables), and the extra GC overhead of creating a map, this
change should be beneficial.
Here is the benchmark code:
```
package org.apache.pinot.perf;
import it.unimi.dsi.fastutil.doubles.Double2IntOpenHashMap;
import it.unimi.dsi.fastutil.floats.Float2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap;
import java.io.IOException;
import java.util.Arrays;
import java.util.Random;
import java.util.concurrent.TimeUnit;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.runner.Runner;
import org.openjdk.jmh.runner.options.Options;
import org.openjdk.jmh.runner.options.OptionsBuilder;
import org.openjdk.jmh.runner.options.TimeValue;
@State(Scope.Benchmark)
public class BenchmarkBinarySearch {
private static final int NUM_LOOK_UPS = 5000000;
private static final Random RANDOM = new Random();
@Param({"100", "10000", "1000000"})
private int _cardinality;
private int[] _sortedInts;
private long[] _sortedLongs;
private float[] _sortedFloats;
private double[] _sortedDoubles;
private String[] _sortedStrings;
@Setup
public void setUp()
throws IOException {
_sortedInts = new int[_cardinality];
_sortedLongs = new long[_cardinality];
_sortedFloats = new float[_cardinality];
_sortedDoubles = new double[_cardinality];
_sortedStrings = new String[_cardinality];
for (int i = 0; i < _cardinality; i++) {
_sortedInts[i] = i;
_sortedLongs[i] = i;
_sortedFloats[i] = i;
_sortedDoubles[i] = i;
String string = Integer.toString(i + _cardinality);
_sortedStrings[i] = string;
}
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkIntMap() {
Int2IntOpenHashMap intMap = new Int2IntOpenHashMap(_cardinality);
for (int i = 0; i < _cardinality; i++) {
intMap.put(i, i);
}
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += intMap.get(RANDOM.nextInt(_cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkIntBinarySearch() {
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += Arrays.binarySearch(_sortedInts,
RANDOM.nextInt(_cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkLongMap() {
Long2IntOpenHashMap longMap = new Long2IntOpenHashMap(_cardinality);
for (int i = 0; i < _cardinality; i++) {
longMap.put(i, i);
}
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += longMap.get(RANDOM.nextInt(_cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkLongBinarySearch() {
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += Arrays.binarySearch(_sortedLongs,
RANDOM.nextInt(_cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkFloatMap() {
Float2IntOpenHashMap floatMap = new Float2IntOpenHashMap(_cardinality);
for (int i = 0; i < _cardinality; i++) {
floatMap.put(i, i);
}
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += floatMap.get(RANDOM.nextInt(_cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkFloatBinarySearch() {
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += Arrays.binarySearch(_sortedFloats,
RANDOM.nextInt(_cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkDoubleMap() {
Double2IntOpenHashMap doubleMap = new
Double2IntOpenHashMap(_cardinality);
for (int i = 0; i < _cardinality; i++) {
doubleMap.put(i, i);
}
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += doubleMap.get(RANDOM.nextInt(_cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkDoubleBinarySearch() {
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += Arrays.binarySearch(_sortedDoubles,
RANDOM.nextInt(_cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkStringMap() {
Object2IntOpenHashMap<String> stringMap = new
Object2IntOpenHashMap<>(_cardinality);
for (int i = 0; i < _cardinality; i++) {
stringMap.put(Integer.toString(i + _cardinality), i);
}
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result +=
stringMap.getInt(Integer.toString(RANDOM.nextInt(_cardinality) + _cardinality));
}
return result;
}
@Benchmark
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.MILLISECONDS)
public int benchmarkStringBinarySearch() {
int result = 0;
for (int i = 0; i < NUM_LOOK_UPS; i++) {
result += Arrays.binarySearch(_sortedStrings,
Integer.toString(RANDOM.nextInt(_cardinality) + _cardinality));
}
return result;
}
public static void main(String[] args)
throws Exception {
Options opt =
new
OptionsBuilder().include(BenchmarkBinarySearch.class.getSimpleName()).warmupTime(TimeValue.seconds(5))
.warmupIterations(2).measurementTime(TimeValue.seconds(10)).measurementIterations(3).forks(1).build();
new Runner(opt).run();
}
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]