Github user StephanEwen commented on the pull request:
https://github.com/apache/incubator-flink/pull/4#issuecomment-54073097
Here is a micro benchmark of String Serialization I did recently. It is
extremely simple and performs well. Do you have any feeling how it compares to
The basic trick is not to use the varlength encodings, but represent the
binary variant the same way as object variant.
```java
public class StringSortingBenchmark {
private static final long SEED = 0xa2e2223fc6643bbl;
private static final int MIN_LEN = 4;
private static final int MAX_LEN = 213;
private static final sun.misc.Unsafe UNSAFE = MemoryUtils.UNSAFE;
private static final long BASE_OFFSET =
UNSAFE.arrayBaseOffset(byte[].class);
public static long datasink = 0; // side effect variable to prevent
optimizing loops away
public static void main(String[] args) {
// we run comparisons between elements in arrays in the pattern
quickstart does is: pointers coming from both ends
final int NUM = 2000000; // 2 million
final String[] elements = new String[NUM];
final byte[] bytes = new byte[Integer.MAX_VALUE / 2];
final int[] pointers = new int[NUM];
System.out.println("Creating the strings");
{
Random rnd = new Random(SEED);
int ptr = 0;
for (int i = 0; i < NUM; i++) {
String str = getRandomString(rnd);
elements[i] = str;
pointers[i] = ptr;
ptr += serializeString(bytes, ptr, str);
}
}
System.out.println("Verifying serialization and comparison
methods");
{
int a = 0;
int b = NUM - 1;
while (a < NUM) {
String deser = deserializeString(bytes,
pointers[a]);
if (!elements[a].equals(deser)) {
throw new RuntimeException("Wrong
serialization: " + elements[a] + " versus " + deser);
}
int cmp1 = elements[a].compareTo(elements[b]);
int cmp2 = compareDeserializing(bytes,
pointers[a], pointers[b]);
int cmp3 = compareBinaryStrings(bytes,
pointers[a], pointers[b]);
if (Math.signum(cmp1) != Math.signum(cmp2)) {
throw new RuntimeException("Wrong
comparison result between deserialized and original");
}
if (Math.signum(cmp1) != Math.signum(cmp3)) {
throw new RuntimeException("Wrong
comparison result between binary and original");
}
a++;
b--;
}
}
long objectsTime;
long deserializingTime;
long binaryTime;
System.out.println("Running experiments with string objects.");
{
int a = 0;
int b = NUM - 1;
long acc = 0; // use this so no compiler optimizes the
loop away
long start = System.nanoTime();
while (a < NUM) {
long cmp =
elements[a++].compareTo(elements[b--]);
acc += cmp;
}
objectsTime = System.nanoTime() - start;
datasink += acc;
}
System.out.println("Running experiments deserializing string
objects.");
{
int a = 0;
int b = NUM - 1;
long acc = 0; // use this so no compiler optimizes the
loop away
long start = System.nanoTime();
while (a < NUM) {
long cmp = compareDeserializing(bytes,
pointers[a++], pointers[b--]);
acc += cmp;
}
deserializingTime = System.nanoTime() - start;
datasink += acc;
}
System.out.println("Running experiments with binary
comparisons.");
{
int a = 0;
int b = NUM - 1;
long acc = 0; // use this so no compiler optimizes the
loop away
long start = System.nanoTime();
while (a < NUM) {
long cmp = compareBinaryStrings(bytes,
pointers[a++], pointers[b--]);
acc += cmp;
}
binaryTime = System.nanoTime() - start;
datasink += acc;
}
System.out.println(String.format("Time taken\n - String
objects: %d µsecs\n - String deserialization: %d µsecs\n - Binary
comparison: %d µsecs", objectsTime / 1000, deserializingTime / 1000,
binaryTime / 1000));
}
private static String getRandomString(Random rnd) {
final int len = rnd.nextInt(MAX_LEN - MIN_LEN + 1) + MIN_LEN;
final StringBuilder bld = new StringBuilder(len);
for (int i = 0; i < len; i++) {
bld.append((char) (rnd.nextInt(100) + 33));
}
return bld.toString();
}
private static final int serializeString(byte[] target, int offset,
String str) {
final int len = str.length();
long pos = offset + BASE_OFFSET;
UNSAFE.putInt(target, pos, len);
pos += 4;
for (int i = 0; i < len; i++, pos += 2) {
char towrite = Character.reverseBytes(str.charAt(i));
UNSAFE.putChar(target, pos, towrite);
}
return 4 + 2*len;
}
private static final String deserializeString(byte[] data, int offset) {
long pos = offset + BASE_OFFSET;
final int len = UNSAFE.getInt(data, pos);
pos += 4;
StringBuilder bld = new StringBuilder(len);
for (int i = 0; i < len; i++, pos += 2) {
char read = UNSAFE.getChar(data, pos);
read = Character.reverseBytes(read);
bld.append(read);
}
return bld.toString();
}
private static final int compareBinaryStrings(byte[] data, int off1,
int off2) {
int len1 = UNSAFE.getInt(data, off1 + BASE_OFFSET);
int len2 = UNSAFE.getInt(data, off2 + BASE_OFFSET);
off1 += 4;
off2 += 4;
final int binCompLen = 2 * Math.min(len1, len2);
int val = 0;
// binary comparison
for (int pos = 0; pos < binCompLen && (val = (data[off1 + pos]
& 0xff) - (data[off2 + pos] & 0xff)) == 0; pos++);
return val != 0 ? val : len1 - len2;
}
private static final int compareDeserializing(byte[] data, int off1,
int off2) {
String str1 = deserializeString(data, off1);
String str2 = deserializeString(data, off2);
return str1.compareTo(str2);
}
}
```
---
If your project is set up for it, you can reply to this email and have your
reply appear on GitHub as well. If your project does not have this feature
enabled and wishes so, or if the feature is enabled but not working, please
contact infrastructure at [email protected] or file a JIRA ticket
with INFRA.
---