AHeise commented on a change in pull request #10358: [FLINK-14346]
[serialization] faster implementation of StringValue writeString and readString
URL: https://github.com/apache/flink/pull/10358#discussion_r353372023
##########
File path: flink-core/src/main/java/org/apache/flink/types/StringValue.java
##########
@@ -759,56 +761,142 @@ public static String readString(DataInput in) throws
IOException {
}
len |= curr << shift;
}
-
+
// subtract one for the null length
len -= 1;
-
- final char[] data = new char[len];
- for (int i = 0; i < len; i++) {
- int c = in.readUnsignedByte();
- if (c < HIGH_BIT) {
- data[i] = (char) c;
- } else {
+ /* as we have no idea about byte-length of the serialized
string, we cannot fully
+ * read it into memory buffer. But we can do it in an
optimistic way:
+ * 1. In a happy case when the string is an us-ascii one, then
byte_len == char_len
+ * 2. If we spot at least one character with code >= 127, then
we reallocate the buffer
+ * to accommodate for the next characters.
+ */
+
+ // happily assume that the string is an 7 bit us-ascii one
+ byte[] buf = new byte[len];
+ in.readFully(buf);
+
+ final char[] data = new char[len];
+ int charPosition = 0;
+ int bufSize = len;
+ int bytePosition = 0;
+
+ while (charPosition < len) {
+ // there is at least `char count - char position` bytes
left in case if all the
+ // remaining characters are 7 bit.
+ int remainingBytesEstimation = len - charPosition;
+ int c;
+ if (bytePosition == bufSize) {
+ // need to expand the buffer as we are already
reached the end.
+ // we also reuse the old buffer, as it's
capacity must be enough in any case.
+ in.readFully(buf, 0, remainingBytesEstimation);
+ bytePosition = 0;
+ bufSize = remainingBytesEstimation;
+ }
+ c = buf[bytePosition++] & 255;
+ // non 7-bit path
+ if (c >= HIGH_BIT) {
int shift = 7;
int curr;
c = c & 0x7f;
- while ((curr = in.readUnsignedByte()) >=
HIGH_BIT) {
+ if (bytePosition == bufSize) {
+ // need to expand the buffer, also
reusing the buffer again.
+ in.readFully(buf, 0,
remainingBytesEstimation);
+ bytePosition = 0;
+ bufSize = remainingBytesEstimation;
+ }
+
+ while ((curr = buf[bytePosition++] & 255) >=
HIGH_BIT) {
c |= (curr & 0x7f) << shift;
shift += 7;
+ if (bytePosition == bufSize) {
+ // may need to expand the
buffer if char bytes are split between the buffers.
+ in.readFully(buf, 0,
remainingBytesEstimation);
+ bytePosition = 0;
+ bufSize =
remainingBytesEstimation;
+ }
}
c |= curr << shift;
- data[i] = (char) c;
}
+ data[charPosition++] = (char) c;
}
-
return new String(data, 0, len);
}
public static final void writeString(CharSequence cs, DataOutput out)
throws IOException {
if (cs != null) {
// the length we write is offset by one, because a
length of zero indicates a null value
- int lenToWrite = cs.length()+1;
+ int position = 0;
+ int strlen = cs.length();
+
+ int lenToWrite = strlen + 1;
if (lenToWrite < 0) {
throw new
IllegalArgumentException("CharSequence is too long.");
}
-
- // write the length, variable-length encoded
- while (lenToWrite >= HIGH_BIT) {
- out.write(lenToWrite | HIGH_BIT);
- lenToWrite >>>= 7;
- }
- out.write(lenToWrite);
-
- // write the char data, variable length encoded
- for (int i = 0; i < cs.length(); i++) {
- int c = cs.charAt(i);
-
- while (c >= HIGH_BIT) {
- out.write(c | HIGH_BIT);
- c >>>= 7;
+
+ // on average, for strings shorter than 6 characters
the cost of allocating the buffer
+ // is higher than writing it directly. See benchmarks
in https://github.com/apache/flink/pull/10358 for
+ // examples.
+ if (strlen < 6) {
+ // write the length, variable-length encoded
+ while (lenToWrite >= HIGH_BIT) {
Review comment:
Would loop unrolling also help here? Could be extracted as a private method
which JIT can inline.
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services