Author: bryanduxbury
Date: Fri Apr 30 21:37:03 2010
New Revision: 939823
URL: http://svn.apache.org/viewvc?rev=939823&view=rev
Log:
THRIFT-765. java: Improved string encoding and decoding performance
This patch fixes a regression caused by the previous 'fast' implementation, in
particular, dealing with unicode characters that need to be encoded as
surrogate pairs. The performance stays about the same.
Modified:
incubator/thrift/tags/0.3.0/configure.ac
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java
Modified: incubator/thrift/tags/0.3.0/configure.ac
URL:
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/configure.ac?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
--- incubator/thrift/tags/0.3.0/configure.ac (original)
+++ incubator/thrift/tags/0.3.0/configure.ac Fri Apr 30 21:37:03 2010
@@ -19,7 +19,7 @@
AC_PREREQ(2.59)
-AC_INIT([thrift], [20080411])
+AC_INIT([thrift], [0.3.0])
AC_CONFIG_AUX_DIR([.])
Modified:
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java
URL:
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
--- incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java
(original)
+++ incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java
Fri Apr 30 21:37:03 2010
@@ -5,15 +5,26 @@ public final class Utf8Helper {
public static final int getByteLength(final String s) {
int byteLength = 0;
- int c;
+ int codePoint;
for (int i = 0; i < s.length(); i++) {
- c = s.charAt(i);
- if (c <= 0x007F) {
+ codePoint = s.charAt(i);
+ if (codePoint >= 0x07FF) {
+ codePoint = s.codePointAt(i);
+ if (Character.isSupplementaryCodePoint(codePoint)) {
+ i++;
+ }
+ }
+ if (codePoint >= 0 && codePoint <= 0x007F) {
byteLength++;
- } else if (c > 0x07FF) {
+ } else if (codePoint >= 0x80 && codePoint <= 0x07FF) {
+ byteLength += 2;
+ } else if ((codePoint >= 0x0800 && codePoint < 0xD800) || (codePoint >
0xDFFF && codePoint <= 0xFFFD)) {
byteLength+=3;
+ } else if (codePoint >= 0x10000 && codePoint <= 0x10FFFF) {
+ byteLength+=4;
} else {
- byteLength+=2;
+ throw new RuntimeException("Unknown unicode codepoint in string! "
+ + Integer.toHexString(codePoint));
}
}
return byteLength;
@@ -25,62 +36,89 @@ public final class Utf8Helper {
return buf;
}
- public static void encode(String s, byte[] buf, int offset) {
+ public static void encode(final String s, final byte[] buf, final int
offset) {
int nextByte = 0;
- int c;
- for (int i = 0; i < s.length(); i++) {
- c = s.charAt(i);
- if (c <= 0x007F) {
- buf[offset + nextByte] = (byte)c;
+ int codePoint;
+ final int strLen = s.length();
+ for (int i = 0; i < strLen; i++) {
+ codePoint = s.charAt(i);
+ if (codePoint >= 0x07FF) {
+ codePoint = s.codePointAt(i);
+ if (Character.isSupplementaryCodePoint(codePoint)) {
+ i++;
+ }
+ }
+ if (codePoint <= 0x007F) {
+ buf[offset + nextByte] = (byte)codePoint;
nextByte++;
- } else if (c > 0x07FF) {
- buf[offset + nextByte ] = (byte)(0xE0 | c >> 12 & 0x0F);
- buf[offset + nextByte + 1] = (byte)(0x80 | c >> 6 & 0x3F);
- buf[offset + nextByte + 2] = (byte)(0x80 | c & 0x3F);
+ } else if (codePoint <= 0x7FF) {
+ buf[offset + nextByte ] = (byte)(0xC0 | ((codePoint >> 6) & 0x1F));
+ buf[offset + nextByte + 1] = (byte)(0x80 | ((codePoint >> 0) & 0x3F));
+ nextByte+=2;
+ } else if ((codePoint < 0xD800) || (codePoint > 0xDFFF && codePoint <=
0xFFFD)) {
+ buf[offset + nextByte ] = (byte)(0xE0 | ((codePoint >> 12) & 0x0F));
+ buf[offset + nextByte + 1] = (byte)(0x80 | ((codePoint >> 6) & 0x3F));
+ buf[offset + nextByte + 2] = (byte)(0x80 | ((codePoint >> 0) & 0x3F));
nextByte+=3;
+ } else if (codePoint >= 0x10000 && codePoint <= 0x10FFFF) {
+ buf[offset + nextByte ] = (byte)(0xF0 | ((codePoint >> 18) & 0x07));
+ buf[offset + nextByte + 1] = (byte)(0x80 | ((codePoint >> 12) & 0x3F));
+ buf[offset + nextByte + 2] = (byte)(0x80 | ((codePoint >> 6) & 0x3F));
+ buf[offset + nextByte + 3] = (byte)(0x80 | ((codePoint >> 0) & 0x3F));
+ nextByte+=4;
} else {
- buf[offset + nextByte ] = (byte)(0xC0 | c >> 6 & 0x1F);
- buf[offset + nextByte + 1] = (byte)(0x80 | c & 0x3F);
- nextByte+=2;
+ throw new RuntimeException("Unknown unicode codepoint in string! "
+ + Integer.toHexString(codePoint));
}
}
}
public static String decode(byte[] buf) {
- return decode(buf, 0, buf.length);
+ char[] charBuf = new char[buf.length];
+ int charsDecoded = decode(buf, 0, buf.length, charBuf);
+ return new String(charBuf, 0, charsDecoded);
}
- public static String decode(byte[] buf, int offset, int byteLength) {
- int charCount = 0;
- char[] chars = new char[byteLength];
- int c;
- int byteIndex = offset;
- int charIndex = 0;
- while (byteIndex < offset + byteLength) {
- c = buf[byteIndex++] & 0xFF;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- chars[charIndex++] = (char) c;
- break;
- case 12:
- case 13:
- chars[charIndex++] = (char) ((c & 0x1F) << 6 | (buf[byteIndex++] &
0x3F));
- break;
- case 14:
- chars[charIndex++] = (char) ((c & 0x0F) << 12 | (buf[byteIndex++] &
0x3F) << 6 | (buf[byteIndex++] & 0x3F) << 0);
- break;
+ public static final int UNI_SUR_HIGH_START = 0xD800;
+ public static final int UNI_SUR_HIGH_END = 0xDBFF;
+ public static final int UNI_SUR_LOW_START = 0xDC00;
+ public static final int UNI_SUR_LOW_END = 0xDFFF;
+ public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;
+
+ private static final int HALF_BASE = 0x0010000;
+ private static final long HALF_SHIFT = 10;
+ private static final long HALF_MASK = 0x3FFL;
+
+ public static int decode(final byte[] buf, final int offset, final int
byteLength, final char[] charBuf) {
+ int curByteIdx = offset;
+ int endByteIdx = offset + byteLength;
+
+ int curCharIdx = 0;
+
+ while (curByteIdx < endByteIdx) {
+ final int b = buf[curByteIdx++]&0xff;
+ final int ch;
+
+ if (b < 0xC0) {
+ ch = b;
+ } else if (b < 0xE0) {
+ ch = ((b & 0x1F) << 6) + (buf[curByteIdx++] & 0x3F);
+ } else if (b < 0xf0) {
+ ch = ((b & 0xF) << 12) + ((buf[curByteIdx++] & 0x3F) << 6) +
(buf[curByteIdx++] & 0x3F);
+ } else {
+ ch = ((b & 0x7) << 18) + ((buf[curByteIdx++]& 0x3F) << 12) +
((buf[curByteIdx++] & 0x3F) << 6) + (buf[curByteIdx++] & 0x3F);
}
- charCount++;
- }
- return new String(chars, 0, charCount);
+ if (ch <= 0xFFFF) {
+ // target is a character <= 0xFFFF
+ charBuf[curCharIdx++] = (char) ch;
+ } else {
+ // target is a character in range 0xFFFF - 0x10FFFF
+ final int chHalf = ch - HALF_BASE;
+ charBuf[curCharIdx++] = (char) ((chHalf >> HALF_SHIFT) +
UNI_SUR_HIGH_START);
+ charBuf[curCharIdx++] = (char) ((chHalf & HALF_MASK) +
UNI_SUR_LOW_START);
+ }
+ }
+ return curCharIdx;
}
-
}
Modified:
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
URL:
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
---
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
(original)
+++
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
Fri Apr 30 21:37:03 2010
@@ -328,9 +328,10 @@ public class TBinaryProtocol extends TPr
int size = readI32();
if (trans_.getBytesRemainingInBuffer() >= size) {
- String s = Utf8Helper.decode(trans_.getBuffer(),
trans_.getBufferPosition(), size);
+ char[] charBuf = new char[size];
+ int charsDecoded = Utf8Helper.decode(trans_.getBuffer(),
trans_.getBufferPosition(), size, charBuf);
trans_.consumeBuffer(size);
- return s;
+ return new String(charBuf, 0, charsDecoded);
}
return readStringBody(size);
Modified:
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
URL:
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
---
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
(original)
+++
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
Fri Apr 30 21:37:03 2010
@@ -606,9 +606,10 @@ public final class TCompactProtocol exte
}
if (trans_.getBytesRemainingInBuffer() >= length) {
- String str = Utf8Helper.decode(trans_.getBuffer(),
trans_.getBufferPosition(), length);
+ char[] charBuf = new char[length];
+ int charsDecoded = Utf8Helper.decode(trans_.getBuffer(),
trans_.getBufferPosition(), length, charBuf);
trans_.consumeBuffer(length);
- return str;
+ return new String(charBuf, 0, charsDecoded);
} else {
return Utf8Helper.decode(readBinary(length));
}
Modified:
incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java
URL:
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
---
incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java
(original)
+++
incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java
Fri Apr 30 21:37:03 2010
@@ -25,15 +25,19 @@ public class TestUtf8Helper extends Test
private static final String UNICODE_STRING_2;
private static final byte[] UNICODE_STRING_BYTES_2;
- private static final String REALLY_WHACKY_ONE = "\u20491";
+ private static final String REALLY_WHACKY_ONE = "\uD841\uDC91";
private static final byte[] REALLY_WHACKY_ONE_BYTES;
+ private static final String TWO_CHAR_CHAR = "\uD801\uDC00";
+ private static final byte[] TWO_CHAR_CHAR_BYTES;
+
static {
try {
UNICODE_STRING_BYTES = UNICODE_STRING.getBytes("UTF-8");
UNICODE_STRING_2 = new String(kUnicodeBytes, "UTF-8");
UNICODE_STRING_BYTES_2 = UNICODE_STRING_2.getBytes("UTF-8");
REALLY_WHACKY_ONE_BYTES = REALLY_WHACKY_ONE.getBytes("UTF-8");
+ TWO_CHAR_CHAR_BYTES = TWO_CHAR_CHAR.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException(e);
}
@@ -53,6 +57,9 @@ public class TestUtf8Helper extends Test
otherBytes = Utf8Helper.encode(REALLY_WHACKY_ONE);
assertTrue(Arrays.equals(REALLY_WHACKY_ONE_BYTES, otherBytes));
+
+ otherBytes = Utf8Helper.encode(TWO_CHAR_CHAR);
+ assertTrue(Arrays.equals(TWO_CHAR_CHAR_BYTES, otherBytes));
}
public void testDecode() throws Exception {
@@ -62,5 +69,6 @@ public class TestUtf8Helper extends Test
assertEquals(UNICODE_STRING, Utf8Helper.decode(UNICODE_STRING_BYTES));
assertEquals(UNICODE_STRING_2, Utf8Helper.decode(UNICODE_STRING_BYTES_2));
assertEquals(REALLY_WHACKY_ONE,
Utf8Helper.decode(REALLY_WHACKY_ONE_BYTES));
+ assertEquals(TWO_CHAR_CHAR, Utf8Helper.decode(TWO_CHAR_CHAR_BYTES));
}
}