Author: bryanduxbury
Date: Fri Apr 30 21:37:03 2010
New Revision: 939823

URL: http://svn.apache.org/viewvc?rev=939823&view=rev
Log:
THRIFT-765. java:  Improved string encoding and decoding performance

This patch fixes a regression caused by the previous 'fast' implementation, in 
particular, dealing with unicode characters that need to be encoded as 
surrogate pairs. The performance stays about the same.

Modified:
    incubator/thrift/tags/0.3.0/configure.ac
    incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java
    
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
    
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
    
incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java

Modified: incubator/thrift/tags/0.3.0/configure.ac
URL: 
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/configure.ac?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
--- incubator/thrift/tags/0.3.0/configure.ac (original)
+++ incubator/thrift/tags/0.3.0/configure.ac Fri Apr 30 21:37:03 2010
@@ -19,7 +19,7 @@
 
 AC_PREREQ(2.59)
 
-AC_INIT([thrift], [20080411])
+AC_INIT([thrift], [0.3.0])
 
 AC_CONFIG_AUX_DIR([.])
 

Modified: 
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java
URL: 
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
--- incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java 
(original)
+++ incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/Utf8Helper.java 
Fri Apr 30 21:37:03 2010
@@ -5,15 +5,26 @@ public final class Utf8Helper {
 
   public static final int getByteLength(final String s) {
     int byteLength = 0;
-    int c;
+    int codePoint;
     for (int i = 0; i < s.length(); i++) {
-      c = s.charAt(i);
-      if (c <= 0x007F) {
+      codePoint = s.charAt(i);
+      if (codePoint >= 0x07FF) {
+        codePoint = s.codePointAt(i);
+        if (Character.isSupplementaryCodePoint(codePoint)) {
+          i++;
+        }
+      }
+      if (codePoint >= 0 && codePoint <= 0x007F) {
         byteLength++;
-      } else if (c > 0x07FF) {
+      } else if (codePoint >= 0x80 && codePoint <= 0x07FF) {
+        byteLength += 2;
+      } else if ((codePoint >= 0x0800 && codePoint < 0xD800) || (codePoint > 
0xDFFF && codePoint <= 0xFFFD)) {
         byteLength+=3;
+      } else if (codePoint >= 0x10000 && codePoint <= 0x10FFFF) {
+        byteLength+=4;
       } else {
-        byteLength+=2;
+        throw new RuntimeException("Unknown unicode codepoint in string! "
+            + Integer.toHexString(codePoint));
       }
     }
     return byteLength;
@@ -25,62 +36,89 @@ public final class Utf8Helper {
     return buf;
   }
 
-  public static void encode(String s, byte[] buf, int offset) {
+  public static void encode(final String s, final byte[] buf, final int 
offset) {
     int nextByte = 0;
-    int c;
-    for (int i = 0; i < s.length(); i++) {
-      c = s.charAt(i);
-      if (c <= 0x007F) {
-        buf[offset + nextByte] = (byte)c;
+    int codePoint;
+    final int strLen = s.length();
+    for (int i = 0; i < strLen; i++) {
+      codePoint = s.charAt(i);
+      if (codePoint >= 0x07FF) {
+        codePoint = s.codePointAt(i);
+        if (Character.isSupplementaryCodePoint(codePoint)) {
+          i++;
+        }
+      }
+      if (codePoint <= 0x007F) {
+        buf[offset + nextByte] = (byte)codePoint;
         nextByte++;
-      } else if (c > 0x07FF) {
-        buf[offset + nextByte    ] = (byte)(0xE0 | c >> 12 & 0x0F);
-        buf[offset + nextByte + 1] = (byte)(0x80 | c >>  6 & 0x3F);
-        buf[offset + nextByte + 2] = (byte)(0x80 | c       & 0x3F);
+      } else if (codePoint <= 0x7FF) {
+        buf[offset + nextByte    ] = (byte)(0xC0 | ((codePoint >> 6) & 0x1F));
+        buf[offset + nextByte + 1] = (byte)(0x80 | ((codePoint >> 0) & 0x3F));
+        nextByte+=2;
+      } else if ((codePoint < 0xD800) || (codePoint > 0xDFFF && codePoint <= 
0xFFFD)) {
+        buf[offset + nextByte    ] = (byte)(0xE0 | ((codePoint >> 12) & 0x0F));
+        buf[offset + nextByte + 1] = (byte)(0x80 | ((codePoint >>  6) & 0x3F));
+        buf[offset + nextByte + 2] = (byte)(0x80 | ((codePoint >>  0) & 0x3F));
         nextByte+=3;
+      } else if (codePoint >= 0x10000 && codePoint <= 0x10FFFF) {
+        buf[offset + nextByte    ] = (byte)(0xF0 | ((codePoint >> 18) & 0x07));
+        buf[offset + nextByte + 1] = (byte)(0x80 | ((codePoint >> 12) & 0x3F));
+        buf[offset + nextByte + 2] = (byte)(0x80 | ((codePoint >>  6) & 0x3F));
+        buf[offset + nextByte + 3] = (byte)(0x80 | ((codePoint >>  0) & 0x3F));
+        nextByte+=4;
       } else {
-        buf[offset + nextByte    ] = (byte)(0xC0 | c >> 6 & 0x1F);
-        buf[offset + nextByte + 1] = (byte)(0x80 | c      & 0x3F);
-        nextByte+=2;
+        throw new RuntimeException("Unknown unicode codepoint in string! "
+            + Integer.toHexString(codePoint));
       }
     }
   }
 
   public static String decode(byte[] buf) {
-    return decode(buf, 0, buf.length);
+    char[] charBuf = new char[buf.length];
+    int charsDecoded = decode(buf, 0, buf.length, charBuf);
+    return new String(charBuf, 0, charsDecoded);
   }
 
-  public static String decode(byte[] buf, int offset, int byteLength) {
-    int charCount = 0;
-    char[] chars = new char[byteLength];
-    int c;
-    int byteIndex = offset;
-    int charIndex = 0;
-    while (byteIndex < offset + byteLength) {
-      c = buf[byteIndex++] & 0xFF;
-      switch (c >> 4) {
-        case 0:
-        case 1:
-        case 2:
-        case 3:
-        case 4:
-        case 5:
-        case 6:
-        case 7:
-          chars[charIndex++] = (char) c;
-          break;
-        case 12:
-        case 13:
-          chars[charIndex++] = (char) ((c & 0x1F) << 6 | (buf[byteIndex++] & 
0x3F));
-          break;
-        case 14:
-          chars[charIndex++] = (char) ((c & 0x0F) << 12 | (buf[byteIndex++] & 
0x3F) << 6 | (buf[byteIndex++] & 0x3F) << 0);
-          break;
+  public static final int UNI_SUR_HIGH_START = 0xD800;
+  public static final int UNI_SUR_HIGH_END = 0xDBFF;
+  public static final int UNI_SUR_LOW_START = 0xDC00;
+  public static final int UNI_SUR_LOW_END = 0xDFFF;
+  public static final int UNI_REPLACEMENT_CHAR = 0xFFFD;
+
+  private static final int HALF_BASE = 0x0010000;
+  private static final long HALF_SHIFT = 10;
+  private static final long HALF_MASK = 0x3FFL;
+
+  public static int decode(final byte[] buf, final int offset, final int 
byteLength, final char[] charBuf) {
+    int curByteIdx = offset;
+    int endByteIdx = offset + byteLength;
+
+    int curCharIdx = 0;
+
+    while (curByteIdx < endByteIdx) {
+      final int b = buf[curByteIdx++]&0xff;
+      final int ch;
+
+      if (b < 0xC0) {
+        ch = b;
+      } else if (b < 0xE0) {
+        ch = ((b & 0x1F) << 6) + (buf[curByteIdx++] & 0x3F);
+      } else if (b < 0xf0) {
+        ch = ((b & 0xF) << 12) + ((buf[curByteIdx++] & 0x3F) << 6) + 
(buf[curByteIdx++] & 0x3F);
+      } else {
+        ch = ((b & 0x7) << 18) + ((buf[curByteIdx++]& 0x3F) << 12) + 
((buf[curByteIdx++] & 0x3F) << 6) + (buf[curByteIdx++] & 0x3F);
       }
-      charCount++;
-    }
-    return new String(chars, 0, charCount);
 
+      if (ch <= 0xFFFF) {
+        // target is a character <= 0xFFFF
+        charBuf[curCharIdx++] = (char) ch;
+      } else {
+        // target is a character in range 0xFFFF - 0x10FFFF
+        final int chHalf = ch - HALF_BASE;
+        charBuf[curCharIdx++] = (char) ((chHalf >> HALF_SHIFT) + 
UNI_SUR_HIGH_START);
+        charBuf[curCharIdx++] = (char) ((chHalf & HALF_MASK) + 
UNI_SUR_LOW_START);
+      }
+    }
+    return curCharIdx;
   }
-  
 }

Modified: 
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
URL: 
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
--- 
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
 (original)
+++ 
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TBinaryProtocol.java
 Fri Apr 30 21:37:03 2010
@@ -328,9 +328,10 @@ public class TBinaryProtocol extends TPr
     int size = readI32();
 
     if (trans_.getBytesRemainingInBuffer() >= size) {
-      String s = Utf8Helper.decode(trans_.getBuffer(), 
trans_.getBufferPosition(), size);
+      char[] charBuf = new char[size];
+      int charsDecoded = Utf8Helper.decode(trans_.getBuffer(), 
trans_.getBufferPosition(), size, charBuf);
       trans_.consumeBuffer(size);
-      return s;
+      return new String(charBuf, 0, charsDecoded);
     }
 
     return readStringBody(size);

Modified: 
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
URL: 
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
--- 
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
 (original)
+++ 
incubator/thrift/tags/0.3.0/lib/java/src/org/apache/thrift/protocol/TCompactProtocol.java
 Fri Apr 30 21:37:03 2010
@@ -606,9 +606,10 @@ public final class TCompactProtocol exte
     }
 
     if (trans_.getBytesRemainingInBuffer() >= length) {
-      String str = Utf8Helper.decode(trans_.getBuffer(), 
trans_.getBufferPosition(), length);
+      char[] charBuf = new char[length];
+      int charsDecoded = Utf8Helper.decode(trans_.getBuffer(), 
trans_.getBufferPosition(), length, charBuf);
       trans_.consumeBuffer(length);
-      return str;
+      return new String(charBuf, 0, charsDecoded);
     } else {
       return Utf8Helper.decode(readBinary(length));
     }

Modified: 
incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java
URL: 
http://svn.apache.org/viewvc/incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java?rev=939823&r1=939822&r2=939823&view=diff
==============================================================================
--- 
incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java 
(original)
+++ 
incubator/thrift/tags/0.3.0/lib/java/test/org/apache/thrift/TestUtf8Helper.java 
Fri Apr 30 21:37:03 2010
@@ -25,15 +25,19 @@ public class TestUtf8Helper extends Test
   private static final String UNICODE_STRING_2;
   private static final byte[] UNICODE_STRING_BYTES_2;
 
-  private static final String REALLY_WHACKY_ONE = "\u20491";
+  private static final String REALLY_WHACKY_ONE = "\uD841\uDC91";
   private static final byte[] REALLY_WHACKY_ONE_BYTES;
 
+  private static final String TWO_CHAR_CHAR = "\uD801\uDC00";
+  private static final byte[] TWO_CHAR_CHAR_BYTES;
+
   static {
     try {
       UNICODE_STRING_BYTES = UNICODE_STRING.getBytes("UTF-8");
       UNICODE_STRING_2 = new String(kUnicodeBytes, "UTF-8");
       UNICODE_STRING_BYTES_2 = UNICODE_STRING_2.getBytes("UTF-8");
       REALLY_WHACKY_ONE_BYTES = REALLY_WHACKY_ONE.getBytes("UTF-8");
+      TWO_CHAR_CHAR_BYTES = TWO_CHAR_CHAR.getBytes("UTF-8");
     } catch (UnsupportedEncodingException e) {
       throw new RuntimeException(e);
     }
@@ -53,6 +57,9 @@ public class TestUtf8Helper extends Test
 
     otherBytes = Utf8Helper.encode(REALLY_WHACKY_ONE);
     assertTrue(Arrays.equals(REALLY_WHACKY_ONE_BYTES, otherBytes));
+
+    otherBytes = Utf8Helper.encode(TWO_CHAR_CHAR);
+    assertTrue(Arrays.equals(TWO_CHAR_CHAR_BYTES, otherBytes));
   }
 
   public void testDecode() throws Exception {
@@ -62,5 +69,6 @@ public class TestUtf8Helper extends Test
     assertEquals(UNICODE_STRING, Utf8Helper.decode(UNICODE_STRING_BYTES));
     assertEquals(UNICODE_STRING_2, Utf8Helper.decode(UNICODE_STRING_BYTES_2));
     assertEquals(REALLY_WHACKY_ONE, 
Utf8Helper.decode(REALLY_WHACKY_ONE_BYTES));
+    assertEquals(TWO_CHAR_CHAR, Utf8Helper.decode(TWO_CHAR_CHAR_BYTES));
   }
 }


Reply via email to