Author: gsim Date: Fri Oct 10 12:55:29 2014 New Revision: 1630841 URL: http://svn.apache.org/r1630841 Log: PROTON-576: update String UTF-8 encoding to handle high range unicode characters / surrogate pairs
Applied patch from Dominic Evans with modifications by Rob Godfrey Added: qpid/proton/branches/examples/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java Modified: qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java Modified: qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java URL: http://svn.apache.org/viewvc/qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java?rev=1630841&r1=1630840&r2=1630841&view=diff ============================================================================== --- qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java (original) +++ qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/EncoderImpl.java Fri Oct 10 12:55:29 2014 @@ -21,7 +21,11 @@ package org.apache.qpid.proton.codec; import java.nio.ByteBuffer; -import java.util.*; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; import org.apache.qpid.proton.amqp.Binary; import org.apache.qpid.proton.amqp.Decimal128; @@ -770,31 +774,60 @@ public final class EncoderImpl implement void writeRaw(String string) { final int length = string.length(); - char c; + int c; for (int i = 0; i < length; i++) { c = string.charAt(i); - if ((c >= 0x0001) && (c <= 0x007F)) + if ((c & 0xFF80) == 0) /* U+0000..U+007F */ { _buffer.put((byte) c); - } - else if (c > 0x07FF) + else if ((c & 0xF800) == 0) /* U+0080..U+07FF */ { - _buffer.put((byte) (0xE0 | ((c >> 12) & 0x0F))); - _buffer.put((byte) (0x80 | ((c >> 6) & 0x3F))); - _buffer.put((byte) (0x80 | (c & 0x3F))); + _buffer.put((byte)(0xC0 | ((c >> 6) & 0x1F))); + _buffer.put((byte)(0x80 | (c & 0x3F))); } - else + else if ((c & 0xD800) != 0xD800) /* U+0800..U+FFFF - excluding surrogate pairs */ { - _buffer.put((byte) (0xC0 | ((c >> 6) & 0x1F))); - _buffer.put((byte) (0x80 | (c & 0x3F))); + _buffer.put((byte)(0xE0 | ((c >> 12) & 0x0F))); + _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F))); + _buffer.put((byte)(0x80 | (c & 0x3F))); } - } + else + { + int low; - } + if(((c & 0xDC00) == 0xDC00) || (++i == length) || ((low = string.charAt(i)) & 0xDC00) != 0xDC00) + { + throw new IllegalArgumentException("String contains invalid Unicode code points"); + } + c = 0x010000 + ((c & 0x03FF) << 10) + (low & 0x03FF); + if (c <= 0x3FFFF) /* U+10000..U+3FFFF */ + { + _buffer.put((byte) 0xF0); + _buffer.put((byte)(0x90 | ((c >> 12) & 0x2F))); + _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F))); + _buffer.put((byte)(0x80 | (c & 0x3F))); + } + else if (c <= 0xFFFFF) /* U+40000..U+FFFFF */ + { + _buffer.put((byte)(0xF0 | ((c >> 18) & 0x03))); + _buffer.put((byte)(0x80 | ((c >> 12) & 0x3F))); + _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F))); + _buffer.put((byte)(0x80 | (c & 0x3F))); + } + else /* U+100000..U+10FFFF */ + { + _buffer.put((byte)(0xF4)); + _buffer.put((byte)(0x80 | ((c >> 12) & 0x3F))); + _buffer.put((byte)(0x80 | ((c >> 6) & 0x3F))); + _buffer.put((byte)(0x80 | (c & 0x3F))); + } + } + } + } } Modified: qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java URL: http://svn.apache.org/viewvc/qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java?rev=1630841&r1=1630840&r2=1630841&view=diff ============================================================================== --- qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java (original) +++ qpid/proton/branches/examples/proton-j/src/main/java/org/apache/qpid/proton/codec/StringType.java Fri Oct 10 12:55:29 2014 @@ -83,29 +83,22 @@ public class StringType extends Abstract return encoding; } - private static int calculateUTF8Length(final String s) + static int calculateUTF8Length(final String s) { int len = s.length(); - int i = 0; - final int length = s.length(); - while(i < length) + final int length = len; + for (int i = 0; i < length; i++) { - char c = s.charAt(i); - if(c > 127) + int c = s.charAt(i); + if ((c & 0xFF80) != 0) /* U+0080.. */ { len++; - if(c > 0x07ff) + // surrogate pairs should always combine to create a code point with a 4 octet representation + if(((c & 0xF800) != 0) && ((c & 0xD800) != 0xD800)) /* U+0800.. excluding surrogate pairs */ { len++; - if(c >= 0xD800 && c <= 0xDBFF) - { - i++; - len++; - } } } - i++; - } return len; } Added: qpid/proton/branches/examples/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java URL: http://svn.apache.org/viewvc/qpid/proton/branches/examples/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java?rev=1630841&view=auto ============================================================================== --- qpid/proton/branches/examples/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java (added) +++ qpid/proton/branches/examples/proton-j/src/test/java/org/apache/qpid/proton/codec/StringTypeTest.java Fri Oct 10 12:55:29 2014 @@ -0,0 +1,140 @@ +/* + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ +package org.apache.qpid.proton.codec; + +import static org.junit.Assert.assertEquals; + +import java.lang.Character.UnicodeBlock; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.HashSet; +import java.util.Set; + +import org.junit.Test; + +import org.apache.qpid.proton.amqp.messaging.AmqpValue; + +/** + * Test the encoding and decoding of {@link StringType} values. + */ +public class StringTypeTest +{ + /** + * Loop over all the chars in a given {@link UnicodeBlock} and return a + * {@link Set <String>} containing all the possible values as their + * {@link String} values. + * + * @param block the {@link UnicodeBlock} to loop over + * @return a {@link Set <String>} containing all the possible values as + * {@link String} values + */ + private static Set<String> getAllStringsFromUnicodeBlock(final UnicodeBlock block) + { + final Set<String> strings = new HashSet<String>(); + for (int codePoint = 0; codePoint <= Character.MAX_CODE_POINT; codePoint++) + { + if (UnicodeBlock.of(codePoint) == block) + { + final int charCount = Character.charCount(codePoint); + final StringBuilder sb = new StringBuilder( + charCount); + if (charCount == 1) + { + sb.append(String.valueOf((char) codePoint)); + } + else if (charCount == 2) + { + sb.append(Character.highSurrogate(codePoint)); + sb.append(Character.lowSurrogate(codePoint)); + } + else + { + throw new IllegalArgumentException("Character.charCount of " + + charCount + " not supported."); + } + strings.add(sb.toString()); + } + } + return strings; + } + + + /** + * Test the encoding and decoding of various complicated Unicode characters + * which will end up as "surrogate pairs" when encoded to UTF-8 + */ + @Test + public void calculateUTF8Length() + { + for (final String input : generateTestData()) + { + assertEquals("Incorrect string length calculated for string '"+input+"'",input.getBytes(StandardCharsets.UTF_8).length, StringType.calculateUTF8Length(input)); + } + } + + /** + * Test the encoding and decoding of various Unicode characters + */ + @Test + public void encodeDecodeStrings() + { + final DecoderImpl decoder = new DecoderImpl(); + final EncoderImpl encoder = new EncoderImpl(decoder); + AMQPDefinedTypes.registerAllTypes(decoder, encoder); + final ByteBuffer bb = ByteBuffer.allocate(16); + + for (final String input : generateTestData()) + { + bb.clear(); + final AmqpValue inputValue = new AmqpValue(input); + encoder.setByteBuffer(bb); + encoder.writeObject(inputValue); + bb.clear(); + decoder.setByteBuffer(bb); + final AmqpValue outputValue = (AmqpValue) decoder.readObject(); + assertEquals("Failed to round trip String correctly: ", input, outputValue.getValue()); + } + } + + // build up some test data with a set of suitable Unicode characters + private Set<String> generateTestData() + { + return new HashSet<String>() + { + private static final long serialVersionUID = 7331717267070233454L; + + { + // non-surrogate pair blocks + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.BASIC_LATIN)); + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.LATIN_1_SUPPLEMENT)); + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.GREEK)); + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.LETTERLIKE_SYMBOLS)); + // blocks with surrogate pairs + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.MISCELLANEOUS_SYMBOLS_AND_PICTOGRAPHS)); + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.MUSICAL_SYMBOLS)); + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.EMOTICONS)); + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.PLAYING_CARDS)); + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_A)); + addAll(getAllStringsFromUnicodeBlock(UnicodeBlock.SUPPLEMENTARY_PRIVATE_USE_AREA_B)); + } + }; + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@qpid.apache.org For additional commands, e-mail: commits-h...@qpid.apache.org