Till Westmann has uploaded a new change for review. https://asterix-gerrit.ics.uci.edu/1115
Change subject: WIP UTF-8 encoding/decoding ...................................................................... WIP UTF-8 encoding/decoding Change-Id: Ibe6cd240094eaccfe4a954f9b76fd3345d5a0c06 --- M asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java M hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java 2 files changed, 90 insertions(+), 70 deletions(-) git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb refs/changes/15/1115/1 diff --git a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java index 5b5f53f..2906a85 100644 --- a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java +++ b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java @@ -21,6 +21,9 @@ import java.io.IOException; import java.io.OutputStream; import java.io.PrintStream; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; import org.apache.asterix.dataflow.data.nontagged.serde.AInt32SerializerDeserializer; import org.apache.asterix.dataflow.data.nontagged.serde.AInt64SerializerDeserializer; @@ -242,7 +245,8 @@ int position = s + UTF8StringUtil.getNumBytesToStoreLength(utfLength); // skip 2 bytes containing string size int maxPosition = position + utfLength; os.write('"'); - while (position < maxPosition) { + os.write(b, position, utfLength); + while (false && position < maxPosition) { char c = UTF8StringUtil.charAt(b, position); int sz = UTF8StringUtil.charSize(b, position); switch (c) { diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java index e867ecc..a6c84da 100644 --- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java +++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java @@ -25,6 +25,9 @@ import java.io.IOException; import java.io.OutputStream; import java.io.UTFDataFormatException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.Charset; import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder; @@ -450,85 +453,92 @@ public static String readUTF8(DataInput in, UTF8StringReader reader) throws IOException { int utflen = VarLenIntEncoderDecoder.decode(in); - byte[] bytearr; - char[] chararr; - - if (reader == null) { - bytearr = new byte[utflen * 2]; - chararr = new char[utflen * 2]; + if (true) { + ByteBuffer buf = ByteBuffer.allocate(utflen); + in.readFully(buf.array(), buf.position(), buf.limit()); + CharBuffer cb = Charset.forName("UTF-8").decode(buf); + return cb.toString(); } else { - if (reader.bytearr == null || reader.bytearr.length < utflen) { - reader.bytearr = new byte[utflen * 2]; - reader.chararr = new char[utflen * 2]; + byte[] bytearr; + char[] chararr; + + if (reader == null) { + bytearr = new byte[utflen * 2]; + chararr = new char[utflen * 2]; + } else { + if (reader.bytearr == null || reader.bytearr.length < utflen) { + reader.bytearr = new byte[utflen * 2]; + reader.chararr = new char[utflen * 2]; + } + bytearr = reader.bytearr; + chararr = reader.chararr; } - bytearr = reader.bytearr; - chararr = reader.chararr; - } - int c, char2, char3; - int count = 0; - int chararr_count = 0; + int c, char2, char3; + int count = 0; + int chararr_count = 0; - in.readFully(bytearr, 0, utflen); + in.readFully(bytearr, 0, utflen); - while (count < utflen) { - c = bytearr[count] & 0xff; - if (c > 127) { - break; + while (count < utflen) { + c = bytearr[count] & 0xff; + if (c > 127) { + break; + } + count++; + chararr[chararr_count++] = (char) c; } - count++; - chararr[chararr_count++] = (char) c; - } - while (count < utflen) { - c = bytearr[count] & 0xff; - switch (c >> 4) { - case 0: - case 1: - case 2: - case 3: - case 4: - case 5: - case 6: - case 7: + while (count < utflen) { + c = bytearr[count] & 0xff; + switch (c >> 4) { + case 0: + case 1: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: /* 0xxxxxxx*/ - count++; - chararr[chararr_count++] = (char) c; - break; - case 12: - case 13: + count++; + chararr[chararr_count++] = (char) c; + break; + case 12: + case 13: /* 110x xxxx 10xx xxxx*/ - count += 2; - if (count > utflen) { - throw new UTFDataFormatException("malformed input: partial character at end"); - } - char2 = bytearr[count - 1]; - if ((char2 & 0xC0) != 0x80) { - throw new UTFDataFormatException("malformed input around byte " + count); - } - chararr[chararr_count++] = (char) (((c & 0x1F) << 6) | (char2 & 0x3F)); - break; - case 14: + count += 2; + if (count > utflen) { + throw new UTFDataFormatException("malformed input: partial character at end"); + } + char2 = bytearr[count - 1]; + if ((char2 & 0xC0) != 0x80) { + throw new UTFDataFormatException("malformed input around byte " + count); + } + chararr[chararr_count++] = (char) (((c & 0x1F) << 6) | (char2 & 0x3F)); + break; + case 14: /* 1110 xxxx 10xx xxxx 10xx xxxx */ - count += 3; - if (count > utflen) { - throw new UTFDataFormatException("malformed input: partial character at end"); - } - char2 = bytearr[count - 2]; - char3 = bytearr[count - 1]; - if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) { - throw new UTFDataFormatException("malformed input around byte " + (count - 1)); - } - chararr[chararr_count++] = (char) (((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) - | ((char3 & 0x3F) << 0)); - break; - default: + count += 3; + if (count > utflen) { + throw new UTFDataFormatException("malformed input: partial character at end"); + } + char2 = bytearr[count - 2]; + char3 = bytearr[count - 1]; + if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) { + throw new UTFDataFormatException("malformed input around byte " + (count - 1)); + } + chararr[chararr_count++] = (char) (((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) + << 0)); + break; + default: /* 10xx xxxx, 1111 xxxx */ - throw new UTFDataFormatException("malformed input around byte " + count); + throw new UTFDataFormatException("malformed input around byte " + count); + } } + // The number of chars produced may be less than utflen + return new String(chararr, 0, chararr_count); } - // The number of chars produced may be less than utflen - return new String(chararr, 0, chararr_count); } /** @@ -548,7 +558,6 @@ int strlen = str.length(); int utflen = 0; char c; - int count = 0; for (int i = 0; i < strlen; i++) { c = str.charAt(i); @@ -556,7 +565,7 @@ } byte[] tempBytes = getTempBytes(writer, utflen); - count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count); + int count = VarLenIntEncoderDecoder.encode(utflen, tempBytes, 0); int i = 0; for (; i < strlen; i++) { c = str.charAt(i); @@ -570,7 +579,14 @@ c = str.charAt(i); count += writeToBytes(tempBytes, count, c); } - out.write(tempBytes, 0, count); + + ByteBuffer buffer = Charset.forName("UTF-8").encode(str.toString()); + final int len = buffer.limit() - buffer.position(); + int cnt = VarLenIntEncoderDecoder.encode(len, tempBytes, 0); + out.write(tempBytes, 0, cnt); + out.write(buffer.array(), buffer.position(), len); + + // out.write(tempBytes, 0, count); } static void writeUTF8(char[] buffer, int start, int length, DataOutput out, UTF8StringWriter writer) -- To view, visit https://asterix-gerrit.ics.uci.edu/1115 To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ibe6cd240094eaccfe4a954f9b76fd3345d5a0c06 Gerrit-PatchSet: 1 Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Owner: Till Westmann <ti...@apache.org>