Till Westmann has uploaded a new change for review.
https://asterix-gerrit.ics.uci.edu/1115
Change subject: WIP UTF-8 encoding/decoding
......................................................................
WIP UTF-8 encoding/decoding
Change-Id: Ibe6cd240094eaccfe4a954f9b76fd3345d5a0c06
---
M
asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
M
hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
2 files changed, 90 insertions(+), 70 deletions(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb
refs/changes/15/1115/1
diff --git
a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
index 5b5f53f..2906a85 100644
---
a/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
+++
b/asterixdb/asterix-om/src/main/java/org/apache/asterix/dataflow/data/nontagged/printers/PrintTools.java
@@ -21,6 +21,9 @@
import java.io.IOException;
import java.io.OutputStream;
import java.io.PrintStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
import
org.apache.asterix.dataflow.data.nontagged.serde.AInt32SerializerDeserializer;
import
org.apache.asterix.dataflow.data.nontagged.serde.AInt64SerializerDeserializer;
@@ -242,7 +245,8 @@
int position = s + UTF8StringUtil.getNumBytesToStoreLength(utfLength);
// skip 2 bytes containing string size
int maxPosition = position + utfLength;
os.write('"');
- while (position < maxPosition) {
+ os.write(b, position, utfLength);
+ while (false && position < maxPosition) {
char c = UTF8StringUtil.charAt(b, position);
int sz = UTF8StringUtil.charSize(b, position);
switch (c) {
diff --git
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index e867ecc..a6c84da 100644
---
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -25,6 +25,9 @@
import java.io.IOException;
import java.io.OutputStream;
import java.io.UTFDataFormatException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
@@ -450,85 +453,92 @@
public static String readUTF8(DataInput in, UTF8StringReader reader)
throws IOException {
int utflen = VarLenIntEncoderDecoder.decode(in);
- byte[] bytearr;
- char[] chararr;
-
- if (reader == null) {
- bytearr = new byte[utflen * 2];
- chararr = new char[utflen * 2];
+ if (true) {
+ ByteBuffer buf = ByteBuffer.allocate(utflen);
+ in.readFully(buf.array(), buf.position(), buf.limit());
+ CharBuffer cb = Charset.forName("UTF-8").decode(buf);
+ return cb.toString();
} else {
- if (reader.bytearr == null || reader.bytearr.length < utflen) {
- reader.bytearr = new byte[utflen * 2];
- reader.chararr = new char[utflen * 2];
+ byte[] bytearr;
+ char[] chararr;
+
+ if (reader == null) {
+ bytearr = new byte[utflen * 2];
+ chararr = new char[utflen * 2];
+ } else {
+ if (reader.bytearr == null || reader.bytearr.length < utflen) {
+ reader.bytearr = new byte[utflen * 2];
+ reader.chararr = new char[utflen * 2];
+ }
+ bytearr = reader.bytearr;
+ chararr = reader.chararr;
}
- bytearr = reader.bytearr;
- chararr = reader.chararr;
- }
- int c, char2, char3;
- int count = 0;
- int chararr_count = 0;
+ int c, char2, char3;
+ int count = 0;
+ int chararr_count = 0;
- in.readFully(bytearr, 0, utflen);
+ in.readFully(bytearr, 0, utflen);
- while (count < utflen) {
- c = bytearr[count] & 0xff;
- if (c > 127) {
- break;
+ while (count < utflen) {
+ c = bytearr[count] & 0xff;
+ if (c > 127) {
+ break;
+ }
+ count++;
+ chararr[chararr_count++] = (char) c;
}
- count++;
- chararr[chararr_count++] = (char) c;
- }
- while (count < utflen) {
- c = bytearr[count] & 0xff;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
+ while (count < utflen) {
+ c = bytearr[count] & 0xff;
+ switch (c >> 4) {
+ case 0:
+ case 1:
+ case 2:
+ case 3:
+ case 4:
+ case 5:
+ case 6:
+ case 7:
/* 0xxxxxxx*/
- count++;
- chararr[chararr_count++] = (char) c;
- break;
- case 12:
- case 13:
+ count++;
+ chararr[chararr_count++] = (char) c;
+ break;
+ case 12:
+ case 13:
/* 110x xxxx 10xx xxxx*/
- count += 2;
- if (count > utflen) {
- throw new UTFDataFormatException("malformed input:
partial character at end");
- }
- char2 = bytearr[count - 1];
- if ((char2 & 0xC0) != 0x80) {
- throw new UTFDataFormatException("malformed input
around byte " + count);
- }
- chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
(char2 & 0x3F));
- break;
- case 14:
+ count += 2;
+ if (count > utflen) {
+ throw new UTFDataFormatException("malformed input:
partial character at end");
+ }
+ char2 = bytearr[count - 1];
+ if ((char2 & 0xC0) != 0x80) {
+ throw new UTFDataFormatException("malformed input
around byte " + count);
+ }
+ chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
(char2 & 0x3F));
+ break;
+ case 14:
/* 1110 xxxx 10xx xxxx 10xx xxxx */
- count += 3;
- if (count > utflen) {
- throw new UTFDataFormatException("malformed input:
partial character at end");
- }
- char2 = bytearr[count - 2];
- char3 = bytearr[count - 1];
- if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
- throw new UTFDataFormatException("malformed input
around byte " + (count - 1));
- }
- chararr[chararr_count++] = (char) (((c & 0x0F) << 12) |
((char2 & 0x3F) << 6)
- | ((char3 & 0x3F) << 0));
- break;
- default:
+ count += 3;
+ if (count > utflen) {
+ throw new UTFDataFormatException("malformed input:
partial character at end");
+ }
+ char2 = bytearr[count - 2];
+ char3 = bytearr[count - 1];
+ if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) !=
0x80)) {
+ throw new UTFDataFormatException("malformed input
around byte " + (count - 1));
+ }
+ chararr[chararr_count++] = (char) (((c & 0x0F) << 12)
| ((char2 & 0x3F) << 6) | ((char3 & 0x3F)
+ << 0));
+ break;
+ default:
/* 10xx xxxx, 1111 xxxx */
- throw new UTFDataFormatException("malformed input around
byte " + count);
+ throw new UTFDataFormatException("malformed input
around byte " + count);
+ }
}
+ // The number of chars produced may be less than utflen
+ return new String(chararr, 0, chararr_count);
}
- // The number of chars produced may be less than utflen
- return new String(chararr, 0, chararr_count);
}
/**
@@ -548,7 +558,6 @@
int strlen = str.length();
int utflen = 0;
char c;
- int count = 0;
for (int i = 0; i < strlen; i++) {
c = str.charAt(i);
@@ -556,7 +565,7 @@
}
byte[] tempBytes = getTempBytes(writer, utflen);
- count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
+ int count = VarLenIntEncoderDecoder.encode(utflen, tempBytes, 0);
int i = 0;
for (; i < strlen; i++) {
c = str.charAt(i);
@@ -570,7 +579,14 @@
c = str.charAt(i);
count += writeToBytes(tempBytes, count, c);
}
- out.write(tempBytes, 0, count);
+
+ ByteBuffer buffer = Charset.forName("UTF-8").encode(str.toString());
+ final int len = buffer.limit() - buffer.position();
+ int cnt = VarLenIntEncoderDecoder.encode(len, tempBytes, 0);
+ out.write(tempBytes, 0, cnt);
+ out.write(buffer.array(), buffer.position(), len);
+
+ // out.write(tempBytes, 0, count);
}
static void writeUTF8(char[] buffer, int start, int length, DataOutput
out, UTF8StringWriter writer)
--
To view, visit https://asterix-gerrit.ics.uci.edu/1115
To unsubscribe, visit https://asterix-gerrit.ics.uci.edu/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibe6cd240094eaccfe4a954f9b76fd3345d5a0c06
Gerrit-PatchSet: 1
Gerrit-Project: asterixdb
Gerrit-Branch: master
Gerrit-Owner: Till Westmann <[email protected]>