>From Michael Blow <[email protected]>:
Michael Blow has submitted this change. (
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17991 )
Change subject: Merge branch 'gerrit/neo' into 'gerrit/trinity'
......................................................................
Merge branch 'gerrit/neo' into 'gerrit/trinity'
Change-Id: Ia94fc0878d6468495233cb06268132fdee71b7f1
---
M
hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
M
hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
D
hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
D
hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
5 files changed, 34 insertions(+), 920 deletions(-)
Approvals:
Michael Blow: Looks good to me, approved; Verified
diff --git
a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 4fc503d..a50cc31 100644
---
a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++
b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -248,12 +248,13 @@
* consistent with the comparison result.
*/
public static int normalize(byte[] bytes, int start) {
- int len = getUTFLength(bytes, start);
long nk = 0;
+ int len = getUTFLength(bytes, start);
int offset = start + getNumBytesToStoreLength(len);
+ int end = offset + len;
for (int i = 0; i < 2; ++i) {
nk <<= 16;
- if (i < len) {
+ if (offset < end) {
nk += (charAt(bytes, offset)) & 0xffff;
offset += charSize(bytes, offset);
}
@@ -502,19 +503,15 @@
* are exactly the same as for the <code>readUTF</code>
* method of <code>DataInput</code>.
*
- * @param in
- * a data input stream.
+ * @param in a data input stream.
* @return a Unicode string.
- * @throws EOFException
- * if the input stream reaches the end
- * before all the bytes.
- * @throws IOException
- * the stream has been closed and the contained
- * input stream does not support reading after close, or
- * another I/O error occurs.
- * @throws UTFDataFormatException
- * if the bytes do not represent a
- * valid modified UTF-8 encoding of a Unicode string.
+ * @throws EOFException if the input stream reaches the end
+ * before all the bytes.
+ * @throws IOException the stream has been closed and the
contained
+ * input stream does not support reading
after close, or
+ * another I/O error occurs.
+ * @throws UTFDataFormatException if the bytes do not represent a
+ * valid modified UTF-8 encoding of a
Unicode string.
* @see java.io.DataInputStream#readUnsignedShort()
*/
public static String readUTF8(DataInput in) throws IOException {
@@ -606,10 +603,8 @@
/**
* Write a UTF8 String <code>str</code> into the DataOutput
<code>out</code>
*
- * @param str,
- * a Unicode string;
- * @param out,
- * a Data output stream.
+ * @param str, a Unicode string;
+ * @param out, a Data output stream.
* @throws IOException
*/
public static void writeUTF8(CharSequence str, DataOutput out) throws
IOException {
diff --git
a/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
index 6f3782b..37ab002 100644
---
a/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
+++
b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
@@ -25,6 +25,7 @@
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
import static
org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static
org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR;
import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
@@ -79,13 +80,14 @@
}
@Test
- public void testCompareToAndNormolize() throws Exception {
+ public void testCompareToAndNormalize() throws Exception {
testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+ testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR,
OPTION.STANDARD);
}
- public boolean isSameSign(int r1, int r2) {
+ private static boolean isSameSign(int r1, int r2) {
if (r1 > 0) {
return r2 > 0;
}
@@ -101,7 +103,7 @@
LOWERCASE
}
- public void testCompare(String str1, String str2, OPTION option) throws
IOException {
+ private static void testCompare(String str1, String str2, OPTION option) {
byte[] buffer1 = writeStringToBytes(str1);
byte[] buffer2 = writeStringToBytes(str2);
@@ -119,7 +121,6 @@
assertEquals(str1.compareToIgnoreCase(str2),
lowerCaseCompareTo(buffer1, 0, buffer2, 0));
break;
}
-
}
@Test
diff --git
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
deleted file mode 100644
index 3dbe4ac..0000000
---
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ /dev/null
@@ -1,705 +0,0 @@
-<<<<<<< HEAD (78ebed [NO ISSUE] Use getClass().getName() instead of
getClass().ge)
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.hyracks.util.string;
-
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-import java.lang.ref.SoftReference;
-
-import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
-
-/**
- * A helper package to operate the UTF8String in Hyracks.
- * Most of the codes were migrated from asterix-fuzzyjoin and
hyracks-storage-am-invertedindex
- */
-public class UTF8StringUtil {
-
- public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
- "Decoding error: got a low surrogate without a leading high
surrogate";
- public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
- "Decoding error: got a high surrogate without a following low
surrogate";
-
- private UTF8StringUtil() {
- }
-
- public static char charAt(byte[] b, int s) {
- if (s >= b.length) {
- throw new ArrayIndexOutOfBoundsException(s);
- }
- int c = b[s] & 0xff;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- return (char) c;
-
- case 12:
- case 13:
- return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
-
- case 14:
- return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6)
| (b[s + 2] & 0x3F));
-
- default:
- throw new IllegalArgumentException();
- }
- }
-
- public static int charSize(byte[] b, int s) {
- int c = b[s] & 0xff;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- return 1;
-
- case 12:
- case 13:
- return 2;
-
- case 14:
- return 3;
-
- default:
- throw new IllegalStateException();
- }
- }
-
- public static int codePointAt(byte[] b, int s) {
- char c1 = charAt(b, s);
-
- if (Character.isLowSurrogate(c1)) {
- // In this case, the index s doesn't point to a correct position
- throw new
IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
- }
-
- if (Character.isHighSurrogate(c1)) {
- // If c1 is the a high surrogate and also the last char in the
byte array (that means the byte array is somehow illegal),
- // then an exception will be thrown because there is no low
surrogate (c2) available in the byte array
- s += charSize(b, s);
- char c2 = charAt(b, s);
- if (Character.isLowSurrogate(c2)) {
- return Character.toCodePoint(c1, c2);
- } else {
- throw new
IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
- }
- }
-
- return c1;
- }
-
- public static int codePointSize(byte[] b, int s) {
- char c1 = charAt(b, s);
- int size1 = charSize(b, s);
-
- if (Character.isLowSurrogate(c1)) {
- throw new
IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
- }
-
- if (Character.isHighSurrogate(c1)) {
- // Similar to the above codePointAt(),
- // if c1 is the a high surrogate and also the last char in the
byte array (that means the byte array is somehow illegal),
- // then an exception will be thrown because there is no low
surrogate available in the byte array
- s += size1;
- int size2 = charSize(b, s);
- return size1 + size2;
- }
-
- return size1;
- }
-
- public static boolean isCharStart(byte[] b, int s) {
- int c = b[s] & 0xff;
- return (c >> 6) != 2;
- }
-
- public static int getModifiedUTF8Len(char c) {
- if (c >= 0x0001 && c <= 0x007F) {
- return 1;
- } else if (c <= 0x07FF) {
- return 2;
- } else {
- return 3;
- }
- }
-
- public static int writeCharAsModifiedUTF8(char c, DataOutput dos) throws
IOException {
- if (c >= 0x0001 && c <= 0x007F) {
- dos.writeByte(c);
- return 1;
- } else if (c <= 0x07FF) {
- dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
- dos.writeByte((byte) (0x80 | (c & 0x3F)));
- return 2;
- } else {
- dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
- dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
- dos.writeByte((byte) (0x80 | (c & 0x3F)));
- return 3;
- }
- }
-
- public static int writeCharAsModifiedUTF8(char c, OutputStream dos) throws
IOException {
- if (c >= 0x0001 && c <= 0x007F) {
- dos.write(c);
- return 1;
- } else if (c <= 0x07FF) {
- dos.write((byte) (0xC0 | ((c >> 6) & 0x3F)));
- dos.write((byte) (0x80 | (c & 0x3F)));
- return 2;
- } else {
- dos.write((byte) (0xE0 | ((c >> 12) & 0x0F)));
- dos.write((byte) (0x80 | ((c >> 6) & 0x3F)));
- dos.write((byte) (0x80 | (c & 0x3F)));
- return 3;
- }
- }
-
- // The result is the number of Java Chars (8 bytes) in the string
- public static int getStringLength(byte[] b, int s) {
- int len = getUTFLength(b, s);
- int pos = s + getNumBytesToStoreLength(len);
- return getStringLength(b, pos, len);
- }
-
- public static int getStringLength(byte[] b, int offs, int len) {
- int pos = offs;
- int end = pos + len;
- int charCount = 0;
- while (pos < end) {
- charCount++;
- pos += charSize(b, pos);
- }
- return charCount;
- }
-
- public static int getNumCodePoint(byte[] b, int s) {
- int len = getUTFLength(b, s);
- int pos = s + getNumBytesToStoreLength(len);
- int end = pos + len;
- int codePointCount = 0;
- while (pos < end) {
- codePointCount++;
- pos += codePointSize(b, pos);
- }
-
- return codePointCount;
- }
-
- public static int getUTFLength(byte[] b, int s) {
- return VarLenIntEncoderDecoder.decode(b, s);
- }
-
- public static int getNumBytesToStoreLength(int strlen) {
- return VarLenIntEncoderDecoder.getBytesRequired(strlen);
- }
-
- public static int codePointToUTF8(int codePoint, char[] tempChars, byte[]
outputUTF8) {
- int len = 0;
- int numChars = Character.toChars(codePoint, tempChars, 0);
- for (int i = 0; i < numChars; i++) {
- len += writeToBytes(outputUTF8, len, tempChars[i]);
- }
-
- return len;
- }
-
- /**
- * Compute the normalized key of the UTF8 string.
- * The normalized key in Hyracks is mainly used to speedup the comparison
between pointable data.
- * In the UTF8StringPTR case, we compute the integer value by using the
first 2 chars.
- * The comparator will first use this integer to get the result ( <,>, or
=), it will check
- * the actual bytes only if the normalized key is equal. Thus this
normalized key must be
- * consistent with the comparison result.
- */
- public static int normalize(byte[] bytes, int start) {
- long nk = 0;
- int len = getUTFLength(bytes, start);
- int offset = start + getNumBytesToStoreLength(len);
- int end = offset + len;
- for (int i = 0; i < 2; ++i) {
- nk <<= 16;
- if (offset < end) {
- nk += (charAt(bytes, offset)) & 0xffff;
- offset += charSize(bytes, offset);
- }
- }
- return (int) (nk >> 1); // make it always positive.
- }
-
- public static int compareTo(byte[] thisBytes, int thisStart, byte[]
thatBytes, int thatStart) {
- return compareTo(thisBytes, thisStart, thatBytes, thatStart, false,
false);
- }
-
- // the start and length of each are the ones calculated by
UTF8StringPointable. caller should provide proper values
- public static int compareTo(byte[] thisBytes, int thisStart, int
thisLength, byte[] thatBytes, int thatStart,
- int thatLength) {
- return compareTo(thisBytes, thisStart, thisLength, thatBytes,
thatStart, thatLength, false, false);
- }
-
- /**
- * This function provides the raw bytes-based comparison for UTF8 strings.
- * Note that the comparison may not deliver the correct ordering for
certain languages that include 2 or 3 bytes characters.
- * But it works for single-byte character languages.
- */
- public static int rawByteCompareTo(byte[] thisBytes, int thisStart, byte[]
thatBytes, int thatStart) {
- return compareTo(thisBytes, thisStart, thatBytes, thatStart, false,
true);
- }
-
- public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart,
byte[] thatBytes, int thatStart) {
- return compareTo(thisBytes, thisStart, thatBytes, thatStart, true,
false);
- }
-
- // Certain type of string does not include lengthByte in the beginning and
- // the length of the given string is given explicitly as a parameter.
(e.g., token in a string)
- public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, int
thisLength, byte[] thatBytes,
- int thatStart, int thatLength) {
- return compareTo(thisBytes, thisStart, thisLength, thatBytes,
thatStart, thatLength, true, false);
- }
-
- public static int hash(byte[] bytes, int start, int coefficient, int r) {
- return hash(bytes, start, false, false, coefficient, r);
- }
-
- public static int hash(byte[] bytes, int start) {
- return hash(bytes, start, false, false, 31, Integer.MAX_VALUE);
- }
-
- private static int hash(byte[] bytes, int start, boolean useLowerCase,
boolean useRawByte, int coefficient, int r) {
- int utflen = getUTFLength(bytes, start);
- int sStart = start + getNumBytesToStoreLength(utflen);
- return hash(bytes, sStart, utflen, useLowerCase, useRawByte,
coefficient, r);
- }
-
- /**
- * This function provides the raw bytes-based hash function for UTF8
strings.
- * Note that the hash values may not deliver the correct ordering for
certain languages that include 2 or 3 bytes characters.
- * But it works for single-byte character languages.
- */
- public static int rawBytehash(byte[] bytes, int start) {
- return hash(bytes, start, false, true, 31, Integer.MAX_VALUE);
- }
-
- public static int lowerCaseHash(byte[] bytes, int start) {
- return hash(bytes, start, true, false, 31, Integer.MAX_VALUE);
- }
-
- // Certain type of string does not include lengthByte in the beginning and
- // the length of the given string is given explicitly as a parameter.
- public static int lowerCaseHash(byte[] bytes, int start, int length) {
- return hash(bytes, start, length, true, false, 31, Integer.MAX_VALUE);
- }
-
- public static String toString(byte[] bytes, int start) {
- StringBuilder builder = new StringBuilder();
- return toString(builder, bytes, start).toString();
- }
-
- public static StringBuilder toString(StringBuilder builder, byte[] bytes,
int start) {
- int utfLen = getUTFLength(bytes, start);
- int offset = getNumBytesToStoreLength(utfLen);
- while (utfLen > 0) {
- char c = charAt(bytes, start + offset);
- builder.append(c);
- int cLen = getModifiedUTF8Len(c);
- offset += cLen;
- utfLen -= cLen;
- }
- return builder;
- }
-
- // Different from the above toString() methods, here we assume the byte[]
doesn't contain NumBytesToStoreLength
- // In fact, this is used for string tokenizer: get "hello" and "world"
from the bytes of "hello world"
- public static String getUTF8StringInArray(byte[] b, int start, int len) {
- StringBuilder builder = new StringBuilder();
-
- for (int i = start; i < start + len;) {
- char c = UTF8StringUtil.charAt(b, i);
- builder.append(c);
- i += UTF8StringUtil.charSize(b, i);
- }
-
- return builder.toString();
- }
-
- public static void printUTF8StringWithQuotes(byte[] b, int s, int l,
OutputStream os) throws IOException {
- printUTF8String(b, s, l, os, true);
- }
-
- public static void printUTF8StringNoQuotes(byte[] b, int s, int l,
OutputStream os) throws IOException {
- printUTF8String(b, s, l, os, false);
- }
-
- public static void printUTF8StringWithQuotes(String str, OutputStream os)
throws IOException {
- printUTF8String(str, os, true);
- }
-
- public static void printUTF8StringNoQuotes(String str, OutputStream os)
throws IOException {
- printUTF8String(str, os, false);
- }
-
- public static int encodeUTF8Length(int length, byte[] bytes, int start) {
- return VarLenIntEncoderDecoder.encode(length, bytes, start);
- }
-
- public static int writeUTF8Length(int length, byte[] bytes, DataOutput
out) throws IOException {
- int nbytes = encodeUTF8Length(length, bytes, 0);
- out.write(bytes, 0, nbytes);
- return nbytes;
- }
-
- private static void printUTF8String(byte[] b, int s, int l, OutputStream
os, boolean useQuotes) throws IOException {
- int stringLength = getUTFLength(b, s);
- int position = s + getNumBytesToStoreLength(stringLength);
- int maxPosition = position + stringLength;
- if (useQuotes) {
- os.write('\"');
- }
- while (position < maxPosition) {
- char c = charAt(b, position);
- if (c == '\\' || c == '"') {
- // escape
- os.write('\\');
- }
- int sz = charSize(b, position);
- while (sz > 0) {
- os.write(b[position]);
- position++;
- sz--;
- }
- }
- if (useQuotes) {
- os.write('\"');
- }
- }
-
- private static void printUTF8String(String string, OutputStream os,
boolean useQuotes) throws IOException {
- if (useQuotes) {
- os.write('\"');
- }
- for (int i = 0; i < string.length(); i++) {
- char ch = string.charAt(i);
- writeCharAsModifiedUTF8(ch, os);
- }
- if (useQuotes) {
- os.write('\"');
- }
- }
-
- private static int compareTo(byte[] thisBytes, int thisStart, byte[]
thatBytes, int thatStart, boolean useLowerCase,
- boolean useRawByte) {
- int thisLength = getUTFLength(thisBytes, thisStart);
- int thatLength = getUTFLength(thatBytes, thatStart);
- int thisActualStart = thisStart + getNumBytesToStoreLength(thisLength);
- int thatActualStart = thatStart + getNumBytesToStoreLength(thatLength);
- return compareTo(thisBytes, thisActualStart, thisLength, thatBytes,
thatActualStart, thatLength, useLowerCase,
- useRawByte);
- }
-
- private static int compareTo(byte[] thisBytes, int thisActualStart, int
thisLength, byte[] thatBytes,
- int thatActualStart, int thatLength, boolean useLowerCase, boolean
useRawByte) {
- int c1 = 0;
- int c2 = 0;
-
- while (c1 < thisLength && c2 < thatLength) {
- char ch1, ch2;
- if (useRawByte) {
- ch1 = (char) thisBytes[thisActualStart + c1];
- ch2 = (char) thatBytes[thatActualStart + c2];
- } else {
- ch1 = charAt(thisBytes, thisActualStart + c1);
- ch2 = charAt(thatBytes, thatActualStart + c2);
-
- if (useLowerCase) {
- ch1 = Character.toLowerCase(ch1);
- ch2 = Character.toLowerCase(ch2);
- }
- }
-
- if (ch1 != ch2) {
- return ch1 - ch2;
- }
- c1 += charSize(thisBytes, thisActualStart + c1);
- c2 += charSize(thatBytes, thatActualStart + c2);
- }
- return thisLength - thatLength;
- }
-
- private static int hash(byte[] bytes, int start, int length, boolean
useLowerCase, boolean useRawByte,
- int coefficient, int r) {
- int h = 0;
- int c = 0;
-
- while (c < length) {
- char ch;
- if (useRawByte) {
- ch = (char) bytes[start + c];
- } else {
- ch = charAt(bytes, start + c);
- if (useLowerCase) {
- ch = Character.toLowerCase(ch);
- }
- }
- h = (coefficient * h + ch) % r;
- c += charSize(bytes, start + c);
- }
- return h;
- }
-
- public static byte[] writeStringToBytes(String string) {
- UTF8StringWriter writer = new UTF8StringWriter();
- ByteArrayOutputStream bos = new ByteArrayOutputStream();
- DataOutputStream dos = new DataOutputStream(bos);
- try {
- writer.writeUTF8(string, dos);
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- return bos.toByteArray();
- }
-
- /**
- * Reads from the
- * stream <code>in</code> a representation
- * of a Unicode character string encoded in
- * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
- * this string of characters is then returned as a <code>String</code>.
- * The details of the modified UTF-8 representation
- * are exactly the same as for the <code>readUTF</code>
- * method of <code>DataInput</code>.
- *
- * @param in a data input stream.
- * @return a Unicode string.
- * @throws EOFException if the input stream reaches the end
- * before all the bytes.
- * @throws IOException the stream has been closed and the
contained
- * input stream does not support reading
after close, or
- * another I/O error occurs.
- * @throws UTFDataFormatException if the bytes do not represent a
- * valid modified UTF-8 encoding of a
Unicode string.
- * @see java.io.DataInputStream#readUnsignedShort()
- */
- public static String readUTF8(DataInput in) throws IOException {
- return readUTF8(in, null);
- }
-
- public static String readUTF8(DataInput in, UTF8StringReader reader)
throws IOException {
- int utflen = VarLenIntEncoderDecoder.decode(in);
- byte[] bytearr;
- char[] chararr;
-
- if (reader == null) {
- bytearr = new byte[utflen * 2];
- chararr = new char[utflen * 2];
- } else {
- if (reader.bytearr == null || reader.bytearr.length < utflen) {
- reader.bytearr = new byte[utflen * 2];
- reader.chararr = new char[utflen * 2];
- }
- bytearr = reader.bytearr;
- chararr = reader.chararr;
- }
-
- int c, char2, char3;
- int count = 0;
- int chararr_count = 0;
-
- in.readFully(bytearr, 0, utflen);
-
- while (count < utflen) {
- c = bytearr[count] & 0xff;
- if (c > 127) {
- break;
- }
- count++;
- chararr[chararr_count++] = (char) c;
- }
-
- while (count < utflen) {
- c = bytearr[count] & 0xff;
- switch (c >> 4) {
- case 0:
- case 1:
- case 2:
- case 3:
- case 4:
- case 5:
- case 6:
- case 7:
- /* 0xxxxxxx*/
- count++;
- chararr[chararr_count++] = (char) c;
- break;
- case 12:
- case 13:
- /* 110x xxxx 10xx xxxx*/
- count += 2;
- if (count > utflen) {
- throw new UTFDataFormatException("malformed input:
partial character at end");
- }
- char2 = bytearr[count - 1];
- if ((char2 & 0xC0) != 0x80) {
- throw new UTFDataFormatException("malformed input
around byte " + count);
- }
- chararr[chararr_count++] = (char) (((c & 0x1F) << 6) |
(char2 & 0x3F));
- break;
- case 14:
- /* 1110 xxxx 10xx xxxx 10xx xxxx */
- count += 3;
- if (count > utflen) {
- throw new UTFDataFormatException("malformed input:
partial character at end");
- }
- char2 = bytearr[count - 2];
- char3 = bytearr[count - 1];
- if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
- throw new UTFDataFormatException("malformed input
around byte " + (count - 1));
- }
- chararr[chararr_count++] = (char) (((c & 0x0F) << 12) |
((char2 & 0x3F) << 6) | (char3 & 0x3F));
- break;
- default:
- /* 10xx xxxx, 1111 xxxx */
- throw new UTFDataFormatException("malformed input around
byte " + count);
- }
- }
- // The number of chars produced may be less than utflen
- return new String(chararr, 0, chararr_count);
- }
-
- /**
- * Write a UTF8 String <code>str</code> into the DataOutput
<code>out</code>
- *
- * @param str, a Unicode string;
- * @param out, a Data output stream.
- * @throws IOException
- */
- public static void writeUTF8(CharSequence str, DataOutput out) throws
IOException {
- writeUTF8(str, out, null);
- }
-
- public static void writeUTF8(CharSequence str, DataOutput out,
UTF8StringWriter writer) throws IOException {
- int strlen = str.length();
- int utflen = 0;
- char c;
- int count = 0;
-
- for (int i = 0; i < strlen; i++) {
- // ToDo: we shouldn't use str.charAt(i) to convert raw byte array
to UTF-8 chars
- // one UTF-8 char has at most four bytes, and one Java char we get
via str.charAt(i) has 2 bytes
- // In this case, a UTF-8 char may be consistent of 2 Java chars,
and 1 Java char can be converted into 3 UTF-8 bytes
- // For the emoji, it can be 6 bytes after encoded to UTF-8
- c = str.charAt(i);
- utflen += UTF8StringUtil.getModifiedUTF8Len(c);
- }
-
- byte[] tempBytes = getTempBytes(writer, utflen);
- count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
- int i = 0;
- for (; i < strlen; i++) {
- c = str.charAt(i);
- if (!((c >= 0x0001) && (c <= 0x007F))) {
- break;
- }
- tempBytes[count++] = (byte) c;
- }
-
- for (; i < strlen; i++) {
- c = str.charAt(i);
- count += writeToBytes(tempBytes, count, c);
- }
- out.write(tempBytes, 0, count);
- }
-
- public static void writeUTF8(char[] buffer, int start, int length,
DataOutput out, UTF8StringWriter writer)
- throws IOException {
- int utflen = 0;
- int count = 0;
- char c;
-
- for (int i = 0; i < length; i++) {
- c = buffer[i + start];
- utflen += UTF8StringUtil.getModifiedUTF8Len(c);
- }
-
- byte[] tempBytes = getTempBytes(writer, utflen);
- count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
-
- int i = 0;
- for (; i < length; i++) {
- c = buffer[i + start];
- if (!((c >= 0x0001) && (c <= 0x007F))) {
- break;
- }
- tempBytes[count++] = (byte) c;
- }
-
- for (; i < length; i++) {
- c = buffer[i + start];
- count += writeToBytes(tempBytes, count, c);
- }
- out.write(tempBytes, 0, count);
- }
-
- private static int writeToBytes(byte[] tempBytes, int count, char c) {
- int orig = count;
- if ((c >= 0x0001) && (c <= 0x007F)) {
- tempBytes[count++] = (byte) c;
- } else if (c > 0x07FF) {
- tempBytes[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
- tempBytes[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
- tempBytes[count++] = (byte) (0x80 | (c & 0x3F));
- } else {
- tempBytes[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
- tempBytes[count++] = (byte) (0x80 | (c & 0x3F));
- }
- return count - orig;
- }
-
- private static byte[] getTempBytes(UTF8StringWriter writer, int utflen) {
- byte[] tempBytes;
- if (writer == null) {
- tempBytes = new byte[utflen + 5];
- } else {
- byte[] writerTempBytes = writer.tempBytesRef != null ?
writer.tempBytesRef.get() : null;
- if (writerTempBytes == null || writerTempBytes.length < utflen +
5) {
- writerTempBytes = new byte[utflen + 5];
- writer.tempBytesRef = new SoftReference<>(writerTempBytes);
- }
- tempBytes = writerTempBytes;
- }
- return tempBytes;
- }
-}
-=======
->>>>>>> BRANCH (b4a7d8 [NO ISSUE]: Move StringUtils to hyracks-api module)
diff --git
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
deleted file mode 100644
index 2ad0b62..0000000
---
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ /dev/null
@@ -1,193 +0,0 @@
-<<<<<<< HEAD (78ebed [NO ISSUE] Use getClass().getName() instead of
getClass().ge)
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.hyracks.util.string;
-
-import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
-import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
-import static
org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
-import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
-import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
-import static
org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
-import static
org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR;
-import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
-import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
-import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
-import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
-import static
org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
-import static org.apache.hyracks.util.string.UTF8StringUtil.getNumCodePoint;
-import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
-import static
org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
-import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
-import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
-import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
-import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseHash;
-import static org.apache.hyracks.util.string.UTF8StringUtil.normalize;
-import static org.apache.hyracks.util.string.UTF8StringUtil.rawByteCompareTo;
-import static org.apache.hyracks.util.string.UTF8StringUtil.writeStringToBytes;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.util.List;
-
-import org.junit.Test;
-
-public class UTF8StringUtilTest {
-
- @Test
- public void testCharAtCharSizeGetLen() throws Exception {
- char[] utf8Mix = STRING_UTF8_MIX.toCharArray();
- byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
- int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0));
- for (char c : utf8Mix) {
- assertEquals(c, charAt(buffer, pos));
- assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos));
- pos += charSize(buffer, pos);
- }
- }
-
- @Test
- public void testGetStringLength() throws Exception {
- byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
- assertEquals(STRING_UTF8_MIX.length(), getStringLength(buffer, 0));
- }
-
- @Test
- public void testChinese() {
- byte[] bufferDe = writeStringToBytes("的");
- byte[] bufferLi = writeStringToBytes("离");
- int ret = compareTo(bufferDe, 0, bufferLi, 0);
- assertTrue(ret != 0);
- }
-
- @Test
- public void testCompareToAndNormalize() throws Exception {
- testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
- testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
- testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
- testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR,
OPTION.STANDARD);
- }
-
- private static boolean isSameSign(int r1, int r2) {
- if (r1 > 0) {
- return r2 > 0;
- }
- if (r1 < 0) {
- return r2 < 0;
- }
- return r2 == 0;
- }
-
- enum OPTION {
- STANDARD,
- RAW_BYTE,
- LOWERCASE
- }
-
- private static void testCompare(String str1, String str2, OPTION option) {
- byte[] buffer1 = writeStringToBytes(str1);
- byte[] buffer2 = writeStringToBytes(str2);
-
- switch (option) {
- case STANDARD:
- assertEquals(str1.compareTo(str2), compareTo(buffer1, 0,
buffer2, 0));
- int n1 = normalize(buffer1, 0);
- int n2 = normalize(buffer2, 0);
- assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
- break;
- case RAW_BYTE:
- assertEquals(str1.compareTo(str2), rawByteCompareTo(buffer1,
0, buffer2, 0));
- break;
- case LOWERCASE:
- assertEquals(str1.compareToIgnoreCase(str2),
lowerCaseCompareTo(buffer1, 0, buffer2, 0));
- break;
- }
- }
-
- @Test
- public void testRawByteCompareTo() throws Exception {
- testCompare(STRING_LEN_MEDIUM, STRING_LEN_MEDIUM, OPTION.RAW_BYTE);
- testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.RAW_BYTE);
- }
-
- @Test
- public void testLowerCaseCompareTo() throws Exception {
- testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.LOWERCASE);
- testCompare(STRING_LEN_127, STRING_UTF8_MIX, OPTION.LOWERCASE);
- testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX_LOWERCASE,
OPTION.LOWERCASE);
- testCompare(STRING_UTF8_MIX_LOWERCASE, STRING_UTF8_MIX,
OPTION.LOWERCASE);
- }
-
- @Test
- public void testToString() throws Exception {
-
- StringBuilder sb = new StringBuilder();
- byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
- assertEquals(STRING_UTF8_MIX, UTF8StringUtil.toString(sb, buffer,
0).toString());
- }
-
- @Test
- public void testHash() throws IOException {
- byte[] buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
- int lowerHash = hash(buffer, 0);
-
- buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
- int upperHash = lowerCaseHash(buffer, 0);
- assertEquals(lowerHash, upperHash);
-
- int familyOne = hash(buffer, 0, 7, 297);
- int familyTwo = hash(buffer, 0, 8, 297);
- assertTrue(familyOne != familyTwo);
- }
-
- @Test
- public void testGetUTF8StringInArray() {
- String str = null;
- byte[] bytes = null;
- List<String> answer = null;
-
- str = "database group at university of California, Irvine 23333";
- bytes = writeStringToBytes(str);
- // First byte in bytes is for the number of bytes of the entire string,
- // and it should be skipped in getUTF8StringInArray
- assertEquals("database", getUTF8StringInArray(bytes, 1, 8));
- assertEquals("at", getUTF8StringInArray(bytes, 16, 2));
- // test upper case
- assertEquals("California", getUTF8StringInArray(bytes, 33, 10));
- // test non-english char
- assertEquals(",", getUTF8StringInArray(bytes, 43, 1));
- assertEquals("Irvine", getUTF8StringInArray(bytes, 45, 6));
- // test number
- assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
- }
-
- @Test
- public void testGetNumCodePoint() {
- String str =
"\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66";
- assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 7);
-
- str =
"\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66\uD83C\uDDE8\uD83C\uDDF3";
- assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 9);
- }
-
-}
-=======
->>>>>>> BRANCH (b4a7d8 [NO ISSUE]: Move StringUtils to hyracks-api module)
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17991
To unsubscribe, or for help writing mail filters, visit
https://asterix-gerrit.ics.uci.edu/settings
Gerrit-Project: asterixdb
Gerrit-Branch: trinity
Gerrit-Change-Id: Ia94fc0878d6468495233cb06268132fdee71b7f1
Gerrit-Change-Number: 17991
Gerrit-PatchSet: 1
Gerrit-Owner: Michael Blow <[email protected]>
Gerrit-Reviewer: Michael Blow <[email protected]>
Gerrit-CC: Jenkins <[email protected]>
Gerrit-MessageType: merged