trinity'

AsterixDB Code Review Fri, 01 Dec 2023 08:25:26 -0800

>From Michael Blow <[email protected]>:

Michael Blow has uploaded this change for review. ( 
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17991 )



Change subject: Merge branch 'gerrit/neo' into 'gerrit/trinity'
......................................................................

Merge branch 'gerrit/neo' into 'gerrit/trinity'

Change-Id: Ia94fc0878d6468495233cb06268132fdee71b7f1
---
M 
hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
M 
hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
D 
hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
D 
hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
5 files changed, 34 insertions(+), 920 deletions(-)



  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb 
refs/changes/91/17991/1

diff --git 
a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
 
b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
index 4fc503d..a50cc31 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-api/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
@@ -248,12 +248,13 @@
      * consistent with the comparison result.
      */
     public static int normalize(byte[] bytes, int start) {
-        int len = getUTFLength(bytes, start);
         long nk = 0;
+        int len = getUTFLength(bytes, start);
         int offset = start + getNumBytesToStoreLength(len);
+        int end = offset + len;
         for (int i = 0; i < 2; ++i) {
             nk <<= 16;
-            if (i < len) {
+            if (offset < end) {
                 nk += (charAt(bytes, offset)) & 0xffff;
                 offset += charSize(bytes, offset);
             }
@@ -502,19 +503,15 @@
      * are exactly the same as for the <code>readUTF</code>
      * method of <code>DataInput</code>.
      *
-     * @param in
-     *            a data input stream.
+     * @param in a data input stream.
      * @return a Unicode string.
-     * @throws EOFException
-     *             if the input stream reaches the end
-     *             before all the bytes.
-     * @throws IOException
-     *             the stream has been closed and the contained
-     *             input stream does not support reading after close, or
-     *             another I/O error occurs.
-     * @throws UTFDataFormatException
-     *             if the bytes do not represent a
-     *             valid modified UTF-8 encoding of a Unicode string.
+     * @throws EOFException           if the input stream reaches the end
+     *                                before all the bytes.
+     * @throws IOException            the stream has been closed and the 
contained
+     *                                input stream does not support reading 
after close, or
+     *                                another I/O error occurs.
+     * @throws UTFDataFormatException if the bytes do not represent a
+     *                                valid modified UTF-8 encoding of a 
Unicode string.
      * @see java.io.DataInputStream#readUnsignedShort()
      */
     public static String readUTF8(DataInput in) throws IOException {
@@ -606,10 +603,8 @@
     /**
      * Write a UTF8 String <code>str</code> into the DataOutput 
<code>out</code>
      *
-     * @param str,
-     *            a Unicode string;
-     * @param out,
-     *            a Data output stream.
+     * @param str, a Unicode string;
+     * @param out, a Data output stream.
      * @throws IOException
      */
     public static void writeUTF8(CharSequence str, DataOutput out) throws 
IOException {
diff --git 
a/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
 
b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
index 6f3782b..37ab002 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-api/src/test/java/org/apache/hyracks/api/string/UTF8StringUtilTest.java
@@ -25,6 +25,7 @@
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
 import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
 import static 
org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
+import static 
org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR;
 import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
 import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
 import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
@@ -79,13 +80,14 @@
     }

     @Test
-    public void testCompareToAndNormolize() throws Exception {
+    public void testCompareToAndNormalize() throws Exception {
         testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
         testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
         testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
+        testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR, 
OPTION.STANDARD);
     }

-    public boolean isSameSign(int r1, int r2) {
+    private static boolean isSameSign(int r1, int r2) {
         if (r1 > 0) {
             return r2 > 0;
         }
@@ -101,7 +103,7 @@
         LOWERCASE
     }

-    public void testCompare(String str1, String str2, OPTION option) throws 
IOException {
+    private static void testCompare(String str1, String str2, OPTION option) {
         byte[] buffer1 = writeStringToBytes(str1);
         byte[] buffer2 = writeStringToBytes(str2);

@@ -119,7 +121,6 @@
                 assertEquals(str1.compareToIgnoreCase(str2), 
lowerCaseCompareTo(buffer1, 0, buffer2, 0));
                 break;
         }
-
     }

     @Test
diff --git 
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
 
b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
deleted file mode 100644
index 3dbe4ac..0000000
--- 
a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java
+++ /dev/null
@@ -1,705 +0,0 @@
-<<<<<<< HEAD   (78ebed [NO ISSUE] Use getClass().getName() instead of 
getClass().ge)
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- * http://www.apache.org/licenses/LICENSE-2.0
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.hyracks.util.string;
-
-import java.io.ByteArrayOutputStream;
-import java.io.DataInput;
-import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.io.UTFDataFormatException;
-import java.lang.ref.SoftReference;
-
-import org.apache.hyracks.util.encoding.VarLenIntEncoderDecoder;
-
-/**
- * A helper package to operate the UTF8String in Hyracks.
- * Most of the codes were migrated from asterix-fuzzyjoin and 
hyracks-storage-am-invertedindex
- */
-public class UTF8StringUtil {
-
-    public static final String LOW_SURROGATE_WITHOUT_HIGH_SURROGATE =
-            "Decoding error: got a low surrogate without a leading high 
surrogate";
-    public static final String HIGH_SURROGATE_WITHOUT_LOW_SURROGATE =
-            "Decoding error: got a high surrogate without a following low 
surrogate";
-
-    private UTF8StringUtil() {
-    }
-
-    public static char charAt(byte[] b, int s) {
-        if (s >= b.length) {
-            throw new ArrayIndexOutOfBoundsException(s);
-        }
-        int c = b[s] & 0xff;
-        switch (c >> 4) {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-                return (char) c;
-
-            case 12:
-            case 13:
-                return (char) (((c & 0x1F) << 6) | ((b[s + 1]) & 0x3F));
-
-            case 14:
-                return (char) (((c & 0x0F) << 12) | (((b[s + 1]) & 0x3F) << 6) 
| (b[s + 2] & 0x3F));
-
-            default:
-                throw new IllegalArgumentException();
-        }
-    }
-
-    public static int charSize(byte[] b, int s) {
-        int c = b[s] & 0xff;
-        switch (c >> 4) {
-            case 0:
-            case 1:
-            case 2:
-            case 3:
-            case 4:
-            case 5:
-            case 6:
-            case 7:
-                return 1;
-
-            case 12:
-            case 13:
-                return 2;
-
-            case 14:
-                return 3;
-
-            default:
-                throw new IllegalStateException();
-        }
-    }
-
-    public static int codePointAt(byte[] b, int s) {
-        char c1 = charAt(b, s);
-
-        if (Character.isLowSurrogate(c1)) {
-            // In this case, the index s doesn't point to a correct position
-            throw new 
IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
-        }
-
-        if (Character.isHighSurrogate(c1)) {
-            // If c1 is the a high surrogate and also the last char in the 
byte array (that means the byte array is somehow illegal),
-            // then an exception will be thrown because there is no low 
surrogate (c2) available in the byte array
-            s += charSize(b, s);
-            char c2 = charAt(b, s);
-            if (Character.isLowSurrogate(c2)) {
-                return Character.toCodePoint(c1, c2);
-            } else {
-                throw new 
IllegalArgumentException(HIGH_SURROGATE_WITHOUT_LOW_SURROGATE);
-            }
-        }
-
-        return c1;
-    }
-
-    public static int codePointSize(byte[] b, int s) {
-        char c1 = charAt(b, s);
-        int size1 = charSize(b, s);
-
-        if (Character.isLowSurrogate(c1)) {
-            throw new 
IllegalArgumentException(LOW_SURROGATE_WITHOUT_HIGH_SURROGATE);
-        }
-
-        if (Character.isHighSurrogate(c1)) {
-            // Similar to the above codePointAt(),
-            // if c1 is the a high surrogate and also the last char in the 
byte array (that means the byte array is somehow illegal),
-            // then an exception will be thrown because there is no low 
surrogate available in the byte array
-            s += size1;
-            int size2 = charSize(b, s);
-            return size1 + size2;
-        }
-
-        return size1;
-    }
-
-    public static boolean isCharStart(byte[] b, int s) {
-        int c = b[s] & 0xff;
-        return (c >> 6) != 2;
-    }
-
-    public static int getModifiedUTF8Len(char c) {
-        if (c >= 0x0001 && c <= 0x007F) {
-            return 1;
-        } else if (c <= 0x07FF) {
-            return 2;
-        } else {
-            return 3;
-        }
-    }
-
-    public static int writeCharAsModifiedUTF8(char c, DataOutput dos) throws 
IOException {
-        if (c >= 0x0001 && c <= 0x007F) {
-            dos.writeByte(c);
-            return 1;
-        } else if (c <= 0x07FF) {
-            dos.writeByte((byte) (0xC0 | ((c >> 6) & 0x3F)));
-            dos.writeByte((byte) (0x80 | (c & 0x3F)));
-            return 2;
-        } else {
-            dos.writeByte((byte) (0xE0 | ((c >> 12) & 0x0F)));
-            dos.writeByte((byte) (0x80 | ((c >> 6) & 0x3F)));
-            dos.writeByte((byte) (0x80 | (c & 0x3F)));
-            return 3;
-        }
-    }
-
-    public static int writeCharAsModifiedUTF8(char c, OutputStream dos) throws 
IOException {
-        if (c >= 0x0001 && c <= 0x007F) {
-            dos.write(c);
-            return 1;
-        } else if (c <= 0x07FF) {
-            dos.write((byte) (0xC0 | ((c >> 6) & 0x3F)));
-            dos.write((byte) (0x80 | (c & 0x3F)));
-            return 2;
-        } else {
-            dos.write((byte) (0xE0 | ((c >> 12) & 0x0F)));
-            dos.write((byte) (0x80 | ((c >> 6) & 0x3F)));
-            dos.write((byte) (0x80 | (c & 0x3F)));
-            return 3;
-        }
-    }
-
-    // The result is the number of Java Chars (8 bytes) in the string
-    public static int getStringLength(byte[] b, int s) {
-        int len = getUTFLength(b, s);
-        int pos = s + getNumBytesToStoreLength(len);
-        return getStringLength(b, pos, len);
-    }
-
-    public static int getStringLength(byte[] b, int offs, int len) {
-        int pos = offs;
-        int end = pos + len;
-        int charCount = 0;
-        while (pos < end) {
-            charCount++;
-            pos += charSize(b, pos);
-        }
-        return charCount;
-    }
-
-    public static int getNumCodePoint(byte[] b, int s) {
-        int len = getUTFLength(b, s);
-        int pos = s + getNumBytesToStoreLength(len);
-        int end = pos + len;
-        int codePointCount = 0;
-        while (pos < end) {
-            codePointCount++;
-            pos += codePointSize(b, pos);
-        }
-
-        return codePointCount;
-    }
-
-    public static int getUTFLength(byte[] b, int s) {
-        return VarLenIntEncoderDecoder.decode(b, s);
-    }
-
-    public static int getNumBytesToStoreLength(int strlen) {
-        return VarLenIntEncoderDecoder.getBytesRequired(strlen);
-    }
-
-    public static int codePointToUTF8(int codePoint, char[] tempChars, byte[] 
outputUTF8) {
-        int len = 0;
-        int numChars = Character.toChars(codePoint, tempChars, 0);
-        for (int i = 0; i < numChars; i++) {
-            len += writeToBytes(outputUTF8, len, tempChars[i]);
-        }
-
-        return len;
-    }
-
-    /**
-     * Compute the normalized key of the UTF8 string.
-     * The normalized key in Hyracks is mainly used to speedup the comparison 
between pointable data.
-     * In the UTF8StringPTR case, we compute the integer value by using the 
first 2 chars.
-     * The comparator will first use this integer to get the result ( <,>, or 
=), it will check
-     * the actual bytes only if the normalized key is equal. Thus this 
normalized key must be
-     * consistent with the comparison result.
-     */
-    public static int normalize(byte[] bytes, int start) {
-        long nk = 0;
-        int len = getUTFLength(bytes, start);
-        int offset = start + getNumBytesToStoreLength(len);
-        int end = offset + len;
-        for (int i = 0; i < 2; ++i) {
-            nk <<= 16;
-            if (offset < end) {
-                nk += (charAt(bytes, offset)) & 0xffff;
-                offset += charSize(bytes, offset);
-            }
-        }
-        return (int) (nk >> 1); // make it always positive.
-    }
-
-    public static int compareTo(byte[] thisBytes, int thisStart, byte[] 
thatBytes, int thatStart) {
-        return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, 
false);
-    }
-
-    // the start and length of each are the ones calculated by 
UTF8StringPointable. caller should provide proper values
-    public static int compareTo(byte[] thisBytes, int thisStart, int 
thisLength, byte[] thatBytes, int thatStart,
-            int thatLength) {
-        return compareTo(thisBytes, thisStart, thisLength, thatBytes, 
thatStart, thatLength, false, false);
-    }
-
-    /**
-     * This function provides the raw bytes-based comparison for UTF8 strings.
-     * Note that the comparison may not deliver the correct ordering for 
certain languages that include 2 or 3 bytes characters.
-     * But it works for single-byte character languages.
-     */
-    public static int rawByteCompareTo(byte[] thisBytes, int thisStart, byte[] 
thatBytes, int thatStart) {
-        return compareTo(thisBytes, thisStart, thatBytes, thatStart, false, 
true);
-    }
-
-    public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, 
byte[] thatBytes, int thatStart) {
-        return compareTo(thisBytes, thisStart, thatBytes, thatStart, true, 
false);
-    }
-
-    // Certain type of string does not include lengthByte in the beginning and
-    // the length of the given string is given explicitly as a parameter. 
(e.g., token in a string)
-    public static int lowerCaseCompareTo(byte[] thisBytes, int thisStart, int 
thisLength, byte[] thatBytes,
-            int thatStart, int thatLength) {
-        return compareTo(thisBytes, thisStart, thisLength, thatBytes, 
thatStart, thatLength, true, false);
-    }
-
-    public static int hash(byte[] bytes, int start, int coefficient, int r) {
-        return hash(bytes, start, false, false, coefficient, r);
-    }
-
-    public static int hash(byte[] bytes, int start) {
-        return hash(bytes, start, false, false, 31, Integer.MAX_VALUE);
-    }
-
-    private static int hash(byte[] bytes, int start, boolean useLowerCase, 
boolean useRawByte, int coefficient, int r) {
-        int utflen = getUTFLength(bytes, start);
-        int sStart = start + getNumBytesToStoreLength(utflen);
-        return hash(bytes, sStart, utflen, useLowerCase, useRawByte, 
coefficient, r);
-    }
-
-    /**
-     * This function provides the raw bytes-based hash function for UTF8 
strings.
-     * Note that the hash values may not deliver the correct ordering for 
certain languages that include 2 or 3 bytes characters.
-     * But it works for single-byte character languages.
-     */
-    public static int rawBytehash(byte[] bytes, int start) {
-        return hash(bytes, start, false, true, 31, Integer.MAX_VALUE);
-    }
-
-    public static int lowerCaseHash(byte[] bytes, int start) {
-        return hash(bytes, start, true, false, 31, Integer.MAX_VALUE);
-    }
-
-    // Certain type of string does not include lengthByte in the beginning and
-    // the length of the given string is given explicitly as a parameter.
-    public static int lowerCaseHash(byte[] bytes, int start, int length) {
-        return hash(bytes, start, length, true, false, 31, Integer.MAX_VALUE);
-    }
-
-    public static String toString(byte[] bytes, int start) {
-        StringBuilder builder = new StringBuilder();
-        return toString(builder, bytes, start).toString();
-    }
-
-    public static StringBuilder toString(StringBuilder builder, byte[] bytes, 
int start) {
-        int utfLen = getUTFLength(bytes, start);
-        int offset = getNumBytesToStoreLength(utfLen);
-        while (utfLen > 0) {
-            char c = charAt(bytes, start + offset);
-            builder.append(c);
-            int cLen = getModifiedUTF8Len(c);
-            offset += cLen;
-            utfLen -= cLen;
-        }
-        return builder;
-    }
-
-    // Different from the above toString() methods, here we assume the byte[] 
doesn't contain NumBytesToStoreLength
-    // In fact, this is used for string tokenizer: get "hello" and "world" 
from the bytes of "hello world"
-    public static String getUTF8StringInArray(byte[] b, int start, int len) {
-        StringBuilder builder = new StringBuilder();
-
-        for (int i = start; i < start + len;) {
-            char c = UTF8StringUtil.charAt(b, i);
-            builder.append(c);
-            i += UTF8StringUtil.charSize(b, i);
-        }
-
-        return builder.toString();
-    }
-
-    public static void printUTF8StringWithQuotes(byte[] b, int s, int l, 
OutputStream os) throws IOException {
-        printUTF8String(b, s, l, os, true);
-    }
-
-    public static void printUTF8StringNoQuotes(byte[] b, int s, int l, 
OutputStream os) throws IOException {
-        printUTF8String(b, s, l, os, false);
-    }
-
-    public static void printUTF8StringWithQuotes(String str, OutputStream os) 
throws IOException {
-        printUTF8String(str, os, true);
-    }
-
-    public static void printUTF8StringNoQuotes(String str, OutputStream os) 
throws IOException {
-        printUTF8String(str, os, false);
-    }
-
-    public static int encodeUTF8Length(int length, byte[] bytes, int start) {
-        return VarLenIntEncoderDecoder.encode(length, bytes, start);
-    }
-
-    public static int writeUTF8Length(int length, byte[] bytes, DataOutput 
out) throws IOException {
-        int nbytes = encodeUTF8Length(length, bytes, 0);
-        out.write(bytes, 0, nbytes);
-        return nbytes;
-    }
-
-    private static void printUTF8String(byte[] b, int s, int l, OutputStream 
os, boolean useQuotes) throws IOException {
-        int stringLength = getUTFLength(b, s);
-        int position = s + getNumBytesToStoreLength(stringLength);
-        int maxPosition = position + stringLength;
-        if (useQuotes) {
-            os.write('\"');
-        }
-        while (position < maxPosition) {
-            char c = charAt(b, position);
-            if (c == '\\' || c == '"') {
-                // escape
-                os.write('\\');
-            }
-            int sz = charSize(b, position);
-            while (sz > 0) {
-                os.write(b[position]);
-                position++;
-                sz--;
-            }
-        }
-        if (useQuotes) {
-            os.write('\"');
-        }
-    }
-
-    private static void printUTF8String(String string, OutputStream os, 
boolean useQuotes) throws IOException {
-        if (useQuotes) {
-            os.write('\"');
-        }
-        for (int i = 0; i < string.length(); i++) {
-            char ch = string.charAt(i);
-            writeCharAsModifiedUTF8(ch, os);
-        }
-        if (useQuotes) {
-            os.write('\"');
-        }
-    }
-
-    private static int compareTo(byte[] thisBytes, int thisStart, byte[] 
thatBytes, int thatStart, boolean useLowerCase,
-            boolean useRawByte) {
-        int thisLength = getUTFLength(thisBytes, thisStart);
-        int thatLength = getUTFLength(thatBytes, thatStart);
-        int thisActualStart = thisStart + getNumBytesToStoreLength(thisLength);
-        int thatActualStart = thatStart + getNumBytesToStoreLength(thatLength);
-        return compareTo(thisBytes, thisActualStart, thisLength, thatBytes, 
thatActualStart, thatLength, useLowerCase,
-                useRawByte);
-    }
-
-    private static int compareTo(byte[] thisBytes, int thisActualStart, int 
thisLength, byte[] thatBytes,
-            int thatActualStart, int thatLength, boolean useLowerCase, boolean 
useRawByte) {
-        int c1 = 0;
-        int c2 = 0;
-
-        while (c1 < thisLength && c2 < thatLength) {
-            char ch1, ch2;
-            if (useRawByte) {
-                ch1 = (char) thisBytes[thisActualStart + c1];
-                ch2 = (char) thatBytes[thatActualStart + c2];
-            } else {
-                ch1 = charAt(thisBytes, thisActualStart + c1);
-                ch2 = charAt(thatBytes, thatActualStart + c2);
-
-                if (useLowerCase) {
-                    ch1 = Character.toLowerCase(ch1);
-                    ch2 = Character.toLowerCase(ch2);
-                }
-            }
-
-            if (ch1 != ch2) {
-                return ch1 - ch2;
-            }
-            c1 += charSize(thisBytes, thisActualStart + c1);
-            c2 += charSize(thatBytes, thatActualStart + c2);
-        }
-        return thisLength - thatLength;
-    }
-
-    private static int hash(byte[] bytes, int start, int length, boolean 
useLowerCase, boolean useRawByte,
-            int coefficient, int r) {
-        int h = 0;
-        int c = 0;
-
-        while (c < length) {
-            char ch;
-            if (useRawByte) {
-                ch = (char) bytes[start + c];
-            } else {
-                ch = charAt(bytes, start + c);
-                if (useLowerCase) {
-                    ch = Character.toLowerCase(ch);
-                }
-            }
-            h = (coefficient * h + ch) % r;
-            c += charSize(bytes, start + c);
-        }
-        return h;
-    }
-
-    public static byte[] writeStringToBytes(String string) {
-        UTF8StringWriter writer = new UTF8StringWriter();
-        ByteArrayOutputStream bos = new ByteArrayOutputStream();
-        DataOutputStream dos = new DataOutputStream(bos);
-        try {
-            writer.writeUTF8(string, dos);
-        } catch (IOException e) {
-            throw new RuntimeException(e);
-        }
-        return bos.toByteArray();
-    }
-
-    /**
-     * Reads from the
-     * stream <code>in</code> a representation
-     * of a Unicode character string encoded in
-     * <a href="DataInput.html#modified-utf-8">modified UTF-8</a> format;
-     * this string of characters is then returned as a <code>String</code>.
-     * The details of the modified UTF-8 representation
-     * are exactly the same as for the <code>readUTF</code>
-     * method of <code>DataInput</code>.
-     *
-     * @param in a data input stream.
-     * @return a Unicode string.
-     * @throws EOFException           if the input stream reaches the end
-     *                                before all the bytes.
-     * @throws IOException            the stream has been closed and the 
contained
-     *                                input stream does not support reading 
after close, or
-     *                                another I/O error occurs.
-     * @throws UTFDataFormatException if the bytes do not represent a
-     *                                valid modified UTF-8 encoding of a 
Unicode string.
-     * @see java.io.DataInputStream#readUnsignedShort()
-     */
-    public static String readUTF8(DataInput in) throws IOException {
-        return readUTF8(in, null);
-    }
-
-    public static String readUTF8(DataInput in, UTF8StringReader reader) 
throws IOException {
-        int utflen = VarLenIntEncoderDecoder.decode(in);
-        byte[] bytearr;
-        char[] chararr;
-
-        if (reader == null) {
-            bytearr = new byte[utflen * 2];
-            chararr = new char[utflen * 2];
-        } else {
-            if (reader.bytearr == null || reader.bytearr.length < utflen) {
-                reader.bytearr = new byte[utflen * 2];
-                reader.chararr = new char[utflen * 2];
-            }
-            bytearr = reader.bytearr;
-            chararr = reader.chararr;
-        }
-
-        int c, char2, char3;
-        int count = 0;
-        int chararr_count = 0;
-
-        in.readFully(bytearr, 0, utflen);
-
-        while (count < utflen) {
-            c = bytearr[count] & 0xff;
-            if (c > 127) {
-                break;
-            }
-            count++;
-            chararr[chararr_count++] = (char) c;
-        }
-
-        while (count < utflen) {
-            c = bytearr[count] & 0xff;
-            switch (c >> 4) {
-                case 0:
-                case 1:
-                case 2:
-                case 3:
-                case 4:
-                case 5:
-                case 6:
-                case 7:
-                    /* 0xxxxxxx*/
-                    count++;
-                    chararr[chararr_count++] = (char) c;
-                    break;
-                case 12:
-                case 13:
-                    /* 110x xxxx   10xx xxxx*/
-                    count += 2;
-                    if (count > utflen) {
-                        throw new UTFDataFormatException("malformed input: 
partial character at end");
-                    }
-                    char2 = bytearr[count - 1];
-                    if ((char2 & 0xC0) != 0x80) {
-                        throw new UTFDataFormatException("malformed input 
around byte " + count);
-                    }
-                    chararr[chararr_count++] = (char) (((c & 0x1F) << 6) | 
(char2 & 0x3F));
-                    break;
-                case 14:
-                    /* 1110 xxxx  10xx xxxx  10xx xxxx */
-                    count += 3;
-                    if (count > utflen) {
-                        throw new UTFDataFormatException("malformed input: 
partial character at end");
-                    }
-                    char2 = bytearr[count - 2];
-                    char3 = bytearr[count - 1];
-                    if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) {
-                        throw new UTFDataFormatException("malformed input 
around byte " + (count - 1));
-                    }
-                    chararr[chararr_count++] = (char) (((c & 0x0F) << 12) | 
((char2 & 0x3F) << 6) | (char3 & 0x3F));
-                    break;
-                default:
-                    /* 10xx xxxx,  1111 xxxx */
-                    throw new UTFDataFormatException("malformed input around 
byte " + count);
-            }
-        }
-        // The number of chars produced may be less than utflen
-        return new String(chararr, 0, chararr_count);
-    }
-
-    /**
-     * Write a UTF8 String <code>str</code> into the DataOutput 
<code>out</code>
-     *
-     * @param str, a Unicode string;
-     * @param out, a Data output stream.
-     * @throws IOException
-     */
-    public static void writeUTF8(CharSequence str, DataOutput out) throws 
IOException {
-        writeUTF8(str, out, null);
-    }
-
-    public static void writeUTF8(CharSequence str, DataOutput out, 
UTF8StringWriter writer) throws IOException {
-        int strlen = str.length();
-        int utflen = 0;
-        char c;
-        int count = 0;
-
-        for (int i = 0; i < strlen; i++) {
-            // ToDo: we shouldn't use str.charAt(i) to convert raw byte array 
to UTF-8 chars
-            // one UTF-8 char has at most four bytes, and one Java char we get 
via str.charAt(i) has 2 bytes
-            // In this case, a UTF-8 char may be consistent of 2 Java chars, 
and 1 Java char can be converted into 3 UTF-8 bytes
-            // For the emoji, it can be 6 bytes after encoded to UTF-8
-            c = str.charAt(i);
-            utflen += UTF8StringUtil.getModifiedUTF8Len(c);
-        }
-
-        byte[] tempBytes = getTempBytes(writer, utflen);
-        count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
-        int i = 0;
-        for (; i < strlen; i++) {
-            c = str.charAt(i);
-            if (!((c >= 0x0001) && (c <= 0x007F))) {
-                break;
-            }
-            tempBytes[count++] = (byte) c;
-        }
-
-        for (; i < strlen; i++) {
-            c = str.charAt(i);
-            count += writeToBytes(tempBytes, count, c);
-        }
-        out.write(tempBytes, 0, count);
-    }
-
-    public static void writeUTF8(char[] buffer, int start, int length, 
DataOutput out, UTF8StringWriter writer)
-            throws IOException {
-        int utflen = 0;
-        int count = 0;
-        char c;
-
-        for (int i = 0; i < length; i++) {
-            c = buffer[i + start];
-            utflen += UTF8StringUtil.getModifiedUTF8Len(c);
-        }
-
-        byte[] tempBytes = getTempBytes(writer, utflen);
-        count += VarLenIntEncoderDecoder.encode(utflen, tempBytes, count);
-
-        int i = 0;
-        for (; i < length; i++) {
-            c = buffer[i + start];
-            if (!((c >= 0x0001) && (c <= 0x007F))) {
-                break;
-            }
-            tempBytes[count++] = (byte) c;
-        }
-
-        for (; i < length; i++) {
-            c = buffer[i + start];
-            count += writeToBytes(tempBytes, count, c);
-        }
-        out.write(tempBytes, 0, count);
-    }
-
-    private static int writeToBytes(byte[] tempBytes, int count, char c) {
-        int orig = count;
-        if ((c >= 0x0001) && (c <= 0x007F)) {
-            tempBytes[count++] = (byte) c;
-        } else if (c > 0x07FF) {
-            tempBytes[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F));
-            tempBytes[count++] = (byte) (0x80 | ((c >> 6) & 0x3F));
-            tempBytes[count++] = (byte) (0x80 | (c & 0x3F));
-        } else {
-            tempBytes[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F));
-            tempBytes[count++] = (byte) (0x80 | (c & 0x3F));
-        }
-        return count - orig;
-    }
-
-    private static byte[] getTempBytes(UTF8StringWriter writer, int utflen) {
-        byte[] tempBytes;
-        if (writer == null) {
-            tempBytes = new byte[utflen + 5];
-        } else {
-            byte[] writerTempBytes = writer.tempBytesRef != null ? 
writer.tempBytesRef.get() : null;
-            if (writerTempBytes == null || writerTempBytes.length < utflen + 
5) {
-                writerTempBytes = new byte[utflen + 5];
-                writer.tempBytesRef = new SoftReference<>(writerTempBytes);
-            }
-            tempBytes = writerTempBytes;
-        }
-        return tempBytes;
-    }
-}
-=======
->>>>>>> BRANCH (b4a7d8 [NO ISSUE]: Move StringUtils to hyracks-api module)
diff --git 
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
 
b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
deleted file mode 100644
index 2ad0b62..0000000
--- 
a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java
+++ /dev/null
@@ -1,193 +0,0 @@
-<<<<<<< HEAD   (78ebed [NO ISSUE] Use getClass().getName() instead of 
getClass().ge)
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.apache.hyracks.util.string;
-
-import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_127;
-import static org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_128;
-import static 
org.apache.hyracks.util.string.UTF8StringSample.STRING_LEN_MEDIUM;
-import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3;
-import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX;
-import static 
org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE;
-import static 
org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR;
-import static org.apache.hyracks.util.string.UTF8StringUtil.charAt;
-import static org.apache.hyracks.util.string.UTF8StringUtil.charSize;
-import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo;
-import static org.apache.hyracks.util.string.UTF8StringUtil.getModifiedUTF8Len;
-import static 
org.apache.hyracks.util.string.UTF8StringUtil.getNumBytesToStoreLength;
-import static org.apache.hyracks.util.string.UTF8StringUtil.getNumCodePoint;
-import static org.apache.hyracks.util.string.UTF8StringUtil.getStringLength;
-import static 
org.apache.hyracks.util.string.UTF8StringUtil.getUTF8StringInArray;
-import static org.apache.hyracks.util.string.UTF8StringUtil.getUTFLength;
-import static org.apache.hyracks.util.string.UTF8StringUtil.hash;
-import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseCompareTo;
-import static org.apache.hyracks.util.string.UTF8StringUtil.lowerCaseHash;
-import static org.apache.hyracks.util.string.UTF8StringUtil.normalize;
-import static org.apache.hyracks.util.string.UTF8StringUtil.rawByteCompareTo;
-import static org.apache.hyracks.util.string.UTF8StringUtil.writeStringToBytes;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-
-import java.io.IOException;
-import java.util.List;
-
-import org.junit.Test;
-
-public class UTF8StringUtilTest {
-
-    @Test
-    public void testCharAtCharSizeGetLen() throws Exception {
-        char[] utf8Mix = STRING_UTF8_MIX.toCharArray();
-        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
-        int pos = getNumBytesToStoreLength(getUTFLength(buffer, 0));
-        for (char c : utf8Mix) {
-            assertEquals(c, charAt(buffer, pos));
-            assertEquals(getModifiedUTF8Len(c), charSize(buffer, pos));
-            pos += charSize(buffer, pos);
-        }
-    }
-
-    @Test
-    public void testGetStringLength() throws Exception {
-        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
-        assertEquals(STRING_UTF8_MIX.length(), getStringLength(buffer, 0));
-    }
-
-    @Test
-    public void testChinese() {
-        byte[] bufferDe = writeStringToBytes("的");
-        byte[] bufferLi = writeStringToBytes("离");
-        int ret = compareTo(bufferDe, 0, bufferLi, 0);
-        assertTrue(ret != 0);
-    }
-
-    @Test
-    public void testCompareToAndNormalize() throws Exception {
-        testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD);
-        testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD);
-        testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD);
-        testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR, 
OPTION.STANDARD);
-    }
-
-    private static boolean isSameSign(int r1, int r2) {
-        if (r1 > 0) {
-            return r2 > 0;
-        }
-        if (r1 < 0) {
-            return r2 < 0;
-        }
-        return r2 == 0;
-    }
-
-    enum OPTION {
-        STANDARD,
-        RAW_BYTE,
-        LOWERCASE
-    }
-
-    private static void testCompare(String str1, String str2, OPTION option) {
-        byte[] buffer1 = writeStringToBytes(str1);
-        byte[] buffer2 = writeStringToBytes(str2);
-
-        switch (option) {
-            case STANDARD:
-                assertEquals(str1.compareTo(str2), compareTo(buffer1, 0, 
buffer2, 0));
-                int n1 = normalize(buffer1, 0);
-                int n2 = normalize(buffer2, 0);
-                assertTrue(isSameSign(str1.compareTo(str2), n1 - n2));
-                break;
-            case RAW_BYTE:
-                assertEquals(str1.compareTo(str2), rawByteCompareTo(buffer1, 
0, buffer2, 0));
-                break;
-            case LOWERCASE:
-                assertEquals(str1.compareToIgnoreCase(str2), 
lowerCaseCompareTo(buffer1, 0, buffer2, 0));
-                break;
-        }
-    }
-
-    @Test
-    public void testRawByteCompareTo() throws Exception {
-        testCompare(STRING_LEN_MEDIUM, STRING_LEN_MEDIUM, OPTION.RAW_BYTE);
-        testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.RAW_BYTE);
-    }
-
-    @Test
-    public void testLowerCaseCompareTo() throws Exception {
-        testCompare(STRING_LEN_127, STRING_LEN_128, OPTION.LOWERCASE);
-        testCompare(STRING_LEN_127, STRING_UTF8_MIX, OPTION.LOWERCASE);
-        testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX_LOWERCASE, 
OPTION.LOWERCASE);
-        testCompare(STRING_UTF8_MIX_LOWERCASE, STRING_UTF8_MIX, 
OPTION.LOWERCASE);
-    }
-
-    @Test
-    public void testToString() throws Exception {
-
-        StringBuilder sb = new StringBuilder();
-        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX);
-        assertEquals(STRING_UTF8_MIX, UTF8StringUtil.toString(sb, buffer, 
0).toString());
-    }
-
-    @Test
-    public void testHash() throws IOException {
-        byte[] buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
-        int lowerHash = hash(buffer, 0);
-
-        buffer = writeStringToBytes(STRING_UTF8_MIX_LOWERCASE);
-        int upperHash = lowerCaseHash(buffer, 0);
-        assertEquals(lowerHash, upperHash);
-
-        int familyOne = hash(buffer, 0, 7, 297);
-        int familyTwo = hash(buffer, 0, 8, 297);
-        assertTrue(familyOne != familyTwo);
-    }
-
-    @Test
-    public void testGetUTF8StringInArray() {
-        String str = null;
-        byte[] bytes = null;
-        List<String> answer = null;
-
-        str = "database group at university of California, Irvine 23333";
-        bytes = writeStringToBytes(str);
-        // First byte in bytes is for the number of bytes of the entire string,
-        // and it should be skipped in getUTF8StringInArray
-        assertEquals("database", getUTF8StringInArray(bytes, 1, 8));
-        assertEquals("at", getUTF8StringInArray(bytes, 16, 2));
-        // test upper case
-        assertEquals("California", getUTF8StringInArray(bytes, 33, 10));
-        // test non-english char
-        assertEquals(",", getUTF8StringInArray(bytes, 43, 1));
-        assertEquals("Irvine", getUTF8StringInArray(bytes, 45, 6));
-        // test number
-        assertEquals("23333", getUTF8StringInArray(bytes, 52, 5));
-    }
-
-    @Test
-    public void testGetNumCodePoint() {
-        String str = 
"\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66";
-        assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 7);
-
-        str = 
"\uD83D\uDC69\u200D\uD83D\uDC69\u200D\uD83D\uDC67\u200D\uD83D\uDC66\uD83C\uDDE8\uD83C\uDDF3";
-        assertEquals(getNumCodePoint(writeStringToBytes(str), 0), 9);
-    }
-
-}
-=======
->>>>>>> BRANCH (b4a7d8 [NO ISSUE]: Move StringUtils to hyracks-api module)

--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17991
To unsubscribe, or for help writing mail filters, visit 
https://asterix-gerrit.ics.uci.edu/settings

Gerrit-Project: asterixdb
Gerrit-Branch: trinity
Gerrit-Change-Id: Ia94fc0878d6468495233cb06268132fdee71b7f1
Gerrit-Change-Number: 17991
Gerrit-PatchSet: 1
Gerrit-Owner: Michael Blow <[email protected]>
Gerrit-MessageType: newchange

Change in asterixdb[trinity]: Merge branch 'gerrit/neo' into 'gerrit/trinity'

Reply via email to