>From Wail Alkowaileet <[email protected]>: Wail Alkowaileet has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230 )
Change subject: [ASTERIXDB-2129][RT] Fix normalizing non-ascii strings ...................................................................... [ASTERIXDB-2129][RT] Fix normalizing non-ascii strings - user model changes: no - storage format changes: no - interface changes: no Details: For example, single char strings with a 3-byte char can go out of the string's buffer boundry Change-Id: Ic169d5ff20f9bf5ce2ca36bab4ebd241bbc50dca Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230 Tested-by: Jenkins <[email protected]> Reviewed-by: Ali Alsuliman <[email protected]> --- M hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java M hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java M hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java 3 files changed, 42 insertions(+), 23 deletions(-) Approvals: Ali Alsuliman: Looks good to me, approved Jenkins: Verified Objections: Anon. E. Moose #1000171: Violations found diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java index 3eb8687..c0475b1 100644 --- a/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java +++ b/hyracks-fullstack/hyracks/hyracks-util/src/main/java/org/apache/hyracks/util/string/UTF8StringUtil.java @@ -244,12 +244,13 @@ * consistent with the comparison result. */ public static int normalize(byte[] bytes, int start) { - int len = getUTFLength(bytes, start); long nk = 0; + int len = getUTFLength(bytes, start); int offset = start + getNumBytesToStoreLength(len); + int end = offset + len; for (int i = 0; i < 2; ++i) { nk <<= 16; - if (i < len) { + if (offset < end) { nk += (charAt(bytes, offset)) & 0xffff; offset += charSize(bytes, offset); } @@ -498,19 +499,15 @@ * are exactly the same as for the <code>readUTF</code> * method of <code>DataInput</code>. * - * @param in - * a data input stream. + * @param in a data input stream. * @return a Unicode string. - * @throws EOFException - * if the input stream reaches the end - * before all the bytes. - * @throws IOException - * the stream has been closed and the contained - * input stream does not support reading after close, or - * another I/O error occurs. - * @throws UTFDataFormatException - * if the bytes do not represent a - * valid modified UTF-8 encoding of a Unicode string. + * @throws EOFException if the input stream reaches the end + * before all the bytes. + * @throws IOException the stream has been closed and the contained + * input stream does not support reading after close, or + * another I/O error occurs. + * @throws UTFDataFormatException if the bytes do not represent a + * valid modified UTF-8 encoding of a Unicode string. * @see java.io.DataInputStream#readUnsignedShort() */ public static String readUTF8(DataInput in) throws IOException { @@ -602,10 +599,8 @@ /** * Write a UTF8 String <code>str</code> into the DataOutput <code>out</code> * - * @param str, - * a Unicode string; - * @param out, - * a Data output stream. + * @param str, a Unicode string; + * @param out, a Data output stream. * @throws IOException */ public static void writeUTF8(CharSequence str, DataOutput out) throws IOException { diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java index b114351..eb3a5b6 100644 --- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java +++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringSample.java @@ -35,7 +35,8 @@ public static final String STRING_LEN_3 = "xyz"; public static final String STRING_UTF8_3 = "锟斤拷"; - public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà"; // one, two, three, and four bytes + // one, two, three, and four bytes + public static final String STRING_UTF8_MIX = "\uD841\uDF0E\uD841\uDF31锟X斤Y拷Zà"; public static final String STRING_UTF8_MIX_LOWERCASE = "\uD841\uDF0E\uD841\uDF31锟x斤y拷zà"; public static final String STRING_NEEDS_2_JAVA_CHARS_1 = "\uD83D\uDE22\uD83D\uDE22\uD83D\uDC89\uD83D\uDC89"; public static final String STRING_NEEDS_2_JAVA_CHARS_2 = "😢😢💉💉"; @@ -44,6 +45,8 @@ public static final String STRING_EMOJI_FAMILY_OF_2 = "\uD83D\uDC68\u200D\uD83D\uDC66"; public static final String EMOJI_BASKETBALL = "\uD83C\uDFC0"; + public static final String THREE_BYTES_UTF8_CHAR = "ह"; + public static final String STRING_LEN_127 = generateStringRepeatBy(ONE_ASCII_CHAR, 127); public static final String STRING_LEN_128 = generateStringRepeatBy(ONE_ASCII_CHAR, 128); diff --git a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java index c7468d2..4eb1fc3 100644 --- a/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java +++ b/hyracks-fullstack/hyracks/hyracks-util/src/test/java/org/apache/hyracks/util/string/UTF8StringUtilTest.java @@ -25,6 +25,7 @@ import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_3; import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX; import static org.apache.hyracks.util.string.UTF8StringSample.STRING_UTF8_MIX_LOWERCASE; +import static org.apache.hyracks.util.string.UTF8StringSample.THREE_BYTES_UTF8_CHAR; import static org.apache.hyracks.util.string.UTF8StringUtil.charAt; import static org.apache.hyracks.util.string.UTF8StringUtil.charSize; import static org.apache.hyracks.util.string.UTF8StringUtil.compareTo; @@ -77,13 +78,14 @@ } @Test - public void testCompareToAndNormolize() throws Exception { + public void testCompareToAndNormalize() throws Exception { testCompare(STRING_UTF8_MIX, STRING_UTF8_MIX, OPTION.STANDARD); testCompare(STRING_UTF8_3, STRING_UTF8_MIX, OPTION.STANDARD); testCompare(STRING_LEN_MEDIUM, STRING_UTF8_MIX, OPTION.STANDARD); + testCompare(THREE_BYTES_UTF8_CHAR, THREE_BYTES_UTF8_CHAR, OPTION.STANDARD); } - public boolean isSameSign(int r1, int r2) { + private static boolean isSameSign(int r1, int r2) { if (r1 > 0) { return r2 > 0; } @@ -99,7 +101,7 @@ LOWERCASE } - public void testCompare(String str1, String str2, OPTION option) throws IOException { + private static void testCompare(String str1, String str2, OPTION option) { byte[] buffer1 = writeStringToBytes(str1); byte[] buffer2 = writeStringToBytes(str2); @@ -117,7 +119,6 @@ assertEquals(str1.compareToIgnoreCase(str2), lowerCaseCompareTo(buffer1, 0, buffer2, 0)); break; } - } @Test -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/17230 To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings Gerrit-Project: asterixdb Gerrit-Branch: master Gerrit-Change-Id: Ic169d5ff20f9bf5ce2ca36bab4ebd241bbc50dca Gerrit-Change-Number: 17230 Gerrit-PatchSet: 5 Gerrit-Owner: Wail Alkowaileet <[email protected]> Gerrit-Reviewer: Ali Alsuliman <[email protected]> Gerrit-Reviewer: Anon. E. Moose #1000171 Gerrit-Reviewer: Glenn Galvizo <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Wail Alkowaileet <[email protected]> Gerrit-MessageType: merged
