>From Ritik Raj <[email protected]>: Ritik Raj has submitted this change. ( https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20984?usp=email )
Change subject: [ASTERIXDB-3715][RT] Upgrade UTF8StringPointable string search to KMP ...................................................................... [ASTERIXDB-3715][RT] Upgrade UTF8StringPointable string search to KMP - user model changes: no - storage format changes: no - interface changes: no Details: Replace the naive O(N*M) string matching algorithm in UTF8StringPointable.findInByteOrCodePoint with the Knuth-Morris-Pratt (KMP) algorithm to achieve O(N+M) complexity. Co-authored-by: Gemini 3.1 Pro <[email protected]> Ext-ref: MB-68178 Change-Id: Ia00fbce6499a5258127c91d3ce62270722b89112 Reviewed-on: https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20984 Tested-by: Jenkins <[email protected]> Reviewed-by: Ritik Raj <[email protected]> Reviewed-by: Ian Maxon <[email protected]> Reviewed-by: Michael Blow <[email protected]> --- M hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java M hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java 2 files changed, 197 insertions(+), 45 deletions(-) Approvals: Ian Maxon: Looks good to me, approved Jenkins: Verified Ritik Raj: Looks good to me, but someone else must approve Michael Blow: Looks good to me, approved diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java index 3744350..4932e2fe 100644 --- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java +++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java @@ -278,65 +278,152 @@ int startMatchPos = startMatch; final int srcUtfLen = src.getUTF8Length(); final int pttnUtfLen = pattern.getUTF8Length(); + if (pttnUtfLen == 0) { + return resultInByte ? startMatchPos : 0; + } + + char[] patternChars = getPatternChars(pattern, ignoreCase); + int[] lps = computeLPS(patternChars); + return kmpMatch(src, patternChars, lps, ignoreCase, startMatchPos, resultInByte); + } + + private static char[] getPatternChars(UTF8StringPointable pattern, boolean ignoreCase) { + int pttnUtfLen = pattern.getUTF8Length(); + int pttnStart = pattern.getMetaDataLength(); + char[] chars = new char[pttnUtfLen]; + int byteIdx = 0; + int charIdx = 0; + while (byteIdx < pttnUtfLen) { + char ch = pattern.charAt(pttnStart + byteIdx); + if (ignoreCase && !Character.isHighSurrogate(ch) && !Character.isLowSurrogate(ch)) { + ch = Character.toLowerCase(ch); + } + chars[charIdx++] = ch; + byteIdx += pattern.charSize(pttnStart + byteIdx); + } + char[] exactChars = new char[charIdx]; + System.arraycopy(chars, 0, exactChars, 0, charIdx); + return exactChars; + } + + private static int[] computeLPS(char[] patternChars) { + int[] lps = new int[patternChars.length]; + int len = 0; + int i = 1; + lps[0] = 0; + while (i < patternChars.length) { + if (patternChars[i] == patternChars[len]) { + len++; + lps[i] = len; + i++; + } else { + if (len != 0) { + len = lps[len - 1]; + } else { + lps[i] = 0; + i++; + } + } + } + return lps; + } + + private static int kmpMatch(UTF8StringPointable src, char[] patternChars, int[] lps, boolean ignoreCase, + int startMatchPos, boolean resultInByte) throws HyracksDataException { + final int srcUtfLen = src.getUTF8Length(); final int srcStart = src.getMetaDataLength(); - final int pttnStart = pattern.getMetaDataLength(); int codePointCount = 0; + int c1 = 0; // index in bytes for src + int j = 0; // index for patternChars - int maxStart = srcUtfLen - pttnUtfLen; - boolean prevHighSurrogate = false; - while (startMatchPos <= maxStart) { - int c1 = startMatchPos; - int c2 = 0; - while (c1 < srcUtfLen && c2 < pttnUtfLen) { - char ch1 = src.charAt(srcStart + c1); - char ch2 = pattern.charAt(pttnStart + c2); + boolean prevHigh = false; - if (ch1 != ch2) { - // Currently, the ignoreCase is only valid for one-surrogate characters - // (e.g. characters whose UTF-16 encoding is 2-byte (1 Java char) instead of 4-byte (2 Java chars). - // We may need to support the two-surrogate characters in the future - // - // Another edge case is that one letter may have different forms of lower cases in different languages - // For example, the letter I may have "i" as the lower case in English but "ı" in Turkish. - // We may need to use methods such as String.toLowerCase(Locale locale) to support other languages in the future - // Reference: https://stackoverflow.com/questions/11063102/using-locales-with-javas-tolowercase-and-touppercase - if (!ignoreCase || Character.toLowerCase(ch1) != Character.toLowerCase(ch2)) { - break; + int[] ringBytes = new int[patternChars.length + 1]; + int[] ringCPs = new int[patternChars.length + 1]; + int ringIdx = 0; + + while (c1 < srcUtfLen) { + if (c1 < startMatchPos) { + char ch = src.charAt(srcStart + c1); + c1 += src.charSize(srcStart + c1); + if (!resultInByte) { + if (Character.isHighSurrogate(ch)) { + prevHigh = true; + } else if (Character.isLowSurrogate(ch)) { + if (prevHigh) { + codePointCount++; + prevHigh = false; + } else { + throw HyracksDataException.create(INVALID_STRING_UNICODE, + LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); + } + } else { + codePointCount++; } } - c1 += src.charSize(srcStart + c1); - c2 += pattern.charSize(pttnStart + c2); + continue; } - if (c2 == pttnUtfLen) { - if (resultInByte) { - return startMatchPos; - } else { - if (prevHighSurrogate) { + char ch1 = src.charAt(srcStart + c1); + char matchCh1 = (ignoreCase && !Character.isHighSurrogate(ch1) && !Character.isLowSurrogate(ch1)) + ? Character.toLowerCase(ch1) : ch1; + + if (j == 0) { + ringBytes[ringIdx % ringBytes.length] = c1; + ringCPs[ringIdx % ringCPs.length] = codePointCount; + } + + if (matchCh1 == patternChars[j]) { + j++; + int size = src.charSize(srcStart + c1); + c1 += size; + + if (!resultInByte) { + if (Character.isHighSurrogate(ch1)) { + prevHigh = true; + } else if (Character.isLowSurrogate(ch1)) { + if (prevHigh) { + codePointCount++; + prevHigh = false; + } else { + throw HyracksDataException.create(INVALID_STRING_UNICODE, + LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); + } + } else { + codePointCount++; + } + } + + if (j == patternChars.length) { + if (!resultInByte && prevHigh) { throw HyracksDataException.create(INVALID_STRING_UNICODE, HIGH_SURROGATE_WITHOUT_LOW_SURROGATE); } - return codePointCount; + return resultInByte ? ringBytes[ringIdx % ringBytes.length] : ringCPs[ringIdx % ringCPs.length]; } - } - - // The result is counted in code point instead of bytes - if (!resultInByte) { - char ch = src.charAt(srcStart + startMatchPos); - if (Character.isHighSurrogate(ch)) { - prevHighSurrogate = true; - } else if (Character.isLowSurrogate(ch)) { - if (prevHighSurrogate) { - codePointCount++; - prevHighSurrogate = false; - } else { - throw HyracksDataException.create(INVALID_STRING_UNICODE, LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); - } + } else { + if (j != 0) { + j = lps[j - 1]; + ringIdx++; } else { - codePointCount++; + int size = src.charSize(srcStart + c1); + c1 += size; + if (!resultInByte) { + if (Character.isHighSurrogate(ch1)) { + prevHigh = true; + } else if (Character.isLowSurrogate(ch1)) { + if (prevHigh) { + codePointCount++; + prevHigh = false; + } else { + throw HyracksDataException.create(INVALID_STRING_UNICODE, + LOW_SURROGATE_WITHOUT_HIGH_SURROGATE); + } + } else { + codePointCount++; + } + } } } - - startMatchPos += src.charSize(srcStart + startMatchPos); } return -1; } diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java index 45a5ba3..fbba46f 100644 --- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java +++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableTest.java @@ -41,6 +41,71 @@ import it.unimi.dsi.fastutil.ints.IntCollection; public class UTF8StringPointableTest { + + @Test + public void testFindWithKMP() throws Exception { + // Standard matching (returns byte pos) + assertTrue(findReturnPos("hello world", "world") >= 0); + assertEquals(0, findReturnPos("hello world", "hello")); + assertTrue(findReturnPos("hello world", "o w") >= 0); + + // Exact same pattern boundary overlaps (KMP jump testing) + assertEquals(0, findReturnPos("abacababc", "abacab")); + assertTrue(findReturnPos("abacababc", "ababc") >= 0); + + // Match not found + assertEquals(-1, findReturnPos("hello world", "xyz")); + + // Multi-byte Unicode Characters (length checking bytes vs codepoints) + // 'é' (U+00E9) is 2 bytes in UTF-8, 1 code point + // '€' (U+20AC) is 3 bytes in UTF-8, 1 code point + // '😀' (U+1F600) is a surrogate pair in Java, encoded as 6 bytes in Modified UTF-8! + + // Testing 2-byte character: expected w byte pos = 5(hello) + 2(é) = 7 + assertEquals(7, findReturnPos("helloéworld", "w")); + assertEquals(6, findReturnCodePointPos("helloéworld", "w")); // 5(hello) + 1(é) = 6 + + // Testing 3-byte character: expected w byte pos = 5(hello) + 3(€) = 8 + assertEquals(8, findReturnPos("hello€world", "w")); + assertEquals(6, findReturnCodePointPos("hello€world", "w")); + + // Testing Surrogate Pair (Modified UTF-8 encodes as 2x 3-byte characters = 6 bytes) + assertEquals(11, findReturnPos("hello😀world", "w")); // 5 + 6 = 11 + assertEquals(6, findReturnCodePointPos("hello😀world", "w")); // 5 + 1 = 6 + + // Find the emoji itself + assertTrue(findReturnPos("hello😀world", "😀") > 0); + assertEquals(0, findReturnCodePointPos("😀😀😀", "😀😀")); + + // Case insensitivity + assertTrue(findReturnPosIgnoreCase("hello WORLD", "world") >= 0); + assertTrue(findReturnPosIgnoreCase("hello WORLD", "WOrLd") >= 0); + assertEquals(-1, findReturnPosIgnoreCase("hello WORLD", "XYZ")); + } + + private int findReturnPos(String srcStr, String patternStr) throws Exception { + return findReturnPosInternal(srcStr, patternStr, false, true); + } + + private int findReturnCodePointPos(String srcStr, String patternStr) throws Exception { + return findReturnPosInternal(srcStr, patternStr, false, false); + } + + private int findReturnPosIgnoreCase(String srcStr, String patternStr) throws Exception { + return findReturnPosInternal(srcStr, patternStr, true, true); + } + + private int findReturnPosInternal(String srcStr, String patternStr, boolean ignoreCase, boolean resultInByte) + throws Exception { + UTF8StringPointable src = generateUTF8Pointable(srcStr); + UTF8StringPointable pattern = generateUTF8Pointable(patternStr); + if (resultInByte) { + return UTF8StringPointable.find(src, pattern, ignoreCase, 0); + } else { + return UTF8StringPointable.findInCodePoint(src, pattern, ignoreCase, 0); + } + } + public static UTF8StringPointable STRING_EMPTY = generateUTF8Pointable(UTF8StringSample.EMPTY_STRING); public static UTF8StringPointable STRING_UTF8_MIX = generateUTF8Pointable(UTF8StringSample.STRING_UTF8_MIX); public static UTF8StringPointable STRING_UTF8_MIX_LOWERCASE = -- To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20984?usp=email To unsubscribe, or for help writing mail filters, visit https://asterix-gerrit.ics.uci.edu/settings?usp=email Gerrit-MessageType: merged Gerrit-Project: asterixdb Gerrit-Branch: lumina Gerrit-Change-Id: Ia00fbce6499a5258127c91d3ce62270722b89112 Gerrit-Change-Number: 20984 Gerrit-PatchSet: 3 Gerrit-Owner: Ritik Raj <[email protected]> Gerrit-Reviewer: Ian Maxon <[email protected]> Gerrit-Reviewer: Jenkins <[email protected]> Gerrit-Reviewer: Michael Blow <[email protected]> Gerrit-Reviewer: Peeyush Gupta <[email protected]> Gerrit-Reviewer: Ritik Raj <[email protected]> Gerrit-CC: Anon. E. Moose #1000171
