mkaravel commented on code in PR #46761:
URL: https://github.com/apache/spark/pull/46761#discussion_r1669268493
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -424,27 +430,56 @@ private static UTF8String toLowerCaseSlow(final
UTF8String target, final int col
* @param codePoint The code point to convert to lowercase.
* @param sb The StringBuilder to append the lowercase character to.
*/
- private static void lowercaseCodePoint(final int codePoint, final
StringBuilder sb) {
- if (codePoint == 0x0130) {
+ private static void appendLowercaseCodePoint(final int codePoint, final
StringBuilder sb) {
+ int lowercaseCodePoint = getLowercaseCodePoint(codePoint);
+ if (lowercaseCodePoint == CODE_POINT_COMBINED_LOWERCASE_I_DOT) {
// Latin capital letter I with dot above is mapped to 2 lowercase
characters.
sb.appendCodePoint(0x0069);
sb.appendCodePoint(0x0307);
+ } else {
+ // All other characters should follow context-unaware ICU single-code
point case mapping.
+ sb.appendCodePoint(lowercaseCodePoint);
+ }
+ }
+
+ /**
+ * `CODE_POINT_COMBINED_LOWERCASE_I_DOT` is an internal representation of
the combined lowercase
+ * code point for ASCII lowercase letter i with an additional combining dot
character (U+0307).
+ * This integer value is not a valid code point itself, but rather an
artificial code point
+ * marker used to represent the two lowercase characters that are the result
of converting the
+ * uppercase Turkish dotted letter I with a combining dot character (U+0130)
to lowercase.
+ */
+ private static final int CODE_POINT_LOWERCASE_I = 0x69;
+ private static final int CODE_POINT_COMBINING_DOT = 0x307;
+ private static final int CODE_POINT_COMBINED_LOWERCASE_I_DOT =
+ CODE_POINT_LOWERCASE_I << 16 | CODE_POINT_COMBINING_DOT;
+
+ /**
+ * Returns the lowercase version of the provided code point, with special
handling for
+ * one-to-many case mappings (i.e. characters that map to multiple
characters in lowercase) and
+ * context-insensitive case mappings (i.e. characters that map to different
characters based on
+ * the position in the string relative to other characters in lowercase).
+ */
+ private static int getLowercaseCodePoint(final int codePoint) {
+ if (codePoint == 0x0130) {
+ // Latin capital letter I with dot above is mapped to 2 lowercase
characters.
+ return CODE_POINT_COMBINED_LOWERCASE_I_DOT;
}
else if (codePoint == 0x03C2) {
// Greek final and non-final capital letter sigma should be mapped the
same.
Review Comment:
(I know this is not part of this PR) I think it would make sense to extend
the comment here to explain that we achieve this by mapping the Greek final
sigma to the Greek small sigma. The Greek capital sigma will be mapped to the
Greek small sigma by the code in the `else` part of the this if statement.
##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -1378,19 +1381,138 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
}
- // TODO: Test more collation-aware string expressions.
-
- /**
- * Collation-aware regexp expressions.
- */
-
- // TODO: Test more collation-aware regexp expressions.
+ private void assertStringTranslate(
+ String inputString,
+ String matchingString,
+ String replaceString,
+ String collationName,
+ String expectedResultString) throws SparkException {
+ int collationId = CollationFactory.collationNameToId(collationName);
+ Map<String, String> dict = buildDict(matchingString, replaceString);
+ UTF8String source = UTF8String.fromString(inputString);
+ UTF8String result = CollationSupport.StringTranslate.exec(source, dict,
collationId);
+ assertEquals(expectedResultString, result.toString());
+ }
- /**
- * Other collation-aware expressions.
- */
+ @Test
+ public void testStringTranslate() throws SparkException {
+ // Basic tests - UTF8_BINARY.
+ assertStringTranslate("Translate", "Rnlt", "12", "UTF8_BINARY", "Tra2sae");
+ assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY",
"Tra2slate");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY",
"TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY",
"test大千世AX大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY",
"大千世界test大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY",
"大千世界大千世界oesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY",
"世世世界世世世界tesT");
+ // Basic tests - UTF8_LCASE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_LCASE",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE",
"世世世界世世世界tesT");
+ // Basic tests - UNICODE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE",
"test大千世AX大千世A");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE",
"大千世界大千世界oesO");
+ // Basic tests - UNICODE_CI.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE_CI",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE_CI",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE_CI",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE_CI",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE_CI",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE_CI",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE_CI",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE_CI",
"世世世界世世世界tesT");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE_CI",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE",
"Tra2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_BINARY", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f");
Review Comment:
Wouldn't it be easier to group those with the corresponding collations?
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
Review Comment:
```suggestion
// Index for the current character in the (validated) input string. This
is the character we want to determine if we
// need to replace or not.
int charIndex = 0;
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
Review Comment:
```suggestion
// We search the replacement dictionary to find a match. If there are
more than one matches (which is possible
// for collated strings), we want to choose the match of largest
matching length.
int longestMatchLen = 0;
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java:
##########
@@ -818,7 +818,7 @@ public static UTF8String getCollationKey(UTF8String input,
int collationId) {
if (collation.supportsBinaryEquality) {
return input;
} else if (collation.supportsLowercaseEquality) {
- return input.toLowerCase();
+ return CollationAwareUTF8String.toLowerCase(input);
Review Comment:
Is this needed for this PR? Also does the `toLowerCase` method produce a key
where all Greek sigmas map to the same character?
##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -1378,19 +1381,138 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
}
- // TODO: Test more collation-aware string expressions.
-
- /**
- * Collation-aware regexp expressions.
- */
-
- // TODO: Test more collation-aware regexp expressions.
+ private void assertStringTranslate(
+ String inputString,
+ String matchingString,
+ String replaceString,
+ String collationName,
+ String expectedResultString) throws SparkException {
+ int collationId = CollationFactory.collationNameToId(collationName);
+ Map<String, String> dict = buildDict(matchingString, replaceString);
+ UTF8String source = UTF8String.fromString(inputString);
+ UTF8String result = CollationSupport.StringTranslate.exec(source, dict,
collationId);
+ assertEquals(expectedResultString, result.toString());
+ }
- /**
- * Other collation-aware expressions.
- */
+ @Test
+ public void testStringTranslate() throws SparkException {
+ // Basic tests - UTF8_BINARY.
+ assertStringTranslate("Translate", "Rnlt", "12", "UTF8_BINARY", "Tra2sae");
+ assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY",
"Tra2slate");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY",
"TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY",
"test大千世AX大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY",
"大千世界test大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY",
"大千世界大千世界oesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY",
"世世世界世世世界tesT");
+ // Basic tests - UTF8_LCASE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
Review Comment:
These two tests are identical. Wouldn't it make more sense to add tests
where the second and third strings are of difference sizes (like we do for
UTF8_BINARY)?
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
+ String longestMatch = "";
+ for (String key : dict.keySet()) {
+ StringSearch stringSearch = new StringSearch(key, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
+ int matchIndex = stringSearch.next();
+ if (matchIndex == charIndex) {
+ int matchLen = stringSearch.getMatchLength();
+ if (matchLen > longestMatchLen) {
+ longestMatchLen = matchLen;
+ longestMatch = key;
}
}
-
- if (!exists) {
- collationAwareDict.put(newKey, dict.get(key));
+ }
+ if (longestMatchLen == 0) {
+ sb.append(inputString.charAt(charIndex));
+ charIndex++;
Review Comment:
```suggestion
++charIndex;
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
Review Comment:
```suggestion
// Replace invalid UTF8 sequences with the Unicode replacement character
U+FFFD.
String inputString = input.toValidString();
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
Review Comment:
```suggestion
// Create a character iterator for the validated input string. This will
be used for searching inside the
// string using ICU's `StringSearch` class. We only need to do it once
before the main loop of the
// translate algorithm.
CharacterIterator target = new StringCharacterIterator(inputString);
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
+ String longestMatch = "";
+ for (String key : dict.keySet()) {
+ StringSearch stringSearch = new StringSearch(key, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
+ int matchIndex = stringSearch.next();
+ if (matchIndex == charIndex) {
+ int matchLen = stringSearch.getMatchLength();
+ if (matchLen > longestMatchLen) {
+ longestMatchLen = matchLen;
+ longestMatch = key;
}
}
-
- if (!exists) {
- collationAwareDict.put(newKey, dict.get(key));
+ }
+ if (longestMatchLen == 0) {
+ sb.append(inputString.charAt(charIndex));
Review Comment:
```suggestion
// No match was found, so output the current character and move to
the next character in the input string.
sb.append(inputString.charAt(charIndex));
```
##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -1378,19 +1381,138 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
}
- // TODO: Test more collation-aware string expressions.
-
- /**
- * Collation-aware regexp expressions.
- */
-
- // TODO: Test more collation-aware regexp expressions.
+ private void assertStringTranslate(
+ String inputString,
+ String matchingString,
+ String replaceString,
+ String collationName,
+ String expectedResultString) throws SparkException {
+ int collationId = CollationFactory.collationNameToId(collationName);
+ Map<String, String> dict = buildDict(matchingString, replaceString);
+ UTF8String source = UTF8String.fromString(inputString);
+ UTF8String result = CollationSupport.StringTranslate.exec(source, dict,
collationId);
+ assertEquals(expectedResultString, result.toString());
+ }
- /**
- * Other collation-aware expressions.
- */
+ @Test
+ public void testStringTranslate() throws SparkException {
+ // Basic tests - UTF8_BINARY.
+ assertStringTranslate("Translate", "Rnlt", "12", "UTF8_BINARY", "Tra2sae");
+ assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY",
"Tra2slate");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY",
"TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY",
"test大千世AX大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY",
"大千世界test大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY",
"大千世界大千世界oesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY",
"世世世界世世世界tesT");
+ // Basic tests - UTF8_LCASE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_LCASE",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE",
"世世世界世世世界tesT");
+ // Basic tests - UNICODE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE",
"test大千世AX大千世A");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE",
"大千世界大千世界oesO");
+ // Basic tests - UNICODE_CI.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE_CI",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE_CI",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE_CI",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE_CI",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE_CI",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE_CI",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE_CI",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE_CI",
"世世世界世世世界tesT");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE_CI",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE",
"Tra2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_BINARY", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f");
+
+ // One-to-many case mapping - UTF8_BINARY.
+ assertStringTranslate("İ", "i\u0307", "xy", "UTF8_BINARY", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_BINARY", "i\u0307");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_BINARY", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_BINARY", "y\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_BINARY", "123");
+ assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_BINARY", "1i\u0307");
+ assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_BINARY", "İ23");
+ assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_BINARY",
"12bc3");
+ assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_BINARY",
"a2bcå");
+ assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UTF8_BINARY",
"3\u030Aβφδ1\u0307");
+ // One-to-many case mapping - UTF8_LCASE.
+ assertStringTranslate("İ", "i\u0307", "xy", "UTF8_LCASE", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_LCASE", "x");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_LCASE", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_LCASE", "y\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_LCASE", "11");
Review Comment:
I think this example deserves a comment as to why the result is "11".
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
Review Comment:
I would add comments inside this method to explain what every step does.
As it is I would argue it is quite hard to follow.
I will try to add some example comments to explain what I have in mind.
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
+ String longestMatch = "";
+ for (String key : dict.keySet()) {
+ StringSearch stringSearch = new StringSearch(key, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
Review Comment:
```suggestion
// Point `stringSearch` to start at the current character.
stringSearch.setIndex(charIndex);
```
##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##########
@@ -1083,6 +1103,7 @@ object StringTranslate {
}
dict
}
+
Review Comment:
Could we please remove this empty line?
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
Review Comment:
I think it would be helpful to explain here in more detail the logic: We
need to accommodate for "i\u0307" in the (validated) input string and `\u0130`
in the dictionary. This is the only case for the UTF8_LCASE collation where we
need to match more than one codepoint in the input string with a codepoint in
the dictionary. The way we do this is by keeping track of two codepoints at a
time. If we are in a situation where the current codepoint and the next one are
`i` and `\u0307` and the dictionary contains `\u0130`, then we have found a
max-length match. Otherwise, we are trying to match the currect codepoint with
a dictionary entry and buffer the second codepoint that was read for the next
iterator of the translate algorithm.
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
+ String longestMatch = "";
+ for (String key : dict.keySet()) {
+ StringSearch stringSearch = new StringSearch(key, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
+ int matchIndex = stringSearch.next();
+ if (matchIndex == charIndex) {
+ int matchLen = stringSearch.getMatchLength();
Review Comment:
```suggestion
// We have found a match (that is the current position matches
with one of the characters in the dictionary).
// However, there might be other matches of larger length, so we
need to continue searching against the
// characters in the dictionary and keep track of the match of
largest length.
int matchLen = stringSearch.getMatchLength();
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
+ String longestMatch = "";
+ for (String key : dict.keySet()) {
+ StringSearch stringSearch = new StringSearch(key, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
+ int matchIndex = stringSearch.next();
+ if (matchIndex == charIndex) {
+ int matchLen = stringSearch.getMatchLength();
Review Comment:
BTW, I expect `stringSearch.getMatchLength()` to return number of
characters. Is this understanding correct?
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
+ String longestMatch = "";
+ for (String key : dict.keySet()) {
+ StringSearch stringSearch = new StringSearch(key, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
+ int matchIndex = stringSearch.next();
+ if (matchIndex == charIndex) {
+ int matchLen = stringSearch.getMatchLength();
+ if (matchLen > longestMatchLen) {
+ longestMatchLen = matchLen;
+ longestMatch = key;
}
}
-
- if (!exists) {
- collationAwareDict.put(newKey, dict.get(key));
+ }
+ if (longestMatchLen == 0) {
+ sb.append(inputString.charAt(charIndex));
+ charIndex++;
+ } else {
+ if (!"\0".equals(dict.get(longestMatch))) {
+ sb.append(dict.get(longestMatch));
}
+ charIndex += longestMatchLen;
Review Comment:
```suggestion
// Skip as many characters as the longest match.
charIndex += longestMatchLen;
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
+ String longestMatch = "";
+ for (String key : dict.keySet()) {
+ StringSearch stringSearch = new StringSearch(key, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
+ int matchIndex = stringSearch.next();
+ if (matchIndex == charIndex) {
+ int matchLen = stringSearch.getMatchLength();
+ if (matchLen > longestMatchLen) {
+ longestMatchLen = matchLen;
+ longestMatch = key;
}
}
-
- if (!exists) {
- collationAwareDict.put(newKey, dict.get(key));
+ }
+ if (longestMatchLen == 0) {
+ sb.append(inputString.charAt(charIndex));
+ charIndex++;
+ } else {
+ if (!"\0".equals(dict.get(longestMatch))) {
Review Comment:
```suggestion
// We have found at least one match. Append the match of longest
match length to the output.
if (!"\0".equals(dict.get(longestMatch))) {
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -655,38 +691,130 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- // TODO(SPARK-48715): All UTF8String -> String conversions should use
`makeValid`
- String srcStr = string.toString();
+ /**
+ * Converts the original translation dictionary (`dict`) to a dictionary
with lowercased keys.
+ * This method is used to create a dictionary that can be used for the
UTF8_LCASE collation.
+ * Note that `StringTranslate.buildDict` will ensure that all strings are
validated properly.
+ *
+ * The method returns a map with lowercased code points as keys, while the
values remain
+ * unchanged. Note that `dict` is constructed on a character by character
basis, and the
+ * original keys are stored as strings. Keys in the resulting lowercase
dictionary are stored
+ * as integers, which correspond only to single characters from the original
`dict`. Also,
+ * there is special handling for the Turkish dotted uppercase letter I
(U+0130).
+ */
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+
+ /**
+ * Translates the `input` string using the translation map `dict`, for
UTF8_LCASE collation.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For UTF8_LCASE, the method uses the lowercased substring to perform the
lookup in the
+ * lowercased version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the lowercase translation dictionary
+ * @return the translated string
+ */
+ public static UTF8String lowercaseTranslate(final UTF8String input,
+ final Map<String, String> dict) {
+ // Iterator for the input string.
+ Iterator<Integer> inputIter = input.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ // Lowercased translation dictionary.
+ Map<Integer, String> lowercaseDict = getLowercaseDict(dict);
+ // StringBuilder to store the translated string.
+ StringBuilder sb = new StringBuilder();
- Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
+ // Buffered code point iteration to handle one-to-many case mappings.
+ int codePointBuffer = -1, codePoint;
+ while (inputIter.hasNext()) {
+ if (codePointBuffer != -1) {
+ codePoint = codePointBuffer;
+ codePointBuffer = -1;
+ } else {
+ codePoint = inputIter.next();
+ }
+ // Special handling for letter i (U+0069) followed by a combining dot
(U+0307).
+ if (lowercaseDict.containsKey(CODE_POINT_COMBINED_LOWERCASE_I_DOT) &&
+ codePoint == CODE_POINT_LOWERCASE_I && inputIter.hasNext()) {
+ int nextCodePoint = inputIter.next();
+ if (nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ codePoint = CODE_POINT_COMBINED_LOWERCASE_I_DOT;
+ } else {
+ codePointBuffer = nextCodePoint;
+ }
+ }
+ // Translate the code point using the lowercased dictionary.
+ String translated = lowercaseDict.get(getLowercaseCodePoint(codePoint));
+ if (translated == null) {
+ // Append the original code point if no translation is found.
+ sb.appendCodePoint(codePoint);
+ } else if (!"\0".equals(translated)) {
+ // Append the translated code point if the translation is not the null
character.
+ sb.append(translated);
+ }
+ // Skip the code point if it maps to the null character.
+ }
+ // Append the last code point if it was buffered.
+ if (codePointBuffer != -1) sb.appendCodePoint(codePointBuffer);
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
+ // Return the translated string.
+ return UTF8String.fromString(sb.toString());
+ }
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
+ /**
+ * Translates the `input` string using the translation map `dict`, for all
ICU collations.
+ * String translation is performed by iterating over the input string, from
left to right, and
+ * repeatedly translating the longest possible substring that matches a key
in the dictionary.
+ * For ICU collations, the method uses the collation key of the substring to
perform the lookup
+ * in the collation aware version of the translation map.
+ *
+ * @param input the string to be translated
+ * @param dict the collation aware translation dictionary
+ * @param collationId the collation ID to use for string translation
+ * @return the translated string
+ */
+ public static UTF8String translate(final UTF8String input,
+ final Map<String, String> dict, final int collationId) {
+ String inputString = input.toValidString();
+ CharacterIterator target = new StringCharacterIterator(inputString);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ StringBuilder sb = new StringBuilder();
+ int charIndex = 0;
+ while (charIndex < inputString.length()) {
+ int longestMatchLen = 0;
+ String longestMatch = "";
+ for (String key : dict.keySet()) {
+ StringSearch stringSearch = new StringSearch(key, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
+ int matchIndex = stringSearch.next();
+ if (matchIndex == charIndex) {
+ int matchLen = stringSearch.getMatchLength();
+ if (matchLen > longestMatchLen) {
+ longestMatchLen = matchLen;
+ longestMatch = key;
}
}
-
- if (!exists) {
- collationAwareDict.put(newKey, dict.get(key));
+ }
+ if (longestMatchLen == 0) {
+ sb.append(inputString.charAt(charIndex));
+ charIndex++;
Review Comment:
This is a nit.
##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -1378,19 +1381,138 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
}
- // TODO: Test more collation-aware string expressions.
-
- /**
- * Collation-aware regexp expressions.
- */
-
- // TODO: Test more collation-aware regexp expressions.
+ private void assertStringTranslate(
+ String inputString,
+ String matchingString,
+ String replaceString,
+ String collationName,
+ String expectedResultString) throws SparkException {
+ int collationId = CollationFactory.collationNameToId(collationName);
+ Map<String, String> dict = buildDict(matchingString, replaceString);
+ UTF8String source = UTF8String.fromString(inputString);
+ UTF8String result = CollationSupport.StringTranslate.exec(source, dict,
collationId);
+ assertEquals(expectedResultString, result.toString());
+ }
- /**
- * Other collation-aware expressions.
- */
+ @Test
+ public void testStringTranslate() throws SparkException {
+ // Basic tests - UTF8_BINARY.
+ assertStringTranslate("Translate", "Rnlt", "12", "UTF8_BINARY", "Tra2sae");
+ assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY",
"Tra2slate");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY",
"TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY",
"test大千世AX大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY",
"大千世界test大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY",
"大千世界大千世界oesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY",
"世世世界世世世界tesT");
+ // Basic tests - UTF8_LCASE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_LCASE",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE",
"世世世界世世世界tesT");
+ // Basic tests - UNICODE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE",
"test大千世AX大千世A");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE",
"大千世界大千世界oesO");
+ // Basic tests - UNICODE_CI.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE_CI",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE_CI",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE_CI",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE_CI",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE_CI",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE_CI",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE_CI",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE_CI",
"世世世界世世世界tesT");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE_CI",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE",
"Tra2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_BINARY", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f");
+
+ // One-to-many case mapping - UTF8_BINARY.
+ assertStringTranslate("İ", "i\u0307", "xy", "UTF8_BINARY", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_BINARY", "i\u0307");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_BINARY", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_BINARY", "y\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_BINARY", "123");
+ assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_BINARY", "1i\u0307");
+ assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_BINARY", "İ23");
+ assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_BINARY",
"12bc3");
+ assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_BINARY",
"a2bcå");
+ assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UTF8_BINARY",
"3\u030Aβφδ1\u0307");
+ // One-to-many case mapping - UTF8_LCASE.
+ assertStringTranslate("İ", "i\u0307", "xy", "UTF8_LCASE", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_LCASE", "x");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_LCASE", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_LCASE", "y\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_LCASE", "11");
+ assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_LCASE", "11");
+ assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_LCASE", "İ23");
+ assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_LCASE",
"12bc3");
+ assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_LCASE",
"12bc3");
+ assertStringTranslate("A\u030Aβφδi\u0307", "Iİaå", "1234", "UTF8_LCASE",
"3\u030Aβφδ2");
Review Comment:
Can we please add a few more test cases (which is basically targeting
towards making sure that Greek sigmas are treated correctly)?
```scala
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "σιι", "UTF8_LCASE",
"σιστιματικοσ");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "σιι", "UTF8_LCASE",
"σιστιματικοσ");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "σιι", "UTF8_LCASE",
"σιστιματικοσ");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "συη", "ςιι", "UTF8_LCASE",
"ςιςτιματικος");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "Συη", "ςιι", "UTF8_LCASE",
"ςιςτιματικος");
assertStringTranslate("ΣΥΣΤΗΜΑΤΙΚΟΣ", "ςυη", "ςιι", "UTF8_LCASE",
"ςιςτιματικος");
assertStringTranslate("συστηματικος", "Συη", "σιι", "UTF8_LCASE",
"σιστιματικοσ");
assertStringTranslate("συστηματικος", "συη", "σιι", "UTF8_LCASE",
"σιστιματικοσ");
assertStringTranslate("συστηματικος", "ςυη", "σιι", "UTF8_LCASE",
"σιστιματικοσ");
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]