uros-db commented on code in PR #46761:
URL: https://github.com/apache/spark/pull/46761#discussion_r1665813246
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -621,37 +638,69 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
}
}
- public static Map<String, String> getCollationAwareDict(UTF8String string,
- Map<String, String> dict, int collationId) {
- String srcStr = string.toString();
-
+ private static Map<Integer, String> getLowercaseDict(final Map<String,
String> dict) {
+ // Replace all the keys in the dict with lowercased code points.
+ Map<Integer, String> lowercaseDict = new HashMap<>();
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ int codePoint = entry.getKey().codePointAt(0);
+ lowercaseDict.putIfAbsent(getLowercaseCodePoint(codePoint),
entry.getValue());
+ }
+ return lowercaseDict;
+ }
+ private static Map<String, String> getCollationAwareDict(final Map<String,
String> dict,
+ int collationId) {
+ // Replace all the keys in the dict with collation keys.
Map<String, String> collationAwareDict = new HashMap<>();
- for (String key : dict.keySet()) {
- StringSearch stringSearch =
- CollationFactory.getStringSearch(string, UTF8String.fromString(key),
collationId);
-
- int pos = 0;
- while ((pos = stringSearch.next()) != StringSearch.DONE) {
- int codePoint = srcStr.codePointAt(pos);
- int charCount = Character.charCount(codePoint);
- String newKey = srcStr.substring(pos, pos + charCount);
-
- boolean exists = false;
- for (String existingKey : collationAwareDict.keySet()) {
- if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
- collationAwareDict.put(newKey,
collationAwareDict.get(existingKey));
- exists = true;
- break;
- }
- }
+ for (Map.Entry<String, String> entry : dict.entrySet()) {
+ String collationKey = CollationFactory.getCollationKey(entry.getKey(),
collationId);
+ collationAwareDict.putIfAbsent(collationKey, entry.getValue());
+ }
+ return collationAwareDict;
+ }
- if (!exists) {
- collationAwareDict.put(newKey, dict.get(key));
- }
+ private static String lowercaseTranslate(final String input, final
Map<Integer, String> dict) {
+ StringBuilder sb = new StringBuilder();
+ int charCount = 0;
+ for (int k = 0; k < input.length(); k += charCount) {
+ int codePoint = input.codePointAt(k);
+ charCount = Character.charCount(codePoint);
+ String translated = dict.get(getLowercaseCodePoint(codePoint));
+ if (null == translated) {
Review Comment:
some would say that the original is more error-proof ;)
although for Java, this is useless - I'll replace it
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]