stefankandic commented on code in PR #45820:
URL: https://github.com/apache/spark/pull/45820#discussion_r1548115179
##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -1155,6 +1156,71 @@ public UTF8String translate(Map<String, String> dict) {
return fromString(sb.toString());
}
+ public UTF8String translate(Map<String, String> dict, int collationId) {
+ if(CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+ return translate(dict);
+ }
+ return collationAwareTranslate(dict, collationId);
+ }
+
+ public UTF8String collationAwareTranslate(Map<String, String> dict, int
collationId) {
+ if (numBytes == 0) {
+ return this;
+ }
+
+ Map<String, String> collationAwareDict = getCollationAwareDict(dict,
collationId);
+
+ String srcStr = this.toString();
+
+ StringBuilder sb = new StringBuilder();
+ int charCount = 0;
+ for (int k = 0; k < srcStr.length(); k += charCount) {
+ int codePoint = srcStr.codePointAt(k);
+ charCount = Character.charCount(codePoint);
+ String subStr = srcStr.substring(k, k + charCount);
+
+ String translated = collationAwareDict.get(subStr);
+
+ if (null == translated) {
+ sb.append(subStr);
+ } else if (!"\0".equals(translated)) {
+ sb.append(translated);
+ }
+ }
+ return fromString(sb.toString());
+ }
+
+ private Map<String, String> getCollationAwareDict(Map<String, String> dict,
int collationId) {
+ String srcStr = this.toString();
Review Comment:
do we need to do this twice? maybe we can just pass it to the method
creating a string is a pretty expensive operation in both time and memory
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]