HyukjinKwon commented on code in PR #45820:
URL: https://github.com/apache/spark/pull/45820#discussion_r1548765578


##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -1155,6 +1156,45 @@ public UTF8String translate(Map<String, String> dict) {
     return fromString(sb.toString());
   }
 
+  public UTF8String translate(Map<String, String> dict, int collationId) {
+    if(CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return translate(dict);
+    }
+    return translate(getCollationAwareDict(dict, collationId));
+  }
+
+  private Map<String, String> getCollationAwareDict(Map<String, String> dict, 
int collationId) {
+    String srcStr = this.toString();
+
+    Map<String, String> collationAwareDict = new HashMap<>();
+    for(String key : dict.keySet()) {
+      StringSearch stringSearch =
+        CollationFactory.getStringSearch(this, UTF8String.fromString(key), 
collationId);
+
+      int pos = 0;
+      while((pos = stringSearch.next()) != StringSearch.DONE) {

Review Comment:
   ```suggestion
         while ((pos = stringSearch.next()) != StringSearch.DONE) {
   ```



##########
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##########
@@ -73,6 +74,57 @@ class CollationStringExpressionsSuite extends QueryTest
     })
   }
 
+  test("TRANSLATE check result on explicitly collated string") {
+    def testTranslate(input: String,
+                      matchExpression: String,
+                      replaceExpression: String,
+                      collationId: Int,
+                      expected: String): Unit = {

Review Comment:
   ```suggestion
       def testTranslate(
           input: String,
           matchExpression: String,
           replaceExpression: String,
           collationId: Int,
           expected: String): Unit = {
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -1155,6 +1156,45 @@ public UTF8String translate(Map<String, String> dict) {
     return fromString(sb.toString());
   }
 
+  public UTF8String translate(Map<String, String> dict, int collationId) {
+    if(CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return translate(dict);
+    }
+    return translate(getCollationAwareDict(dict, collationId));
+  }
+
+  private Map<String, String> getCollationAwareDict(Map<String, String> dict, 
int collationId) {
+    String srcStr = this.toString();
+
+    Map<String, String> collationAwareDict = new HashMap<>();
+    for(String key : dict.keySet()) {
+      StringSearch stringSearch =
+        CollationFactory.getStringSearch(this, UTF8String.fromString(key), 
collationId);
+
+      int pos = 0;
+      while((pos = stringSearch.next()) != StringSearch.DONE) {
+        int codePoint = srcStr.codePointAt(pos);
+        int charCount = Character.charCount(codePoint);
+        String newKey = srcStr.substring(pos, pos + charCount);
+
+        boolean exists = false;
+        for(String existingKey : collationAwareDict.keySet()) {
+          if(stringSearch.getCollator().compare(existingKey, newKey) == 0) {
+            collationAwareDict.put(newKey, 
collationAwareDict.get(existingKey));
+            exists = true;
+            break;
+          }
+        }
+
+        if(!exists) {

Review Comment:
   ```suggestion
           if (!exists) {
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -1155,6 +1156,45 @@ public UTF8String translate(Map<String, String> dict) {
     return fromString(sb.toString());
   }
 
+  public UTF8String translate(Map<String, String> dict, int collationId) {
+    if(CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return translate(dict);
+    }
+    return translate(getCollationAwareDict(dict, collationId));
+  }
+
+  private Map<String, String> getCollationAwareDict(Map<String, String> dict, 
int collationId) {
+    String srcStr = this.toString();
+
+    Map<String, String> collationAwareDict = new HashMap<>();
+    for(String key : dict.keySet()) {

Review Comment:
   ```suggestion
       for (String key : dict.keySet()) {
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -1155,6 +1156,45 @@ public UTF8String translate(Map<String, String> dict) {
     return fromString(sb.toString());
   }
 
+  public UTF8String translate(Map<String, String> dict, int collationId) {
+    if(CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {

Review Comment:
   ```suggestion
       if (CollationFactory.fetchCollation(collationId).supportsBinaryEquality) 
{
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -1155,6 +1156,45 @@ public UTF8String translate(Map<String, String> dict) {
     return fromString(sb.toString());
   }
 
+  public UTF8String translate(Map<String, String> dict, int collationId) {
+    if(CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return translate(dict);
+    }
+    return translate(getCollationAwareDict(dict, collationId));
+  }
+
+  private Map<String, String> getCollationAwareDict(Map<String, String> dict, 
int collationId) {
+    String srcStr = this.toString();
+
+    Map<String, String> collationAwareDict = new HashMap<>();
+    for(String key : dict.keySet()) {
+      StringSearch stringSearch =
+        CollationFactory.getStringSearch(this, UTF8String.fromString(key), 
collationId);
+
+      int pos = 0;
+      while((pos = stringSearch.next()) != StringSearch.DONE) {
+        int codePoint = srcStr.codePointAt(pos);
+        int charCount = Character.charCount(codePoint);
+        String newKey = srcStr.substring(pos, pos + charCount);
+
+        boolean exists = false;
+        for(String existingKey : collationAwareDict.keySet()) {

Review Comment:
   ```suggestion
           for (String existingKey : collationAwareDict.keySet()) {
   ```



##########
common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java:
##########
@@ -1155,6 +1156,45 @@ public UTF8String translate(Map<String, String> dict) {
     return fromString(sb.toString());
   }
 
+  public UTF8String translate(Map<String, String> dict, int collationId) {
+    if(CollationFactory.fetchCollation(collationId).supportsBinaryEquality) {
+      return translate(dict);
+    }
+    return translate(getCollationAwareDict(dict, collationId));
+  }
+
+  private Map<String, String> getCollationAwareDict(Map<String, String> dict, 
int collationId) {
+    String srcStr = this.toString();
+
+    Map<String, String> collationAwareDict = new HashMap<>();
+    for(String key : dict.keySet()) {
+      StringSearch stringSearch =
+        CollationFactory.getStringSearch(this, UTF8String.fromString(key), 
collationId);
+
+      int pos = 0;
+      while((pos = stringSearch.next()) != StringSearch.DONE) {
+        int codePoint = srcStr.codePointAt(pos);
+        int charCount = Character.charCount(codePoint);
+        String newKey = srcStr.substring(pos, pos + charCount);
+
+        boolean exists = false;
+        for(String existingKey : collationAwareDict.keySet()) {
+          if(stringSearch.getCollator().compare(existingKey, newKey) == 0) {

Review Comment:
   ```suggestion
             if (stringSearch.getCollator().compare(existingKey, newKey) == 0) {
   ```



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java:
##########
@@ -179,12 +179,26 @@ public static StringSearch getStringSearch(
       final UTF8String left,
       final UTF8String right,
       final int collationId) {
+
+    if(collationId == UTF8_BINARY_LCASE_COLLATION_ID) {

Review Comment:
   ```suggestion
       if (collationId == UTF8_BINARY_LCASE_COLLATION_ID) {
   ```



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java:
##########
@@ -179,12 +179,26 @@ public static StringSearch getStringSearch(
       final UTF8String left,
       final UTF8String right,
       final int collationId) {
+
+    if(collationId == UTF8_BINARY_LCASE_COLLATION_ID) {
+      return getStringSearch(left, right);
+    }
+
     String pattern = right.toString();
     CharacterIterator target = new StringCharacterIterator(left.toString());
     Collator collator = CollationFactory.fetchCollation(collationId).collator;
     return new StringSearch(pattern, target, (RuleBasedCollator) collator);
   }
 
+  private static StringSearch getStringSearch(
+          final UTF8String left,
+          final UTF8String right) {

Review Comment:
   ```suggestion
         final UTF8String left,
         final UTF8String right) {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to