uros-db commented on code in PR #46761:
URL: https://github.com/apache/spark/pull/46761#discussion_r1670933332
##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -1378,19 +1381,138 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
}
- // TODO: Test more collation-aware string expressions.
-
- /**
- * Collation-aware regexp expressions.
- */
-
- // TODO: Test more collation-aware regexp expressions.
+ private void assertStringTranslate(
+ String inputString,
+ String matchingString,
+ String replaceString,
+ String collationName,
+ String expectedResultString) throws SparkException {
+ int collationId = CollationFactory.collationNameToId(collationName);
+ Map<String, String> dict = buildDict(matchingString, replaceString);
+ UTF8String source = UTF8String.fromString(inputString);
+ UTF8String result = CollationSupport.StringTranslate.exec(source, dict,
collationId);
+ assertEquals(expectedResultString, result.toString());
+ }
- /**
- * Other collation-aware expressions.
- */
+ @Test
+ public void testStringTranslate() throws SparkException {
+ // Basic tests - UTF8_BINARY.
+ assertStringTranslate("Translate", "Rnlt", "12", "UTF8_BINARY", "Tra2sae");
+ assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY",
"Tra2slate");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY",
"TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY",
"test大千世AX大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY",
"大千世界test大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY",
"大千世界大千世界oesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY",
"世世世界世世世界tesT");
+ // Basic tests - UTF8_LCASE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_LCASE",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE",
"世世世界世世世界tesT");
+ // Basic tests - UNICODE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE",
"test大千世AX大千世A");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE",
"大千世界大千世界oesO");
+ // Basic tests - UNICODE_CI.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE_CI",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE_CI",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE_CI",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE_CI",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE_CI",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE_CI",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE_CI",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE_CI",
"世世世界世世世界tesT");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE_CI",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE",
"Tra2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_BINARY", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f");
+
+ // One-to-many case mapping - UTF8_BINARY.
+ assertStringTranslate("İ", "i\u0307", "xy", "UTF8_BINARY", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_BINARY", "i\u0307");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_BINARY", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_BINARY", "y\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_BINARY", "123");
+ assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_BINARY", "1i\u0307");
+ assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_BINARY", "İ23");
+ assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_BINARY",
"12bc3");
+ assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_BINARY",
"a2bcå");
+ assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UTF8_BINARY",
"3\u030Aβφδ1\u0307");
+ // One-to-many case mapping - UTF8_LCASE.
+ assertStringTranslate("İ", "i\u0307", "xy", "UTF8_LCASE", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_LCASE", "x");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_LCASE", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_LCASE", "y\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_LCASE", "11");
Review Comment:
there's plenty of these `edgy` examples, could we instead agree that the
implementation comments and doc comments are enough to describe the expected
behaviour?
##########
common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java:
##########
@@ -1378,19 +1381,138 @@ public void testStringTrim() throws SparkException {
assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
}
- // TODO: Test more collation-aware string expressions.
-
- /**
- * Collation-aware regexp expressions.
- */
-
- // TODO: Test more collation-aware regexp expressions.
+ private void assertStringTranslate(
+ String inputString,
+ String matchingString,
+ String replaceString,
+ String collationName,
+ String expectedResultString) throws SparkException {
+ int collationId = CollationFactory.collationNameToId(collationName);
+ Map<String, String> dict = buildDict(matchingString, replaceString);
+ UTF8String source = UTF8String.fromString(inputString);
+ UTF8String result = CollationSupport.StringTranslate.exec(source, dict,
collationId);
+ assertEquals(expectedResultString, result.toString());
+ }
- /**
- * Other collation-aware expressions.
- */
+ @Test
+ public void testStringTranslate() throws SparkException {
+ // Basic tests - UTF8_BINARY.
+ assertStringTranslate("Translate", "Rnlt", "12", "UTF8_BINARY", "Tra2sae");
+ assertStringTranslate("Translate", "Rn", "1234", "UTF8_BINARY",
"Tra2slate");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_BINARY",
"TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_BINARY",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_BINARY",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_BINARY",
"test大千世AX大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_BINARY",
"大千世界test大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_BINARY",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_BINARY",
"大千世界大千世界oesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_BINARY",
"世世世界世世世界tesT");
+ // Basic tests - UTF8_LCASE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "1234", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UTF8_LCASE",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UTF8_LCASE",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UTF8_LCASE",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UTF8_LCASE",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UTF8_LCASE",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UTF8_LCASE",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UTF8_LCASE",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UTF8_LCASE",
"世世世界世世世界tesT");
+ // Basic tests - UNICODE.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE", "Tra2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE", "TRaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE",
"TxaxsXaxeX");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE",
"TXaxsXaxex");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE",
"test大千世AX大千世A");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE",
"Oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE",
"大千世界大千世界oesO");
+ // Basic tests - UNICODE_CI.
+ assertStringTranslate("Translate", "Rnlt", "1234", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("TRanslate", "rnlt", "XxXx", "UNICODE_CI",
"xXaxsXaxe");
+ assertStringTranslate("TRanslater", "Rrnlt", "xXxXx", "UNICODE_CI",
"xxaxsXaxex");
+ assertStringTranslate("TRanslater", "Rrnlt", "XxxXx", "UNICODE_CI",
"xXaxsXaxeX");
+ assertStringTranslate("test大千世界X大千世界", "界x", "AB", "UNICODE_CI",
"test大千世AB大千世A");
+ assertStringTranslate("大千世界test大千世界", "TEST", "abcd", "UNICODE_CI",
"大千世界abca大千世界");
+ assertStringTranslate("Test大千世界大千世界", "tT", "oO", "UNICODE_CI",
"oeso大千世界大千世界");
+ assertStringTranslate("大千世界大千世界tesT", "Tt", "Oo", "UNICODE_CI",
"大千世界大千世界OesO");
+ assertStringTranslate("大千世界大千世界tesT", "大千", "世世", "UNICODE_CI",
"世世世界世世世界tesT");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_LCASE",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE_CI",
"14234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UNICODE",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlasdfjhgadt", "1234", "UTF8_BINARY",
"Tr4234e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_LCASE",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE",
"Tra2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UNICODE_CI",
"41a2s3a4e");
+ assertStringTranslate("Translate", "Rnlt", "123495834634", "UTF8_BINARY",
"Tra2s3a4e");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_BINARY", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
+ assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f");
+
+ // One-to-many case mapping - UTF8_BINARY.
+ assertStringTranslate("İ", "i\u0307", "xy", "UTF8_BINARY", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_BINARY", "i\u0307");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_BINARY", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_BINARY", "y\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_BINARY", "123");
+ assertStringTranslate("İi\u0307", "İyz", "123", "UTF8_BINARY", "1i\u0307");
+ assertStringTranslate("İi\u0307", "xi\u0307", "123", "UTF8_BINARY", "İ23");
+ assertStringTranslate("a\u030Abcå", "a\u030Aå", "123", "UTF8_BINARY",
"12bc3");
+ assertStringTranslate("a\u030Abcå", "A\u030AÅ", "123", "UTF8_BINARY",
"a2bcå");
+ assertStringTranslate("a\u030AβφδI\u0307", "Iİaå", "1234", "UTF8_BINARY",
"3\u030Aβφδ1\u0307");
+ // One-to-many case mapping - UTF8_LCASE.
+ assertStringTranslate("İ", "i\u0307", "xy", "UTF8_LCASE", "İ");
+ assertStringTranslate("i\u0307", "İ", "xy", "UTF8_LCASE", "x");
+ assertStringTranslate("i\u030A", "İ", "x", "UTF8_LCASE", "i\u030A");
+ assertStringTranslate("i\u030A", "İi", "xy", "UTF8_LCASE", "y\u030A");
+ assertStringTranslate("İi\u0307", "İi\u0307", "123", "UTF8_LCASE", "11");
Review Comment:
there's plenty of these _edgy_ examples, could we instead agree that the
implementation comments and doc comments are enough to describe the expected
behaviour?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]