This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 95de02e80dee [SPARK-48441][SQL] Fix StringTrim behaviour for
non-UTF8_BINARY collations
95de02e80dee is described below
commit 95de02e80deef19607a55b5913c674cc21521132
Author: Uros Bojanic <[email protected]>
AuthorDate: Mon Jul 15 16:33:53 2024 +0800
[SPARK-48441][SQL] Fix StringTrim behaviour for non-UTF8_BINARY collations
### What changes were proposed in this pull request?
String searching in UTF8_LCASE now works on character-level, rather than on
byte-level. For example: `ltrim("İ", "i")` now returns `"İ"`, because there
exist **no characters** in `"İ"`, starting from the left, such that lowercased
version of those characters are equal to `"i"`. Note, however, that there is a
byte subsequence of `"İ"` such that lowercased version of that UTF-8 byte
sequence equals to `"i"` (so the new behaviour is different than the old
behaviour).
Also, translation for ICU collations works by repeatedly trimming the
longest possible substring that matches a character in the trim string,
starting from the left side of the input string, until trimming is done.
### Why are the changes needed?
Fix functions that give unusable results due to one-to-many case mapping
when performing string search under UTF8_LCASE (see example above).
### Does this PR introduce _any_ user-facing change?
Yes, behaviour of `trim*` expressions is changed for collated strings for
edge cases with one-to-many case mapping.
### How was this patch tested?
New unit tests in `CollationSupportSuite` and new e2e sql tests in
`CollationStringExpressionsSuite`.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #46762 from uros-db/alter-trim.
Authored-by: Uros Bojanic <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
---
.../catalyst/util/CollationAwareUTF8String.java | 306 +++++++---
.../spark/sql/catalyst/util/CollationSupport.java | 129 ++--
.../spark/unsafe/types/CollationSupportSuite.java | 663 +++++++++++++++++++--
.../catalyst/expressions/stringExpressions.scala | 16 +-
.../sql/CollationStringExpressionsSuite.scala | 45 +-
5 files changed, 922 insertions(+), 237 deletions(-)
diff --git
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
index af152c87f88c..b9868ca665a6 100644
---
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
+++
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java
@@ -33,6 +33,7 @@ import static
org.apache.spark.unsafe.types.UTF8String.CodePointIteratorType;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
@@ -841,117 +842,268 @@ public class CollationAwareUTF8String {
return UTF8String.fromString(sb.toString());
}
+ /**
+ * Trims the `srcString` string from both ends of the string using the
specified `trimString`
+ * characters, with respect to the UTF8_LCASE collation. String trimming is
performed by
+ * first trimming the left side of the string, and then trimming the right
side of the string.
+ * The method returns the trimmed string. If the `trimString` is null, the
method returns null.
+ *
+ * @param srcString the input string to be trimmed from both ends of the
string
+ * @param trimString the trim string characters to trim
+ * @return the trimmed string (for UTF8_LCASE collation)
+ */
public static UTF8String lowercaseTrim(
final UTF8String srcString,
final UTF8String trimString) {
- // Matching UTF8String behavior for null `trimString`.
- if (trimString == null) {
- return null;
- }
+ return lowercaseTrimRight(lowercaseTrimLeft(srcString, trimString),
trimString);
+ }
- UTF8String leftTrimmed = lowercaseTrimLeft(srcString, trimString);
- return lowercaseTrimRight(leftTrimmed, trimString);
+ /**
+ * Trims the `srcString` string from both ends of the string using the
specified `trimString`
+ * characters, with respect to all ICU collations in Spark. String trimming
is performed by
+ * first trimming the left side of the string, and then trimming the right
side of the string.
+ * The method returns the trimmed string. If the `trimString` is null, the
method returns null.
+ *
+ * @param srcString the input string to be trimmed from both ends of the
string
+ * @param trimString the trim string characters to trim
+ * @param collationId the collation ID to use for string trimming
+ * @return the trimmed string (for ICU collations)
+ */
+ public static UTF8String trim(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ return trimRight(trimLeft(srcString, trimString, collationId), trimString,
collationId);
}
+ /**
+ * Trims the `srcString` string from the left side using the specified
`trimString` characters,
+ * with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method
first creates a hash
+ * set of lowercased code points in `trimString`, and then iterates over the
`srcString` from
+ * the left side, until reaching a character whose lowercased code point is
not in the hash set.
+ * Finally, the method returns the substring from that position to the end
of `srcString`.
+ * If `trimString` is null, null is returned. If `trimString` is empty,
`srcString` is returned.
+ *
+ * @param srcString the input string to be trimmed from the left end of the
string
+ * @param trimString the trim string characters to trim
+ * @return the trimmed string (for UTF8_LCASE collation)
+ */
public static UTF8String lowercaseTrimLeft(
final UTF8String srcString,
final UTF8String trimString) {
- // Matching UTF8String behavior for null `trimString`.
+ // Matching the default UTF8String behavior for null `trimString`.
if (trimString == null) {
return null;
}
- // The searching byte position in the srcString.
- int searchIdx = 0;
- // The byte position of a first non-matching character in the srcString.
- int trimByteIdx = 0;
- // Number of bytes in srcString.
- int numBytes = srcString.numBytes();
- // Convert trimString to lowercase, so it can be searched properly.
- UTF8String lowercaseTrimString = trimString.toLowerCase();
-
- while (searchIdx < numBytes) {
- UTF8String searchChar = srcString.copyUTF8String(
- searchIdx,
- searchIdx +
UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1);
- int searchCharBytes = searchChar.numBytes();
-
- // Try to find the matching for the searchChar in the trimString.
- if (lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) {
- trimByteIdx += searchCharBytes;
- searchIdx += searchCharBytes;
- } else {
- // No matching, exit the search.
+ // Create a hash set of lowercased code points for all characters of
`trimString`.
+ HashSet<Integer> trimChars = new HashSet<>();
+ Iterator<Integer> trimIter = trimString.codePointIterator();
+ while (trimIter.hasNext())
trimChars.add(getLowercaseCodePoint(trimIter.next()));
+
+ // Iterate over `srcString` from the left to find the first character that
is not in the set.
+ int searchIndex = 0, codePoint;
+ Iterator<Integer> srcIter = srcString.codePointIterator();
+ while (srcIter.hasNext()) {
+ codePoint = getLowercaseCodePoint(srcIter.next());
+ // Special handling for Turkish dotted uppercase letter I.
+ if (codePoint == CODE_POINT_LOWERCASE_I && srcIter.hasNext() &&
+ trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
+ int nextCodePoint = getLowercaseCodePoint(srcIter.next());
+ if ((trimChars.contains(codePoint) &&
trimChars.contains(nextCodePoint))
+ || nextCodePoint == CODE_POINT_COMBINING_DOT) {
+ searchIndex += 2;
+ }
+ else {
+ if (trimChars.contains(codePoint)) ++searchIndex;
+ break;
+ }
+ } else if (trimChars.contains(codePoint)) {
+ ++searchIndex;
+ }
+ else {
break;
}
}
- if (searchIdx == 0) {
- // Nothing trimmed - return original string (not converted to lowercase).
- return srcString;
+ // Return the substring from that position to the end of the string.
+ return searchIndex == 0 ? srcString : srcString.substring(searchIndex,
srcString.numChars());
+ }
+
+ /**
+ * Trims the `srcString` string from the left side using the specified
`trimString` characters,
+ * with respect to ICU collations. For these collations, the method iterates
over `srcString`
+ * from left to right, and repeatedly skips the longest possible substring
that matches any
+ * character in `trimString`, until reaching a character that is not found
in `trimString`.
+ * Finally, the method returns the substring from that position to the end
of `srcString`.
+ * If `trimString` is null, null is returned. If `trimString` is empty,
`srcString` is returned.
+ *
+ * @param srcString the input string to be trimmed from the left end of the
string
+ * @param trimString the trim string characters to trim
+ * @param collationId the collation ID to use for string trimming
+ * @return the trimmed string (for ICU collations)
+ */
+ public static UTF8String trimLeft(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ // Short-circuit for base cases.
+ if (trimString == null) return null;
+ if (srcString.numBytes() == 0) return srcString;
+
+ // Create an array of Strings for all characters of `trimString`.
+ Map<Integer, String> trimChars = new HashMap<>();
+ Iterator<Integer> trimIter = trimString.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ while (trimIter.hasNext()) {
+ int codePoint = trimIter.next();
+ trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint));
}
- if (trimByteIdx >= numBytes) {
- // Everything trimmed.
- return UTF8String.EMPTY_UTF8;
+
+ // Iterate over srcString from the left and find the first character that
is not in trimChars.
+ String src = srcString.toValidString();
+ CharacterIterator target = new StringCharacterIterator(src);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ int charIndex = 0, longestMatchLen;
+ while (charIndex < src.length()) {
+ longestMatchLen = 0;
+ for (String trim : trimChars.values()) {
+ StringSearch stringSearch = new StringSearch(trim, target,
(RuleBasedCollator) collator);
+ stringSearch.setIndex(charIndex);
+ int matchIndex = stringSearch.next();
+ if (matchIndex == charIndex) {
+ int matchLen = stringSearch.getMatchLength();
+ if (matchLen > longestMatchLen) {
+ longestMatchLen = matchLen;
+ }
+ }
+ }
+ if (longestMatchLen == 0) break;
+ else charIndex += longestMatchLen;
}
- return srcString.copyUTF8String(trimByteIdx, numBytes - 1);
+
+ // Return the substring from the calculated position until the end of the
string.
+ return UTF8String.fromString(src.substring(charIndex));
}
+ /**
+ * Trims the `srcString` string from the right side using the specified
`trimString` characters,
+ * with respect to the UTF8_LCASE collation. For UTF8_LCASE, the method
first creates a hash
+ * set of lowercased code points in `trimString`, and then iterates over the
`srcString` from
+ * the right side, until reaching a character whose lowercased code point is
not in the hash set.
+ * Finally, the method returns the substring from the start of `srcString`
until that position.
+ * If `trimString` is null, null is returned. If `trimString` is empty,
`srcString` is returned.
+ *
+ * @param srcString the input string to be trimmed from the right end of the
string
+ * @param trimString the trim string characters to trim
+ * @return the trimmed string (for UTF8_LCASE collation)
+ */
public static UTF8String lowercaseTrimRight(
final UTF8String srcString,
final UTF8String trimString) {
- // Matching UTF8String behavior for null `trimString`.
+ // Matching the default UTF8String behavior for null `trimString`.
if (trimString == null) {
return null;
}
- // Number of bytes iterated from the srcString.
- int byteIdx = 0;
- // Number of characters iterated from the srcString.
- int numChars = 0;
- // Number of bytes in srcString.
- int numBytes = srcString.numBytes();
- // Array of character length for the srcString.
- int[] stringCharLen = new int[numBytes];
- // Array of the first byte position for each character in the srcString.
- int[] stringCharPos = new int[numBytes];
- // Convert trimString to lowercase, so it can be searched properly.
- UTF8String lowercaseTrimString = trimString.toLowerCase();
-
- // Build the position and length array.
- while (byteIdx < numBytes) {
- stringCharPos[numChars] = byteIdx;
- stringCharLen[numChars] =
UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx));
- byteIdx += stringCharLen[numChars];
- numChars++;
- }
-
- // Index trimEnd points to the first no matching byte position from the
right side of
- // the source string.
- int trimByteIdx = numBytes - 1;
-
- while (numChars > 0) {
- UTF8String searchChar = srcString.copyUTF8String(
- stringCharPos[numChars - 1],
- stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
-
- if(lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) {
- trimByteIdx -= stringCharLen[numChars - 1];
- numChars--;
- } else {
+ // Create a hash set of lowercased code points for all characters of
`trimString`.
+ HashSet<Integer> trimChars = new HashSet<>();
+ Iterator<Integer> trimIter = trimString.codePointIterator();
+ while (trimIter.hasNext())
trimChars.add(getLowercaseCodePoint(trimIter.next()));
+
+ // Iterate over `srcString` from the right to find the first character
that is not in the set.
+ int searchIndex = srcString.numChars(), codePoint;
+ Iterator<Integer> srcIter = srcString.reverseCodePointIterator();
+ while (srcIter.hasNext()) {
+ codePoint = getLowercaseCodePoint(srcIter.next());
+ // Special handling for Turkish dotted uppercase letter I.
+ if (codePoint == CODE_POINT_COMBINING_DOT && srcIter.hasNext() &&
+ trimChars.contains(CODE_POINT_COMBINED_LOWERCASE_I_DOT)) {
+ int nextCodePoint = getLowercaseCodePoint(srcIter.next());
+ if ((trimChars.contains(codePoint) &&
trimChars.contains(nextCodePoint))
+ || nextCodePoint == CODE_POINT_LOWERCASE_I) {
+ searchIndex -= 2;
+ }
+ else {
+ if (trimChars.contains(codePoint)) --searchIndex;
+ break;
+ }
+ } else if (trimChars.contains(codePoint)) {
+ --searchIndex;
+ }
+ else {
break;
}
}
- if (trimByteIdx == numBytes - 1) {
- // Nothing trimmed.
- return srcString;
+ // Return the substring from the start of the string to the calculated
position.
+ return searchIndex == srcString.numChars() ? srcString :
srcString.substring(0, searchIndex);
+ }
+
+ /**
+ * Trims the `srcString` string from the right side using the specified
`trimString` characters,
+ * with respect to ICU collations. For these collations, the method iterates
over `srcString`
+ * from right to left, and repeatedly skips the longest possible substring
that matches any
+ * character in `trimString`, until reaching a character that is not found
in `trimString`.
+ * Finally, the method returns the substring from the start of `srcString`
until that position.
+ * If `trimString` is null, null is returned. If `trimString` is empty,
`srcString` is returned.
+ *
+ * @param srcString the input string to be trimmed from the right end of the
string
+ * @param trimString the trim string characters to trim
+ * @param collationId the collation ID to use for string trimming
+ * @return the trimmed string (for ICU collations)
+ */
+ public static UTF8String trimRight(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ // Short-circuit for base cases.
+ if (trimString == null) return null;
+ if (srcString.numBytes() == 0) return srcString;
+
+ // Create an array of Strings for all characters of `trimString`.
+ Map<Integer, String> trimChars = new HashMap<>();
+ Iterator<Integer> trimIter = trimString.codePointIterator(
+ CodePointIteratorType.CODE_POINT_ITERATOR_MAKE_VALID);
+ while (trimIter.hasNext()) {
+ int codePoint = trimIter.next();
+ trimChars.putIfAbsent(codePoint, String.valueOf((char) codePoint));
}
- if (trimByteIdx < 0) {
- // Everything trimmed.
- return UTF8String.EMPTY_UTF8;
+
+ // Iterate over srcString from the left and find the first character that
is not in trimChars.
+ String src = srcString.toValidString();
+ CharacterIterator target = new StringCharacterIterator(src);
+ Collator collator = CollationFactory.fetchCollation(collationId).collator;
+ int charIndex = src.length(), longestMatchLen;
+ while (charIndex >= 0) {
+ longestMatchLen = 0;
+ for (String trim : trimChars.values()) {
+ StringSearch stringSearch = new StringSearch(trim, target,
(RuleBasedCollator) collator);
+ // Note: stringSearch.previous() is NOT consistent with
stringSearch.next()!
+ // Example: StringSearch("İ", "i\\u0307İi\\u0307İi\\u0307İ",
"UNICODE_CI")
+ // stringSearch.next() gives: [0, 2, 3, 5, 6, 8].
+ // stringSearch.previous() gives: [8, 6, 3, 0].
+ // Since 1 character can map to at most 3 characters in Unicode, we
can begin the search
+ // from character position: `charIndex` - 3, and use `next()` to find
the longest match.
+ stringSearch.setIndex(Math.max(charIndex - 3, 0));
+ int matchIndex = stringSearch.next();
+ int matchLen = stringSearch.getMatchLength();
+ while (matchIndex != StringSearch.DONE && matchIndex < charIndex -
matchLen) {
+ matchIndex = stringSearch.next();
+ matchLen = stringSearch.getMatchLength();
+ }
+ if (matchIndex == charIndex - matchLen) {
+ if (matchLen > longestMatchLen) {
+ longestMatchLen = matchLen;
+ }
+ }
+ }
+ if (longestMatchLen == 0) break;
+ else charIndex -= longestMatchLen;
}
- return srcString.copyUTF8String(0, trimByteIdx);
+
+ // Return the substring from the start of the string until that position.
+ return UTF8String.fromString(src.substring(0, charIndex));
}
// TODO: Add more collation-aware UTF8String operations here.
diff --git
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
index f9ccd22f3f5c..453423ddbc33 100644
---
a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
+++
b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java
@@ -531,15 +531,8 @@ public final class CollationSupport {
}
public static class StringTrim {
- public static UTF8String exec(
- final UTF8String srcString,
- final int collationId) {
- CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
- return execBinary(srcString);
- } else {
- return execLowercase(srcString);
- }
+ public static UTF8String exec(final UTF8String srcString) {
+ return execBinary(srcString);
}
public static UTF8String exec(
final UTF8String srcString,
@@ -548,20 +541,14 @@ public final class CollationSupport {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(srcString, trimString);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
return execLowercase(srcString, trimString);
+ } else {
+ return execICU(srcString, trimString, collationId);
}
}
- public static String genCode(
- final String srcString,
- final int collationId) {
- CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- String expr = "CollationSupport.StringTrim.exec";
- if (collation.supportsBinaryEquality) {
- return String.format(expr + "Binary(%s)", srcString);
- } {
- return String.format(expr + "Lowercase(%s)", srcString);
- }
+ public static String genCode(final String srcString) {
+ return String.format("CollationSupport.StringTrim.execBinary(%s)",
srcString);
}
public static String genCode(
final String srcString,
@@ -571,8 +558,10 @@ public final class CollationSupport {
String expr = "CollationSupport.StringTrim.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", srcString, trimString);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", srcString,
trimString);
+ } else {
+ return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString,
collationId);
}
}
public static UTF8String execBinary(
@@ -584,27 +573,22 @@ public final class CollationSupport {
final UTF8String trimString) {
return srcString.trim(trimString);
}
- public static UTF8String execLowercase(
- final UTF8String srcString) {
- return srcString.trim();
- }
public static UTF8String execLowercase(
final UTF8String srcString,
final UTF8String trimString) {
return CollationAwareUTF8String.lowercaseTrim(srcString, trimString);
}
+ public static UTF8String execICU(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ return CollationAwareUTF8String.trim(srcString, trimString, collationId);
+ }
}
public static class StringTrimLeft {
- public static UTF8String exec(
- final UTF8String srcString,
- final int collationId) {
- CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
- return execBinary(srcString);
- } else {
- return execLowercase(srcString);
- }
+ public static UTF8String exec(final UTF8String srcString) {
+ return execBinary(srcString);
}
public static UTF8String exec(
final UTF8String srcString,
@@ -613,21 +597,15 @@ public final class CollationSupport {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(srcString, trimString);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
return execLowercase(srcString, trimString);
- }
- }
- public static String genCode(
- final String srcString,
- final int collationId) {
- CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- String expr = "CollationSupport.StringTrimLeft.exec";
- if (collation.supportsBinaryEquality) {
- return String.format(expr + "Binary(%s)", srcString);
} else {
- return String.format(expr + "Lowercase(%s)", srcString);
+ return execICU(srcString, trimString, collationId);
}
}
+ public static String genCode(final String srcString) {
+ return String.format("CollationSupport.StringTrimLeft.execBinary(%s)",
srcString);
+ }
public static String genCode(
final String srcString,
final String trimString,
@@ -636,12 +614,13 @@ public final class CollationSupport {
String expr = "CollationSupport.StringTrimLeft.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", srcString, trimString);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", srcString,
trimString);
+ } else {
+ return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString,
collationId);
}
}
- public static UTF8String execBinary(
- final UTF8String srcString) {
+ public static UTF8String execBinary(final UTF8String srcString) {
return srcString.trimLeft();
}
public static UTF8String execBinary(
@@ -649,27 +628,22 @@ public final class CollationSupport {
final UTF8String trimString) {
return srcString.trimLeft(trimString);
}
- public static UTF8String execLowercase(
- final UTF8String srcString) {
- return srcString.trimLeft();
- }
public static UTF8String execLowercase(
final UTF8String srcString,
final UTF8String trimString) {
return CollationAwareUTF8String.lowercaseTrimLeft(srcString, trimString);
}
+ public static UTF8String execICU(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ return CollationAwareUTF8String.trimLeft(srcString, trimString,
collationId);
+ }
}
public static class StringTrimRight {
- public static UTF8String exec(
- final UTF8String srcString,
- final int collationId) {
- CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- if (collation.supportsBinaryEquality) {
- return execBinary(srcString);
- } else {
- return execLowercase(srcString);
- }
+ public static UTF8String exec(final UTF8String srcString) {
+ return execBinary(srcString);
}
public static UTF8String exec(
final UTF8String srcString,
@@ -678,21 +652,15 @@ public final class CollationSupport {
CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
if (collation.supportsBinaryEquality) {
return execBinary(srcString, trimString);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
return execLowercase(srcString, trimString);
- }
- }
- public static String genCode(
- final String srcString,
- final int collationId) {
- CollationFactory.Collation collation =
CollationFactory.fetchCollation(collationId);
- String expr = "CollationSupport.StringTrimRight.exec";
- if (collation.supportsBinaryEquality) {
- return String.format(expr + "Binary(%s)", srcString);
} else {
- return String.format(expr + "Lowercase(%s)", srcString);
+ return execICU(srcString, trimString, collationId);
}
}
+ public static String genCode(final String srcString) {
+ return String.format("CollationSupport.StringTrimRight.execBinary(%s)",
srcString);
+ }
public static String genCode(
final String srcString,
final String trimString,
@@ -701,12 +669,13 @@ public final class CollationSupport {
String expr = "CollationSupport.StringTrimRight.exec";
if (collation.supportsBinaryEquality) {
return String.format(expr + "Binary(%s, %s)", srcString, trimString);
- } else {
+ } else if (collation.supportsLowercaseEquality) {
return String.format(expr + "Lowercase(%s, %s)", srcString,
trimString);
+ } else {
+ return String.format(expr + "ICU(%s, %s, %d)", srcString, trimString,
collationId);
}
}
- public static UTF8String execBinary(
- final UTF8String srcString) {
+ public static UTF8String execBinary(final UTF8String srcString) {
return srcString.trimRight();
}
public static UTF8String execBinary(
@@ -714,15 +683,17 @@ public final class CollationSupport {
final UTF8String trimString) {
return srcString.trimRight(trimString);
}
- public static UTF8String execLowercase(
- final UTF8String srcString) {
- return srcString.trimRight();
- }
public static UTF8String execLowercase(
final UTF8String srcString,
final UTF8String trimString) {
return CollationAwareUTF8String.lowercaseTrimRight(srcString,
trimString);
}
+ public static UTF8String execICU(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ return CollationAwareUTF8String.trimRight(srcString, trimString,
collationId);
+ }
}
// TODO: Add more collation-aware string expressions.
diff --git
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
index ce0cef3fef30..b082ab21944f 100644
---
a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
+++
b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java
@@ -1227,20 +1227,34 @@ public class CollationSupportSuite {
String sourceString,
String trimString,
String expectedResultString) throws SparkException {
+ // Prepare the input and expected result.
int collationId = CollationFactory.collationNameToId(collation);
- String result;
+ UTF8String src = UTF8String.fromString(sourceString);
+ UTF8String trim = UTF8String.fromString(trimString);
+ UTF8String resultTrimLeftRight, resultTrimRightLeft;
+ String resultTrim;
if (trimString == null) {
- result = CollationSupport.StringTrim.exec(
- UTF8String.fromString(sourceString), collationId).toString();
+ // Trim string is ASCII space.
+ resultTrim = CollationSupport.StringTrim.exec(src).toString();
+ UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src);
+ resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft);
+ UTF8String trimRight = CollationSupport.StringTrimRight.exec(src);
+ resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight);
} else {
- result = CollationSupport.StringTrim.exec(
- UTF8String
- .fromString(sourceString), UTF8String.fromString(trimString),
collationId)
- .toString();
+ // Trim string is specified.
+ resultTrim = CollationSupport.StringTrim.exec(src, trim,
collationId).toString();
+ UTF8String trimLeft = CollationSupport.StringTrimLeft.exec(src, trim,
collationId);
+ resultTrimLeftRight = CollationSupport.StringTrimRight.exec(trimLeft,
trim, collationId);
+ UTF8String trimRight = CollationSupport.StringTrimRight.exec(src, trim,
collationId);
+ resultTrimRightLeft = CollationSupport.StringTrimLeft.exec(trimRight,
trim, collationId);
}
- assertEquals(expectedResultString, result);
+ // Test that StringTrim result is as expected.
+ assertEquals(expectedResultString, resultTrim);
+ // Test that the order of the trims is not important.
+ assertEquals(resultTrimLeftRight.toString(), resultTrim);
+ assertEquals(resultTrimRightLeft.toString(), resultTrim);
}
private void assertStringTrimLeft(
@@ -1248,19 +1262,21 @@ public class CollationSupportSuite {
String sourceString,
String trimString,
String expectedResultString) throws SparkException {
+ // Prepare the input and expected result.
int collationId = CollationFactory.collationNameToId(collation);
+ UTF8String src = UTF8String.fromString(sourceString);
+ UTF8String trim = UTF8String.fromString(trimString);
String result;
if (trimString == null) {
- result = CollationSupport.StringTrimLeft.exec(
- UTF8String.fromString(sourceString), collationId).toString();
+ // Trim string is ASCII space.
+ result = CollationSupport.StringTrimLeft.exec(src).toString();
} else {
- result = CollationSupport.StringTrimLeft.exec(
- UTF8String
- .fromString(sourceString), UTF8String.fromString(trimString),
collationId)
- .toString();
+ // Trim string is specified.
+ result = CollationSupport.StringTrimLeft.exec(src, trim,
collationId).toString();
}
+ // Test that StringTrimLeft result is as expected.
assertEquals(expectedResultString, result);
}
@@ -1269,116 +1285,645 @@ public class CollationSupportSuite {
String sourceString,
String trimString,
String expectedResultString) throws SparkException {
+ // Prepare the input and expected result.
int collationId = CollationFactory.collationNameToId(collation);
+ UTF8String src = UTF8String.fromString(sourceString);
+ UTF8String trim = UTF8String.fromString(trimString);
String result;
if (trimString == null) {
- result = CollationSupport.StringTrimRight.exec(
- UTF8String.fromString(sourceString), collationId).toString();
+ // Trim string is ASCII space.
+ result = CollationSupport.StringTrimRight.exec(src).toString();
} else {
- result = CollationSupport.StringTrimRight.exec(
- UTF8String
- .fromString(sourceString), UTF8String.fromString(trimString),
collationId)
- .toString();
+ // Trim string is specified.
+ result = CollationSupport.StringTrimRight.exec(src, trim,
collationId).toString();
}
+ // Test that StringTrimRight result is as expected.
assertEquals(expectedResultString, result);
}
@Test
public void testStringTrim() throws SparkException {
+ // Basic tests - UTF8_BINARY.
+ assertStringTrim("UTF8_BINARY", "", "", "");
+ assertStringTrim("UTF8_BINARY", "", "xyz", "");
+ assertStringTrim("UTF8_BINARY", "asd", "", "asd");
assertStringTrim("UTF8_BINARY", "asd", null, "asd");
assertStringTrim("UTF8_BINARY", " asd ", null, "asd");
assertStringTrim("UTF8_BINARY", " a世a ", null, "a世a");
assertStringTrim("UTF8_BINARY", "asd", "x", "asd");
assertStringTrim("UTF8_BINARY", "xxasdxx", "x", "asd");
assertStringTrim("UTF8_BINARY", "xa世ax", "x", "a世a");
-
+ assertStringTrimLeft("UTF8_BINARY", "", "", "");
+ assertStringTrimLeft("UTF8_BINARY", "", "xyz", "");
+ assertStringTrimLeft("UTF8_BINARY", "asd", "", "asd");
assertStringTrimLeft("UTF8_BINARY", "asd", null, "asd");
assertStringTrimLeft("UTF8_BINARY", " asd ", null, "asd ");
assertStringTrimLeft("UTF8_BINARY", " a世a ", null, "a世a ");
assertStringTrimLeft("UTF8_BINARY", "asd", "x", "asd");
assertStringTrimLeft("UTF8_BINARY", "xxasdxx", "x", "asdxx");
assertStringTrimLeft("UTF8_BINARY", "xa世ax", "x", "a世ax");
-
+ assertStringTrimRight("UTF8_BINARY", "", "", "");
+ assertStringTrimRight("UTF8_BINARY", "", "xyz", "");
+ assertStringTrimRight("UTF8_BINARY", "asd", "", "asd");
assertStringTrimRight("UTF8_BINARY", "asd", null, "asd");
assertStringTrimRight("UTF8_BINARY", " asd ", null, " asd");
assertStringTrimRight("UTF8_BINARY", " a世a ", null, " a世a");
assertStringTrimRight("UTF8_BINARY", "asd", "x", "asd");
assertStringTrimRight("UTF8_BINARY", "xxasdxx", "x", "xxasd");
assertStringTrimRight("UTF8_BINARY", "xa世ax", "x", "xa世a");
-
+ // Basic tests - UTF8_LCASE.
+ assertStringTrim("UTF8_LCASE", "", "", "");
+ assertStringTrim("UTF8_LCASE", "", "xyz", "");
+ assertStringTrim("UTF8_LCASE", "asd", "", "asd");
assertStringTrim("UTF8_LCASE", "asd", null, "asd");
assertStringTrim("UTF8_LCASE", " asd ", null, "asd");
assertStringTrim("UTF8_LCASE", " a世a ", null, "a世a");
assertStringTrim("UTF8_LCASE", "asd", "x", "asd");
assertStringTrim("UTF8_LCASE", "xxasdxx", "x", "asd");
assertStringTrim("UTF8_LCASE", "xa世ax", "x", "a世a");
-
+ assertStringTrimLeft("UTF8_LCASE", "", "", "");
+ assertStringTrimLeft("UTF8_LCASE", "", "xyz", "");
+ assertStringTrimLeft("UTF8_LCASE", "asd", "", "asd");
assertStringTrimLeft("UTF8_LCASE", "asd", null, "asd");
assertStringTrimLeft("UTF8_LCASE", " asd ", null, "asd ");
assertStringTrimLeft("UTF8_LCASE", " a世a ", null, "a世a ");
assertStringTrimLeft("UTF8_LCASE", "asd", "x", "asd");
assertStringTrimLeft("UTF8_LCASE", "xxasdxx", "x", "asdxx");
assertStringTrimLeft("UTF8_LCASE", "xa世ax", "x", "a世ax");
-
+ assertStringTrimRight("UTF8_LCASE", "", "", "");
+ assertStringTrimRight("UTF8_LCASE", "", "xyz", "");
+ assertStringTrimRight("UTF8_LCASE", "asd", "", "asd");
assertStringTrimRight("UTF8_LCASE", "asd", null, "asd");
assertStringTrimRight("UTF8_LCASE", " asd ", null, " asd");
assertStringTrimRight("UTF8_LCASE", " a世a ", null, " a世a");
assertStringTrimRight("UTF8_LCASE", "asd", "x", "asd");
assertStringTrimRight("UTF8_LCASE", "xxasdxx", "x", "xxasd");
assertStringTrimRight("UTF8_LCASE", "xa世ax", "x", "xa世a");
-
- assertStringTrim("UTF8_LCASE", "asd", null, "asd");
- assertStringTrim("UTF8_LCASE", " asd ", null, "asd");
- assertStringTrim("UTF8_LCASE", " a世a ", null, "a世a");
- assertStringTrim("UTF8_LCASE", "asd", "x", "asd");
- assertStringTrim("UTF8_LCASE", "xxasdxx", "x", "asd");
- assertStringTrim("UTF8_LCASE", "xa世ax", "x", "a世a");
-
- // Test cases where trimString has more than one character
+ // Basic tests - UNICODE.
+ assertStringTrim("UNICODE", "", "", "");
+ assertStringTrim("UNICODE", "", "xyz", "");
+ assertStringTrim("UNICODE", "asd", "", "asd");
+ assertStringTrim("UNICODE", "asd", null, "asd");
+ assertStringTrim("UNICODE", " asd ", null, "asd");
+ assertStringTrim("UNICODE", " a世a ", null, "a世a");
+ assertStringTrim("UNICODE", "asd", "x", "asd");
+ assertStringTrim("UNICODE", "xxasdxx", "x", "asd");
+ assertStringTrim("UNICODE", "xa世ax", "x", "a世a");
+ assertStringTrimLeft("UNICODE", "", "", "");
+ assertStringTrimLeft("UNICODE", "", "xyz", "");
+ assertStringTrimLeft("UNICODE", "asd", "", "asd");
+ assertStringTrimLeft("UNICODE", "asd", null, "asd");
+ assertStringTrimLeft("UNICODE", " asd ", null, "asd ");
+ assertStringTrimLeft("UNICODE", " a世a ", null, "a世a ");
+ assertStringTrimLeft("UNICODE", "asd", "x", "asd");
+ assertStringTrimLeft("UNICODE", "xxasdxx", "x", "asdxx");
+ assertStringTrimLeft("UNICODE", "xa世ax", "x", "a世ax");
+ assertStringTrimRight("UNICODE", "", "", "");
+ assertStringTrimRight("UNICODE", "", "xyz", "");
+ assertStringTrimRight("UNICODE", "asd", "", "asd");
+ assertStringTrimRight("UNICODE", "asd", null, "asd");
+ assertStringTrimRight("UNICODE", " asd ", null, " asd");
+ assertStringTrimRight("UNICODE", " a世a ", null, " a世a");
+ assertStringTrimRight("UNICODE", "asd", "x", "asd");
+ assertStringTrimRight("UNICODE", "xxasdxx", "x", "xxasd");
+ assertStringTrimRight("UNICODE", "xa世ax", "x", "xa世a");
+ // Basic tests - UNICODE_CI.
+ assertStringTrim("UNICODE_CI", "", "", "");
+ assertStringTrim("UNICODE_CI", "", "xyz", "");
+ assertStringTrim("UNICODE_CI", "asd", "", "asd");
+ assertStringTrim("UNICODE_CI", "asd", null, "asd");
+ assertStringTrim("UNICODE_CI", " asd ", null, "asd");
+ assertStringTrim("UNICODE_CI", " a世a ", null, "a世a");
+ assertStringTrim("UNICODE_CI", "asd", "x", "asd");
+ assertStringTrim("UNICODE_CI", "xxasdxx", "x", "asd");
+ assertStringTrim("UNICODE_CI", "xa世ax", "x", "a世a");
+ assertStringTrimLeft("UNICODE_CI", "", "", "");
+ assertStringTrimLeft("UNICODE_CI", "", "xyz", "");
+ assertStringTrimLeft("UNICODE_CI", "asd", "", "asd");
+ assertStringTrimLeft("UNICODE_CI", "asd", null, "asd");
+ assertStringTrimLeft("UNICODE_CI", " asd ", null, "asd ");
+ assertStringTrimLeft("UNICODE_CI", " a世a ", null, "a世a ");
+ assertStringTrimLeft("UNICODE_CI", "asd", "x", "asd");
+ assertStringTrimLeft("UNICODE_CI", "xxasdxx", "x", "asdxx");
+ assertStringTrimLeft("UNICODE_CI", "xa世ax", "x", "a世ax");
+ assertStringTrimRight("UNICODE_CI", "", "", "");
+ assertStringTrimRight("UNICODE_CI", "", "xyz", "");
+ assertStringTrimRight("UNICODE_CI", "asd", "", "asd");
+ assertStringTrimRight("UNICODE_CI", "asd", null, "asd");
+ assertStringTrimRight("UNICODE_CI", " asd ", null, " asd");
+ assertStringTrimRight("UNICODE_CI", " a世a ", null, " a世a");
+ assertStringTrimRight("UNICODE_CI", "asd", "x", "asd");
+ assertStringTrimRight("UNICODE_CI", "xxasdxx", "x", "xxasd");
+ assertStringTrimRight("UNICODE_CI", "xa世ax", "x", "xa世a");
+
+ // Case variation - UTF8_BINARY.
+ assertStringTrim("UTF8_BINARY", "asd", "A", "asd");
assertStringTrim("UTF8_BINARY", "ddsXXXaa", "asd", "XXX");
+ assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD");
assertStringTrimLeft("UTF8_BINARY", "ddsXXXaa", "asd", "XXXaa");
assertStringTrimRight("UTF8_BINARY", "ddsXXXaa", "asd", "ddsXXX");
-
- assertStringTrim("UTF8_LCASE", "ddsXXXaa", "asd", "XXX");
- assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "asd", "XXXaa");
- assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "asd", "ddsXXX");
-
- // Test cases specific to collation type
- // uppercase trim, lowercase src
- assertStringTrim("UTF8_BINARY", "asd", "A", "asd");
+ // Case variation - UTF8_LCASE.
assertStringTrim("UTF8_LCASE", "asd", "A", "sd");
-
- // lowercase trim, uppercase src
- assertStringTrim("UTF8_BINARY", "ASD", "a", "ASD");
assertStringTrim("UTF8_LCASE", "ASD", "a", "SD");
-
- // uppercase and lowercase chars of different byte-length (utf8)
+ assertStringTrim("UTF8_LCASE", "ddsXXXaa", "ASD", "XXX");
+ assertStringTrimLeft("UTF8_LCASE", "ddsXXXaa", "aSd", "XXXaa");
+ assertStringTrimRight("UTF8_LCASE", "ddsXXXaa", "AsD", "ddsXXX");
+ // Case variation - UNICODE.
+ assertStringTrim("UNICODE", "asd", "A", "asd");
+ assertStringTrim("UNICODE", "ASD", "a", "ASD");
+ assertStringTrim("UNICODE", "ddsXXXaa", "asd", "XXX");
+ assertStringTrimLeft("UNICODE", "ddsXXXaa", "asd", "XXXaa");
+ assertStringTrimRight("UNICODE", "ddsXXXaa", "asd", "ddsXXX");
+ // Case variation - UNICODE_CI.
+ assertStringTrim("UNICODE_CI", "asd", "A", "sd");
+ assertStringTrim("UNICODE_CI", "ASD", "a", "SD");
+ assertStringTrim("UNICODE_CI", "ddsXXXaa", "ASD", "XXX");
+ assertStringTrimLeft("UNICODE_CI", "ddsXXXaa", "aSd", "XXXaa");
+ assertStringTrimRight("UNICODE_CI", "ddsXXXaa", "AsD", "ddsXXX");
+
+ // Case-variable character length - UTF8_BINARY.
assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
assertStringTrimLeft("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
assertStringTrimRight("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
-
- assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa");
- assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ");
- assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa");
-
assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
assertStringTrimLeft("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
assertStringTrimRight("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
-
- assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa");
- assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß");
- assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa");
-
- // different byte-length (utf8) chars trimmed
assertStringTrim("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaa");
assertStringTrimLeft("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "aaaẞ");
assertStringTrimRight("UTF8_BINARY", "Ëaaaẞ", "Ëẞ", "Ëaaa");
-
+ // Case-variable character length - UTF8_LCASE.
+ assertStringTrim("UTF8_LCASE", "ẞaaaẞ", "ß", "aaa");
+ assertStringTrimLeft("UTF8_LCASE", "ẞaaaẞ", "ß", "aaaẞ");
+ assertStringTrimRight("UTF8_LCASE", "ẞaaaẞ", "ß", "ẞaaa");
+ assertStringTrim("UTF8_LCASE", "ßaaaß", "ẞ", "aaa");
+ assertStringTrimLeft("UTF8_LCASE", "ßaaaß", "ẞ", "aaaß");
+ assertStringTrimRight("UTF8_LCASE", "ßaaaß", "ẞ", "ßaaa");
assertStringTrim("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaa");
assertStringTrimLeft("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
assertStringTrimRight("UTF8_LCASE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+ // Case-variable character length - UNICODE.
+ assertStringTrim("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+ assertStringTrimLeft("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+ assertStringTrimRight("UNICODE", "ẞaaaẞ", "ß", "ẞaaaẞ");
+ assertStringTrim("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+ assertStringTrimLeft("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+ assertStringTrimRight("UNICODE", "ßaaaß", "ẞ", "ßaaaß");
+ assertStringTrim("UNICODE", "Ëaaaẞ", "Ëẞ", "aaa");
+ assertStringTrimLeft("UNICODE", "Ëaaaẞ", "Ëẞ", "aaaẞ");
+ assertStringTrimRight("UNICODE", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+ // Case-variable character length - UNICODE_CI.
+ assertStringTrim("UNICODE_CI", "ẞaaaẞ", "ß", "aaa");
+ assertStringTrimLeft("UNICODE_CI", "ẞaaaẞ", "ß", "aaaẞ");
+ assertStringTrimRight("UNICODE_CI", "ẞaaaẞ", "ß", "ẞaaa");
+ assertStringTrim("UNICODE_CI", "ßaaaß", "ẞ", "aaa");
+ assertStringTrimLeft("UNICODE_CI", "ßaaaß", "ẞ", "aaaß");
+ assertStringTrimRight("UNICODE_CI", "ßaaaß", "ẞ", "ßaaa");
+ assertStringTrim("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaa");
+ assertStringTrimLeft("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "aaaẞ");
+ assertStringTrimRight("UNICODE_CI", "Ëaaaẞ", "Ëẞ", "Ëaaa");
+
+ // One-to-many case mapping - UTF8_BINARY.
+ assertStringTrim("UTF8_BINARY", "i", "i", "");
+ assertStringTrim("UTF8_BINARY", "iii", "I", "iii");
+ assertStringTrim("UTF8_BINARY", "I", "iii", "I");
+ assertStringTrim("UTF8_BINARY", "ixi", "i", "x");
+ assertStringTrim("UTF8_BINARY", "i", "İ", "i");
+ assertStringTrim("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+ assertStringTrim("UTF8_BINARY", "i\u0307", "i", "\u0307");
+ assertStringTrim("UTF8_BINARY", "i\u0307", "\u0307", "i");
+ assertStringTrim("UTF8_BINARY", "i\u0307", "i\u0307", "");
+ assertStringTrim("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrim("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrim("UTF8_BINARY", "i\u0307i", "i\u0307", "");
+ assertStringTrim("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrim("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ");
+ assertStringTrim("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307");
+ assertStringTrim("UTF8_BINARY", "İ", "İ", "");
+ assertStringTrim("UTF8_BINARY", "IXi", "İ", "IXi");
+ assertStringTrim("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrim("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrim("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
+ assertStringTrim("UTF8_BINARY", "İ", "i", "İ");
+ assertStringTrim("UTF8_BINARY", "İ", "\u0307", "İ");
+ assertStringTrim("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrim("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrim("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi");
+ assertStringTrimLeft("UTF8_BINARY", "i", "i", "");
+ assertStringTrimLeft("UTF8_BINARY", "iii", "I", "iii");
+ assertStringTrimLeft("UTF8_BINARY", "I", "iii", "I");
+ assertStringTrimLeft("UTF8_BINARY", "ixi", "i", "xi");
+ assertStringTrimLeft("UTF8_BINARY", "i", "İ", "i");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i", "\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "i\u0307", "");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "i\u0307", "İ");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307İ");
+ assertStringTrimLeft("UTF8_BINARY", "İ", "İ", "");
+ assertStringTrimLeft("UTF8_BINARY", "IXi", "İ", "IXi");
+ assertStringTrimLeft("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrimLeft("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
+ assertStringTrimLeft("UTF8_BINARY", "İ", "i", "İ");
+ assertStringTrimLeft("UTF8_BINARY", "İ", "\u0307", "İ");
+ assertStringTrimLeft("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimLeft("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimLeft("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi\u0307");
+ assertStringTrimRight("UTF8_BINARY", "i", "i", "");
+ assertStringTrimRight("UTF8_BINARY", "iii", "I", "iii");
+ assertStringTrimRight("UTF8_BINARY", "I", "iii", "I");
+ assertStringTrimRight("UTF8_BINARY", "ixi", "i", "ix");
+ assertStringTrimRight("UTF8_BINARY", "i", "İ", "i");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307", "İ", "i\u0307");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307", "i", "i\u0307");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307", "\u0307", "i");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307i", "i\u0307", "");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307İ", "İ", "i\u0307");
+ assertStringTrimRight("UTF8_BINARY", "İ", "İ", "");
+ assertStringTrimRight("UTF8_BINARY", "IXi", "İ", "IXi");
+ assertStringTrimRight("UTF8_BINARY", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrimRight("UTF8_BINARY", "i\u0307x", "ix\u0307İ", "");
+ assertStringTrimRight("UTF8_BINARY", "İ", "i", "İ");
+ assertStringTrimRight("UTF8_BINARY", "İ", "\u0307", "İ");
+ assertStringTrimRight("UTF8_BINARY", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimRight("UTF8_BINARY", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimRight("UTF8_BINARY", "xi\u0307", "\u0307IX", "xi");
+ // One-to-many case mapping - UTF8_LCASE.
+ assertStringTrim("UTF8_LCASE", "i", "i", "");
+ assertStringTrim("UTF8_LCASE", "iii", "I", "");
+ assertStringTrim("UTF8_LCASE", "I", "iii", "");
+ assertStringTrim("UTF8_LCASE", "ixi", "i", "x");
+ assertStringTrim("UTF8_LCASE", "i", "İ", "i");
+ assertStringTrim("UTF8_LCASE", "i\u0307", "İ", "");
+ assertStringTrim("UTF8_LCASE", "i\u0307", "i", "\u0307");
+ assertStringTrim("UTF8_LCASE", "i\u0307", "\u0307", "i");
+ assertStringTrim("UTF8_LCASE", "i\u0307", "i\u0307", "");
+ assertStringTrim("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrim("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrim("UTF8_LCASE", "i\u0307i", "i\u0307", "");
+ assertStringTrim("UTF8_LCASE", "i\u0307i", "İ", "i");
+ assertStringTrim("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ");
+ assertStringTrim("UTF8_LCASE", "i\u0307İ", "İ", "");
+ assertStringTrim("UTF8_LCASE", "İ", "İ", "");
+ assertStringTrim("UTF8_LCASE", "IXi", "İ", "IXi");
+ assertStringTrim("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307");
+ assertStringTrim("UTF8_LCASE", "i\u0307x", "IXİ", "");
+ assertStringTrim("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
+ assertStringTrim("UTF8_LCASE", "İ", "i", "İ");
+ assertStringTrim("UTF8_LCASE", "İ", "\u0307", "İ");
+ assertStringTrim("UTF8_LCASE", "Ixİ", "i\u0307", "xİ");
+ assertStringTrim("UTF8_LCASE", "IXİ", "ix\u0307", "İ");
+ assertStringTrim("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
+ assertStringTrimLeft("UTF8_LCASE", "i", "i", "");
+ assertStringTrimLeft("UTF8_LCASE", "iii", "I", "");
+ assertStringTrimLeft("UTF8_LCASE", "I", "iii", "");
+ assertStringTrimLeft("UTF8_LCASE", "ixi", "i", "xi");
+ assertStringTrimLeft("UTF8_LCASE", "i", "İ", "i");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307", "İ", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i", "\u0307");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "i\u0307", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307i", "İ", "i");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "i\u0307", "İ");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307İ", "İ", "");
+ assertStringTrimLeft("UTF8_LCASE", "İ", "İ", "");
+ assertStringTrimLeft("UTF8_LCASE", "IXi", "İ", "IXi");
+ assertStringTrimLeft("UTF8_LCASE", "ix\u0307", "Ixİ", "\u0307");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "IXİ", "");
+ assertStringTrimLeft("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
+ assertStringTrimLeft("UTF8_LCASE", "İ", "i", "İ");
+ assertStringTrimLeft("UTF8_LCASE", "İ", "\u0307", "İ");
+ assertStringTrimLeft("UTF8_LCASE", "Ixİ", "i\u0307", "xİ");
+ assertStringTrimLeft("UTF8_LCASE", "IXİ", "ix\u0307", "İ");
+ assertStringTrimLeft("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
+ assertStringTrimRight("UTF8_LCASE", "i", "i", "");
+ assertStringTrimRight("UTF8_LCASE", "iii", "I", "");
+ assertStringTrimRight("UTF8_LCASE", "I", "iii", "");
+ assertStringTrimRight("UTF8_LCASE", "ixi", "i", "ix");
+ assertStringTrimRight("UTF8_LCASE", "i", "İ", "i");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307", "İ", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307", "i", "i\u0307");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307", "\u0307", "i");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307i\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307\u0307", "i\u0307", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307i", "i\u0307", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307İ", "İ", "");
+ assertStringTrimRight("UTF8_LCASE", "İ", "İ", "");
+ assertStringTrimRight("UTF8_LCASE", "IXi", "İ", "IXi");
+ assertStringTrimRight("UTF8_LCASE", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307x", "IXİ", "");
+ assertStringTrimRight("UTF8_LCASE", "i\u0307x", "I\u0307xİ", "");
+ assertStringTrimRight("UTF8_LCASE", "İ", "i", "İ");
+ assertStringTrimRight("UTF8_LCASE", "İ", "\u0307", "İ");
+ assertStringTrimRight("UTF8_LCASE", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimRight("UTF8_LCASE", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimRight("UTF8_LCASE", "xi\u0307", "\u0307IX", "");
+ // One-to-many case mapping - UNICODE.
+ assertStringTrim("UNICODE", "i", "i", "");
+ assertStringTrim("UNICODE", "iii", "I", "iii");
+ assertStringTrim("UNICODE", "I", "iii", "I");
+ assertStringTrim("UNICODE", "ixi", "i", "x");
+ assertStringTrim("UNICODE", "i", "İ", "i");
+ assertStringTrim("UNICODE", "i\u0307", "İ", "i\u0307");
+ assertStringTrim("UNICODE", "i\u0307", "i", "i\u0307");
+ assertStringTrim("UNICODE", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrim("UNICODE", "i\u0307i\u0307", "i\u0307", "i\u0307i\u0307");
+ assertStringTrim("UNICODE", "i\u0307\u0307", "i\u0307", "i\u0307\u0307");
+ assertStringTrim("UNICODE", "i\u0307i", "i\u0307", "i\u0307");
+ assertStringTrim("UNICODE", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrim("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrim("UNICODE", "i\u0307İ", "İ", "i\u0307");
+ assertStringTrim("UNICODE", "İ", "İ", "");
+ assertStringTrim("UNICODE", "IXi", "İ", "IXi");
+ assertStringTrim("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrim("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrim("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307");
+ assertStringTrim("UNICODE", "İ", "i", "İ");
+ assertStringTrim("UNICODE", "İ", "\u0307", "İ");
+ assertStringTrim("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrim("UNICODE", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrim("UNICODE", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrim("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
+ assertStringTrimLeft("UNICODE", "i", "i", "");
+ assertStringTrimLeft("UNICODE", "iii", "I", "iii");
+ assertStringTrimLeft("UNICODE", "I", "iii", "I");
+ assertStringTrimLeft("UNICODE", "ixi", "i", "xi");
+ assertStringTrimLeft("UNICODE", "i", "İ", "i");
+ assertStringTrimLeft("UNICODE", "i\u0307", "İ", "i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307", "i", "i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307i\u0307", "i\u0307",
"i\u0307i\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307\u0307", "i\u0307",
"i\u0307\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307i", "i\u0307", "i\u0307i");
+ assertStringTrimLeft("UNICODE", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimLeft("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimLeft("UNICODE", "i\u0307İ", "İ", "i\u0307İ");
+ assertStringTrimLeft("UNICODE", "İ", "İ", "");
+ assertStringTrimLeft("UNICODE", "IXi", "İ", "IXi");
+ assertStringTrimLeft("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimLeft("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrimLeft("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307x");
+ assertStringTrimLeft("UNICODE", "İ", "i", "İ");
+ assertStringTrimLeft("UNICODE", "İ", "\u0307", "İ");
+ assertStringTrimLeft("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimLeft("UNICODE", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimLeft("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
+ assertStringTrimRight("UNICODE", "i", "i", "");
+ assertStringTrimRight("UNICODE", "iii", "I", "iii");
+ assertStringTrimRight("UNICODE", "I", "iii", "I");
+ assertStringTrimRight("UNICODE", "ixi", "i", "ix");
+ assertStringTrimRight("UNICODE", "i", "İ", "i");
+ assertStringTrimRight("UNICODE", "i\u0307", "İ", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307", "i", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307i\u0307", "i\u0307",
"i\u0307i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307\u0307", "i\u0307",
"i\u0307\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307i", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimRight("UNICODE", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimRight("UNICODE", "i\u0307İ", "İ", "i\u0307");
+ assertStringTrimRight("UNICODE", "İ", "İ", "");
+ assertStringTrimRight("UNICODE", "IXi", "İ", "IXi");
+ assertStringTrimRight("UNICODE", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimRight("UNICODE", "i\u0307x", "IXİ", "i\u0307x");
+ assertStringTrimRight("UNICODE", "i\u0307x", "ix\u0307İ", "i\u0307");
+ assertStringTrimRight("UNICODE", "İ", "i", "İ");
+ assertStringTrimRight("UNICODE", "İ", "\u0307", "İ");
+ assertStringTrimRight("UNICODE", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimRight("UNICODE", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimRight("UNICODE", "xi\u0307", "\u0307IX", "xi\u0307");
+ // One-to-many case mapping - UNICODE_CI.
+ assertStringTrim("UNICODE_CI", "i", "i", "");
+ assertStringTrim("UNICODE_CI", "iii", "I", "");
+ assertStringTrim("UNICODE_CI", "I", "iii", "");
+ assertStringTrim("UNICODE_CI", "ixi", "i", "x");
+ assertStringTrim("UNICODE_CI", "i", "İ", "i");
+ assertStringTrim("UNICODE_CI", "i\u0307", "İ", "");
+ assertStringTrim("UNICODE_CI", "i\u0307", "i", "i\u0307");
+ assertStringTrim("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrim("UNICODE_CI", "i\u0307i\u0307", "i\u0307",
"i\u0307i\u0307");
+ assertStringTrim("UNICODE_CI", "i\u0307\u0307", "i\u0307",
"i\u0307\u0307");
+ assertStringTrim("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307");
+ assertStringTrim("UNICODE_CI", "i\u0307i", "İ", "i");
+ assertStringTrim("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrim("UNICODE_CI", "i\u0307İ", "İ", "");
+ assertStringTrim("UNICODE_CI", "İ", "İ", "");
+ assertStringTrim("UNICODE_CI", "IXi", "İ", "IXi");
+ assertStringTrim("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307");
+ assertStringTrim("UNICODE_CI", "i\u0307x", "IXİ", "");
+ assertStringTrim("UNICODE_CI", "i\u0307x", "I\u0307xİ", "");
+ assertStringTrim("UNICODE_CI", "İ", "i", "İ");
+ assertStringTrim("UNICODE_CI", "İ", "\u0307", "İ");
+ assertStringTrim("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrim("UNICODE_CI", "Ixİ", "i\u0307", "xİ");
+ assertStringTrim("UNICODE_CI", "IXİ", "ix\u0307", "İ");
+ assertStringTrim("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307");
+ assertStringTrimLeft("UNICODE_CI", "i", "i", "");
+ assertStringTrimLeft("UNICODE_CI", "iii", "I", "");
+ assertStringTrimLeft("UNICODE_CI", "I", "iii", "");
+ assertStringTrimLeft("UNICODE_CI", "ixi", "i", "xi");
+ assertStringTrimLeft("UNICODE_CI", "i", "İ", "i");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307", "İ", "");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307", "i", "i\u0307");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307i\u0307", "i\u0307",
"i\u0307i\u0307");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307\u0307", "i\u0307",
"i\u0307\u0307");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307i");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307i", "İ", "i");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307İ", "İ", "");
+ assertStringTrimLeft("UNICODE_CI", "İ", "İ", "");
+ assertStringTrimLeft("UNICODE_CI", "IXi", "İ", "IXi");
+ assertStringTrimLeft("UNICODE_CI", "ix\u0307", "Ixİ", "x\u0307");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307x", "IXİ", "");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307x", "I\u0307xİ", "");
+ assertStringTrimLeft("UNICODE_CI", "İ", "i", "İ");
+ assertStringTrimLeft("UNICODE_CI", "İ", "\u0307", "İ");
+ assertStringTrimLeft("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimLeft("UNICODE_CI", "Ixİ", "i\u0307", "xİ");
+ assertStringTrimLeft("UNICODE_CI", "IXİ", "ix\u0307", "İ");
+ assertStringTrimLeft("UNICODE_CI", "xi\u0307", "\u0307IX", "i\u0307");
+ assertStringTrimRight("UNICODE_CI", "i", "i", "");
+ assertStringTrimRight("UNICODE_CI", "iii", "I", "");
+ assertStringTrimRight("UNICODE_CI", "I", "iii", "");
+ assertStringTrimRight("UNICODE_CI", "ixi", "i", "ix");
+ assertStringTrimRight("UNICODE_CI", "i", "İ", "i");
+ assertStringTrimRight("UNICODE_CI", "i\u0307", "İ", "");
+ assertStringTrimRight("UNICODE_CI", "i\u0307", "i", "i\u0307");
+ assertStringTrimRight("UNICODE_CI", "i\u0307", "\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE_CI", "i\u0307i\u0307", "i\u0307",
"i\u0307i\u0307");
+ assertStringTrimRight("UNICODE_CI", "i\u0307\u0307", "i\u0307",
"i\u0307\u0307");
+ assertStringTrimRight("UNICODE_CI", "i\u0307i", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE_CI", "i\u0307i", "İ", "i\u0307i");
+ assertStringTrimRight("UNICODE_CI", "i\u0307İ", "i\u0307", "i\u0307İ");
+ assertStringTrimRight("UNICODE_CI", "i\u0307İ", "İ", "");
+ assertStringTrimRight("UNICODE_CI", "İ", "İ", "");
+ assertStringTrimRight("UNICODE_CI", "IXi", "İ", "IXi");
+ assertStringTrimRight("UNICODE_CI", "ix\u0307", "Ixİ", "ix\u0307");
+ assertStringTrimRight("UNICODE_CI", "i\u0307x", "IXİ", "");
+ assertStringTrimRight("UNICODE_CI", "i\u0307x", "I\u0307xİ", "");
+ assertStringTrimRight("UNICODE_CI", "İ", "i", "İ");
+ assertStringTrimRight("UNICODE_CI", "İ", "\u0307", "İ");
+ assertStringTrimRight("UNICODE_CI", "i\u0307", "i\u0307", "i\u0307");
+ assertStringTrimRight("UNICODE_CI", "Ixİ", "i\u0307", "Ixİ");
+ assertStringTrimRight("UNICODE_CI", "IXİ", "ix\u0307", "IXİ");
+ assertStringTrimRight("UNICODE_CI", "xi\u0307", "\u0307IX", "xi\u0307");
+
+ // Greek sigmas - UTF8_BINARY.
+ assertStringTrim("UTF8_BINARY", "ςxς", "σ", "ςxς");
+ assertStringTrim("UTF8_BINARY", "ςxς", "ς", "x");
+ assertStringTrim("UTF8_BINARY", "ςxς", "Σ", "ςxς");
+ assertStringTrim("UTF8_BINARY", "σxσ", "σ", "x");
+ assertStringTrim("UTF8_BINARY", "σxσ", "ς", "σxσ");
+ assertStringTrim("UTF8_BINARY", "σxσ", "Σ", "σxσ");
+ assertStringTrim("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrim("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrim("UTF8_BINARY", "ΣxΣ", "Σ", "x");
+ assertStringTrimLeft("UTF8_BINARY", "ςxς", "σ", "ςxς");
+ assertStringTrimLeft("UTF8_BINARY", "ςxς", "ς", "xς");
+ assertStringTrimLeft("UTF8_BINARY", "ςxς", "Σ", "ςxς");
+ assertStringTrimLeft("UTF8_BINARY", "σxσ", "σ", "xσ");
+ assertStringTrimLeft("UTF8_BINARY", "σxσ", "ς", "σxσ");
+ assertStringTrimLeft("UTF8_BINARY", "σxσ", "Σ", "σxσ");
+ assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrimLeft("UTF8_BINARY", "ΣxΣ", "Σ", "xΣ");
+ assertStringTrimRight("UTF8_BINARY", "ςxς", "σ", "ςxς");
+ assertStringTrimRight("UTF8_BINARY", "ςxς", "ς", "ςx");
+ assertStringTrimRight("UTF8_BINARY", "ςxς", "Σ", "ςxς");
+ assertStringTrimRight("UTF8_BINARY", "σxσ", "σ", "σx");
+ assertStringTrimRight("UTF8_BINARY", "σxσ", "ς", "σxσ");
+ assertStringTrimRight("UTF8_BINARY", "σxσ", "Σ", "σxσ");
+ assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrimRight("UTF8_BINARY", "ΣxΣ", "Σ", "Σx");
+ // Greek sigmas - UTF8_LCASE.
+ assertStringTrim("UTF8_LCASE", "ςxς", "σ", "x");
+ assertStringTrim("UTF8_LCASE", "ςxς", "ς", "x");
+ assertStringTrim("UTF8_LCASE", "ςxς", "Σ", "x");
+ assertStringTrim("UTF8_LCASE", "σxσ", "σ", "x");
+ assertStringTrim("UTF8_LCASE", "σxσ", "ς", "x");
+ assertStringTrim("UTF8_LCASE", "σxσ", "Σ", "x");
+ assertStringTrim("UTF8_LCASE", "ΣxΣ", "σ", "x");
+ assertStringTrim("UTF8_LCASE", "ΣxΣ", "ς", "x");
+ assertStringTrim("UTF8_LCASE", "ΣxΣ", "Σ", "x");
+ assertStringTrimLeft("UTF8_LCASE", "ςxς", "σ", "xς");
+ assertStringTrimLeft("UTF8_LCASE", "ςxς", "ς", "xς");
+ assertStringTrimLeft("UTF8_LCASE", "ςxς", "Σ", "xς");
+ assertStringTrimLeft("UTF8_LCASE", "σxσ", "σ", "xσ");
+ assertStringTrimLeft("UTF8_LCASE", "σxσ", "ς", "xσ");
+ assertStringTrimLeft("UTF8_LCASE", "σxσ", "Σ", "xσ");
+ assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "σ", "xΣ");
+ assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "ς", "xΣ");
+ assertStringTrimLeft("UTF8_LCASE", "ΣxΣ", "Σ", "xΣ");
+ assertStringTrimRight("UTF8_LCASE", "ςxς", "σ", "ςx");
+ assertStringTrimRight("UTF8_LCASE", "ςxς", "ς", "ςx");
+ assertStringTrimRight("UTF8_LCASE", "ςxς", "Σ", "ςx");
+ assertStringTrimRight("UTF8_LCASE", "σxσ", "σ", "σx");
+ assertStringTrimRight("UTF8_LCASE", "σxσ", "ς", "σx");
+ assertStringTrimRight("UTF8_LCASE", "σxσ", "Σ", "σx");
+ assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "σ", "Σx");
+ assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "ς", "Σx");
+ assertStringTrimRight("UTF8_LCASE", "ΣxΣ", "Σ", "Σx");
+ // Greek sigmas - UNICODE.
+ assertStringTrim("UNICODE", "ςxς", "σ", "ςxς");
+ assertStringTrim("UNICODE", "ςxς", "ς", "x");
+ assertStringTrim("UNICODE", "ςxς", "Σ", "ςxς");
+ assertStringTrim("UNICODE", "σxσ", "σ", "x");
+ assertStringTrim("UNICODE", "σxσ", "ς", "σxσ");
+ assertStringTrim("UNICODE", "σxσ", "Σ", "σxσ");
+ assertStringTrim("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrim("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrim("UNICODE", "ΣxΣ", "Σ", "x");
+ assertStringTrimLeft("UNICODE", "ςxς", "σ", "ςxς");
+ assertStringTrimLeft("UNICODE", "ςxς", "ς", "xς");
+ assertStringTrimLeft("UNICODE", "ςxς", "Σ", "ςxς");
+ assertStringTrimLeft("UNICODE", "σxσ", "σ", "xσ");
+ assertStringTrimLeft("UNICODE", "σxσ", "ς", "σxσ");
+ assertStringTrimLeft("UNICODE", "σxσ", "Σ", "σxσ");
+ assertStringTrimLeft("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrimLeft("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrimLeft("UNICODE", "ΣxΣ", "Σ", "xΣ");
+ assertStringTrimRight("UNICODE", "ςxς", "σ", "ςxς");
+ assertStringTrimRight("UNICODE", "ςxς", "ς", "ςx");
+ assertStringTrimRight("UNICODE", "ςxς", "Σ", "ςxς");
+ assertStringTrimRight("UNICODE", "σxσ", "σ", "σx");
+ assertStringTrimRight("UNICODE", "σxσ", "ς", "σxσ");
+ assertStringTrimRight("UNICODE", "σxσ", "Σ", "σxσ");
+ assertStringTrimRight("UNICODE", "ΣxΣ", "σ", "ΣxΣ");
+ assertStringTrimRight("UNICODE", "ΣxΣ", "ς", "ΣxΣ");
+ assertStringTrimRight("UNICODE", "ΣxΣ", "Σ", "Σx");
+ // Greek sigmas - UNICODE_CI.
+ assertStringTrim("UNICODE_CI", "ςxς", "σ", "x");
+ assertStringTrim("UNICODE_CI", "ςxς", "ς", "x");
+ assertStringTrim("UNICODE_CI", "ςxς", "Σ", "x");
+ assertStringTrim("UNICODE_CI", "σxσ", "σ", "x");
+ assertStringTrim("UNICODE_CI", "σxσ", "ς", "x");
+ assertStringTrim("UNICODE_CI", "σxσ", "Σ", "x");
+ assertStringTrim("UNICODE_CI", "ΣxΣ", "σ", "x");
+ assertStringTrim("UNICODE_CI", "ΣxΣ", "ς", "x");
+ assertStringTrim("UNICODE_CI", "ΣxΣ", "Σ", "x");
+ assertStringTrimLeft("UNICODE_CI", "ςxς", "σ", "xς");
+ assertStringTrimLeft("UNICODE_CI", "ςxς", "ς", "xς");
+ assertStringTrimLeft("UNICODE_CI", "ςxς", "Σ", "xς");
+ assertStringTrimLeft("UNICODE_CI", "σxσ", "σ", "xσ");
+ assertStringTrimLeft("UNICODE_CI", "σxσ", "ς", "xσ");
+ assertStringTrimLeft("UNICODE_CI", "σxσ", "Σ", "xσ");
+ assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "σ", "xΣ");
+ assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "ς", "xΣ");
+ assertStringTrimLeft("UNICODE_CI", "ΣxΣ", "Σ", "xΣ");
+ assertStringTrimRight("UNICODE_CI", "ςxς", "σ", "ςx");
+ assertStringTrimRight("UNICODE_CI", "ςxς", "ς", "ςx");
+ assertStringTrimRight("UNICODE_CI", "ςxς", "Σ", "ςx");
+ assertStringTrimRight("UNICODE_CI", "σxσ", "σ", "σx");
+ assertStringTrimRight("UNICODE_CI", "σxσ", "ς", "σx");
+ assertStringTrimRight("UNICODE_CI", "σxσ", "Σ", "σx");
+ assertStringTrimRight("UNICODE_CI", "ΣxΣ", "σ", "Σx");
+ assertStringTrimRight("UNICODE_CI", "ΣxΣ", "ς", "Σx");
+ assertStringTrimRight("UNICODE_CI", "ΣxΣ", "Σ", "Σx");
+
+ // Unicode normalization - UTF8_BINARY.
+ assertStringTrim("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A");
+ assertStringTrimLeft("UTF8_BINARY", "åβγδa\u030A", "å", "βγδa\u030A");
+ assertStringTrimRight("UTF8_BINARY", "åβγδa\u030A", "å", "åβγδa\u030A");
+ // Unicode normalization - UTF8_LCASE.
+ assertStringTrim("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A");
+ assertStringTrimLeft("UTF8_LCASE", "åβγδa\u030A", "Å", "βγδa\u030A");
+ assertStringTrimRight("UTF8_LCASE", "åβγδa\u030A", "Å", "åβγδa\u030A");
+ // Unicode normalization - UNICODE.
+ assertStringTrim("UNICODE", "åβγδa\u030A", "å", "βγδ");
+ assertStringTrimLeft("UNICODE", "åβγδa\u030A", "å", "βγδa\u030A");
+ assertStringTrimRight("UNICODE", "åβγδa\u030A", "å", "åβγδ");
+ // Unicode normalization - UNICODE_CI.
+ assertStringTrim("UNICODE_CI", "åβγδa\u030A", "Å", "βγδ");
+ assertStringTrimLeft("UNICODE_CI", "åβγδa\u030A", "Å", "βγδa\u030A");
+ assertStringTrimRight("UNICODE_CI", "åβγδa\u030A", "Å", "åβγδ");
}
private void assertStringTranslate(
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
index 1302ca80e51a..fec782002bb7 100755
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala
@@ -38,7 +38,7 @@ import
org.apache.spark.sql.catalyst.trees.TreePattern.{TreePattern, UPPER_OR_LO
import org.apache.spark.sql.catalyst.util.{ArrayData, CharsetProvider,
CollationFactory, CollationSupport, GenericArrayData, TypeUtils}
import org.apache.spark.sql.errors.{QueryCompilationErrors,
QueryExecutionErrors}
import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.internal.types.{AbstractArrayType,
StringTypeAnyCollation, StringTypeBinaryLcase}
+import org.apache.spark.sql.internal.types.{AbstractArrayType,
StringTypeAnyCollation}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.UTF8StringBuilder
import org.apache.spark.unsafe.array.ByteArrayMethods
@@ -1232,7 +1232,7 @@ trait String2TrimExpression extends Expression with
ImplicitCastInputTypes {
override def children: Seq[Expression] = srcStr +: trimStr.toSeq
override def dataType: DataType = srcStr.dataType
- override def inputTypes: Seq[AbstractDataType] =
Seq.fill(children.size)(StringTypeBinaryLcase)
+ override def inputTypes: Seq[AbstractDataType] =
Seq.fill(children.size)(StringTypeAnyCollation)
final lazy val collationId: Int =
srcStr.dataType.asInstanceOf[StringType].collationId
@@ -1260,11 +1260,11 @@ trait String2TrimExpression extends Expression with
ImplicitCastInputTypes {
if (evals.length == 1) {
val stringTrimCode: String = this match {
case _: StringTrim =>
- CollationSupport.StringTrim.genCode(srcString.value, collationId)
+ CollationSupport.StringTrim.genCode(srcString.value)
case _: StringTrimLeft =>
- CollationSupport.StringTrimLeft.genCode(srcString.value, collationId)
+ CollationSupport.StringTrimLeft.genCode(srcString.value)
case _: StringTrimRight =>
- CollationSupport.StringTrimRight.genCode(srcString.value,
collationId)
+ CollationSupport.StringTrimRight.genCode(srcString.value)
}
ev.copy(code = code"""
|${srcString.code}
@@ -1390,7 +1390,7 @@ case class StringTrim(srcStr: Expression, trimStr:
Option[Expression] = None)
override protected def direction: String = "BOTH"
override def doEval(srcString: UTF8String): UTF8String =
- CollationSupport.StringTrim.exec(srcString, collationId)
+ CollationSupport.StringTrim.exec(srcString)
override def doEval(srcString: UTF8String, trimString: UTF8String):
UTF8String =
CollationSupport.StringTrim.exec(srcString, trimString, collationId)
@@ -1497,7 +1497,7 @@ case class StringTrimLeft(srcStr: Expression, trimStr:
Option[Expression] = None
override protected def direction: String = "LEADING"
override def doEval(srcString: UTF8String): UTF8String =
- CollationSupport.StringTrimLeft.exec(srcString, collationId)
+ CollationSupport.StringTrimLeft.exec(srcString)
override def doEval(srcString: UTF8String, trimString: UTF8String):
UTF8String =
CollationSupport.StringTrimLeft.exec(srcString, trimString, collationId)
@@ -1557,7 +1557,7 @@ case class StringTrimRight(srcStr: Expression, trimStr:
Option[Expression] = Non
override protected def direction: String = "TRAILING"
override def doEval(srcString: UTF8String): UTF8String =
- CollationSupport.StringTrimRight.exec(srcString, collationId)
+ CollationSupport.StringTrimRight.exec(srcString)
override def doEval(srcString: UTF8String, trimString: UTF8String):
UTF8String =
CollationSupport.StringTrimRight.exec(srcString, trimString, collationId)
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
index 5f722b2f01fb..815a8bc59529 100644
---
a/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
+++
b/sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala
@@ -875,6 +875,37 @@ class CollationStringExpressionsSuite
}
test("StringTrim* functions - unit tests for both paths (codegen and eval)")
{
+ def evalStringTrim(src: Any, trim: Any, result: String): Unit = {
+ Seq("UTF8_BINARY", "UTF8_LCASE", "UNICODE", "UNICODE_CI").foreach {
collation =>
+ val dt: DataType = StringType(collation)
+ checkEvaluation(StringTrim(Literal.create(src, dt),
Literal.create(trim, dt)), result)
+ checkEvaluation(StringTrimLeft(Literal.create(src, dt),
Literal.create(trim, dt)), result)
+ checkEvaluation(StringTrimRight(Literal.create(src, dt),
Literal.create(trim, dt)), result)
+ }
+ }
+ // General edge cases and basic tests.
+ evalStringTrim(null, null, null)
+ evalStringTrim(null, "", null)
+ evalStringTrim(null, "a", null)
+ evalStringTrim("", null, null)
+ evalStringTrim("a", null, null)
+ evalStringTrim("", "", "")
+ evalStringTrim("", " ", "")
+ evalStringTrim("", "a", "")
+ evalStringTrim("", "aaa", "")
+ evalStringTrim(" ", "", " ")
+ evalStringTrim("a", "", "a")
+ evalStringTrim("aaa", "", "aaa")
+ evalStringTrim(" ", " ", "")
+ evalStringTrim(" ", " ", "")
+ evalStringTrim(" ", " ", "")
+ evalStringTrim(" ", " ", "")
+ evalStringTrim("a", "aaa", "")
+ evalStringTrim("aaa", "a", "")
+ evalStringTrim("aaa", "aaa", "")
+ evalStringTrim("abc", "cba", "")
+ evalStringTrim("cba", "abc", "")
+
// Without trimString param.
checkEvaluation(
StringTrim(Literal.create( " asd ", StringType("UTF8_BINARY"))), "asd")
@@ -1019,20 +1050,6 @@ class CollationStringExpressionsSuite
assert(collationMismatch.getErrorClass === "COLLATION_MISMATCH.EXPLICIT")
}
- test("StringTrim* functions - unsupported collation types") {
- List("TRIM", "LTRIM", "RTRIM").foreach(func => {
- val collationMismatch = intercept[AnalysisException] {
- sql("SELECT " + func + "(COLLATE('x', 'UNICODE_CI'),
COLLATE('xxaaaxx', 'UNICODE_CI'))")
- }
- assert(collationMismatch.getErrorClass ===
"DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE")
- })
-
- val collationMismatch = intercept[AnalysisException] {
- sql("SELECT BTRIM(COLLATE('xxaaaxx', 'UNICODE_CI'), COLLATE('x',
'UNICODE_CI'))")
- }
- assert(collationMismatch.getErrorClass ===
"DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE")
- }
-
// TODO: Add more tests for other string expressions
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]