uros-db commented on code in PR #48521:
URL: https://github.com/apache/spark/pull/48521#discussion_r1815201714
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -1260,7 +1364,32 @@ public static UTF8String lowercaseTrimRight(
// Iterate over `srcString` from the right to find the first character
that is not in the set.
int searchIndex = srcString.numChars(), codePoint, codePointBuffer = -1;
+
+ // In cases of trim collation (rtrim for example) trailing spaces should
be ignored.
+ // If trimString contains spaces this behaviour is not important as they
would get trimmed
+ // anyway. However, if it is not the case they should be ignored and then
appended after
+ // trimming other characters.
+ int lastNonSpaceByteIdx = srcString.numBytes(), lastNonSpaceCharacterIdx =
srcString.numChars();
+ if (!trimChars.contains(0x20) &&
+ CollationFactory.ignoresSpacesInTrimFunctions(
+ collationId, /*isLTrim=*/ false, /*isRTrim=*/true)) {
+ while (lastNonSpaceByteIdx > 0 &&
+ srcString.getByte(lastNonSpaceByteIdx - 1) == ' ') {
+ --lastNonSpaceByteIdx;
+ }
+ // In case of src string contains only spaces there is no need to do any
trimming, since it's
+ // already checked that trim string does not contain any spaces.
+ if (lastNonSpaceByteIdx == 0) {
+ return srcString;
+ }
+ searchIndex = lastNonSpaceCharacterIdx =
+ srcString.numChars() - (srcString.numBytes() - lastNonSpaceByteIdx);
+ }
Iterator<Integer> srcIter = srcString.reverseCodePointIterator();
+ for(int i = lastNonSpaceCharacterIdx; i < srcString.numChars(); i++) {
Review Comment:
```suggestion
for (int i = lastNonSpaceCharacterIdx; i < srcString.numChars(); i++) {
```
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -1232,22 +1255,103 @@ public static UTF8String trimLeft(
// Return the substring from the calculated position until the end of the
string.
return UTF8String.fromString(src.substring(charIndex));
}
+ /**
+ * Trims the `srcString` string from the right side using the specified
`trimString` characters,
+ * with respect to the UTF8_BINARY trim collation. For UTF8_BINARY trim
collation, the method has
+ * one special case to cover with respect to trimRight function for regular
UTF8_Binary collation.
+ * Trailing spaces should be ignored in case of trim collation (rtrim for
example) and if
+ * trimString does not contain spaces. In this case, the method trims the
string from the right
+ * and after call of regular trim functions returns back trimmed spaces as
those should not get
+ * removed.
+ * @param srcString the input string to be trimmed from the right end of the
string
+ * @param trimString the trim string characters to trim
+ * @param collationId the collation ID to use for string trim
+ * @return the trimmed string (for UTF8_LCASE collation)
+ */
+ public static UTF8String binaryTrimRight(
+ final UTF8String srcString,
+ final UTF8String trimString,
+ final int collationId) {
+ // Matching the default UTF8String behavior for null `trimString`.
+ if (trimString == null) {
+ return null;
+ }
+
+ // Create a hash set of code points for all characters of `trimString`.
+ HashSet<Integer> trimChars = new HashSet<>();
+ Iterator<Integer> trimIter = trimString.codePointIterator();
+ while (trimIter.hasNext()) trimChars.add(trimIter.next());
+
+ // Iterate over `srcString` from the right to find the first character
that is not in the set.
+ int searchIndex = srcString.numChars(), codePoint, codePointBuffer = -1;
+
+ // In cases of trim collation (rtrim for example) trailing spaces should
be ignored.
+ // If trimString contains spaces this behaviour is not important as they
would get trimmed
+ // anyway. However, if it is not the case they should be ignored and then
appended after
+ // trimming other characters.
+ int lastNonSpaceByteIdx = srcString.numBytes(), lastNonSpaceCharacterIdx =
srcString.numChars();
+ if (!trimChars.contains(0x20) &&
+ CollationFactory.ignoresSpacesInTrimFunctions(
+ collationId, /*isLTrim=*/ false, /*isRTrim=*/true)) {
+ while (lastNonSpaceByteIdx > 0 &&
+ srcString.getByte(lastNonSpaceByteIdx - 1) == ' ') {
+ --lastNonSpaceByteIdx;
+ }
+ // In case of src string contains only spaces there is no need to do any
trimming, since it's
+ // already checked that trim string does not contain any spaces.
+ if (lastNonSpaceByteIdx == 0) {
+ return srcString;
+ }
+ searchIndex = lastNonSpaceCharacterIdx =
+ srcString.numChars() - (srcString.numBytes() - lastNonSpaceByteIdx);
+ }
+ Iterator<Integer> srcIter = srcString.reverseCodePointIterator();
+ for(int i = lastNonSpaceCharacterIdx; i < srcString.numChars(); i++) {
Review Comment:
```suggestion
for (int i = lastNonSpaceCharacterIdx; i < srcString.numChars(); i++) {
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]