uros-db commented on code in PR #46682:
URL: https://github.com/apache/spark/pull/46682#discussion_r1613447350
##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##########
@@ -34,6 +34,155 @@
* Utility class for collation-aware UTF8String operations.
*/
public class CollationAwareUTF8String {
+
+ /**
+ * The constant value to indicate that the match is not found when searching
for a pattern
+ * string in a target string.
+ */
+ private static final int MATCH_NOT_FOUND = -1;
+
+ /**
+ * Returns whether the target string starts with the specified prefix,
starting from the
+ * specified position (0-based index referring to character position in
UTF8String), with respect
+ * to the UTF8_BINARY_LCASE collation. The method assumes that the prefix is
already lowercased
+ * prior to method call to avoid the overhead of calling .toLowerCase()
multiple times on the
+ * same prefix string.
+ *
+ * @param target the string to be searched in
+ * @param lowercasePattern the string to be searched for
+ * @param startPos the start position for searching (in the target string)
+ * @return whether the target string starts with the specified prefix in
UTF8_BINARY_LCASE
+ */
+ public static boolean lowercaseMatchFrom(
+ final UTF8String target,
+ final UTF8String lowercasePattern,
+ int startPos) {
+ return lowercaseMatchLengthFrom(target, lowercasePattern, startPos) !=
MATCH_NOT_FOUND;
+ }
+
+ /**
+ * Returns the length of the substring of the target string that starts with
the specified
+ * prefix, starting from the specified position (0-based index referring to
character position
+ * in UTF8String), with respect to the UTF8_BINARY_LCASE collation. The
method assumes that the
+ * prefix is already lowercased. The method only considers the part of
target string that
+ * starts from the specified (inclusive) position (that is, the method does
not look at UTF8
+ * characters of the target string at or after position `endPos`). If the
prefix is not found,
+ * MATCH_NOT_FOUND is returned.
+ *
+ * @param target the string to be searched in
+ * @param lowercasePattern the string to be searched for
+ * @param startPos the start position for searching (in the target string)
+ * @return length of the target substring that ends with the specified
prefix in lowercase
+ */
+ public static int lowercaseMatchLengthFrom(
+ final UTF8String target,
+ final UTF8String lowercasePattern,
+ int startPos) {
+ assert startPos >= 0;
+ for (int len = 0; len <= target.numChars() - startPos; ++len) {
+ if (target.substring(startPos, startPos +
len).toLowerCase().equals(lowercasePattern)) {
+ return len;
+ }
+ }
+ return MATCH_NOT_FOUND;
+ }
+
+ /**
+ * Returns the position of the first occurrence of the pattern string in the
target string,
+ * starting from the specified position (0-based index referring to
character position in
+ * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method
assumes that the
+ * pattern string is already lowercased prior to call. If the pattern is not
found,
+ * MATCH_NOT_FOUND is returned.
+ *
+ * @param target the string to be searched in
+ * @param lowercasePattern the string to be searched for
+ * @param startPos the start position for searching (in the target string)
+ * @return the position of the first occurrence of pattern in target
+ */
+ public static int lowercaseFind(
+ final UTF8String target,
+ final UTF8String lowercasePattern,
+ int startPos) {
+ assert startPos >= 0;
+ for (int i = startPos; i <= target.numChars(); ++i) {
+ if (lowercaseMatchFrom(target, lowercasePattern, i)) {
+ return i;
+ }
+ }
+ return MATCH_NOT_FOUND;
+ }
+
+ /**
+ * Returns whether the target string ends with the specified suffix, ending
at the specified
+ * position (0-based index referring to character position in UTF8String),
with respect to the
+ * UTF8_BINARY_LCASE collation. The method assumes that the suffix is
already lowercased prior
+ * to method call to avoid the overhead of calling .toLowerCase() multiple
times on the same
+ * suffix string.
+ *
+ * @param target the string to be searched in
+ * @param lowercasePattern the string to be searched for
+ * @param endPos the end position for searching (in the target string)
+ * @return whether the target string ends with the specified suffix in
lowercase
+ */
+ public static boolean lowercaseMatchUntil(
+ final UTF8String target,
+ final UTF8String lowercasePattern,
+ int endPos) {
+ return lowercaseMatchLengthUntil(target, lowercasePattern, endPos) !=
MATCH_NOT_FOUND;
+ }
+
+ /**
+ * Returns the length of the substring of the target string that ends with
the specified
+ * suffix, ending at the specified position (0-based index referring to
character position in
+ * UTF8String), with respect to the UTF8_BINARY_LCASE collation. The method
assumes that the
+ * suffix is already lowercased. The method only considers the part of
target string that ends
+ * at the specified (non-inclusive) position (that is, the method does not
look at UTF8
+ * characters of the target string at or after position `endPos`). If the
suffix is not found,
+ * MATCH_NOT_FOUND is returned.
+ *
+ * @param target the string to be searched in
+ * @param lowercasePattern the string to be searched for
+ * @param endPos the end position for searching (in the target string)
+ * @return length of the target substring that ends with the specified
suffix in lowercase
+ */
+ public static int lowercaseMatchLengthUntil(
Review Comment:
this one can go private
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]